Lazy load flask_compress

UI - Content compression was not obeying FLASK_ENABLE_COMPRESSION, should be off by default due to a memory leak in flask_compress & socket.io
0.53.5
2026-02-21 05:36:06 +00:00 · 2026-02-20 08:56:25 +01:00 · 2026-02-20 08:54:10 +01:00 · 2026-02-20 00:57:52 +01:00 · 2026-02-20 00:55:37 +01:00
6 changed files with 217 additions and 28 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
 # Semver means never use .01, or 00. Should be .1.
-__version__ = '0.53.4'
+__version__ = '0.53.5'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -27,7 +27,6 @@ from flask import (
    session,
    url_for,
 )
-from flask_compress import Compress as FlaskCompress
 from flask_restful import abort, Api
 from flask_cors import CORS

@@ -74,14 +73,14 @@ CORS(app)
 # There's also a bug between flask compress and socketio that causes some kind of slow memory leak
 # It's better to use compression on your reverse proxy (nginx etc) instead.
 if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
+    from flask_compress import Compress as FlaskCompress
    app.config['COMPRESS_MIN_SIZE'] = 2096
    app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
    # Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
    app.config['COMPRESS_ALGORITHM'] = ['gzip']
+    compress = FlaskCompress()
+    compress.init_app(app)

-compress = FlaskCompress()
-
-compress.init_app(app)
 app.config['TEMPLATES_AUTO_RELOAD'] = False


--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -561,31 +561,33 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        )
    else:
        parser_config = None
-
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
    else:
-        # Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
-        # Causing inscriptis to silently exit when more than ~10MB is found.
-        # All we are doing here is converting the HTML to text, no CSS layout etc
-        # Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
-        html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
-                              '', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
+        # causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
+        # can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
+        # causing the regex to scan past the intended close and eat real page content.
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Strip tags that inscriptis cannot render as meaningful text and which can be very large.
+        # svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
+        # video/audio/picture are kept — they may contain meaningful fallback text or captions.
+        for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
+                                  'math', 'canvas', 'iframe', 'template']):
+            tag.decompose()

-        # SPAs often use <body style="display:none"> to hide content until JS loads
-        # inscriptis respects CSS display rules, so we need to remove these hiding styles
-        # to extract the actual page content
-        body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
-
-        # Check if body has hiding styles that need to be fixed
-        body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
-        if body_match:
-            from loguru import logger
-            logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
-
-        html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
+        # SPAs often use <body style="display:none"> to hide content until JS loads.
+        # inscriptis respects CSS display rules, so strip hiding styles from the body tag.
+        body_tag = soup.find('body')
+        if body_tag and body_tag.get('style'):
+            style = body_tag['style']
+            if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
+                logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
+                del body_tag['style']

+        html_content = str(soup)

    text_content = get_text(html_content, config=parser_config)
    return text_content
--- a/changedetectionio/realtime/socket_server.py
+++ b/changedetectionio/realtime/socket_server.py
@@ -199,11 +199,25 @@ def handle_watch_update(socketio, **kwargs):
        logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")


+def _patch_flask_request_context_session():
+    """Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
+    still assigns to it directly (ctx.session = ...).  Restore a setter that writes the
+    private _session attribute so the two libraries work together.
+    """
+    from flask.ctx import RequestContext
+    if getattr(RequestContext.session, 'fset', None) is not None:
+        return  # Already has a setter (future Flask version restored it)
+    original_prop = RequestContext.session
+    RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
+
+
 def init_socketio(app, datastore):
    """Initialize SocketIO with the main Flask app"""
    import platform
    import sys

+    _patch_flask_request_context_session()
+
    # Platform-specific async_mode selection for better stability
    system = platform.system().lower()
    python_version = sys.version_info
--- a/changedetectionio/tests/unit/test_html_to_text.py
+++ b/changedetectionio/tests/unit/test_html_to_text.py
@@ -453,6 +453,175 @@ class TestHtmlToText(unittest.TestCase):



+    def test_script_with_closing_tag_in_string_does_not_eat_content(self):
+        """
+        Script tag containing </script> inside a JS string must not prematurely end the block.
+
+        This is the classic regex failure mode: the old pattern would find the first </script>
+        inside the JS string literal and stop there, leaving the tail of the script block
+        (plus any following content) exposed as raw text. BS4 parses the HTML correctly.
+        """
+        html = '''<html><body>
+<p>Before script</p>
+<script>
+var html = "<div>foo<\\/script><p>bar</p>";
+var also = 1;
+</script>
+<p>AFTER SCRIPT</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'Before script' in text
+        assert 'AFTER SCRIPT' in text
+        # Script internals must not leak
+        assert 'var html' not in text
+        assert 'var also' not in text
+
+    def test_content_sandwiched_between_multiple_body_scripts(self):
+        """Content between multiple script/style blocks in the body must all survive."""
+        html = '''<html><body>
+<script>var a = 1;</script>
+<p>CONTENT A</p>
+<style>.x { color: red; }</style>
+<p>CONTENT B</p>
+<script>var b = 2;</script>
+<p>CONTENT C</p>
+<style>.y { color: blue; }</style>
+<p>CONTENT D</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
+            assert label in text, f"'{label}' was eaten by script/style stripping"
+        assert 'var a' not in text
+        assert 'var b' not in text
+        assert 'color: red' not in text
+        assert 'color: blue' not in text
+
+    def test_unicode_and_international_content_preserved(self):
+        """Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
+        html = '''<html><body>
+<style>.x{color:red}</style>
+<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
+<p>Chinese: \u6ce8\u518c</p>
+<p>Japanese: \u767b\u9332</p>
+<p>Korean: \ub4f1\ub85d</p>
+<p>Emoji: \U0001f4e2</p>
+<script>var x = 1;</script>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'ANMELDUNG' in text
+        assert '\u6ce8\u518c' in text   # Chinese
+        assert '\u767b\u9332' in text   # Japanese
+        assert '\ub4f1\ub85d' in text   # Korean
+
+    def test_style_with_type_attribute_is_stripped(self):
+        """<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
+        html = '''<html><body>
+<style type="text/css">.important { display: none; }</style>
+<p>VISIBLE CONTENT</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'VISIBLE CONTENT' in text
+        assert '.important' not in text
+        assert 'display: none' not in text
+
+    def test_ldjson_script_is_stripped(self):
+        """<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
+        html = '''<html><body>
+<script type="application/ld+json">
+{"@type": "Product", "name": "Widget", "price": "9.99"}
+</script>
+<p>PRODUCT PAGE</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'PRODUCT PAGE' in text
+        assert '@type' not in text
+        assert '"price"' not in text
+
+    def test_inline_svg_is_stripped_entirely(self):
+        """
+        Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
+        SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
+        text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
+        """
+        html = '''<html><body>
+<p>Before SVG</p>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+    <path d="M14 5L7 12L14 19Z" fill="none"/>
+    <circle cx="12" cy="12" r="10"/>
+</svg>
+<p>After SVG</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'Before SVG' in text
+        assert 'After SVG' in text
+        assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
+        assert 'viewBox' not in text, "SVG attributes should not appear in text output"
+
+    def test_tag_inside_json_data_attribute_does_not_eat_content(self):
+        """
+        Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
+
+        Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
+        inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
+        closing tags appear as <\\/svg> rather than </svg>.
+
+        The old regex approach would find <svg> inside the attribute value, then fail to find
+        <\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
+        eating tens of kilobytes of actual page content in the process.
+        """
+        html = '''<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
+</div>
+<div class="content">
+    <h1>IMPORTANT CONTENT</h1>
+    <p>This text must not be eaten by the tag-stripping logic.</p>
+</div>
+<svg><circle cx="50" cy="50" r="40"/></svg>
+</body>
+</html>'''
+
+        text = html_to_text(html)
+
+        assert 'IMPORTANT CONTENT' in text, (
+            "Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
+            "The tag-stripping logic is matching <tag> inside attribute values and scanning "
+            "forward to the next real closing tag in the DOM."
+        )
+        assert 'This text must not be eaten' in text
+
+    def test_script_inside_json_data_attribute_does_not_eat_content(self):
+        """Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
+        html = '''<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
+</div>
+<div>
+    <h1>MUST SURVIVE</h1>
+    <p>Real content after the data attribute with embedded script tag.</p>
+</div>
+<script>var real = 1;</script>
+</body>
+</html>'''
+
+        text = html_to_text(html)
+
+        assert 'MUST SURVIVE' in text, (
+            "Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
+        )
+        assert 'Real content after the data attribute' in text
+
+
 if __name__ == '__main__':
    # Can run this file directly for quick testing
    unittest.main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,10 +9,15 @@ flask_restful
 flask_cors # For the Chrome extension to operate
 # janus # No longer needed - using pure threading.Queue for multi-loop support
 flask_wtf~=1.2
-flask~=3.1
-flask-socketio~=5.6.0
-python-socketio~=5.16.1
-python-engineio~=4.13.1
+# Flask 3.1 removed the session setter on RequestContext; the patch in
+# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
+# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
+flask>=3.1,<4
+# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
+# >=5.5.0 ensures the threading async_mode we rely on is available.
+flask-socketio>=5.5.0,<6
+python-socketio>=5.11.0,<6
+python-engineio>=4.9.0,<5
 inscriptis~=2.2
 pytz
 timeago~=1.0
Author	SHA1	Message	Date
dgtlmoon	7a51f1e4bf	Lazy load flask_compress Some checks are pending Build and push containers / metadata (push) Waiting to run Details Build and push containers / build-push-containers (push) Waiting to run Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Waiting to run Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Blocked by required conditions Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Blocked by required conditions Details ChangeDetection.io App Test / lint-code (push) Waiting to run Details ChangeDetection.io App Test / test-application-3-10 (push) Blocked by required conditions Details ChangeDetection.io App Test / test-application-3-11 (push) Blocked by required conditions Details ChangeDetection.io App Test / test-application-3-12 (push) Blocked by required conditions Details ChangeDetection.io App Test / test-application-3-13 (push) Blocked by required conditions Details	2026-02-20 08:56:25 +01:00
dgtlmoon	91dee697f9	UI - Content compression was not obeying FLASK_ENABLE_COMPRESSION, should be off by default due to a memory leak in flask_compress & socket.io	2026-02-20 08:54:10 +01:00
dgtlmoon	4128acf95a	0.53.5 Some checks failed ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Waiting to run Details Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2026-02-20 00:57:52 +01:00
dgtlmoon	7c8d59c795	Fixing bad replacement of metadata causing possible content removal #3906 (#3908 )	2026-02-20 00:55:37 +01:00