Merge branch 'master' into large-html-to-text-error-200-blank

Merge branch 'large-html-to-text-error-200-blank' of github.com:dgtlmoon/changedetection.io into large-html-to-text-error-200-blank
Fix: Some SPAs with long content - Stripping tags must also find matching close tag
2026-02-21 13:46:08 +00:00 · 2026-02-17 16:42:45 +01:00 · 2026-02-17 16:42:09 +01:00 · 2026-02-17 16:41:50 +01:00 · 2026-02-17 15:38:36 +01:00 · 2026-02-17 15:37:35 +01:00
2 changed files with 91 additions and 1 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -569,7 +569,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        # Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
        # Causing inscriptis to silently exit when more than ~10MB is found.
        # All we are doing here is converting the HTML to text, no CSS layout etc
-        html_content = re.sub(r'<(?:style|script|svg|noscript)[^>]*>.*?</(?:style|script|svg|noscript)>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
+        # Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
+        html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
                              '', html_content, flags=re.DOTALL | re.IGNORECASE)

        # SPAs often use <body style="display:none"> to hide content until JS loads
--- a/changedetectionio/tests/unit/test_html_to_text.py
+++ b/changedetectionio/tests/unit/test_html_to_text.py
@@ -362,6 +362,95 @@ class TestHtmlToText(unittest.TestCase):

        print("  ✓ All display:none body tag tests passed")

+    def test_style_tag_with_svg_data_uri(self):
+        """
+        Test that style tags containing SVG data URIs are properly stripped.
+
+        Some WordPress and modern sites embed SVG as data URIs in CSS, which contains
+        <svg> and </svg> tags within the style content. The regex must use backreferences
+        to ensure <style> matches </style> (not </svg> inside the CSS).
+
+        This was causing errors where the regex would match <style> and stop at the first
+        </svg> it encountered inside a CSS data URI, breaking the HTML structure.
+        """
+        # Real-world example from WordPress wp-block-image styles
+        html = '''<!DOCTYPE html>
+<html>
+<head>
+    <style id='wp-block-image-inline-css'>
+.wp-block-image>a,.wp-block-image>figure>a{display:inline-block}.wp-block-image img{box-sizing:border-box;height:auto;max-width:100%;vertical-align:bottom}@supports ((-webkit-mask-image:none) or (mask-image:none)) or (-webkit-mask-image:none){.wp-block-image.is-style-circle-mask img{border-radius:0;-webkit-mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-mode:alpha}}
+    </style>
+</head>
+<body>
+    <h1>Test Heading</h1>
+    <p>This is the actual content that should be extracted.</p>
+    <div class="wp-block-image">
+        <img src="test.jpg" alt="Test image">
+    </div>
+</body>
+</html>'''
+
+        # This should not crash and should extract the body content
+        text = html_to_text(html)
+
+        # Verify the actual body content was extracted
+        assert text is not None, "html_to_text returned None"
+        assert len(text) > 0, "html_to_text returned empty string"
+        assert 'Test Heading' in text, "Failed to extract heading"
+        assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
+
+        # Verify CSS content was stripped (including the SVG data URI)
+        assert '.wp-block-image' not in text, "CSS class selector leaked into text"
+        assert 'mask-image' not in text, "CSS property leaked into text"
+        assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text"
+        assert 'viewBox' not in text, "SVG attributes leaked into text"
+
+        # Verify no broken HTML structure
+        assert '<style' not in text, "Unclosed style tag in output"
+        assert '</svg>' not in text, "SVG closing tag leaked into text"
+
+        print("  ✓ Style tag with SVG data URI test passed")
+
+    def test_style_tag_closes_correctly(self):
+        """
+        Test that each tag type (style, script, svg) closes with the correct closing tag.
+
+        Before the fix, the regex used (?:style|script|svg|noscript) for both opening and
+        closing tags, which meant <style> could incorrectly match </svg> as its closing tag.
+        With backreferences, <style> must close with </style>, <svg> with </svg>, etc.
+        """
+        # Test nested tags where incorrect matching would break
+        html = '''<!DOCTYPE html>
+<html>
+<head>
+    <style>
+        body { background: url('data:image/svg+xml,<svg><rect/></svg>'); }
+    </style>
+    <script>
+        const svg = '<svg><path d="M0,0"/></svg>';
+    </script>
+</head>
+<body>
+    <h1>Content</h1>
+    <svg><circle cx="50" cy="50" r="40"/></svg>
+    <p>After SVG</p>
+</body>
+</html>'''
+
+        text = html_to_text(html)
+
+        # Should extract body content
+        assert 'Content' in text, "Failed to extract heading"
+        assert 'After SVG' in text, "Failed to extract content after SVG"
+
+        # Should strip all style/script/svg content
+        assert 'background:' not in text, "Style content leaked"
+        assert 'const svg' not in text, "Script content leaked"
+        assert '<circle' not in text, "SVG element leaked"
+        assert 'data:image/svg+xml' not in text, "Data URI leaked"
+
+        print("  ✓ Tag closing validation test passed")
+


 if __name__ == '__main__':
Author	SHA1	Message	Date
dgtlmoon	ae85393531	Merge branch 'master' into large-html-to-text-error-200-blank	2026-02-17 16:42:45 +01:00
dgtlmoon	3391481c2e	Merge branch 'large-html-to-text-error-200-blank' of github.com:dgtlmoon/changedetection.io into large-html-to-text-error-200-blank	2026-02-17 16:42:09 +01:00
dgtlmoon	d4f3640cb4	Fix: Some SPAs with long content - Stripping tags must also find matching close tag	2026-02-17 16:41:50 +01:00
dgtlmoon	9d4df30f1e	Merge branch 'master' into large-html-to-text-error-200-blank	2026-02-17 15:38:36 +01:00
dgtlmoon	46a898ffda	Fix: Some SPA's also set body content to display: none which breaks text output	2026-02-17 15:37:35 +01:00
dgtlmoon	c6f9fb922d	Adding test	2026-02-17 14:29:30 +01:00
dgtlmoon	fc82790db8	"Error 200 no content" - Some very large SPA pages make HTML to Text fail by dumping 10Mb+ into page header, strip extras.	2026-02-17 14:18:45 +01:00