mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-21 13:46:08 +00:00
Compare commits
7 Commits
api-extend
...
large-html
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae85393531 | ||
|
|
3391481c2e | ||
|
|
d4f3640cb4 | ||
|
|
9d4df30f1e | ||
|
|
46a898ffda | ||
|
|
c6f9fb922d | ||
|
|
fc82790db8 |
@@ -569,7 +569,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||
# Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
|
||||
# Causing inscriptis to silently exit when more than ~10MB is found.
|
||||
# All we are doing here is converting the HTML to text, no CSS layout etc
|
||||
html_content = re.sub(r'<(?:style|script|svg|noscript)[^>]*>.*?</(?:style|script|svg|noscript)>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
|
||||
# Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
|
||||
html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
|
||||
'', html_content, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# SPAs often use <body style="display:none"> to hide content until JS loads
|
||||
|
||||
@@ -362,6 +362,95 @@ class TestHtmlToText(unittest.TestCase):
|
||||
|
||||
print(" ✓ All display:none body tag tests passed")
|
||||
|
||||
def test_style_tag_with_svg_data_uri(self):
|
||||
"""
|
||||
Test that style tags containing SVG data URIs are properly stripped.
|
||||
|
||||
Some WordPress and modern sites embed SVG as data URIs in CSS, which contains
|
||||
<svg> and </svg> tags within the style content. The regex must use backreferences
|
||||
to ensure <style> matches </style> (not </svg> inside the CSS).
|
||||
|
||||
This was causing errors where the regex would match <style> and stop at the first
|
||||
</svg> it encountered inside a CSS data URI, breaking the HTML structure.
|
||||
"""
|
||||
# Real-world example from WordPress wp-block-image styles
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style id='wp-block-image-inline-css'>
|
||||
.wp-block-image>a,.wp-block-image>figure>a{display:inline-block}.wp-block-image img{box-sizing:border-box;height:auto;max-width:100%;vertical-align:bottom}@supports ((-webkit-mask-image:none) or (mask-image:none)) or (-webkit-mask-image:none){.wp-block-image.is-style-circle-mask img{border-radius:0;-webkit-mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-mode:alpha}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Heading</h1>
|
||||
<p>This is the actual content that should be extracted.</p>
|
||||
<div class="wp-block-image">
|
||||
<img src="test.jpg" alt="Test image">
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
# This should not crash and should extract the body content
|
||||
text = html_to_text(html)
|
||||
|
||||
# Verify the actual body content was extracted
|
||||
assert text is not None, "html_to_text returned None"
|
||||
assert len(text) > 0, "html_to_text returned empty string"
|
||||
assert 'Test Heading' in text, "Failed to extract heading"
|
||||
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
|
||||
|
||||
# Verify CSS content was stripped (including the SVG data URI)
|
||||
assert '.wp-block-image' not in text, "CSS class selector leaked into text"
|
||||
assert 'mask-image' not in text, "CSS property leaked into text"
|
||||
assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text"
|
||||
assert 'viewBox' not in text, "SVG attributes leaked into text"
|
||||
|
||||
# Verify no broken HTML structure
|
||||
assert '<style' not in text, "Unclosed style tag in output"
|
||||
assert '</svg>' not in text, "SVG closing tag leaked into text"
|
||||
|
||||
print(" ✓ Style tag with SVG data URI test passed")
|
||||
|
||||
def test_style_tag_closes_correctly(self):
|
||||
"""
|
||||
Test that each tag type (style, script, svg) closes with the correct closing tag.
|
||||
|
||||
Before the fix, the regex used (?:style|script|svg|noscript) for both opening and
|
||||
closing tags, which meant <style> could incorrectly match </svg> as its closing tag.
|
||||
With backreferences, <style> must close with </style>, <svg> with </svg>, etc.
|
||||
"""
|
||||
# Test nested tags where incorrect matching would break
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
body { background: url('data:image/svg+xml,<svg><rect/></svg>'); }
|
||||
</style>
|
||||
<script>
|
||||
const svg = '<svg><path d="M0,0"/></svg>';
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Content</h1>
|
||||
<svg><circle cx="50" cy="50" r="40"/></svg>
|
||||
<p>After SVG</p>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
|
||||
# Should extract body content
|
||||
assert 'Content' in text, "Failed to extract heading"
|
||||
assert 'After SVG' in text, "Failed to extract content after SVG"
|
||||
|
||||
# Should strip all style/script/svg content
|
||||
assert 'background:' not in text, "Style content leaked"
|
||||
assert 'const svg' not in text, "Script content leaked"
|
||||
assert '<circle' not in text, "SVG element leaked"
|
||||
assert 'data:image/svg+xml' not in text, "Data URI leaked"
|
||||
|
||||
print(" ✓ Tag closing validation test passed")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user