mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-05-01 15:20:33 +00:00
628 lines
24 KiB
Python
628 lines
24 KiB
Python
#!/usr/bin/env python3
|
||
# coding=utf-8
|
||
|
||
"""Unit tests for html_tools.html_to_text function."""
|
||
|
||
import hashlib
|
||
import threading
|
||
import unittest
|
||
from queue import Queue
|
||
|
||
from changedetectionio.html_tools import html_to_text
|
||
|
||
|
||
class TestHtmlToText(unittest.TestCase):
|
||
"""Test html_to_text function for correctness and thread-safety."""
|
||
|
||
def test_basic_text_extraction(self):
|
||
"""Test basic HTML to text conversion."""
|
||
html = '<html><body><h1>Title</h1><p>Paragraph text.</p></body></html>'
|
||
text = html_to_text(html)
|
||
|
||
assert 'Title' in text
|
||
assert 'Paragraph text.' in text
|
||
assert '<' not in text # HTML tags should be stripped
|
||
assert '>' not in text
|
||
|
||
def test_empty_html(self):
|
||
"""Test handling of empty HTML."""
|
||
html = '<html><body></body></html>'
|
||
text = html_to_text(html)
|
||
|
||
# Should return empty or whitespace only
|
||
assert text.strip() == ''
|
||
|
||
def test_nested_elements(self):
|
||
"""Test extraction from nested HTML elements."""
|
||
html = '''
|
||
<html>
|
||
<body>
|
||
<div>
|
||
<h1>Header</h1>
|
||
<div>
|
||
<p>First paragraph</p>
|
||
<p>Second paragraph</p>
|
||
</div>
|
||
</div>
|
||
</body>
|
||
</html>
|
||
'''
|
||
text = html_to_text(html)
|
||
|
||
assert 'Header' in text
|
||
assert 'First paragraph' in text
|
||
assert 'Second paragraph' in text
|
||
|
||
def test_anchor_tag_rendering(self):
|
||
"""Test anchor tag rendering option."""
|
||
html = '<html><body><a href="https://example.com">Link text</a></body></html>'
|
||
|
||
# Without rendering anchors
|
||
text_without = html_to_text(html, render_anchor_tag_content=False)
|
||
assert 'Link text' in text_without
|
||
assert 'https://example.com' not in text_without
|
||
|
||
# With rendering anchors
|
||
text_with = html_to_text(html, render_anchor_tag_content=True)
|
||
assert 'Link text' in text_with
|
||
assert 'https://example.com' in text_with or '[Link text]' in text_with
|
||
|
||
def test_rss_mode(self):
|
||
"""Test RSS mode converts title tags to h1."""
|
||
html = '<item><title>RSS Title</title><description>Content</description></item>'
|
||
|
||
# is_rss=True should convert <title> to <h1>
|
||
text = html_to_text(html, is_rss=True)
|
||
|
||
assert 'RSS Title' in text
|
||
assert 'Content' in text
|
||
|
||
def test_special_characters(self):
|
||
"""Test handling of special characters and entities."""
|
||
html = '<html><body><p>Test & <special> characters</p></body></html>'
|
||
text = html_to_text(html)
|
||
|
||
# Entities should be decoded
|
||
assert 'Test &' in text or 'Test &' in text
|
||
assert 'special' in text
|
||
|
||
def test_whitespace_handling(self):
|
||
"""Test that whitespace is properly handled."""
|
||
html = '<html><body><p>Line 1</p><p>Line 2</p></body></html>'
|
||
text = html_to_text(html)
|
||
|
||
# Should have some separation between lines
|
||
assert 'Line 1' in text
|
||
assert 'Line 2' in text
|
||
assert text.count('\n') >= 1 # At least one newline
|
||
|
||
def test_deterministic_output(self):
|
||
"""Test that the same HTML always produces the same text."""
|
||
html = '<html><body><h1>Test</h1><p>Content here</p></body></html>'
|
||
|
||
# Extract text multiple times
|
||
results = [html_to_text(html) for _ in range(10)]
|
||
|
||
# All results should be identical
|
||
assert len(set(results)) == 1, "html_to_text should be deterministic"
|
||
|
||
def test_thread_safety_determinism(self):
|
||
"""
|
||
Test that html_to_text produces deterministic output under high concurrency.
|
||
|
||
This verifies that lxml's default parser (used by inscriptis.get_text)
|
||
is thread-safe and produces consistent results when called from multiple
|
||
threads simultaneously.
|
||
"""
|
||
html = '''
|
||
<html>
|
||
<head><title>Test Page</title></head>
|
||
<body>
|
||
<h1>Main Heading</h1>
|
||
<div class="content">
|
||
<p>First paragraph with <b>bold text</b>.</p>
|
||
<p>Second paragraph with <i>italic text</i>.</p>
|
||
<ul>
|
||
<li>Item 1</li>
|
||
<li>Item 2</li>
|
||
<li>Item 3</li>
|
||
</ul>
|
||
</div>
|
||
</body>
|
||
</html>
|
||
'''
|
||
|
||
results_queue = Queue()
|
||
|
||
def worker(worker_id, iterations=10):
|
||
"""Worker that converts HTML to text multiple times."""
|
||
for i in range(iterations):
|
||
text = html_to_text(html)
|
||
md5 = hashlib.md5(text.encode('utf-8')).hexdigest()
|
||
results_queue.put((worker_id, i, md5))
|
||
|
||
# Launch many threads simultaneously
|
||
num_threads = 50
|
||
threads = []
|
||
|
||
for i in range(num_threads):
|
||
t = threading.Thread(target=worker, args=(i,))
|
||
threads.append(t)
|
||
t.start()
|
||
|
||
# Wait for all threads to complete
|
||
for t in threads:
|
||
t.join()
|
||
|
||
# Collect all MD5 results
|
||
md5_values = []
|
||
while not results_queue.empty():
|
||
_, _, md5 = results_queue.get()
|
||
md5_values.append(md5)
|
||
|
||
# All MD5s should be identical
|
||
unique_md5s = set(md5_values)
|
||
|
||
assert len(unique_md5s) == 1, (
|
||
f"Thread-safety issue detected! Found {len(unique_md5s)} different MD5 values: {unique_md5s}. "
|
||
"The thread-local parser fix may not be working correctly."
|
||
)
|
||
|
||
print(f"✓ Thread-safety test passed: {len(md5_values)} conversions, all identical")
|
||
|
||
def test_thread_safety_basic(self):
|
||
"""Verify basic thread safety - multiple threads can call html_to_text simultaneously."""
|
||
results = []
|
||
errors = []
|
||
|
||
def worker():
|
||
"""Worker that converts HTML to text."""
|
||
try:
|
||
html = '<html><body><h1>Test</h1><p>Content</p></body></html>'
|
||
text = html_to_text(html)
|
||
results.append(text)
|
||
except Exception as e:
|
||
errors.append(e)
|
||
|
||
# Launch 10 threads simultaneously
|
||
threads = [threading.Thread(target=worker) for _ in range(10)]
|
||
for t in threads:
|
||
t.start()
|
||
for t in threads:
|
||
t.join()
|
||
|
||
# Should have no errors
|
||
assert len(errors) == 0, f"Thread-safety errors occurred: {errors}"
|
||
|
||
# All results should be identical
|
||
assert len(set(results)) == 1, "All threads should produce identical output"
|
||
|
||
print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors")
|
||
|
||
def test_large_html_with_bloated_head(self):
|
||
"""
|
||
Test that html_to_text can handle large HTML documents with massive <head> bloat.
|
||
|
||
SPAs often dump 10MB+ of styles, scripts, and other bloat into the <head> section.
|
||
This can cause inscriptis to silently exit when processing very large documents.
|
||
The fix strips <style>, <script>, <svg>, <noscript>, <link>, <meta>, and HTML comments
|
||
before processing, allowing extraction of actual body content.
|
||
"""
|
||
# Generate massive style block (~5MB)
|
||
large_style = '<style>' + '.class{color:red;}\n' * 200000 + '</style>\n'
|
||
|
||
# Generate massive script block (~5MB)
|
||
large_script = '<script>' + 'console.log("bloat");\n' * 200000 + '</script>\n'
|
||
|
||
# Generate lots of SVG bloat (~3MB)
|
||
svg_bloat = '<svg><path d="M0,0 L100,100"/></svg>\n' * 50000
|
||
|
||
# Generate meta/link tags (~2MB)
|
||
meta_bloat = '<meta name="description" content="bloat"/>\n' * 50000
|
||
link_bloat = '<link rel="stylesheet" href="bloat.css"/>\n' * 50000
|
||
|
||
# Generate HTML comments (~1MB)
|
||
comment_bloat = '<!-- This is bloat -->\n' * 50000
|
||
|
||
# Generate noscript bloat
|
||
noscript_bloat = '<noscript>Enable JavaScript</noscript>\n' * 10000
|
||
|
||
# Build the large HTML document
|
||
html = f'''<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<title>Test Page</title>
|
||
{large_style}
|
||
{large_script}
|
||
{svg_bloat}
|
||
{meta_bloat}
|
||
{link_bloat}
|
||
{comment_bloat}
|
||
{noscript_bloat}
|
||
</head>
|
||
<body>
|
||
<h1>Important Heading</h1>
|
||
<p>This is the actual content that should be extracted.</p>
|
||
<div>
|
||
<p>First paragraph with meaningful text.</p>
|
||
<p>Second paragraph with more content.</p>
|
||
</div>
|
||
<footer>Footer text</footer>
|
||
</body>
|
||
</html>
|
||
'''
|
||
|
||
# Verify the HTML is actually large (should be ~20MB+)
|
||
html_size_mb = len(html) / (1024 * 1024)
|
||
assert html_size_mb > 15, f"HTML should be >15MB, got {html_size_mb:.2f}MB"
|
||
|
||
print(f" Testing {html_size_mb:.2f}MB HTML document with bloated head...")
|
||
|
||
# This should not crash or silently exit
|
||
text = html_to_text(html)
|
||
|
||
# Verify we got actual text output (not empty/None)
|
||
assert text is not None, "html_to_text returned None"
|
||
assert len(text) > 0, "html_to_text returned empty string"
|
||
|
||
# Verify the actual body content was extracted
|
||
assert 'Important Heading' in text, "Failed to extract heading"
|
||
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
|
||
assert 'First paragraph with meaningful text' in text, "Failed to extract first paragraph"
|
||
assert 'Second paragraph with more content' in text, "Failed to extract second paragraph"
|
||
assert 'Footer text' in text, "Failed to extract footer"
|
||
|
||
# Verify bloat was stripped (output should be tiny compared to input)
|
||
text_size_kb = len(text) / 1024
|
||
assert text_size_kb < 1, f"Output too large ({text_size_kb:.2f}KB), bloat not stripped"
|
||
|
||
# Verify no CSS, script content, or SVG leaked through
|
||
assert 'color:red' not in text, "Style content leaked into text output"
|
||
assert 'console.log' not in text, "Script content leaked into text output"
|
||
assert '<path' not in text, "SVG content leaked into text output"
|
||
assert 'bloat.css' not in text, "Link href leaked into text output"
|
||
|
||
print(f" ✓ Successfully processed {html_size_mb:.2f}MB HTML -> {text_size_kb:.2f}KB text")
|
||
|
||
def test_body_display_none_spa_pattern(self):
|
||
"""
|
||
Test that html_to_text can extract content from pages with display:none body.
|
||
|
||
SPAs (Single Page Applications) often use <body style="display:none"> to hide content
|
||
until JavaScript loads and renders the page. inscriptis respects CSS display rules,
|
||
so without preprocessing, it would skip all content and return only newlines.
|
||
|
||
The fix strips display:none and visibility:hidden styles from the body tag before
|
||
processing, allowing text extraction from client-side rendered applications.
|
||
"""
|
||
# Test case 1: Basic display:none
|
||
html1 = '''<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head><title>What's New – Fluxguard</title></head>
|
||
<body style="display:none">
|
||
<h1>Important Heading</h1>
|
||
<p>This is actual content that should be extracted.</p>
|
||
<div>
|
||
<p>First paragraph with meaningful text.</p>
|
||
<p>Second paragraph with more content.</p>
|
||
</div>
|
||
</body>
|
||
</html>'''
|
||
|
||
text1 = html_to_text(html1)
|
||
|
||
# Before fix: would return ~33 newlines, len(text) ~= 33
|
||
# After fix: should extract actual content, len(text) > 100
|
||
assert len(text1) > 100, f"Expected substantial text output, got {len(text1)} chars"
|
||
assert 'Important Heading' in text1, "Failed to extract heading from display:none body"
|
||
assert 'actual content' in text1, "Failed to extract paragraph from display:none body"
|
||
assert 'First paragraph' in text1, "Failed to extract nested content"
|
||
|
||
# Should not be mostly newlines
|
||
newline_ratio = text1.count('\n') / len(text1)
|
||
assert newline_ratio < 0.5, f"Output is mostly newlines ({newline_ratio:.2%}), content not extracted"
|
||
|
||
# Test case 2: visibility:hidden (another hiding pattern)
|
||
html2 = '<html><body style="visibility:hidden"><h1>Hidden Content</h1><p>Test paragraph.</p></body></html>'
|
||
text2 = html_to_text(html2)
|
||
|
||
assert 'Hidden Content' in text2, "Failed to extract content from visibility:hidden body"
|
||
assert 'Test paragraph' in text2, "Failed to extract paragraph from visibility:hidden body"
|
||
|
||
# Test case 3: Mixed styles (display:none with other CSS)
|
||
html3 = '<html><body style="color: red; display:none; font-size: 12px"><p>Mixed style content</p></body></html>'
|
||
text3 = html_to_text(html3)
|
||
|
||
assert 'Mixed style content' in text3, "Failed to extract content from body with mixed styles"
|
||
|
||
# Test case 4: Case insensitivity (DISPLAY:NONE uppercase)
|
||
html4 = '<html><body style="DISPLAY:NONE"><p>Uppercase style</p></body></html>'
|
||
text4 = html_to_text(html4)
|
||
|
||
assert 'Uppercase style' in text4, "Failed to handle uppercase DISPLAY:NONE"
|
||
|
||
# Test case 5: Space variations (display: none vs display:none)
|
||
html5 = '<html><body style="display: none"><p>With spaces</p></body></html>'
|
||
text5 = html_to_text(html5)
|
||
|
||
assert 'With spaces' in text5, "Failed to handle 'display: none' with space"
|
||
|
||
# Test case 6: Body with other attributes (class, id)
|
||
html6 = '<html><body class="foo" style="display:none" id="bar"><p>With attributes</p></body></html>'
|
||
text6 = html_to_text(html6)
|
||
|
||
assert 'With attributes' in text6, "Failed to extract from body with multiple attributes"
|
||
|
||
# Test case 7: Should NOT affect opacity:0 (which doesn't hide from inscriptis)
|
||
html7 = '<html><body style="opacity:0"><p>Transparent content</p></body></html>'
|
||
text7 = html_to_text(html7)
|
||
|
||
# Opacity doesn't affect inscriptis text extraction, content should be there
|
||
assert 'Transparent content' in text7, "Incorrectly stripped opacity:0 style"
|
||
|
||
print(" ✓ All display:none body tag tests passed")
|
||
|
||
def test_style_tag_with_svg_data_uri(self):
|
||
"""
|
||
Test that style tags containing SVG data URIs are properly stripped.
|
||
|
||
Some WordPress and modern sites embed SVG as data URIs in CSS, which contains
|
||
<svg> and </svg> tags within the style content. The regex must use backreferences
|
||
to ensure <style> matches </style> (not </svg> inside the CSS).
|
||
|
||
This was causing errors where the regex would match <style> and stop at the first
|
||
</svg> it encountered inside a CSS data URI, breaking the HTML structure.
|
||
"""
|
||
# Real-world example from WordPress wp-block-image styles
|
||
html = '''<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<style id='wp-block-image-inline-css'>
|
||
.wp-block-image>a,.wp-block-image>figure>a{display:inline-block}.wp-block-image img{box-sizing:border-box;height:auto;max-width:100%;vertical-align:bottom}@supports ((-webkit-mask-image:none) or (mask-image:none)) or (-webkit-mask-image:none){.wp-block-image.is-style-circle-mask img{border-radius:0;-webkit-mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-mode:alpha}}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<h1>Test Heading</h1>
|
||
<p>This is the actual content that should be extracted.</p>
|
||
<div class="wp-block-image">
|
||
<img src="test.jpg" alt="Test image">
|
||
</div>
|
||
</body>
|
||
</html>'''
|
||
|
||
# This should not crash and should extract the body content
|
||
text = html_to_text(html)
|
||
|
||
# Verify the actual body content was extracted
|
||
assert text is not None, "html_to_text returned None"
|
||
assert len(text) > 0, "html_to_text returned empty string"
|
||
assert 'Test Heading' in text, "Failed to extract heading"
|
||
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
|
||
|
||
# Verify CSS content was stripped (including the SVG data URI)
|
||
assert '.wp-block-image' not in text, "CSS class selector leaked into text"
|
||
assert 'mask-image' not in text, "CSS property leaked into text"
|
||
assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text"
|
||
assert 'viewBox' not in text, "SVG attributes leaked into text"
|
||
|
||
# Verify no broken HTML structure
|
||
assert '<style' not in text, "Unclosed style tag in output"
|
||
assert '</svg>' not in text, "SVG closing tag leaked into text"
|
||
|
||
print(" ✓ Style tag with SVG data URI test passed")
|
||
|
||
def test_style_tag_closes_correctly(self):
|
||
"""
|
||
Test that each tag type (style, script, svg) closes with the correct closing tag.
|
||
|
||
Before the fix, the regex used (?:style|script|svg|noscript) for both opening and
|
||
closing tags, which meant <style> could incorrectly match </svg> as its closing tag.
|
||
With backreferences, <style> must close with </style>, <svg> with </svg>, etc.
|
||
"""
|
||
# Test nested tags where incorrect matching would break
|
||
html = '''<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<style>
|
||
body { background: url('data:image/svg+xml,<svg><rect/></svg>'); }
|
||
</style>
|
||
<script>
|
||
const svg = '<svg><path d="M0,0"/></svg>';
|
||
</script>
|
||
</head>
|
||
<body>
|
||
<h1>Content</h1>
|
||
<svg><circle cx="50" cy="50" r="40"/></svg>
|
||
<p>After SVG</p>
|
||
</body>
|
||
</html>'''
|
||
|
||
text = html_to_text(html)
|
||
|
||
# Should extract body content
|
||
assert 'Content' in text, "Failed to extract heading"
|
||
assert 'After SVG' in text, "Failed to extract content after SVG"
|
||
|
||
# Should strip all style/script/svg content
|
||
assert 'background:' not in text, "Style content leaked"
|
||
assert 'const svg' not in text, "Script content leaked"
|
||
assert '<circle' not in text, "SVG element leaked"
|
||
assert 'data:image/svg+xml' not in text, "Data URI leaked"
|
||
|
||
print(" ✓ Tag closing validation test passed")
|
||
|
||
|
||
|
||
def test_script_with_closing_tag_in_string_does_not_eat_content(self):
|
||
"""
|
||
Script tag containing </script> inside a JS string must not prematurely end the block.
|
||
|
||
This is the classic regex failure mode: the old pattern would find the first </script>
|
||
inside the JS string literal and stop there, leaving the tail of the script block
|
||
(plus any following content) exposed as raw text. BS4 parses the HTML correctly.
|
||
"""
|
||
html = '''<html><body>
|
||
<p>Before script</p>
|
||
<script>
|
||
var html = "<div>foo<\\/script><p>bar</p>";
|
||
var also = 1;
|
||
</script>
|
||
<p>AFTER SCRIPT</p>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
assert 'Before script' in text
|
||
assert 'AFTER SCRIPT' in text
|
||
# Script internals must not leak
|
||
assert 'var html' not in text
|
||
assert 'var also' not in text
|
||
|
||
def test_content_sandwiched_between_multiple_body_scripts(self):
|
||
"""Content between multiple script/style blocks in the body must all survive."""
|
||
html = '''<html><body>
|
||
<script>var a = 1;</script>
|
||
<p>CONTENT A</p>
|
||
<style>.x { color: red; }</style>
|
||
<p>CONTENT B</p>
|
||
<script>var b = 2;</script>
|
||
<p>CONTENT C</p>
|
||
<style>.y { color: blue; }</style>
|
||
<p>CONTENT D</p>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
|
||
assert label in text, f"'{label}' was eaten by script/style stripping"
|
||
assert 'var a' not in text
|
||
assert 'var b' not in text
|
||
assert 'color: red' not in text
|
||
assert 'color: blue' not in text
|
||
|
||
def test_unicode_and_international_content_preserved(self):
|
||
"""Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
|
||
html = '''<html><body>
|
||
<style>.x{color:red}</style>
|
||
<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
|
||
<p>Chinese: \u6ce8\u518c</p>
|
||
<p>Japanese: \u767b\u9332</p>
|
||
<p>Korean: \ub4f1\ub85d</p>
|
||
<p>Emoji: \U0001f4e2</p>
|
||
<script>var x = 1;</script>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
assert 'ANMELDUNG' in text
|
||
assert '\u6ce8\u518c' in text # Chinese
|
||
assert '\u767b\u9332' in text # Japanese
|
||
assert '\ub4f1\ub85d' in text # Korean
|
||
|
||
def test_style_with_type_attribute_is_stripped(self):
|
||
"""<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
|
||
html = '''<html><body>
|
||
<style type="text/css">.important { display: none; }</style>
|
||
<p>VISIBLE CONTENT</p>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
assert 'VISIBLE CONTENT' in text
|
||
assert '.important' not in text
|
||
assert 'display: none' not in text
|
||
|
||
def test_ldjson_script_is_stripped(self):
|
||
"""<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
|
||
html = '''<html><body>
|
||
<script type="application/ld+json">
|
||
{"@type": "Product", "name": "Widget", "price": "9.99"}
|
||
</script>
|
||
<p>PRODUCT PAGE</p>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
assert 'PRODUCT PAGE' in text
|
||
assert '@type' not in text
|
||
assert '"price"' not in text
|
||
|
||
def test_inline_svg_is_stripped_entirely(self):
|
||
"""
|
||
Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
|
||
SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
|
||
text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
|
||
"""
|
||
html = '''<html><body>
|
||
<p>Before SVG</p>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||
<path d="M14 5L7 12L14 19Z" fill="none"/>
|
||
<circle cx="12" cy="12" r="10"/>
|
||
</svg>
|
||
<p>After SVG</p>
|
||
</body></html>'''
|
||
|
||
text = html_to_text(html)
|
||
assert 'Before SVG' in text
|
||
assert 'After SVG' in text
|
||
assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
|
||
assert 'viewBox' not in text, "SVG attributes should not appear in text output"
|
||
|
||
def test_tag_inside_json_data_attribute_does_not_eat_content(self):
|
||
"""
|
||
Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
|
||
|
||
Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
|
||
inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
|
||
closing tags appear as <\\/svg> rather than </svg>.
|
||
|
||
The old regex approach would find <svg> inside the attribute value, then fail to find
|
||
<\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
|
||
eating tens of kilobytes of actual page content in the process.
|
||
"""
|
||
html = '''<!DOCTYPE html>
|
||
<html>
|
||
<head><title>Test</title></head>
|
||
<body>
|
||
<div class="slider" data-slider-atts="{"prevArrow":"<i class=\\"icon\\"><svg width=\\"24\\" height=\\"24\\" viewBox=\\"0 0 24 24\\" xmlns=\\"http:\\/\\/www.w3.org\\/2000\\/svg\\"><path d=\\"M14 5L7 12L14 19\\"\\/><\\/svg><\\/i>"}">
|
||
</div>
|
||
<div class="content">
|
||
<h1>IMPORTANT CONTENT</h1>
|
||
<p>This text must not be eaten by the tag-stripping logic.</p>
|
||
</div>
|
||
<svg><circle cx="50" cy="50" r="40"/></svg>
|
||
</body>
|
||
</html>'''
|
||
|
||
text = html_to_text(html)
|
||
|
||
assert 'IMPORTANT CONTENT' in text, (
|
||
"Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
|
||
"The tag-stripping logic is matching <tag> inside attribute values and scanning "
|
||
"forward to the next real closing tag in the DOM."
|
||
)
|
||
assert 'This text must not be eaten' in text
|
||
|
||
def test_script_inside_json_data_attribute_does_not_eat_content(self):
|
||
"""Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
|
||
html = '''<!DOCTYPE html>
|
||
<html>
|
||
<head><title>Test</title></head>
|
||
<body>
|
||
<div data-config="{"template":"<script type=\\"text\\/javascript\\">var x=1;<\\/script>"}">
|
||
</div>
|
||
<div>
|
||
<h1>MUST SURVIVE</h1>
|
||
<p>Real content after the data attribute with embedded script tag.</p>
|
||
</div>
|
||
<script>var real = 1;</script>
|
||
</body>
|
||
</html>'''
|
||
|
||
text = html_to_text(html)
|
||
|
||
assert 'MUST SURVIVE' in text, (
|
||
"Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
|
||
)
|
||
assert 'Real content after the data attribute' in text
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# Can run this file directly for quick testing
|
||
unittest.main()
|