#!/usr/bin/env python3 # coding=utf-8 """Unit tests for html_tools.html_to_text function.""" import hashlib import threading import unittest from queue import Queue from changedetectionio.html_tools import html_to_text class TestHtmlToText(unittest.TestCase): """Test html_to_text function for correctness and thread-safety.""" def test_basic_text_extraction(self): """Test basic HTML to text conversion.""" html = '

Title

Paragraph text.

' text = html_to_text(html) assert 'Title' in text assert 'Paragraph text.' in text assert '<' not in text # HTML tags should be stripped assert '>' not in text def test_empty_html(self): """Test handling of empty HTML.""" html = '' text = html_to_text(html) # Should return empty or whitespace only assert text.strip() == '' def test_nested_elements(self): """Test extraction from nested HTML elements.""" html = '''

Header

First paragraph

Second paragraph

''' text = html_to_text(html) assert 'Header' in text assert 'First paragraph' in text assert 'Second paragraph' in text def test_anchor_tag_rendering(self): """Test anchor tag rendering option.""" html = 'Link text' # Without rendering anchors text_without = html_to_text(html, render_anchor_tag_content=False) assert 'Link text' in text_without assert 'https://example.com' not in text_without # With rendering anchors text_with = html_to_text(html, render_anchor_tag_content=True) assert 'Link text' in text_with assert 'https://example.com' in text_with or '[Link text]' in text_with def test_rss_mode(self): """Test RSS mode converts title tags to h1.""" html = 'RSS TitleContent' # is_rss=True should convert to <h1> text = html_to_text(html, is_rss=True) assert 'RSS Title' in text assert 'Content' in text def test_special_characters(self): """Test handling of special characters and entities.""" html = '<html><body><p>Test & <special> characters</p></body></html>' text = html_to_text(html) # Entities should be decoded assert 'Test &' in text or 'Test &' in text assert 'special' in text def test_whitespace_handling(self): """Test that whitespace is properly handled.""" html = '<html><body><p>Line 1</p><p>Line 2</p></body></html>' text = html_to_text(html) # Should have some separation between lines assert 'Line 1' in text assert 'Line 2' in text assert text.count('\n') >= 1 # At least one newline def test_deterministic_output(self): """Test that the same HTML always produces the same text.""" html = '<html><body><h1>Test</h1><p>Content here</p></body></html>' # Extract text multiple times results = [html_to_text(html) for _ in range(10)] # All results should be identical assert len(set(results)) == 1, "html_to_text should be deterministic" def test_thread_safety_determinism(self): """ Test that html_to_text produces deterministic output under high concurrency. This verifies that lxml's default parser (used by inscriptis.get_text) is thread-safe and produces consistent results when called from multiple threads simultaneously. """ html = ''' <html> <head><title>Test Page

Main Heading

First paragraph with bold text.

Second paragraph with italic text.

Item 1
Item 2
Item 3

''' results_queue = Queue() def worker(worker_id, iterations=10): """Worker that converts HTML to text multiple times.""" for i in range(iterations): text = html_to_text(html) md5 = hashlib.md5(text.encode('utf-8')).hexdigest() results_queue.put((worker_id, i, md5)) # Launch many threads simultaneously num_threads = 50 threads = [] for i in range(num_threads): t = threading.Thread(target=worker, args=(i,)) threads.append(t) t.start() # Wait for all threads to complete for t in threads: t.join() # Collect all MD5 results md5_values = [] while not results_queue.empty(): _, _, md5 = results_queue.get() md5_values.append(md5) # All MD5s should be identical unique_md5s = set(md5_values) assert len(unique_md5s) == 1, ( f"Thread-safety issue detected! Found {len(unique_md5s)} different MD5 values: {unique_md5s}. " "The thread-local parser fix may not be working correctly." ) print(f"✓ Thread-safety test passed: {len(md5_values)} conversions, all identical") def test_thread_safety_basic(self): """Verify basic thread safety - multiple threads can call html_to_text simultaneously.""" results = [] errors = [] def worker(): """Worker that converts HTML to text.""" try: html = '

Test

Content

' text = html_to_text(html) results.append(text) except Exception as e: errors.append(e) # Launch 10 threads simultaneously threads = [threading.Thread(target=worker) for _ in range(10)] for t in threads: t.start() for t in threads: t.join() # Should have no errors assert len(errors) == 0, f"Thread-safety errors occurred: {errors}" # All results should be identical assert len(set(results)) == 1, "All threads should produce identical output" print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors") def test_large_html_with_bloated_head(self): """ Test that html_to_text can handle large HTML documents with massive bloat. SPAs often dump 10MB+ of styles, scripts, and other bloat into the section. This can cause inscriptis to silently exit when processing very large documents. The fix strips \n' # Generate massive script block (~5MB) large_script = '\n' # Generate lots of SVG bloat (~3MB) svg_bloat = '\n' * 50000 # Generate meta/link tags (~2MB) meta_bloat = '\n' * 50000 link_bloat = '\n' * 50000 # Generate HTML comments (~1MB) comment_bloat = '\n' * 50000 # Generate noscript bloat noscript_bloat = '\n' * 10000 # Build the large HTML document html = f''' Test Page {large_style} {large_script} {svg_bloat} {meta_bloat} {link_bloat} {comment_bloat} {noscript_bloat}

Important Heading

This is the actual content that should be extracted.

First paragraph with meaningful text.

Second paragraph with more content.

''' # Verify the HTML is actually large (should be ~20MB+) html_size_mb = len(html) / (1024 * 1024) assert html_size_mb > 15, f"HTML should be >15MB, got {html_size_mb:.2f}MB" print(f" Testing {html_size_mb:.2f}MB HTML document with bloated head...") # This should not crash or silently exit text = html_to_text(html) # Verify we got actual text output (not empty/None) assert text is not None, "html_to_text returned None" assert len(text) > 0, "html_to_text returned empty string" # Verify the actual body content was extracted assert 'Important Heading' in text, "Failed to extract heading" assert 'actual content that should be extracted' in text, "Failed to extract paragraph" assert 'First paragraph with meaningful text' in text, "Failed to extract first paragraph" assert 'Second paragraph with more content' in text, "Failed to extract second paragraph" assert 'Footer text' in text, "Failed to extract footer" # Verify bloat was stripped (output should be tiny compared to input) text_size_kb = len(text) / 1024 assert text_size_kb < 1, f"Output too large ({text_size_kb:.2f}KB), bloat not stripped" # Verify no CSS, script content, or SVG leaked through assert 'color:red' not in text, "Style content leaked into text output" assert 'console.log' not in text, "Script content leaked into text output" assert ' {text_size_kb:.2f}KB text") def test_body_display_none_spa_pattern(self): """ Test that html_to_text can extract content from pages with display:none body. SPAs (Single Page Applications) often use to hide content until JavaScript loads and renders the page. inscriptis respects CSS display rules, so without preprocessing, it would skip all content and return only newlines. The fix strips display:none and visibility:hidden styles from the body tag before processing, allowing text extraction from client-side rendered applications. """ # Test case 1: Basic display:none html1 = ''' What's New – Fluxguard

Important Heading

This is actual content that should be extracted.

First paragraph with meaningful text.

Second paragraph with more content.

''' text1 = html_to_text(html1) # Before fix: would return ~33 newlines, len(text) ~= 33 # After fix: should extract actual content, len(text) > 100 assert len(text1) > 100, f"Expected substantial text output, got {len(text1)} chars" assert 'Important Heading' in text1, "Failed to extract heading from display:none body" assert 'actual content' in text1, "Failed to extract paragraph from display:none body" assert 'First paragraph' in text1, "Failed to extract nested content" # Should not be mostly newlines newline_ratio = text1.count('\n') / len(text1) assert newline_ratio < 0.5, f"Output is mostly newlines ({newline_ratio:.2%}), content not extracted" # Test case 2: visibility:hidden (another hiding pattern) html2 = '

Hidden Content

Test paragraph.

' text2 = html_to_text(html2) assert 'Hidden Content' in text2, "Failed to extract content from visibility:hidden body" assert 'Test paragraph' in text2, "Failed to extract paragraph from visibility:hidden body" # Test case 3: Mixed styles (display:none with other CSS) html3 = '

Mixed style content

' text3 = html_to_text(html3) assert 'Mixed style content' in text3, "Failed to extract content from body with mixed styles" # Test case 4: Case insensitivity (DISPLAY:NONE uppercase) html4 = '

Uppercase style

' text4 = html_to_text(html4) assert 'Uppercase style' in text4, "Failed to handle uppercase DISPLAY:NONE" # Test case 5: Space variations (display: none vs display:none) html5 = '

With spaces

' text5 = html_to_text(html5) assert 'With spaces' in text5, "Failed to handle 'display: none' with space" # Test case 6: Body with other attributes (class, id) html6 = '

With attributes

' text6 = html_to_text(html6) assert 'With attributes' in text6, "Failed to extract from body with multiple attributes" # Test case 7: Should NOT affect opacity:0 (which doesn't hide from inscriptis) html7 = '

Transparent content

' text7 = html_to_text(html7) # Opacity doesn't affect inscriptis text extraction, content should be there assert 'Transparent content' in text7, "Incorrectly stripped opacity:0 style" print(" ✓ All display:none body tag tests passed") def test_style_tag_with_svg_data_uri(self): """ Test that style tags containing SVG data URIs are properly stripped. Some WordPress and modern sites embed SVG as data URIs in CSS, which contains tags within the style content. The regex must use backreferences to ensure (not inside the CSS). This was causing errors where the regex would match

Test Heading

This is the actual content that should be extracted.

''' # This should not crash and should extract the body content text = html_to_text(html) # Verify the actual body content was extracted assert text is not None, "html_to_text returned None" assert len(text) > 0, "html_to_text returned empty string" assert 'Test Heading' in text, "Failed to extract heading" assert 'actual content that should be extracted' in text, "Failed to extract paragraph" # Verify CSS content was stripped (including the SVG data URI) assert '.wp-block-image' not in text, "CSS class selector leaked into text" assert 'mask-image' not in text, "CSS property leaked into text" assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text" assert 'viewBox' not in text, "SVG attributes leaked into text" # Verify no broken HTML structure assert '' not in text, "SVG closing tag leaked into text" print(" ✓ Style tag with SVG data URI test passed") def test_style_tag_closes_correctly(self): """ Test that each tag type (style, script, svg) closes with the correct closing tag. Before the fix, the regex used (?:style|script|svg|noscript) for both opening and closing tags, which meant , , etc. """ # Test nested tags where incorrect matching would break html = '''

Content

After SVG

''' text = html_to_text(html) # Should extract body content assert 'Content' in text, "Failed to extract heading" assert 'After SVG' in text, "Failed to extract content after SVG" # Should strip all style/script/svg content assert 'background:' not in text, "Style content leaked" assert 'const svg' not in text, "Script content leaked" assert ' inside a JS string must not prematurely end the block. This is the classic regex failure mode: the old pattern would find the first inside the JS string literal and stop there, leaving the tail of the script block (plus any following content) exposed as raw text. BS4 parses the HTML correctly. """ html = '''

Before script

AFTER SCRIPT

''' text = html_to_text(html) assert 'Before script' in text assert 'AFTER SCRIPT' in text # Script internals must not leak assert 'var html' not in text assert 'var also' not in text def test_content_sandwiched_between_multiple_body_scripts(self): """Content between multiple script/style blocks in the body must all survive.""" html = '''

CONTENT A

CONTENT B

CONTENT C

CONTENT D

''' text = html_to_text(html) for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']: assert label in text, f"'{label}' was eaten by script/style stripping" assert 'var a' not in text assert 'var b' not in text assert 'color: red' not in text assert 'color: blue' not in text def test_unicode_and_international_content_preserved(self): """Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping.""" html = '''

German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026

Chinese: \u6ce8\u518c

Japanese: \u767b\u9332

Korean: \ub4f1\ub85d

Emoji: \U0001f4e2

''' text = html_to_text(html) assert 'ANMELDUNG' in text assert '\u6ce8\u518c' in text # Chinese assert '\u767b\u9332' in text # Japanese assert '\ub4f1\ub85d' in text # Korean def test_style_with_type_attribute_is_stripped(self): """

VISIBLE CONTENT

''' text = html_to_text(html) assert 'VISIBLE CONTENT' in text assert '.important' not in text assert 'display: none' not in text def test_ldjson_script_is_stripped(self): """

PRODUCT PAGE

''' text = html_to_text(html) assert 'PRODUCT PAGE' in text assert '@type' not in text assert '"price"' not in text def test_inline_svg_is_stripped_entirely(self): """ Inline SVG elements in the body are stripped by BS4 before passing to inscriptis. SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data text. The old regex code explicitly stripped

Before SVG

After SVG

''' text = html_to_text(html) assert 'Before SVG' in text assert 'After SVG' in text assert 'M14 5L7' not in text, "SVG path data should not appear in text output" assert 'viewBox' not in text, "SVG attributes should not appear in text output" def test_tag_inside_json_data_attribute_does_not_eat_content(self): """ Tags inside JSON data attributes with JS-escaped closing tags must not eat real content. Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons) inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so closing tags appear as <\\/svg> rather than . The old regex approach would find in the DOM — eating tens of kilobytes of actual page content in the process. """ html = ''' Test

IMPORTANT CONTENT

This text must not be eaten by the tag-stripping logic.

''' text = html_to_text(html) assert 'IMPORTANT CONTENT' in text, ( "Content after a JS-escaped tag in a data attribute was incorrectly stripped. " "The tag-stripping logic is matching inside attribute values and scanning " "forward to the next real closing tag in the DOM." ) assert 'This text must not be eaten' in text def test_script_inside_json_data_attribute_does_not_eat_content(self): """Same issue as above but with ''' text = html_to_text(html) assert 'MUST SURVIVE' in text, ( "Content after a JS-escaped