First paragraph with bold text.
Second paragraph with italic text.
- Item 1
- Item 2
- Item 3
#!/usr/bin/env python3 # coding=utf-8 """Unit tests for html_tools.html_to_text function.""" import hashlib import threading import unittest from queue import Queue from changedetectionio.html_tools import html_to_text class TestHtmlToText(unittest.TestCase): """Test html_to_text function for correctness and thread-safety.""" def test_basic_text_extraction(self): """Test basic HTML to text conversion.""" html = '
Paragraph text.
' text = html_to_text(html) assert 'Title' in text assert 'Paragraph text.' in text assert '<' not in text # HTML tags should be stripped assert '>' not in text def test_empty_html(self): """Test handling of empty HTML.""" html = '' text = html_to_text(html) # Should return empty or whitespace only assert text.strip() == '' def test_nested_elements(self): """Test extraction from nested HTML elements.""" html = '''First paragraph
Second paragraph
Test & <special> characters
' text = html_to_text(html) # Entities should be decoded assert 'Test &' in text or 'Test &' in text assert 'special' in text def test_whitespace_handling(self): """Test that whitespace is properly handled.""" html = 'Line 1
Line 2
' text = html_to_text(html) # Should have some separation between lines assert 'Line 1' in text assert 'Line 2' in text assert text.count('\n') >= 1 # At least one newline def test_deterministic_output(self): """Test that the same HTML always produces the same text.""" html = 'Content here
' # Extract text multiple times results = [html_to_text(html) for _ in range(10)] # All results should be identical assert len(set(results)) == 1, "html_to_text should be deterministic" def test_thread_safety_determinism(self): """ Test that html_to_text produces deterministic output under high concurrency. This verifies that lxml's default parser (used by inscriptis.get_text) is thread-safe and produces consistent results when called from multiple threads simultaneously. """ html = '''First paragraph with bold text.
Second paragraph with italic text.
Content
' text = html_to_text(html) results.append(text) except Exception as e: errors.append(e) # Launch 10 threads simultaneously threads = [threading.Thread(target=worker) for _ in range(10)] for t in threads: t.start() for t in threads: t.join() # Should have no errors assert len(errors) == 0, f"Thread-safety errors occurred: {errors}" # All results should be identical assert len(set(results)) == 1, "All threads should produce identical output" print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors") def test_large_html_with_bloated_head(self): """ Test that html_to_text can handle large HTML documents with massive bloat. SPAs often dump 10MB+ of styles, scripts, and other bloat into the section. This can cause inscriptis to silently exit when processing very large documents. The fix strips \n' # Generate massive script block (~5MB) large_script = '\n' # Generate lots of SVG bloat (~3MB) svg_bloat = '\n' * 50000 # Generate meta/link tags (~2MB) meta_bloat = '\n' * 50000 link_bloat = '\n' * 50000 # Generate HTML comments (~1MB) comment_bloat = '\n' * 50000 # Generate noscript bloat noscript_bloat = '\n' * 10000 # Build the large HTML document html = f'''This is the actual content that should be extracted.
First paragraph with meaningful text.
Second paragraph with more content.
This is actual content that should be extracted.
First paragraph with meaningful text.
Second paragraph with more content.
Test paragraph.
' text2 = html_to_text(html2) assert 'Hidden Content' in text2, "Failed to extract content from visibility:hidden body" assert 'Test paragraph' in text2, "Failed to extract paragraph from visibility:hidden body" # Test case 3: Mixed styles (display:none with other CSS) html3 = 'Mixed style content
' text3 = html_to_text(html3) assert 'Mixed style content' in text3, "Failed to extract content from body with mixed styles" # Test case 4: Case insensitivity (DISPLAY:NONE uppercase) html4 = 'Uppercase style
' text4 = html_to_text(html4) assert 'Uppercase style' in text4, "Failed to handle uppercase DISPLAY:NONE" # Test case 5: Space variations (display: none vs display:none) html5 = 'With spaces
' text5 = html_to_text(html5) assert 'With spaces' in text5, "Failed to handle 'display: none' with space" # Test case 6: Body with other attributes (class, id) html6 = 'With attributes
' text6 = html_to_text(html6) assert 'With attributes' in text6, "Failed to extract from body with multiple attributes" # Test case 7: Should NOT affect opacity:0 (which doesn't hide from inscriptis) html7 = 'Transparent content
' text7 = html_to_text(html7) # Opacity doesn't affect inscriptis text extraction, content should be there assert 'Transparent content' in text7, "Incorrectly stripped opacity:0 style" print(" ✓ All display:none body tag tests passed") def test_style_tag_with_svg_data_uri(self): """ Test that style tags containing SVG data URIs are properly stripped. Some WordPress and modern sites embed SVG as data URIs in CSS, which contains tags within the style content. The regex must use backreferences to ensure (not inside the CSS). This was causing errors where the regex would matchThis is the actual content that should be extracted.
After SVG
''' text = html_to_text(html) # Should extract body content assert 'Content' in text, "Failed to extract heading" assert 'After SVG' in text, "Failed to extract content after SVG" # Should strip all style/script/svg content assert 'background:' not in text, "Style content leaked" assert 'const svg' not in text, "Script content leaked" assert 'Before script
AFTER SCRIPT
''' text = html_to_text(html) assert 'Before script' in text assert 'AFTER SCRIPT' in text # Script internals must not leak assert 'var html' not in text assert 'var also' not in text def test_content_sandwiched_between_multiple_body_scripts(self): """Content between multiple script/style blocks in the body must all survive.""" html = '''CONTENT A
CONTENT B
CONTENT C
CONTENT D
''' text = html_to_text(html) for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']: assert label in text, f"'{label}' was eaten by script/style stripping" assert 'var a' not in text assert 'var b' not in text assert 'color: red' not in text assert 'color: blue' not in text def test_unicode_and_international_content_preserved(self): """Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping.""" html = '''German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026
Chinese: \u6ce8\u518c
Japanese: \u767b\u9332
Korean: \ub4f1\ub85d
Emoji: \U0001f4e2
''' text = html_to_text(html) assert 'ANMELDUNG' in text assert '\u6ce8\u518c' in text # Chinese assert '\u767b\u9332' in text # Japanese assert '\ub4f1\ub85d' in text # Korean def test_style_with_type_attribute_is_stripped(self): """VISIBLE CONTENT