First paragraph with bold text.
Second paragraph with italic text.
- Item 1
- Item 2
- Item 3
#!/usr/bin/env python3 # coding=utf-8 """Unit tests for html_tools.html_to_text function.""" import hashlib import threading import unittest from queue import Queue from changedetectionio.html_tools import html_to_text class TestHtmlToText(unittest.TestCase): """Test html_to_text function for correctness and thread-safety.""" def test_basic_text_extraction(self): """Test basic HTML to text conversion.""" html = '
Paragraph text.
' text = html_to_text(html) assert 'Title' in text assert 'Paragraph text.' in text assert '<' not in text # HTML tags should be stripped assert '>' not in text def test_empty_html(self): """Test handling of empty HTML.""" html = '' text = html_to_text(html) # Should return empty or whitespace only assert text.strip() == '' def test_nested_elements(self): """Test extraction from nested HTML elements.""" html = '''First paragraph
Second paragraph
Test & <special> characters
' text = html_to_text(html) # Entities should be decoded assert 'Test &' in text or 'Test &' in text assert 'special' in text def test_whitespace_handling(self): """Test that whitespace is properly handled.""" html = 'Line 1
Line 2
' text = html_to_text(html) # Should have some separation between lines assert 'Line 1' in text assert 'Line 2' in text assert text.count('\n') >= 1 # At least one newline def test_deterministic_output(self): """Test that the same HTML always produces the same text.""" html = 'Content here
' # Extract text multiple times results = [html_to_text(html) for _ in range(10)] # All results should be identical assert len(set(results)) == 1, "html_to_text should be deterministic" def test_thread_safety_determinism(self): """ Test that html_to_text produces deterministic output under high concurrency. This verifies that lxml's default parser (used by inscriptis.get_text) is thread-safe and produces consistent results when called from multiple threads simultaneously. """ html = '''First paragraph with bold text.
Second paragraph with italic text.
Content
' text = html_to_text(html) results.append(text) except Exception as e: errors.append(e) # Launch 10 threads simultaneously threads = [threading.Thread(target=worker) for _ in range(10)] for t in threads: t.start() for t in threads: t.join() # Should have no errors assert len(errors) == 0, f"Thread-safety errors occurred: {errors}" # All results should be identical assert len(set(results)) == 1, "All threads should produce identical output" print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors") if __name__ == '__main__': # Can run this file directly for quick testing unittest.main()