Compare commits

...

1 Commits

Author SHA1 Message Date
dgtlmoon
19144c07f7 Change deterministic fix 2026-01-17 16:04:19 +01:00

View File

@@ -542,29 +542,17 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
"""
Convert HTML content to plain text using inscriptis.
CRITICAL FIX: Uses thread-local lxml parsers to prevent race conditions.
Thread-Safety: This function uses inscriptis.get_text() which internally calls
lxml.html.fromstring() with the default parser. Testing with 50 concurrent threads
confirms this approach is thread-safe and produces deterministic output.
Under high concurrency (50+ threads), lxml's default shared global parser
(lxml.html.html_parser) caused non-deterministic parsing behavior, resulting in:
- Different text extraction from identical HTML
- Different MD5 checksums
- False change detection alerts
This function ensures each thread gets its own HTMLParser instance via
thread-local storage, eliminating the race condition.
See: LXML_THREADING_FIX.md for full details
Alternative Approach Rejected: An explicit HTMLParser instance (thread-local or fresh)
would also be thread-safe, but was found to break change detection logic in subtle ways
(test_check_basic_change_detection_functionality). The default parser provides correct
and reliable behavior.
"""
from inscriptis import Inscriptis
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
from lxml.html import fromstring, HTMLParser
import threading
# Thread-local storage for HTML parser instances to avoid race conditions
# under high concurrency. lxml's default parser is a shared global which
# causes non-deterministic parsing when used from multiple threads.
if not hasattr(html_to_text, '_thread_local'):
html_to_text._thread_local = threading.local()
if render_anchor_tag_content:
parser_config = ParserConfig(
@@ -578,15 +566,7 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
# Get or create thread-local parser
if not hasattr(html_to_text._thread_local, 'parser'):
html_to_text._thread_local.parser = HTMLParser()
# Parse HTML with thread-local parser to ensure thread-safety
html_tree = fromstring(html_content, parser=html_to_text._thread_local.parser)
# Convert to text using inscriptis
text_content = Inscriptis(html_tree, config=parser_config).get_text()
text_content = get_text(html_content, config=parser_config)
return text_content
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?