Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
541ed62bba More translatable strings 2026-01-17 13:59:08 +01:00
dgtlmoon
b1a45964e6 Language updates 2026-01-17 13:54:06 +01:00
5 changed files with 4 additions and 257 deletions

View File

@@ -84,7 +84,6 @@ jobs:
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
# Basic pytest tests with ancillary services
basic-tests:

View File

@@ -75,6 +75,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
continue
uuid = queued_item_data.item.get('uuid')
# RACE CONDITION FIX: Check if this UUID is already being processed by another worker
from changedetectionio import worker_handler
from changedetectionio.queuedWatchMetaData import PrioritizedItem

View File

@@ -1,5 +1,4 @@
import time
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, session
from flask_babel import gettext
from loguru import logger
@@ -152,24 +151,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_handle
confirmtext = request.form.get('confirmtext')
if confirmtext == 'clear':
# Run in background thread to avoid blocking
def clear_history_background():
# Capture UUIDs first to avoid race conditions
watch_uuids = list(datastore.data['watching'].keys())
logger.info(f"Background: Clearing history for {len(watch_uuids)} watches")
for uuid in watch_uuids:
try:
datastore.clear_watch_history(uuid)
except Exception as e:
logger.error(f"Error clearing history for watch {uuid}: {e}")
logger.info("Background: Completed clearing history")
# Start daemon thread
threading.Thread(target=clear_history_background, daemon=True).start()
flash(gettext("History clearing started in background"))
for uuid in datastore.data['watching'].keys():
datastore.clear_watch_history(uuid)
flash(gettext("Cleared snapshot history for all watches"))
else:
flash(gettext('Incorrect confirmation text.'), 'error')

View File

@@ -539,18 +539,6 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
"""
Convert HTML content to plain text using inscriptis.
Thread-Safety: This function uses inscriptis.get_text() which internally calls
lxml.html.fromstring() with the default parser. Testing with 50 concurrent threads
confirms this approach is thread-safe and produces deterministic output.
Alternative Approach Rejected: An explicit HTMLParser instance (thread-local or fresh)
would also be thread-safe, but was found to break change detection logic in subtle ways
(test_check_basic_change_detection_functionality). The default parser provides correct
and reliable behavior.
"""
from inscriptis import get_text
from inscriptis.model.config import ParserConfig

View File

@@ -1,225 +0,0 @@
#!/usr/bin/env python3
# coding=utf-8
"""Unit tests for html_tools.html_to_text function."""
import hashlib
import threading
from queue import Queue
import pytest
from changedetectionio.html_tools import html_to_text
class TestHtmlToText:
"""Test html_to_text function for correctness and thread-safety."""
def test_basic_text_extraction(self):
"""Test basic HTML to text conversion."""
html = '<html><body><h1>Title</h1><p>Paragraph text.</p></body></html>'
text = html_to_text(html)
assert 'Title' in text
assert 'Paragraph text.' in text
assert '<' not in text # HTML tags should be stripped
assert '>' not in text
def test_empty_html(self):
"""Test handling of empty HTML."""
html = '<html><body></body></html>'
text = html_to_text(html)
# Should return empty or whitespace only
assert text.strip() == ''
def test_nested_elements(self):
"""Test extraction from nested HTML elements."""
html = '''
<html>
<body>
<div>
<h1>Header</h1>
<div>
<p>First paragraph</p>
<p>Second paragraph</p>
</div>
</div>
</body>
</html>
'''
text = html_to_text(html)
assert 'Header' in text
assert 'First paragraph' in text
assert 'Second paragraph' in text
def test_anchor_tag_rendering(self):
"""Test anchor tag rendering option."""
html = '<html><body><a href="https://example.com">Link text</a></body></html>'
# Without rendering anchors
text_without = html_to_text(html, render_anchor_tag_content=False)
assert 'Link text' in text_without
assert 'https://example.com' not in text_without
# With rendering anchors
text_with = html_to_text(html, render_anchor_tag_content=True)
assert 'Link text' in text_with
assert 'https://example.com' in text_with or '[Link text]' in text_with
def test_rss_mode(self):
"""Test RSS mode converts title tags to h1."""
html = '<item><title>RSS Title</title><description>Content</description></item>'
# is_rss=True should convert <title> to <h1>
text = html_to_text(html, is_rss=True)
assert 'RSS Title' in text
assert 'Content' in text
def test_special_characters(self):
"""Test handling of special characters and entities."""
html = '<html><body><p>Test &amp; &lt;special&gt; characters</p></body></html>'
text = html_to_text(html)
# Entities should be decoded
assert 'Test &' in text or 'Test &amp;' in text
assert 'special' in text
def test_whitespace_handling(self):
"""Test that whitespace is properly handled."""
html = '<html><body><p>Line 1</p><p>Line 2</p></body></html>'
text = html_to_text(html)
# Should have some separation between lines
assert 'Line 1' in text
assert 'Line 2' in text
assert text.count('\n') >= 1 # At least one newline
def test_deterministic_output(self):
"""Test that the same HTML always produces the same text."""
html = '<html><body><h1>Test</h1><p>Content here</p></body></html>'
# Extract text multiple times
results = [html_to_text(html) for _ in range(10)]
# All results should be identical
assert len(set(results)) == 1, "html_to_text should be deterministic"
def test_thread_safety_determinism(self):
"""
Test that html_to_text produces deterministic output under high concurrency.
This is the critical test for the lxml threading bug fix.
Without the thread-local parser fix, this test would occasionally fail
under high concurrency when multiple threads share the global parser.
"""
html = '''
<html>
<head><title>Test Page</title></head>
<body>
<h1>Main Heading</h1>
<div class="content">
<p>First paragraph with <b>bold text</b>.</p>
<p>Second paragraph with <i>italic text</i>.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
</div>
</body>
</html>
'''
results_queue = Queue()
def worker(worker_id, iterations=10):
"""Worker that converts HTML to text multiple times."""
for i in range(iterations):
text = html_to_text(html)
md5 = hashlib.md5(text.encode('utf-8')).hexdigest()
results_queue.put((worker_id, i, md5))
# Launch many threads simultaneously
num_threads = 50
threads = []
for i in range(num_threads):
t = threading.Thread(target=worker, args=(i,))
threads.append(t)
t.start()
# Wait for all threads to complete
for t in threads:
t.join()
# Collect all MD5 results
md5_values = []
while not results_queue.empty():
_, _, md5 = results_queue.get()
md5_values.append(md5)
# All MD5s should be identical
unique_md5s = set(md5_values)
assert len(unique_md5s) == 1, (
f"Thread-safety issue detected! Found {len(unique_md5s)} different MD5 values: {unique_md5s}. "
"The thread-local parser fix may not be working correctly."
)
print(f"✓ Thread-safety test passed: {len(md5_values)} conversions, all identical")
def test_thread_local_parser_exists(self):
"""Verify that thread-local storage is properly initialized."""
# Call html_to_text at least once to initialize thread-local storage
html_to_text('<html><body>Test</body></html>')
# Check that thread-local storage attribute exists
assert hasattr(html_to_text, '_thread_local'), (
"html_to_text should have _thread_local attribute for thread-safe parsers"
)
def test_different_threads_get_different_parsers(self):
"""Verify that different threads CAN get different parser instances."""
parser_ids = Queue()
def get_parser_id():
"""Get the parser ID in this thread."""
# Trigger parser creation
html_to_text('<html><body>Test</body></html>')
# Get the parser instance for this thread
if hasattr(html_to_text._thread_local, 'parser'):
parser = html_to_text._thread_local.parser
parser_ids.put(id(parser))
# Launch multiple threads
threads = []
for _ in range(5):
t = threading.Thread(target=get_parser_id)
threads.append(t)
t.start()
for t in threads:
t.join()
# Collect all parser IDs
ids = []
while not parser_ids.empty():
ids.append(parser_ids.get())
# We should have at least 2 different parser instances
# (threads can reuse IDs after completion, so not necessarily all unique)
unique_ids = set(ids)
assert len(unique_ids) >= 2, (
f"Expected at least 2 unique parsers, but got {len(unique_ids)}. "
"Thread-local storage may not be working correctly."
)
print(f"✓ Parser isolation test passed: {len(ids)} threads, {len(unique_ids)} unique parsers")
if __name__ == '__main__':
# Can run this file directly for quick testing
pytest.main([__file__, '-v'])