mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-01-25 16:40:19 +00:00
Compare commits
2 Commits
content-fa
...
lang-impro
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
541ed62bba | ||
|
|
b1a45964e6 |
@@ -84,7 +84,6 @@ jobs:
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
|
||||
|
||||
# Basic pytest tests with ancillary services
|
||||
basic-tests:
|
||||
|
||||
@@ -75,6 +75,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
continue
|
||||
|
||||
uuid = queued_item_data.item.get('uuid')
|
||||
|
||||
# RACE CONDITION FIX: Check if this UUID is already being processed by another worker
|
||||
from changedetectionio import worker_handler
|
||||
from changedetectionio.queuedWatchMetaData import PrioritizedItem
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import time
|
||||
import threading
|
||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, session
|
||||
from flask_babel import gettext
|
||||
from loguru import logger
|
||||
@@ -152,24 +151,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_handle
|
||||
confirmtext = request.form.get('confirmtext')
|
||||
|
||||
if confirmtext == 'clear':
|
||||
# Run in background thread to avoid blocking
|
||||
def clear_history_background():
|
||||
# Capture UUIDs first to avoid race conditions
|
||||
watch_uuids = list(datastore.data['watching'].keys())
|
||||
logger.info(f"Background: Clearing history for {len(watch_uuids)} watches")
|
||||
|
||||
for uuid in watch_uuids:
|
||||
try:
|
||||
datastore.clear_watch_history(uuid)
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing history for watch {uuid}: {e}")
|
||||
|
||||
logger.info("Background: Completed clearing history")
|
||||
|
||||
# Start daemon thread
|
||||
threading.Thread(target=clear_history_background, daemon=True).start()
|
||||
|
||||
flash(gettext("History clearing started in background"))
|
||||
for uuid in datastore.data['watching'].keys():
|
||||
datastore.clear_watch_history(uuid)
|
||||
flash(gettext("Cleared snapshot history for all watches"))
|
||||
else:
|
||||
flash(gettext('Incorrect confirmation text.'), 'error')
|
||||
|
||||
|
||||
@@ -539,18 +539,6 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||
|
||||
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
||||
"""
|
||||
Convert HTML content to plain text using inscriptis.
|
||||
|
||||
Thread-Safety: This function uses inscriptis.get_text() which internally calls
|
||||
lxml.html.fromstring() with the default parser. Testing with 50 concurrent threads
|
||||
confirms this approach is thread-safe and produces deterministic output.
|
||||
|
||||
Alternative Approach Rejected: An explicit HTMLParser instance (thread-local or fresh)
|
||||
would also be thread-safe, but was found to break change detection logic in subtle ways
|
||||
(test_check_basic_change_detection_functionality). The default parser provides correct
|
||||
and reliable behavior.
|
||||
"""
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
"""Unit tests for html_tools.html_to_text function."""
|
||||
|
||||
import hashlib
|
||||
import threading
|
||||
from queue import Queue
|
||||
import pytest
|
||||
|
||||
from changedetectionio.html_tools import html_to_text
|
||||
|
||||
|
||||
class TestHtmlToText:
|
||||
"""Test html_to_text function for correctness and thread-safety."""
|
||||
|
||||
def test_basic_text_extraction(self):
|
||||
"""Test basic HTML to text conversion."""
|
||||
html = '<html><body><h1>Title</h1><p>Paragraph text.</p></body></html>'
|
||||
text = html_to_text(html)
|
||||
|
||||
assert 'Title' in text
|
||||
assert 'Paragraph text.' in text
|
||||
assert '<' not in text # HTML tags should be stripped
|
||||
assert '>' not in text
|
||||
|
||||
def test_empty_html(self):
|
||||
"""Test handling of empty HTML."""
|
||||
html = '<html><body></body></html>'
|
||||
text = html_to_text(html)
|
||||
|
||||
# Should return empty or whitespace only
|
||||
assert text.strip() == ''
|
||||
|
||||
def test_nested_elements(self):
|
||||
"""Test extraction from nested HTML elements."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<h1>Header</h1>
|
||||
<div>
|
||||
<p>First paragraph</p>
|
||||
<p>Second paragraph</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
text = html_to_text(html)
|
||||
|
||||
assert 'Header' in text
|
||||
assert 'First paragraph' in text
|
||||
assert 'Second paragraph' in text
|
||||
|
||||
def test_anchor_tag_rendering(self):
|
||||
"""Test anchor tag rendering option."""
|
||||
html = '<html><body><a href="https://example.com">Link text</a></body></html>'
|
||||
|
||||
# Without rendering anchors
|
||||
text_without = html_to_text(html, render_anchor_tag_content=False)
|
||||
assert 'Link text' in text_without
|
||||
assert 'https://example.com' not in text_without
|
||||
|
||||
# With rendering anchors
|
||||
text_with = html_to_text(html, render_anchor_tag_content=True)
|
||||
assert 'Link text' in text_with
|
||||
assert 'https://example.com' in text_with or '[Link text]' in text_with
|
||||
|
||||
def test_rss_mode(self):
|
||||
"""Test RSS mode converts title tags to h1."""
|
||||
html = '<item><title>RSS Title</title><description>Content</description></item>'
|
||||
|
||||
# is_rss=True should convert <title> to <h1>
|
||||
text = html_to_text(html, is_rss=True)
|
||||
|
||||
assert 'RSS Title' in text
|
||||
assert 'Content' in text
|
||||
|
||||
def test_special_characters(self):
|
||||
"""Test handling of special characters and entities."""
|
||||
html = '<html><body><p>Test & <special> characters</p></body></html>'
|
||||
text = html_to_text(html)
|
||||
|
||||
# Entities should be decoded
|
||||
assert 'Test &' in text or 'Test &' in text
|
||||
assert 'special' in text
|
||||
|
||||
def test_whitespace_handling(self):
|
||||
"""Test that whitespace is properly handled."""
|
||||
html = '<html><body><p>Line 1</p><p>Line 2</p></body></html>'
|
||||
text = html_to_text(html)
|
||||
|
||||
# Should have some separation between lines
|
||||
assert 'Line 1' in text
|
||||
assert 'Line 2' in text
|
||||
assert text.count('\n') >= 1 # At least one newline
|
||||
|
||||
def test_deterministic_output(self):
|
||||
"""Test that the same HTML always produces the same text."""
|
||||
html = '<html><body><h1>Test</h1><p>Content here</p></body></html>'
|
||||
|
||||
# Extract text multiple times
|
||||
results = [html_to_text(html) for _ in range(10)]
|
||||
|
||||
# All results should be identical
|
||||
assert len(set(results)) == 1, "html_to_text should be deterministic"
|
||||
|
||||
def test_thread_safety_determinism(self):
|
||||
"""
|
||||
Test that html_to_text produces deterministic output under high concurrency.
|
||||
|
||||
This is the critical test for the lxml threading bug fix.
|
||||
Without the thread-local parser fix, this test would occasionally fail
|
||||
under high concurrency when multiple threads share the global parser.
|
||||
"""
|
||||
html = '''
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Main Heading</h1>
|
||||
<div class="content">
|
||||
<p>First paragraph with <b>bold text</b>.</p>
|
||||
<p>Second paragraph with <i>italic text</i>.</p>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
results_queue = Queue()
|
||||
|
||||
def worker(worker_id, iterations=10):
|
||||
"""Worker that converts HTML to text multiple times."""
|
||||
for i in range(iterations):
|
||||
text = html_to_text(html)
|
||||
md5 = hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
results_queue.put((worker_id, i, md5))
|
||||
|
||||
# Launch many threads simultaneously
|
||||
num_threads = 50
|
||||
threads = []
|
||||
|
||||
for i in range(num_threads):
|
||||
t = threading.Thread(target=worker, args=(i,))
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
# Wait for all threads to complete
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
# Collect all MD5 results
|
||||
md5_values = []
|
||||
while not results_queue.empty():
|
||||
_, _, md5 = results_queue.get()
|
||||
md5_values.append(md5)
|
||||
|
||||
# All MD5s should be identical
|
||||
unique_md5s = set(md5_values)
|
||||
|
||||
assert len(unique_md5s) == 1, (
|
||||
f"Thread-safety issue detected! Found {len(unique_md5s)} different MD5 values: {unique_md5s}. "
|
||||
"The thread-local parser fix may not be working correctly."
|
||||
)
|
||||
|
||||
print(f"✓ Thread-safety test passed: {len(md5_values)} conversions, all identical")
|
||||
|
||||
def test_thread_local_parser_exists(self):
|
||||
"""Verify that thread-local storage is properly initialized."""
|
||||
# Call html_to_text at least once to initialize thread-local storage
|
||||
html_to_text('<html><body>Test</body></html>')
|
||||
|
||||
# Check that thread-local storage attribute exists
|
||||
assert hasattr(html_to_text, '_thread_local'), (
|
||||
"html_to_text should have _thread_local attribute for thread-safe parsers"
|
||||
)
|
||||
|
||||
def test_different_threads_get_different_parsers(self):
|
||||
"""Verify that different threads CAN get different parser instances."""
|
||||
parser_ids = Queue()
|
||||
|
||||
def get_parser_id():
|
||||
"""Get the parser ID in this thread."""
|
||||
# Trigger parser creation
|
||||
html_to_text('<html><body>Test</body></html>')
|
||||
|
||||
# Get the parser instance for this thread
|
||||
if hasattr(html_to_text._thread_local, 'parser'):
|
||||
parser = html_to_text._thread_local.parser
|
||||
parser_ids.put(id(parser))
|
||||
|
||||
# Launch multiple threads
|
||||
threads = []
|
||||
for _ in range(5):
|
||||
t = threading.Thread(target=get_parser_id)
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
# Collect all parser IDs
|
||||
ids = []
|
||||
while not parser_ids.empty():
|
||||
ids.append(parser_ids.get())
|
||||
|
||||
# We should have at least 2 different parser instances
|
||||
# (threads can reuse IDs after completion, so not necessarily all unique)
|
||||
unique_ids = set(ids)
|
||||
assert len(unique_ids) >= 2, (
|
||||
f"Expected at least 2 unique parsers, but got {len(unique_ids)}. "
|
||||
"Thread-local storage may not be working correctly."
|
||||
)
|
||||
|
||||
print(f"✓ Parser isolation test passed: {len(ids)} threads, {len(unique_ids)} unique parsers")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Can run this file directly for quick testing
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user