Compare commits

...

3 Commits

3 changed files with 134 additions and 9 deletions

View File

@@ -328,8 +328,9 @@ Math: {{ 1 + 1 }}") }}
{{ render_checkbox_field(form.filter_text_replaced) }}
{{ render_checkbox_field(form.filter_text_removed) }}
<span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span><br>
<span class="pure-form-message-inline">&nbsp;So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
<span class="pure-form-message-inline">&nbsp;When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
<span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
<span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code>.</span><br>
<span class="pure-form-message-inline">The full snapshot is still saved (this does not strip added/changed/removed lines), only limits the triggers of the change detection.</span>
</fieldset>
<fieldset class="pure-control-group">
{{ render_checkbox_field(form.check_unique_lines) }}

View File

@@ -459,15 +459,27 @@ class perform_site_check(difference_detection_processor):
# Save text before ignore filters (for diff calculation)
text_content_before_ignored_filter = stripped_text
# Save full content before diff filtering for consistent MD5 calculation
full_content_for_md5 = None
# === DIFF FILTERING ===
# If user wants specific diff types (added/removed/replaced only)
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
stripped_text = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
if stripped_text is None:
# No differences found, but content exists
# Save full content BEFORE applying diff filtering
# This ensures MD5 is always calculated from full content, not the filtered diff
full_content_for_md5 = stripped_text
filtered_diff = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
if filtered_diff is None:
# No matching differences found (e.g., only removed lines when user wants added/replaced)
# Calculate MD5 of full content and return early
c = ChecksumCalculator.calculate(text_content_before_ignored_filter, ignore_whitespace=True)
return False, {'previous_md5': c}, text_content_before_ignored_filter.encode('utf-8')
# Has matching changes - use filtered diff for trigger_text evaluation and display,
# but full_content_for_md5 will be used later for MD5 calculation
stripped_text = filtered_diff
# === EMPTY PAGE CHECK ===
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text.strip()) == 0:
@@ -495,17 +507,23 @@ class perform_site_check(difference_detection_processor):
stripped_text = transformer.sort_alphabetically(stripped_text)
# === CHECKSUM CALCULATION ===
text_for_checksuming = stripped_text
# When diff filtering is active, use full content for MD5, not the filtered diff
# This ensures consistent MD5 calculation regardless of what changes occurred
if full_content_for_md5 is not None:
text_for_checksuming = full_content_for_md5
else:
text_for_checksuming = stripped_text
# Apply ignore_text for checksum calculation
if filter_config.ignore_text:
text_for_checksuming = html_tools.strip_ignore_text(stripped_text, filter_config.ignore_text)
text_for_checksuming = html_tools.strip_ignore_text(text_for_checksuming, filter_config.ignore_text)
# Optionally remove ignored lines from output
# Note: Only apply to stripped_text if we're not using full_content_for_md5
strip_ignored_lines = watch.get('strip_ignored_lines')
if strip_ignored_lines is None:
strip_ignored_lines = self.datastore.data['settings']['application'].get('strip_ignored_lines')
if strip_ignored_lines:
if strip_ignored_lines and full_content_for_md5 is None:
stripped_text = text_for_checksuming
# Calculate checksum
@@ -571,7 +589,12 @@ class perform_site_check(difference_detection_processor):
if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text:
del text_for_checksuming
return changed_detected, update_obj, stripped_text
# When diff filtering is active, return full content for history snapshots
# stripped_text contains only the filtered diff, which would create confusing/broken snapshots
if full_content_for_md5 is not None:
return changed_detected, update_obj, full_content_for_md5
else:
return changed_detected, update_obj, stripped_text
def _apply_diff_filtering(self, watch, stripped_text, text_before_filter):
"""Apply user's diff filtering preferences (show only added/removed/replaced lines)."""

View File

@@ -183,3 +183,104 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
assert '网站监测 内容更新了'.encode('utf-8') in response
delete_all_watches(client)
def test_consistent_md5_with_diff_filtering(client, live_server, measure_memory_usage):
"""
Test that MD5 checksums are calculated consistently when diff filtering is active.
This test ensures that after a change is detected with diff filtering enabled,
subsequent checks with identical content don't trigger false positives.
Bug: Previously, MD5 was calculated from the filtered diff (partial content)
when changes were found, but from full content when no changes were found.
This caused false positives on the next check with identical content.
Fix: Always calculate MD5 from full content, regardless of diff filtering.
"""
delete_all_watches(client)
time.sleep(1)
# Setup initial content
set_original()
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
# Configure: Only track ADDED and REPLACED lines, ignore REMOVED lines
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
'processor': 'text_json_diff',
'fetch_backend': "html_requests",
'filter_text_added': 'y', # Track added lines
'filter_text_replaced': 'y', # Track replaced lines
'filter_text_removed': '', # Don't track removed lines
"time_between_check_use_default": "y"
},
follow_redirects=True
)
assert b"Updated watch." in res.data
# CHECK 1: Initial baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' not in res.data # First check, no change
# Mark as viewed to start fresh
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
# CHECK 2: Remove a line (should NOT trigger - removed lines are filtered out)
set_original(excluding='Something irrelevant')
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' not in res.data # No change (removed line filtered)
# CHECK 3: Add a line (should trigger - added lines are tracked)
set_original(excluding='Something irrelevant', add_line='<p>New exciting feature!</p>')
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data # Change detected (added line)
# Mark as viewed
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
# CHECK 4: Same content as CHECK 3 (THE CRITICAL TEST - should NOT trigger)
# This is where the bug would manifest: false positive change detection
# because previous MD5 was from filtered diff, current MD5 is from full content
set_original(excluding='Something irrelevant', add_line='<p>New exciting feature!</p>')
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
# CRITICAL ASSERTION: Should NOT detect change (content is identical to CHECK 3)
assert b'has-unread-changes' not in res.data, \
"False positive! Content identical to previous check but change was detected. " \
"MD5 calculation is inconsistent with diff filtering."
# CHECK 5: Verify system still detects real changes (replace a line)
# Change "Some initial text" to "Some modified text"
modified_content = """<html>
<body>
<p>Some modified text</p>
<p>So let's see what happens.</p>
<p>and a new line!</p>
<p>The golden line</p>
<p>New exciting feature!</p>
<p>A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"</p>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(modified_content)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data # Change detected (replaced line)
delete_all_watches(client)