mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 06:07:50 +00:00
Compare commits
3 Commits
08169c23f3
...
filter-not
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69af4edb0f | ||
|
|
2d3a1bc53b | ||
|
|
ce66ef1a9b |
@@ -328,8 +328,9 @@ Math: {{ 1 + 1 }}") }}
|
||||
{{ render_checkbox_field(form.filter_text_replaced) }}
|
||||
{{ render_checkbox_field(form.filter_text_removed) }}
|
||||
<span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span><br>
|
||||
<span class="pure-form-message-inline"> So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
|
||||
<span class="pure-form-message-inline"> When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
|
||||
<span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
|
||||
<span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code>.</span><br>
|
||||
<span class="pure-form-message-inline">The full snapshot is still saved (this does not strip added/changed/removed lines), only limits the triggers of the change detection.</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-control-group">
|
||||
{{ render_checkbox_field(form.check_unique_lines) }}
|
||||
|
||||
@@ -459,15 +459,27 @@ class perform_site_check(difference_detection_processor):
|
||||
# Save text before ignore filters (for diff calculation)
|
||||
text_content_before_ignored_filter = stripped_text
|
||||
|
||||
# Save full content before diff filtering for consistent MD5 calculation
|
||||
full_content_for_md5 = None
|
||||
|
||||
# === DIFF FILTERING ===
|
||||
# If user wants specific diff types (added/removed/replaced only)
|
||||
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
|
||||
stripped_text = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
|
||||
if stripped_text is None:
|
||||
# No differences found, but content exists
|
||||
# Save full content BEFORE applying diff filtering
|
||||
# This ensures MD5 is always calculated from full content, not the filtered diff
|
||||
full_content_for_md5 = stripped_text
|
||||
|
||||
filtered_diff = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
|
||||
if filtered_diff is None:
|
||||
# No matching differences found (e.g., only removed lines when user wants added/replaced)
|
||||
# Calculate MD5 of full content and return early
|
||||
c = ChecksumCalculator.calculate(text_content_before_ignored_filter, ignore_whitespace=True)
|
||||
return False, {'previous_md5': c}, text_content_before_ignored_filter.encode('utf-8')
|
||||
|
||||
# Has matching changes - use filtered diff for trigger_text evaluation and display,
|
||||
# but full_content_for_md5 will be used later for MD5 calculation
|
||||
stripped_text = filtered_diff
|
||||
|
||||
# === EMPTY PAGE CHECK ===
|
||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text.strip()) == 0:
|
||||
@@ -495,17 +507,23 @@ class perform_site_check(difference_detection_processor):
|
||||
stripped_text = transformer.sort_alphabetically(stripped_text)
|
||||
|
||||
# === CHECKSUM CALCULATION ===
|
||||
text_for_checksuming = stripped_text
|
||||
# When diff filtering is active, use full content for MD5, not the filtered diff
|
||||
# This ensures consistent MD5 calculation regardless of what changes occurred
|
||||
if full_content_for_md5 is not None:
|
||||
text_for_checksuming = full_content_for_md5
|
||||
else:
|
||||
text_for_checksuming = stripped_text
|
||||
|
||||
# Apply ignore_text for checksum calculation
|
||||
if filter_config.ignore_text:
|
||||
text_for_checksuming = html_tools.strip_ignore_text(stripped_text, filter_config.ignore_text)
|
||||
text_for_checksuming = html_tools.strip_ignore_text(text_for_checksuming, filter_config.ignore_text)
|
||||
|
||||
# Optionally remove ignored lines from output
|
||||
# Note: Only apply to stripped_text if we're not using full_content_for_md5
|
||||
strip_ignored_lines = watch.get('strip_ignored_lines')
|
||||
if strip_ignored_lines is None:
|
||||
strip_ignored_lines = self.datastore.data['settings']['application'].get('strip_ignored_lines')
|
||||
if strip_ignored_lines:
|
||||
if strip_ignored_lines and full_content_for_md5 is None:
|
||||
stripped_text = text_for_checksuming
|
||||
|
||||
# Calculate checksum
|
||||
@@ -571,7 +589,12 @@ class perform_site_check(difference_detection_processor):
|
||||
if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text:
|
||||
del text_for_checksuming
|
||||
|
||||
return changed_detected, update_obj, stripped_text
|
||||
# When diff filtering is active, return full content for history snapshots
|
||||
# stripped_text contains only the filtered diff, which would create confusing/broken snapshots
|
||||
if full_content_for_md5 is not None:
|
||||
return changed_detected, update_obj, full_content_for_md5
|
||||
else:
|
||||
return changed_detected, update_obj, stripped_text
|
||||
|
||||
def _apply_diff_filtering(self, watch, stripped_text, text_before_filter):
|
||||
"""Apply user's diff filtering preferences (show only added/removed/replaced lines)."""
|
||||
|
||||
@@ -183,3 +183,104 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
|
||||
assert '网站监测 内容更新了'.encode('utf-8') in response
|
||||
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_consistent_md5_with_diff_filtering(client, live_server, measure_memory_usage):
|
||||
"""
|
||||
Test that MD5 checksums are calculated consistently when diff filtering is active.
|
||||
|
||||
This test ensures that after a change is detected with diff filtering enabled,
|
||||
subsequent checks with identical content don't trigger false positives.
|
||||
|
||||
Bug: Previously, MD5 was calculated from the filtered diff (partial content)
|
||||
when changes were found, but from full content when no changes were found.
|
||||
This caused false positives on the next check with identical content.
|
||||
|
||||
Fix: Always calculate MD5 from full content, regardless of diff filtering.
|
||||
"""
|
||||
|
||||
delete_all_watches(client)
|
||||
time.sleep(1)
|
||||
|
||||
# Setup initial content
|
||||
set_original()
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
|
||||
|
||||
# Configure: Only track ADDED and REPLACED lines, ignore REMOVED lines
|
||||
res = client.post(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first"),
|
||||
data={
|
||||
"url": test_url,
|
||||
'processor': 'text_json_diff',
|
||||
'fetch_backend': "html_requests",
|
||||
'filter_text_added': 'y', # Track added lines
|
||||
'filter_text_replaced': 'y', # Track replaced lines
|
||||
'filter_text_removed': '', # Don't track removed lines
|
||||
"time_between_check_use_default": "y"
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
# CHECK 1: Initial baseline
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'has-unread-changes' not in res.data # First check, no change
|
||||
|
||||
# Mark as viewed to start fresh
|
||||
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
|
||||
|
||||
# CHECK 2: Remove a line (should NOT trigger - removed lines are filtered out)
|
||||
set_original(excluding='Something irrelevant')
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'has-unread-changes' not in res.data # No change (removed line filtered)
|
||||
|
||||
# CHECK 3: Add a line (should trigger - added lines are tracked)
|
||||
set_original(excluding='Something irrelevant', add_line='<p>New exciting feature!</p>')
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'has-unread-changes' in res.data # Change detected (added line)
|
||||
|
||||
# Mark as viewed
|
||||
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
|
||||
|
||||
# CHECK 4: Same content as CHECK 3 (THE CRITICAL TEST - should NOT trigger)
|
||||
# This is where the bug would manifest: false positive change detection
|
||||
# because previous MD5 was from filtered diff, current MD5 is from full content
|
||||
set_original(excluding='Something irrelevant', add_line='<p>New exciting feature!</p>')
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
|
||||
# CRITICAL ASSERTION: Should NOT detect change (content is identical to CHECK 3)
|
||||
assert b'has-unread-changes' not in res.data, \
|
||||
"False positive! Content identical to previous check but change was detected. " \
|
||||
"MD5 calculation is inconsistent with diff filtering."
|
||||
|
||||
# CHECK 5: Verify system still detects real changes (replace a line)
|
||||
# Change "Some initial text" to "Some modified text"
|
||||
modified_content = """<html>
|
||||
<body>
|
||||
<p>Some modified text</p>
|
||||
<p>So let's see what happens.</p>
|
||||
<p>and a new line!</p>
|
||||
<p>The golden line</p>
|
||||
<p>New exciting feature!</p>
|
||||
<p>A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(modified_content)
|
||||
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'has-unread-changes' in res.data # Change detected (replaced line)
|
||||
|
||||
delete_all_watches(client)
|
||||
|
||||
Reference in New Issue
Block a user