mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-05-29 13:01:08 +00:00
Text filters - Process subtractive_selectors first (#4142)
This commit is contained in:
@@ -495,16 +495,17 @@ class perform_site_check(difference_detection_processor):
|
||||
# Start with content reference, avoid copy until modification
|
||||
html_content = content
|
||||
|
||||
# Apply include filters (CSS, XPath, JSON)
|
||||
# Except for plaintext (incase they tried to confuse the system, it will HTML escape
|
||||
#if not stream_content_type.is_plaintext:
|
||||
if filter_config.has_include_filters:
|
||||
html_content = content_processor.apply_include_filters(content, stream_content_type)
|
||||
|
||||
# Apply subtractive selectors
|
||||
# Apply subtractive selectors first so include filters operate on already-cleaned content.
|
||||
# Otherwise a subtractive selector that relies on ancestor context (e.g. ".main .ads")
|
||||
# cannot match after the include filter has extracted the inner element and stripped
|
||||
# the parent wrapper.
|
||||
if filter_config.has_subtractive_selectors:
|
||||
html_content = content_processor.apply_subtractive_selectors(html_content)
|
||||
|
||||
# Apply include filters (CSS, XPath, JSON)
|
||||
if filter_config.has_include_filters:
|
||||
html_content = content_processor.apply_include_filters(html_content, stream_content_type)
|
||||
|
||||
# === TEXT EXTRACTION ===
|
||||
if watch.is_source_type_url:
|
||||
# For source URLs, keep raw content
|
||||
|
||||
@@ -251,3 +251,41 @@ body > table > tr:nth-child(3) > td:nth-child(3)""",
|
||||
# First column should exist
|
||||
assert b"Emil" in res.data
|
||||
|
||||
|
||||
# Re PR #978: subtractive_selectors must run BEFORE include_filters so that selectors
|
||||
# relying on ancestor context (e.g. ".main .ad") can still match. If include runs first,
|
||||
# the ancestor wrapper is stripped and the subtractive selector matches nothing.
|
||||
def test_subtractive_selectors_applied_before_include_filters(client, live_server, measure_memory_usage, datastore_path):
|
||||
page_html = """<html><body>
|
||||
<div class="main">
|
||||
<p class="keep">first kept paragraph</p>
|
||||
<p class="advertisement">noisy advertisement text</p>
|
||||
<p class="keep">second kept paragraph</p>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
|
||||
f.write(page_html)
|
||||
|
||||
test_url = url_for("test_endpoint", _external=True)
|
||||
client.application.config.get('DATASTORE').add_watch(
|
||||
url=test_url,
|
||||
extras={
|
||||
# Include filter strips the .main wrapper from the output
|
||||
"include_filters": [".main p"],
|
||||
# Subtractive selector depends on the .main ancestor — only effective if it runs first
|
||||
"subtractive_selectors": [".main .advertisement"],
|
||||
},
|
||||
)
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("ui.ui_preview.preview_page", uuid="first"),
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
assert b"first kept paragraph" in res.data
|
||||
assert b"second kept paragraph" in res.data
|
||||
# The bug: ad survives if include filter runs first
|
||||
assert b"noisy advertisement text" not in res.data
|
||||
|
||||
Reference in New Issue
Block a user