diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 2fe438f4..5b7312d2 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -495,16 +495,17 @@ class perform_site_check(difference_detection_processor): # Start with content reference, avoid copy until modification html_content = content - # Apply include filters (CSS, XPath, JSON) - # Except for plaintext (incase they tried to confuse the system, it will HTML escape - #if not stream_content_type.is_plaintext: - if filter_config.has_include_filters: - html_content = content_processor.apply_include_filters(content, stream_content_type) - - # Apply subtractive selectors + # Apply subtractive selectors first so include filters operate on already-cleaned content. + # Otherwise a subtractive selector that relies on ancestor context (e.g. ".main .ads") + # cannot match after the include filter has extracted the inner element and stripped + # the parent wrapper. if filter_config.has_subtractive_selectors: html_content = content_processor.apply_subtractive_selectors(html_content) + # Apply include filters (CSS, XPath, JSON) + if filter_config.has_include_filters: + html_content = content_processor.apply_include_filters(html_content, stream_content_type) + # === TEXT EXTRACTION === if watch.is_source_type_url: # For source URLs, keep raw content diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py index 5fde2206..562e8453 100644 --- a/changedetectionio/tests/test_element_removal.py +++ b/changedetectionio/tests/test_element_removal.py @@ -251,3 +251,41 @@ body > table > tr:nth-child(3) > td:nth-child(3)""", # First column should exist assert b"Emil" in res.data + +# Re PR #978: subtractive_selectors must run BEFORE include_filters so that selectors +# relying on ancestor context (e.g. ".main .ad") can still match. If include runs first, +# the ancestor wrapper is stripped and the subtractive selector matches nothing. +def test_subtractive_selectors_applied_before_include_filters(client, live_server, measure_memory_usage, datastore_path): + page_html = """ +
+

first kept paragraph

+ +

second kept paragraph

+
+ +""" + with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f: + f.write(page_html) + + test_url = url_for("test_endpoint", _external=True) + client.application.config.get('DATASTORE').add_watch( + url=test_url, + extras={ + # Include filter strips the .main wrapper from the output + "include_filters": [".main p"], + # Subtractive selector depends on the .main ancestor — only effective if it runs first + "subtractive_selectors": [".main .advertisement"], + }, + ) + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get( + url_for("ui.ui_preview.preview_page", uuid="first"), + follow_redirects=True, + ) + + assert b"first kept paragraph" in res.data + assert b"second kept paragraph" in res.data + # The bug: ad survives if include filter runs first + assert b"noisy advertisement text" not in res.data