first kept paragraph
+noisy advertisement text
+second kept paragraph
+diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 2fe438f4..5b7312d2 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -495,16 +495,17 @@ class perform_site_check(difference_detection_processor): # Start with content reference, avoid copy until modification html_content = content - # Apply include filters (CSS, XPath, JSON) - # Except for plaintext (incase they tried to confuse the system, it will HTML escape - #if not stream_content_type.is_plaintext: - if filter_config.has_include_filters: - html_content = content_processor.apply_include_filters(content, stream_content_type) - - # Apply subtractive selectors + # Apply subtractive selectors first so include filters operate on already-cleaned content. + # Otherwise a subtractive selector that relies on ancestor context (e.g. ".main .ads") + # cannot match after the include filter has extracted the inner element and stripped + # the parent wrapper. if filter_config.has_subtractive_selectors: html_content = content_processor.apply_subtractive_selectors(html_content) + # Apply include filters (CSS, XPath, JSON) + if filter_config.has_include_filters: + html_content = content_processor.apply_include_filters(html_content, stream_content_type) + # === TEXT EXTRACTION === if watch.is_source_type_url: # For source URLs, keep raw content diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py index 5fde2206..562e8453 100644 --- a/changedetectionio/tests/test_element_removal.py +++ b/changedetectionio/tests/test_element_removal.py @@ -251,3 +251,41 @@ body > table > tr:nth-child(3) > td:nth-child(3)""", # First column should exist assert b"Emil" in res.data + +# Re PR #978: subtractive_selectors must run BEFORE include_filters so that selectors +# relying on ancestor context (e.g. ".main .ad") can still match. If include runs first, +# the ancestor wrapper is stripped and the subtractive selector matches nothing. +def test_subtractive_selectors_applied_before_include_filters(client, live_server, measure_memory_usage, datastore_path): + page_html = """
+first kept paragraph
+noisy advertisement text
+second kept paragraph
+