Re #3483 Dont touch original content if no filters found

2025-12-03 22:55:33 +00:00 · 2025-10-10 13:04:47 +02:00
2 changed files with 13 additions and 3 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -1,5 +1,4 @@
 from loguru import logger
-from lxml import etree
 from typing import List
 import html
 import json
@@ -58,13 +57,17 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting

    return html_block

-def subtractive_css_selector(css_selector, html_content):
+def subtractive_css_selector(css_selector, content):
    from bs4 import BeautifulSoup
-    soup = BeautifulSoup(html_content, "html.parser")
+    soup = BeautifulSoup(content, "html.parser")

    # So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM
    elements_to_remove = soup.select(css_selector)

+    if not elements_to_remove:
+        # Better to return the original that rebuild with BeautifulSoup
+        return content
+
    # Then, remove them in a separate loop
    for item in elements_to_remove:
        item.decompose()
@@ -72,6 +75,7 @@ def subtractive_css_selector(css_selector, html_content):
    return str(soup)

 def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
+    from lxml import etree
    # Parse the HTML content using lxml
    html_tree = etree.HTML(html_content)

@@ -83,6 +87,10 @@ def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
        # Collect elements for each selector
        elements_to_remove.extend(html_tree.xpath(selector))

+    # If no elements were found, return the original HTML content
+    if not elements_to_remove:
+        return html_content
+
    # Then, remove them in a separate loop
    for element in elements_to_remove:
        if element.getparent() is not None:  # Ensure the element has a parent before removing
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -404,6 +404,8 @@ class perform_site_check(difference_detection_processor):
        html_content = content

        # Apply include filters (CSS, XPath, JSON)
+        # Except for plaintext (incase they tried to confuse the system, it will HTML escape
+        #if not stream_content_type.is_plaintext:
        if filter_config.has_include_filters:
            html_content = content_processor.apply_include_filters(content, stream_content_type)