Merge branch 'master' into enhanced-RSS-and-register-all-namespaces

repair error handling
Adding validation
2025-12-16 13:06:37 +00:00 · 2024-09-11 11:32:00 +02:00 · 2024-09-11 10:14:31 +02:00 · 2024-09-10 19:10:47 +02:00 · 2024-09-10 14:31:09 +02:00
5 changed files with 98 additions and 36 deletions
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -729,6 +729,12 @@ def changedetection_app(config=None, datastore_o=None):
            for p in datastore.proxy_list:
                form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label'])))

+        # Add some HTML to be used for form validation
+        if datastore.data['watching'][uuid].history.keys():
+            timestamp = list(datastore.data['watching'][uuid].history.keys())[-1]
+            form.last_html_for_form_validation = datastore.data['watching'][uuid].get_fetched_html(timestamp)
+        else:
+            form.last_html_for_form_validation = "<html><body></body></html>"

        if request.method == 'POST' and form.validate():

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -1,6 +1,9 @@
 import os
 import re

+import elementpath
+
+from changedetectionio.html_tools import xpath_filter, xpath1_filter
 from changedetectionio.strtobool import strtobool

 from wtforms import (
@@ -322,52 +325,39 @@ class ValidateCSSJSONXPATHInput(object):
        self.allow_json = allow_json

    def __call__(self, form, field):
-
+        from lxml.etree import XPathEvalError
        if isinstance(field.data, str):
            data = [field.data]
        else:
            data = field.data

        for line in data:
-        # Nothing to see here
-            if not len(line.strip()):
-                return
+            line = line.strip()

-            # Does it look like XPath?
-            if line.strip()[0] == '/' or line.strip().startswith('xpath:'):
+            if not line:
+                continue
+
+            if line.startswith('xpath') or line.startswith('/'):
                if not self.allow_xpath:
                    raise ValidationError("XPath not permitted in this field!")
-                from lxml import etree, html
-                import elementpath
-                # xpath 2.0-3.1
-                from elementpath.xpath3 import XPath3Parser
-                tree = html.fromstring("<html></html>")
+
+                if line.startswith('xpath1:'):
+                    filter_function = xpath1_filter
+                else:
                    line = line.replace('xpath:', '')
+                    filter_function = xpath_filter

                try:
-                    elementpath.select(tree, line.strip(), parser=XPath3Parser)
-                except elementpath.ElementPathError as e:
+                    # Call the determined function
+                    res = filter_function(xpath_filter=line, html_content=form.last_html_for_form_validation)
+                    # It's OK if this is an empty result, we just want to check that it doesn't crash the parser
+                except (elementpath.ElementPathError,XPathEvalError) as e:
                    message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
                    raise ValidationError(message % (line, str(e)))
-                except:
+                except Exception as e:
                    raise ValidationError("A system-error occurred when validating your XPath expression")

-            if line.strip().startswith('xpath1:'):
-                if not self.allow_xpath:
-                    raise ValidationError("XPath not permitted in this field!")
-                from lxml import etree, html
-                tree = html.fromstring("<html></html>")
-                line = re.sub(r'^xpath1:', '', line)
-
-                try:
-                    tree.xpath(line.strip())
-                except etree.XPathEvalError as e:
-                    message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
-                    raise ValidationError(message % (line, str(e)))
-                except:
-                    raise ValidationError("A system-error occurred when validating your XPath expression")
-
-            if 'json:' in line:
+            elif 'json:' in line:
                if not self.allow_json:
                    raise ValidationError("JSONPath not permitted in this field!")

@@ -392,7 +382,7 @@ class ValidateCSSJSONXPATHInput(object):
                if not self.allow_json:
                    raise ValidationError("jq not permitted in this field!")

-            if 'jq:' in line:
+            elif line.startswith('jq:'):
                try:
                    import jq
                except ModuleNotFoundError:
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -8,6 +8,7 @@ from xml.sax.saxutils import escape as xml_escape
 import json
 import re

+from loguru import logger

 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
@@ -108,6 +109,20 @@ def elementpath_tostring(obj):

    return str(obj)

+def extract_namespaces(xml_content):
+    """
+    Extracts all namespaces from the XML content.
+    """
+    from lxml import etree
+    from io import BytesIO
+
+    it = etree.iterparse(BytesIO(xml_content), events=('start-ns',))
+    namespaces = {}
+    for _, ns in it:
+        prefix, uri = ns
+        namespaces[prefix] = uri
+    return namespaces
+
 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
    from lxml import etree, html
@@ -123,7 +138,14 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

-    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+    # Automatically extract all namespaces from the XML content
+    namespaces = {'re': 'http://exslt.org/regular-expressions'}
+    try:
+        namespaces.update(extract_namespaces(html_content.encode('utf-8')))
+    except Exception as e:
+        logger.warning(f"Problem extracting namespaces from HTMl/XML content {str(e)}")
+
+    r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
    #@note: //title/text() wont work where <title>CDATA..

    if type(r) != list:
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -77,11 +77,12 @@ class perform_site_check(difference_detection_processor):

        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Go into RSS preprocess for converting CDATA/comment to usable text
-        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
-            if '<rss' in self.fetcher.content[:100].lower():
+        # Ctype_header could be unset if we are just reprocessing the existin content
+        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']) or not ctype_header:
+            top_text = self.fetcher.content[:200].lower().strip()
+            if '<rss' in top_text or 'search.yahoo.com/mrss/' in top_text:
                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
                is_rss = True
-
        # source: support, basically treat it as plaintext
        if watch.is_source_type_url:
            is_html = False
--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -164,3 +164,46 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
    assert b'Some other description' not in res.data  # Should NOT be selected by the xpath

    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+
+def test_namespace_selectors(live_server, client):
+    set_original_cdata_xml()
+    #live_server_setup(live_server)
+
+    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
+
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+
+    wait_for_all_checks(client)
+
+    uuid = extract_UUID_from_client(client)
+    # because it will look for the namespaced stuff during form validation, but on the first check it wont exist..
+    res = client.post(
+        url_for("edit_page", uuid=uuid),
+        data={
+            "include_filters": "//media:thumbnail/@url",
+            "fetch_backend": "html_requests",
+            "headers": "",
+            "proxy": "no-proxy",
+            "tags": "",
+            "url": test_url,
+        },
+        follow_redirects=True
+    )
+
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+    assert b'CDATA' not in res.data
+    assert b'<![' not in res.data
+    assert b'https://testsite.com/thumbnail-c224e10d81488e818701c981da04869e.jpg' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
Author	SHA1	Message	Date
dgtlmoon	365df6cf81	Merge branch 'master' into enhanced-RSS-and-register-all-namespaces	2024-09-11 11:32:00 +02:00
dgtlmoon	76729f2106	repair error handling	2024-09-11 10:14:31 +02:00
dgtlmoon	591dd5b570	Adding validation	2024-09-10 19:10:47 +02:00
dgtlmoon	919812bf8b	Automatically apply any XML/RSS namespaces	2024-09-10 14:31:09 +02:00