fix comment

Related to #3458
2025-11-22 01:16:12 +00:00 · 2025-10-06 15:12:20 +02:00 · 2025-10-06 15:11:07 +02:00
12 changed files with 67 additions and 289 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -11,4 +11,6 @@ updates:
  - package-ecosystem: pip
    directory: /
    schedule:
-      interval: "weekly"
+      interval: "daily"
    allow:
      - dependency-name: "apprise"
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -34,7 +34,7 @@ jobs:
    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v4
+      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        # If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,7 +45,7 @@ jobs:
    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
-      uses: github/codeql-action/autobuild@v4
+      uses: github/codeql-action/autobuild@v3
    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 https://git.io/JvXDl
@@ -59,4 +59,4 @@ jobs:
    #   make release
    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v4
+      uses: github/codeql-action/analyze@v3
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@
 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
-__version__ = '0.50.18'
+__version__ = '0.50.15'
 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/api/init.py
+++ b/changedetectionio/api/init.py
@@ -1,7 +1,10 @@
 import copy
 import yaml
 import functools
 from flask import request, abort
 from loguru import logger
 from openapi_core import OpenAPI
 from openapi_core.contrib.flask import FlaskOpenAPIRequest
 from . import api_schema
 from ..model import watch_base
@@ -31,11 +34,7 @@ schema_delete_notification_urls['required'] = ['notification_urls']
@functools.cache
 def get_openapi_spec():
    """Lazy load OpenAPI spec and dependencies only when validation is needed."""
    import os
    import yaml  # Lazy import - only loaded when API validation is actually used
    from openapi_core import OpenAPI  # Lazy import - saves ~10.7 MB on startup
    spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
    with open(spec_path, 'r') as f:
        spec_dict = yaml.safe_load(f)
@@ -50,9 +49,6 @@ def validate_openapi_request(operation_id):
            try:
                # Skip OpenAPI validation for GET requests since they don't have request bodies
                if request.method.upper() != 'GET':
                    # Lazy import - only loaded when actually validating a request
                    from openapi_core.contrib.flask import FlaskOpenAPIRequest
                    spec = get_openapi_spec()
                    openapi_request = FlaskOpenAPIRequest(request)
                    result = spec.unmarshal_request(openapi_request)
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -1,125 +0,0 @@
 """
 Content Type Detection and Stream Classification
 This module provides intelligent content-type detection for changedetection.io.
 It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
 or too generic, which would otherwise cause the wrong processor to be used.
 The guess_stream_type class combines:
 1. HTTP Content-Type headers (when available and reliable)
 2. Python-magic library for MIME detection (analyzing actual file content)
 3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
 This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
 plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
 Used by: processors/text_json_diff/processor.py and other content processors
 """
 # When to apply the 'cdata to real HTML' hack
 RSS_XML_CONTENT_TYPES = [
    "application/rss+xml",
    "application/rdf+xml",
    "application/atom+xml",
    "text/rss+xml",  # rare, non-standard
    "application/x-rss+xml",  # legacy (older feed software)
    "application/x-atom+xml",  # legacy (older Atom)
 ]
 # JSON Content-types
 JSON_CONTENT_TYPES = [
    "application/activity+json",
    "application/feed+json",
    "application/json",
    "application/ld+json",
    "application/vnd.api+json",
 ]
 # Generic XML Content-types (non-RSS/Atom)
 XML_CONTENT_TYPES = [
    "text/xml",
    "application/xml",
 ]
 HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
 from loguru import logger
 class guess_stream_type():
    is_pdf = False
    is_json = False
    is_html = False
    is_plaintext = False
    is_rss = False
    is_csv = False
    is_xml = False  # Generic XML, not RSS/Atom
    is_yaml = False
    def __init__(self, http_content_header, content):
        import re
        magic_content_header = http_content_header
        test_content = content[:200].lower().strip()
        # Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
        test_content_normalized = re.sub(r'<\s+', '<', test_content)
        # Magic will sometimes call text/plain as text/html!
        magic_result = None
        try:
            import magic
            mime = magic.from_buffer(content[:200], mime=True) # Send the original content
            logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
            if mime and "/" in mime:
                magic_result = mime
                # Ignore generic/fallback mime types from magic
                if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
                    logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
                # Trust magic for non-text types immediately
                elif mime not in ['text/html', 'text/plain']:
                    magic_content_header = mime
        except Exception as e:
            logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
        # Content-based detection (most reliable for text formats)
        # Check for HTML patterns first - if found, override magic's text/plain
        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
        # Always trust headers first
        if 'text/plain' in http_content_header:
            self.is_plaintext = True
        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
        elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
        elif any(s in http_content_header for s in XML_CONTENT_TYPES):
            # Only mark as generic XML if not already detected as RSS
            if not self.is_rss:
                self.is_xml = True
        elif 'pdf' in magic_content_header:
            self.is_pdf = True
 ###
        elif has_html_patterns or http_content_header == 'text/html':
            self.is_html = True
        # If magic says text/plain and we found no HTML patterns, trust it
        elif magic_result == 'text/plain':
            self.is_plaintext = True
            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
        # magic will call a rss document 'xml'
        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
        elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
            self.is_xml = True
        elif '%pdf-1' in test_content:
            self.is_pdf = True
        elif http_content_header.startswith('text/'):
            self.is_plaintext = True
        # Only trust magic for 'text' if no other patterns matched
        elif 'text' in magic_content_header:
            self.is_plaintext = True
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -13,8 +13,6 @@ from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger
 from changedetectionio.processors.magic import guess_stream_type
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -22,9 +20,6 @@ description = 'Detects all text changes where possible'
 json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
 # Assume it's this type if the server says nothing on content-type
 DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg, screenshot=None, xpath_data=None):
        self.screenshot = screenshot
@@ -50,9 +45,6 @@ class perform_site_check(difference_detection_processor):
        if not watch:
            raise Exception("Watch no longer exists.")
        ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
        stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}
@@ -62,7 +54,7 @@ class perform_site_check(difference_detection_processor):
        self.xpath_data = self.fetcher.xpath_data
        # Track the content type
-        update_obj['content_type'] = ctype_header
+        update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
        # Saves a lot of CPU
@@ -77,12 +69,24 @@ class perform_site_check(difference_detection_processor):
        # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
        # return content().textfilter().jsonextract().checksumcompare() ?
        is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
        is_html = not is_json
        is_rss = False
        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Go into RSS preprocess for converting CDATA/comment to usable text
-        if stream_content_type.is_rss:
+        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
            if '<rss' in self.fetcher.content[:100].lower():
                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
                is_rss = True
-        if watch.is_pdf or stream_content_type.is_pdf:
+        # source: support, basically treat it as plaintext
        if watch.is_source_type_url:
            is_html = False
            is_json = False
        inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
        if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
            from shutil import which
            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
            if not which(tool):
@@ -126,12 +130,11 @@ class perform_site_check(difference_detection_processor):
        has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
        has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
-        if stream_content_type.is_json:
+        if is_json and not has_filter_rule:
            if not has_filter_rule:
                # Force a reformat
            include_filters_rule.append("json:$")
            has_filter_rule = True
        if is_json:
            # Sort the JSON so we dont get false alerts when the content is just re-ordered
            try:
                self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -143,21 +146,30 @@ class perform_site_check(difference_detection_processor):
            for filter in include_filters_rule:
                if any(prefix in filter for prefix in json_filter_prefixes):
                    stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
-                        if stripped_text_from_html:
+                    is_html = False
                            stream_content_type.is_json = True
                            stream_content_type.is_html = False
-        # We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
+        if is_html or watch.is_source_type_url:
        if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content
            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
-            # Some kind of "text" but definitely not RSS looking
+            # Try to detect better mime types if its a download or not announced as HTML
-            if stream_content_type.is_plaintext:
+            if is_attachment:
                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
                try:
                    import magic
                    mime = magic.from_buffer(html_content, mime=True)
                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
                    if mime and "/" in mime: # looks valid and is a valid mime type
                        content_type = mime
                except Exception as e:
                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
            if 'text/' in content_type and not 'html' in content_type:
                # Don't run get_text or xpath/css filters on plaintext
                # We are not HTML, we are not any kind of RSS, doesnt even look like HTML
                stripped_text_from_html = html_content
            else:
                # If not JSON, and if it's not text/plain..
@@ -174,13 +186,13 @@ class perform_site_check(difference_detection_processor):
                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=stream_content_type.is_rss)
+                                                                    is_rss=is_rss)
                        elif filter_rule.startswith('xpath1:'):
                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                     is_rss=stream_content_type.is_rss)
+                                                                     is_rss=is_rss)
                        else:
                            html_content += html_tools.include_filters(include_filters=filter_rule,
                                                                       html_content=self.fetcher.content,
@@ -199,7 +211,7 @@ class perform_site_check(difference_detection_processor):
                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                    stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
                                                                      render_anchor_tag_content=do_anchor,
-                                                                      is_rss=stream_content_type.is_rss)  # 1874 activate the <title workaround hack
+                                                                      is_rss=is_rss)  # 1874 activate the <title workaround hack
        if watch.get('trim_text_whitespace'):
            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -238,7 +250,7 @@ class perform_site_check(difference_detection_processor):
        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-        if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
+        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                            status_code=self.fetcher.get_last_status_code(),
                                                            screenshot=self.fetcher.screenshot,
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -174,8 +174,6 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
    WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
    :param client:
    :param live_server:
    :param measure_memory_usage:
@@ -273,7 +271,6 @@ got it\r\n
        url_for("ui.ui_views.preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b"some random text that should be split by line\n" in res.data
    ####
@@ -295,36 +292,3 @@ got it\r\n
    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
 # Server says its plaintext, we should always treat it as plaintext
 def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("""<?xml version="1.0" encoding="utf-8"?>
 <resources xmlns:tools="http://schemas.android.com/tools">
    <!--Activity and fragment titles-->
    <string name="feed_update_receiver_name">Abonnementen bijwerken</string>
 </resources>
 """)
    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
    # Add our URL to the import page
    res = client.post(
        url_for("imports.import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    wait_for_all_checks(client)
    res = client.get(
        url_for("ui.ui_views.preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b'&lt;string name=&#34;feed_update_receiver_name&#34;' in res.data
    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
--- a/changedetectionio/tests/test_group.py
+++ b/changedetectionio/tests/test_group.py
@@ -264,6 +264,8 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
    client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
    wait_for_all_checks(client)
    with open('/tmp/fuck.html', 'wb') as f:
        f.write(res.data)
    # Should be only 1 unviewed
    res = client.get(url_for("watchlist.index"))
    assert res.data.count(b' unviewed ') == 1
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -3,8 +3,9 @@
 import time
 import os
 import json
 import logging
 from flask import url_for
-from .util import wait_for_all_checks
+from .util import live_server_setup, wait_for_all_checks
 from urllib.parse import urlparse, parse_qs
 def test_consistent_history(client, live_server, measure_memory_usage):
--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
    set_original_cdata_xml()
-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
    # Add our URL to the import page
    res = client.post(
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
    set_original_cdata_xml()
-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
    res = client.post(
        url_for("ui.ui_views.form_quick_watch_add"),
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -1,42 +1,12 @@
 # -*- coding: utf-8 -*-
-
+import time
 from flask import url_for
-from .util import  wait_for_all_checks
+from .util import live_server_setup, wait_for_all_checks
-from ..processors.magic import RSS_XML_CONTENT_TYPES
+
 from ..html_tools import *
 def set_rss_atom_feed_response(header=''):
    test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly  -->
 <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
 <channel>
    <atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
    <title>RSS Feed</title>
    <link>
        <![CDATA[ https://store.waterpowered.com/news/collection// ]]>
    </link>
    <description>
        <![CDATA[ Events and Announcements for ]]>
    </description>
    <language>en-us</language>
    <generator>water News RSS</generator>
    <item>
        <title> 🍁 Lets go discount</title>
        <description><p class="bb_paragraph">ok heres the description</p></description>
        <link>
        <![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
        </link>
        <pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
        <guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
        <enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
    </item>
 </channel>
 </rss>"""
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
    return None
@@ -605,47 +575,3 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
 def _subtest_xpath_rss(client, content_type='text/html'):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', content_type=content_type, _external=True)
    res = client.post(
        url_for("ui.ui_views.form_quick_watch_add"),
        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
        follow_redirects=True
    )
    assert b"Watch added in Paused state, saving will unpause" in res.data
    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
        data={
            "url": test_url,
            "include_filters": "xpath://item",
            "tags": '',
            "fetch_backend": "html_requests",
            "time_between_check_use_default": "y",
        },
        follow_redirects=True
    )
    assert b"unpaused" in res.data
    wait_for_all_checks(client)
    res = client.get(
        url_for("ui.ui_views.preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
    assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
 # Be sure all-in-the-wild types of RSS feeds work with xpath
 def test_rss_xpath(client, live_server):
    for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
        set_rss_atom_feed_response(header=feed_header)
        for content_type in RSS_XML_CONTENT_TYPES:
            _subtest_xpath_rss(client, content_type=content_type)
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ flask_wtf~=1.2
 flask~=2.3
 flask-socketio~=5.5.1
 python-socketio~=5.13.0
-python-engineio~=4.12.3
+python-engineio~=4.12.0
 inscriptis~=2.2
 pytz
 timeago~=1.0
@@ -135,7 +135,7 @@ tzdata
 pluggy ~= 1.5
 # Needed for testing, cross-platform for process and system monitoring
-psutil==7.1.0
+psutil==7.0.0
 ruff >= 0.11.2
 pre_commit >= 4.2.0
Author	SHA1	Message	Date
dgtlmoon	a0a0ec9942	fix comment	2025-10-06 15:12:20 +02:00
dgtlmoon	86befef0cb	Related to #3458	2025-10-06 15:11:07 +02:00