woops

Fixing PDF
Rename arg to be more clear
2025-11-01 07:08:47 +00:00 · 2025-10-08 23:44:07 +02:00 · 2025-10-08 23:18:25 +02:00 · 2025-10-08 23:12:06 +02:00 · 2025-10-08 23:10:40 +02:00 · 2025-10-08 23:00:40 +02:00
6 changed files with 253 additions and 53 deletions
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -0,0 +1,138 @@
+"""
+Content Type Detection and Stream Classification
+
+This module provides intelligent content-type detection for changedetection.io.
+It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
+or too generic, which would otherwise cause the wrong processor to be used.
+
+The guess_stream_type class combines:
+1. HTTP Content-Type headers (when available and reliable)
+2. Python-magic library for MIME detection (analyzing actual file content)
+3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
+
+This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
+plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
+
+Used by: processors/text_json_diff/processor.py and other content processors
+"""
+
+# When to apply the 'cdata to real HTML' hack
+RSS_XML_CONTENT_TYPES = [
+    "application/rss+xml",
+    "application/rdf+xml",
+    "text/xml",
+    "application/xml",
+    "application/atom+xml",
+    "text/rss+xml",  # rare, non-standard
+    "application/x-rss+xml",  # legacy (older feed software)
+    "application/x-atom+xml",  # legacy (older Atom)
+]
+
+# JSON Content-types
+JSON_CONTENT_TYPES = [
+    "application/activity+json",
+    "application/feed+json",
+    "application/json",
+    "application/ld+json",
+    "application/vnd.api+json",
+]
+
+# CSV Content-types
+CSV_CONTENT_TYPES = [
+    "text/csv",
+    "application/csv",
+]
+
+# Generic XML Content-types (non-RSS/Atom)
+XML_CONTENT_TYPES = [
+    "text/xml",
+    "application/xml",
+]
+
+# YAML Content-types
+YAML_CONTENT_TYPES = [
+    "text/yaml",
+    "text/x-yaml",
+    "application/yaml",
+    "application/x-yaml",
+]
+
+HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
+
+import re
+import magic
+from loguru import logger
+
+
+class guess_stream_type():
+    is_pdf = False
+    is_json = False
+    is_html = False
+    is_plaintext = False
+    is_rss = False
+    is_csv = False
+    is_xml = False  # Generic XML, not RSS/Atom
+    is_yaml = False
+
+    def __init__(self, http_content_header, content):
+
+        magic_content_header = http_content_header
+        test_content = content[:200].lower().strip()
+
+        # Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
+        test_content_normalized = re.sub(r'<\s+', '<', test_content)
+
+        # Magic will sometimes call text/plain as text/html!
+        magic_result = None
+        try:
+            mime = magic.from_buffer(content[:200], mime=True) # Send the original content
+            logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
+            if mime and "/" in mime:
+                magic_result = mime
+                # Ignore generic/fallback mime types from magic
+                if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
+                    logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
+                # Trust magic for non-text types immediately
+                elif mime not in ['text/html', 'text/plain']:
+                    magic_content_header = mime
+
+        except Exception as e:
+            logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
+
+        # Content-based detection (most reliable for text formats)
+        # Check for HTML patterns first - if found, override magic's text/plain
+        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
+
+        # Always trust headers first
+        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
+            self.is_rss = True
+        elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+            self.is_json = True
+        elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
+            self.is_csv = True
+        elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
+            # Only mark as generic XML if not already detected as RSS
+            if not self.is_rss:
+                self.is_xml = True
+        elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
+            self.is_yaml = True
+        elif 'pdf' in magic_content_header:
+            self.is_pdf = True
+###
+        elif has_html_patterns or http_content_header == 'text/html':
+            self.is_html = True
+        # If magic says text/plain and we found no HTML patterns, trust it
+        elif magic_result == 'text/plain':
+            self.is_plaintext = True
+            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
+        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
+            self.is_rss = True
+        elif test_content_normalized.startswith('<?xml'):
+            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
+            self.is_xml = True
+        elif '%pdf-1' in test_content:
+            self.is_pdf = True
+        # Only trust magic for 'text' if no other patterns matched
+        elif 'text' in magic_content_header:
+            self.is_plaintext = True
+
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -13,6 +13,8 @@ from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger

+from changedetectionio.processors.magic import guess_stream_type
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

 name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -20,6 +22,9 @@ description = 'Detects all text changes where possible'

 json_filter_prefixes = ['json:', 'jq:', 'jqraw:']

+# Assume it's this type if the server says nothing on content-type
+DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
+
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg, screenshot=None, xpath_data=None):
        self.screenshot = screenshot
@@ -45,6 +50,9 @@ class perform_site_check(difference_detection_processor):
        if not watch:
            raise Exception("Watch no longer exists.")

+        ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
+        stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
+
        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}

@@ -54,7 +62,7 @@ class perform_site_check(difference_detection_processor):
        self.xpath_data = self.fetcher.xpath_data

        # Track the content type
-        update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
+        update_obj['content_type'] = ctype_header

        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
        # Saves a lot of CPU
@@ -69,24 +77,12 @@ class perform_site_check(difference_detection_processor):
        # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
        # return content().textfilter().jsonextract().checksumcompare() ?

-        is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
-        is_html = not is_json
-        is_rss = False

-        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Go into RSS preprocess for converting CDATA/comment to usable text
-        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
-            if '<rss' in self.fetcher.content[:100].lower():
-                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
-                is_rss = True
+        if stream_content_type.is_rss:
+            self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)

-        # source: support, basically treat it as plaintext
-        if watch.is_source_type_url:
-            is_html = False
-            is_json = False
-
-        inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
-        if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
+        if watch.is_pdf or stream_content_type.is_pdf:
            from shutil import which
            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
            if not which(tool):
@@ -130,11 +126,12 @@ class perform_site_check(difference_detection_processor):
        has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
        has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())

-        if is_json and not has_filter_rule:
-            include_filters_rule.append("json:$")
-            has_filter_rule = True
+        if stream_content_type.is_json:
+            if not has_filter_rule:
+                # Force a reformat
+                include_filters_rule.append("json:$")
+                has_filter_rule = True

-        if is_json:
            # Sort the JSON so we dont get false alerts when the content is just re-ordered
            try:
                self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -142,34 +139,25 @@ class perform_site_check(difference_detection_processor):
                # Might have just been a snippet, or otherwise bad JSON, continue
                pass

-        if has_filter_rule:
-            for filter in include_filters_rule:
-                if any(prefix in filter for prefix in json_filter_prefixes):
-                    stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
-                    is_html = False
+            if has_filter_rule:
+                for filter in include_filters_rule:
+                    if any(prefix in filter for prefix in json_filter_prefixes):
+                        stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
+                        if stripped_text_from_html:
+                            stream_content_type.is_json = True
+                            stream_content_type.is_html = False

-        if is_html or watch.is_source_type_url:
+        # We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
+        if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content
-            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
-            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type

-            # Try to detect better mime types if its a download or not announced as HTML
-            if is_attachment:
-                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
-                try:
-                    import magic
-                    mime = magic.from_buffer(html_content, mime=True)
-                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
-                    if mime and "/" in mime: # looks valid and is a valid mime type
-                        content_type = mime
-                except Exception as e:
-                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
-
-            if 'text/' in content_type and not 'html' in content_type:
+            # Some kind of "text" but definitely not RSS looking
+            if stream_content_type.is_plaintext:
                # Don't run get_text or xpath/css filters on plaintext
+                # We are not HTML, we are not any kind of RSS, doesnt even look like HTML
                stripped_text_from_html = html_content
            else:
                # If not JSON, and if it's not text/plain..
@@ -186,13 +174,13 @@ class perform_site_check(difference_detection_processor):
                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=stream_content_type.is_rss)

                        elif filter_rule.startswith('xpath1:'):
                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                     is_rss=is_rss)
+                                                                     is_rss=stream_content_type.is_rss)
                        else:
                            html_content += html_tools.include_filters(include_filters=filter_rule,
                                                                       html_content=self.fetcher.content,
@@ -211,7 +199,7 @@ class perform_site_check(difference_detection_processor):
                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                    stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
                                                                      render_anchor_tag_content=do_anchor,
-                                                                      is_rss=is_rss)  # 1874 activate the <title workaround hack
+                                                                      is_rss=stream_content_type.is_rss)  # 1874 activate the <title workaround hack

        if watch.get('trim_text_whitespace'):
            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -250,7 +238,7 @@ class perform_site_check(difference_detection_processor):

        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
+        if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                            status_code=self.fetcher.get_last_status_code(),
                                                            screenshot=self.fetcher.screenshot,
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -174,6 +174,8 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.

+    WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
+
    :param client:
    :param live_server:
    :param measure_memory_usage:
@@ -271,6 +273,7 @@ got it\r\n
        url_for("ui.ui_views.preview_page", uuid="first"),
        follow_redirects=True
    )
+
    assert b"some random text that should be split by line\n" in res.data
    ####

--- a/changedetectionio/tests/test_group.py
+++ b/changedetectionio/tests/test_group.py
@@ -264,8 +264,6 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
    client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
    wait_for_all_checks(client)

-    with open('/tmp/fuck.html', 'wb') as f:
-        f.write(res.data)
    # Should be only 1 unviewed
    res = client.get(url_for("watchlist.index"))
    assert res.data.count(b' unviewed ') == 1
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -3,9 +3,8 @@
 import time
 import os
 import json
-import logging
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from .util import wait_for_all_checks
 from urllib.parse import urlparse, parse_qs

 def test_consistent_history(client, live_server, measure_memory_usage):
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -1,12 +1,42 @@
 # -*- coding: utf-8 -*-

-import time
+
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
-
-from ..html_tools import *
+from .util import  wait_for_all_checks
+from ..processors.magic import RSS_XML_CONTENT_TYPES


+def set_rss_atom_feed_response(header=''):
+    test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly  -->
+<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
+<channel>
+    <atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
+    <title>RSS Feed</title>
+    <link>
+        <![CDATA[ https://store.waterpowered.com/news/collection// ]]>
+    </link>
+    <description>
+        <![CDATA[ Events and Announcements for ]]>
+    </description>
+    <language>en-us</language>
+    <generator>water News RSS</generator>
+    <item>
+        <title> 🍁 Lets go discount</title>
+        <description><p class="bb_paragraph">ok heres the description</p></description>
+        <link>
+        <![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
+        </link>
+        <pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
+        <guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
+        <enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
+    </item>
+</channel>
+</rss>"""
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None



@@ -575,3 +605,47 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo

    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)

+
+def _subtest_xpath_rss(client, content_type='text/html'):
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', content_type=content_type, _external=True)
+    res = client.post(
+        url_for("ui.ui_views.form_quick_watch_add"),
+        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
+        follow_redirects=True
+    )
+
+    assert b"Watch added in Paused state, saving will unpause" in res.data
+
+    res = client.post(
+        url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
+        data={
+            "url": test_url,
+            "include_filters": "xpath://item",
+            "tags": '',
+            "fetch_backend": "html_requests",
+            "time_between_check_use_default": "y",
+        },
+        follow_redirects=True
+    )
+
+    assert b"unpaused" in res.data
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("ui.ui_views.preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
+    assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
+
+    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+
+# Be sure all-in-the-wild types of RSS feeds work with xpath
+def test_rss_xpath(client, live_server):
+    for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
+        set_rss_atom_feed_response(header=feed_header)
+        for content_type in RSS_XML_CONTENT_TYPES:
+            _subtest_xpath_rss(client, content_type=content_type)
Author	SHA1	Message	Date
dgtlmoon	a3b40234fc	woops Some checks failed Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2025-10-08 23:44:07 +02:00
dgtlmoon	4c1f089a06	Fixing PDF	2025-10-08 23:18:25 +02:00
dgtlmoon	09b052c6b1	Rename arg to be more clear	2025-10-08 23:12:06 +02:00
dgtlmoon	bcde39253e	tweak info	2025-10-08 23:10:40 +02:00
dgtlmoon	f770fa3765	WIP	2025-10-08 23:00:40 +02:00
dgtlmoon	b6c6f3a312	WIP	2025-10-08 22:26:12 +02:00
dgtlmoon	6de0b312e7	Misc fixes	2025-10-08 22:24:17 +02:00
dgtlmoon	013f3117b6	Moving mime/content type detection to its own helper	2025-10-08 22:07:40 +02:00
dgtlmoon	edd64fb1dd	Remove debug :-)	2025-10-08 19:15:25 +02:00
dgtlmoon	82833abf1a	move logic	2025-10-08 19:14:30 +02:00
dgtlmoon	6aa253df5f	Improve JSON detection	2025-10-08 19:04:10 +02:00
dgtlmoon	15912999b6	Improving RSS/HTML/Plaintext detection	2025-10-08 18:58:13 +02:00