Refactor content type detection, fixing more xpath issues for RSS types (#3465) #3462 #3391

2025-12-12 02:55:43 +00:00 · 2025-10-09 00:14:28 +02:00
parent 584b6e378d
commit f7dfc9bbb8
6 changed files with 253 additions and 53 deletions
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -0,0 +1,138 @@
+"""
+Content Type Detection and Stream Classification
+
+This module provides intelligent content-type detection for changedetection.io.
+It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
+or too generic, which would otherwise cause the wrong processor to be used.
+
+The guess_stream_type class combines:
+1. HTTP Content-Type headers (when available and reliable)
+2. Python-magic library for MIME detection (analyzing actual file content)
+3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
+
+This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
+plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
+
+Used by: processors/text_json_diff/processor.py and other content processors
+"""
+
+# When to apply the 'cdata to real HTML' hack
+RSS_XML_CONTENT_TYPES = [
+    "application/rss+xml",
+    "application/rdf+xml",
+    "text/xml",
+    "application/xml",
+    "application/atom+xml",
+    "text/rss+xml",  # rare, non-standard
+    "application/x-rss+xml",  # legacy (older feed software)
+    "application/x-atom+xml",  # legacy (older Atom)
+]
+
+# JSON Content-types
+JSON_CONTENT_TYPES = [
+    "application/activity+json",
+    "application/feed+json",
+    "application/json",
+    "application/ld+json",
+    "application/vnd.api+json",
+]
+
+# CSV Content-types
+CSV_CONTENT_TYPES = [
+    "text/csv",
+    "application/csv",
+]
+
+# Generic XML Content-types (non-RSS/Atom)
+XML_CONTENT_TYPES = [
+    "text/xml",
+    "application/xml",
+]
+
+# YAML Content-types
+YAML_CONTENT_TYPES = [
+    "text/yaml",
+    "text/x-yaml",
+    "application/yaml",
+    "application/x-yaml",
+]
+
+HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
+
+import re
+import magic
+from loguru import logger
+
+
+class guess_stream_type():
+    is_pdf = False
+    is_json = False
+    is_html = False
+    is_plaintext = False
+    is_rss = False
+    is_csv = False
+    is_xml = False  # Generic XML, not RSS/Atom
+    is_yaml = False
+
+    def __init__(self, http_content_header, content):
+
+        magic_content_header = http_content_header
+        test_content = content[:200].lower().strip()
+
+        # Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
+        test_content_normalized = re.sub(r'<\s+', '<', test_content)
+
+        # Magic will sometimes call text/plain as text/html!
+        magic_result = None
+        try:
+            mime = magic.from_buffer(content[:200], mime=True) # Send the original content
+            logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
+            if mime and "/" in mime:
+                magic_result = mime
+                # Ignore generic/fallback mime types from magic
+                if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
+                    logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
+                # Trust magic for non-text types immediately
+                elif mime not in ['text/html', 'text/plain']:
+                    magic_content_header = mime
+
+        except Exception as e:
+            logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
+
+        # Content-based detection (most reliable for text formats)
+        # Check for HTML patterns first - if found, override magic's text/plain
+        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
+
+        # Always trust headers first
+        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
+            self.is_rss = True
+        elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+            self.is_json = True
+        elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
+            self.is_csv = True
+        elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
+            # Only mark as generic XML if not already detected as RSS
+            if not self.is_rss:
+                self.is_xml = True
+        elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
+            self.is_yaml = True
+        elif 'pdf' in magic_content_header:
+            self.is_pdf = True
+###
+        elif has_html_patterns or http_content_header == 'text/html':
+            self.is_html = True
+        # If magic says text/plain and we found no HTML patterns, trust it
+        elif magic_result == 'text/plain':
+            self.is_plaintext = True
+            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
+        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
+            self.is_rss = True
+        elif test_content_normalized.startswith('<?xml'):
+            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
+            self.is_xml = True
+        elif '%pdf-1' in test_content:
+            self.is_pdf = True
+        # Only trust magic for 'text' if no other patterns matched
+        elif 'text' in magic_content_header:
+            self.is_plaintext = True
+
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -13,6 +13,8 @@ from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger

+from changedetectionio.processors.magic import guess_stream_type
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

 name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -20,6 +22,9 @@ description = 'Detects all text changes where possible'

 json_filter_prefixes = ['json:', 'jq:', 'jqraw:']

+# Assume it's this type if the server says nothing on content-type
+DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
+
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg, screenshot=None, xpath_data=None):
        self.screenshot = screenshot
@@ -45,6 +50,9 @@ class perform_site_check(difference_detection_processor):
        if not watch:
            raise Exception("Watch no longer exists.")

+        ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
+        stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
+
        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}

@@ -54,7 +62,7 @@ class perform_site_check(difference_detection_processor):
        self.xpath_data = self.fetcher.xpath_data

        # Track the content type
-        update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
+        update_obj['content_type'] = ctype_header

        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
        # Saves a lot of CPU
@@ -69,24 +77,12 @@ class perform_site_check(difference_detection_processor):
        # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
        # return content().textfilter().jsonextract().checksumcompare() ?

-        is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
-        is_html = not is_json
-        is_rss = False

-        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Go into RSS preprocess for converting CDATA/comment to usable text
-        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
-            if '<rss' in self.fetcher.content[:100].lower():
-                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
-                is_rss = True
+        if stream_content_type.is_rss:
+            self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)

-        # source: support, basically treat it as plaintext
-        if watch.is_source_type_url:
-            is_html = False
-            is_json = False
-
-        inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
-        if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
+        if watch.is_pdf or stream_content_type.is_pdf:
            from shutil import which
            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
            if not which(tool):
@@ -130,11 +126,12 @@ class perform_site_check(difference_detection_processor):
        has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
        has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())

-        if is_json and not has_filter_rule:
-            include_filters_rule.append("json:$")
-            has_filter_rule = True
+        if stream_content_type.is_json:
+            if not has_filter_rule:
+                # Force a reformat
+                include_filters_rule.append("json:$")
+                has_filter_rule = True

-        if is_json:
            # Sort the JSON so we dont get false alerts when the content is just re-ordered
            try:
                self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -142,34 +139,25 @@ class perform_site_check(difference_detection_processor):
                # Might have just been a snippet, or otherwise bad JSON, continue
                pass

-        if has_filter_rule:
-            for filter in include_filters_rule:
-                if any(prefix in filter for prefix in json_filter_prefixes):
-                    stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
-                    is_html = False
+            if has_filter_rule:
+                for filter in include_filters_rule:
+                    if any(prefix in filter for prefix in json_filter_prefixes):
+                        stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
+                        if stripped_text_from_html:
+                            stream_content_type.is_json = True
+                            stream_content_type.is_html = False

-        if is_html or watch.is_source_type_url:
+        # We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
+        if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content
-            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
-            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type

-            # Try to detect better mime types if its a download or not announced as HTML
-            if is_attachment:
-                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
-                try:
-                    import magic
-                    mime = magic.from_buffer(html_content, mime=True)
-                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
-                    if mime and "/" in mime: # looks valid and is a valid mime type
-                        content_type = mime
-                except Exception as e:
-                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
-
-            if 'text/' in content_type and not 'html' in content_type:
+            # Some kind of "text" but definitely not RSS looking
+            if stream_content_type.is_plaintext:
                # Don't run get_text or xpath/css filters on plaintext
+                # We are not HTML, we are not any kind of RSS, doesnt even look like HTML
                stripped_text_from_html = html_content
            else:
                # If not JSON, and if it's not text/plain..
@@ -186,13 +174,13 @@ class perform_site_check(difference_detection_processor):
                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=stream_content_type.is_rss)

                        elif filter_rule.startswith('xpath1:'):
                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                     is_rss=is_rss)
+                                                                     is_rss=stream_content_type.is_rss)
                        else:
                            html_content += html_tools.include_filters(include_filters=filter_rule,
                                                                       html_content=self.fetcher.content,
@@ -211,7 +199,7 @@ class perform_site_check(difference_detection_processor):
                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                    stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
                                                                      render_anchor_tag_content=do_anchor,
-                                                                      is_rss=is_rss)  # 1874 activate the <title workaround hack
+                                                                      is_rss=stream_content_type.is_rss)  # 1874 activate the <title workaround hack

        if watch.get('trim_text_whitespace'):
            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -250,7 +238,7 @@ class perform_site_check(difference_detection_processor):

        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
+        if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                            status_code=self.fetcher.get_last_status_code(),
                                                            screenshot=self.fetcher.screenshot,