Re #3486 - Fixing and adding test for RSS/Atom not being converted to text when server sends "text/xml" instead of the "application/atom+xml" header (#3487)

2025-10-30 14:17:40 +00:00 · 2025-10-10 16:29:02 +02:00
parent b59ce190ac
commit bb6d4c2756
2 changed files with 18 additions and 16 deletions
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -94,24 +94,21 @@ class guess_stream_type():
            self.is_rss = True
        elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
+        elif 'pdf' in magic_content_header:
+            self.is_pdf = True
+        elif has_html_patterns or http_content_header == 'text/html':
+            self.is_html = True
+        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+            self.is_json = True
+        # magic will call a rss document 'xml'
+        # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
+        # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
+        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
+            self.is_rss = True
        elif any(s in http_content_header for s in XML_CONTENT_TYPES):
            # Only mark as generic XML if not already detected as RSS
            if not self.is_rss:
                self.is_xml = True
-        elif 'pdf' in magic_content_header:
-            self.is_pdf = True
-###
-        elif has_html_patterns or http_content_header == 'text/html':
-            self.is_html = True
-        # If magic says text/plain and we found no HTML patterns, trust it
-        elif magic_result == 'text/plain':
-            self.is_plaintext = True
-            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
-        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
-            self.is_json = True
-        # magic will call a rss document 'xml'
-        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
-            self.is_rss = True
        elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
            self.is_xml = True
@@ -122,4 +119,8 @@ class guess_stream_type():
        # Only trust magic for 'text' if no other patterns matched
        elif 'text' in magic_content_header:
            self.is_plaintext = True
+        # If magic says text/plain and we found no HTML patterns, trust it
+        elif magic_result == 'text/plain':
+            self.is_plaintext = True
+            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")

--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -110,8 +110,9 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
    

    set_original_cdata_xml()
-
-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
+    # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
+    test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)

    # Add our URL to the import page
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)