Re #3486 - Fixing and adding test for RSS/Atom not being converted to text when server sends "text/xml" instead of the "application/atom+xml" header

2025-12-18 22:15:37 +00:00 · 2025-10-10 15:58:33 +02:00
2 changed files with 18 additions and 16 deletions
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -94,24 +94,21 @@ class guess_stream_type():
            self.is_rss = True
        elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
        elif 'pdf' in magic_content_header:
            self.is_pdf = True
        elif has_html_patterns or http_content_header == 'text/html':
            self.is_html = True
        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
        # magic will call a rss document 'xml'
        # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
        # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
        elif any(s in http_content_header for s in XML_CONTENT_TYPES):
            # Only mark as generic XML if not already detected as RSS
            if not self.is_rss:
                self.is_xml = True
        elif 'pdf' in magic_content_header:
            self.is_pdf = True
 ###
        elif has_html_patterns or http_content_header == 'text/html':
            self.is_html = True
        # If magic says text/plain and we found no HTML patterns, trust it
        elif magic_result == 'text/plain':
            self.is_plaintext = True
            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
        # magic will call a rss document 'xml'
        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
        elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
            self.is_xml = True
@@ -122,4 +119,8 @@ class guess_stream_type():
        # Only trust magic for 'text' if no other patterns matched
        elif 'text' in magic_content_header:
            self.is_plaintext = True
        # If magic says text/plain and we found no HTML patterns, trust it
        elif magic_result == 'text/plain':
            self.is_plaintext = True
            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -110,8 +110,9 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
    set_original_cdata_xml()
-
+    # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
    test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
    # Add our URL to the import page
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)