remove unused types

use correct header type for RSS tests
Re #3472 - always use plaintext if its announced as such
2025-11-28 20:33:22 +00:00 · 2025-10-09 10:48:48 +02:00 · 2025-10-09 10:20:45 +02:00 · 2025-10-09 10:01:19 +02:00
3 changed files with 47 additions and 26 deletions
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -20,8 +20,6 @@ Used by: processors/text_json_diff/processor.py and other content processors
 RSS_XML_CONTENT_TYPES = [
    "application/rss+xml",
    "application/rdf+xml",
-    "text/xml",
-    "application/xml",
    "application/atom+xml",
    "text/rss+xml",  # rare, non-standard
    "application/x-rss+xml",  # legacy (older feed software)
@@ -37,11 +35,6 @@ JSON_CONTENT_TYPES = [
    "application/vnd.api+json",
 ]

-# CSV Content-types
-CSV_CONTENT_TYPES = [
-    "text/csv",
-    "application/csv",
-]

 # Generic XML Content-types (non-RSS/Atom)
 XML_CONTENT_TYPES = [
@@ -49,14 +42,6 @@ XML_CONTENT_TYPES = [
    "application/xml",
 ]

-# YAML Content-types
-YAML_CONTENT_TYPES = [
-    "text/yaml",
-    "text/x-yaml",
-    "application/yaml",
-    "application/x-yaml",
-]
-
 HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']

 import re
@@ -104,18 +89,16 @@ class guess_stream_type():
        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)

        # Always trust headers first
-        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
+        if 'text/plain' in http_content_header:
+            self.is_plaintext = True
+        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
-        elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+        elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
            self.is_json = True
-        elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
-            self.is_csv = True
-        elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
+        elif any(s in http_content_header for s in XML_CONTENT_TYPES):
            # Only mark as generic XML if not already detected as RSS
            if not self.is_rss:
                self.is_xml = True
-        elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
-            self.is_yaml = True
        elif 'pdf' in magic_content_header:
            self.is_pdf = True
 ###
@@ -125,13 +108,18 @@ class guess_stream_type():
        elif magic_result == 'text/plain':
            self.is_plaintext = True
            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
-        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
+        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+            self.is_json = True
+        # magic will call a rss document 'xml'
+        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
            self.is_rss = True
-        elif test_content_normalized.startswith('<?xml'):
+        elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
            self.is_xml = True
        elif '%pdf-1' in test_content:
            self.is_pdf = True
+        elif http_content_header.startswith('text/'):
+            self.is_plaintext = True
        # Only trust magic for 'text' if no other patterns matched
        elif 'text' in magic_content_header:
            self.is_plaintext = True
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -295,3 +295,36 @@ got it\r\n

    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)

+# Server says its plaintext, we should always treat it as plaintext
+def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""<?xml version="1.0" encoding="utf-8"?>
+<resources xmlns:tools="http://schemas.android.com/tools">
+    <!--Activity and fragment titles-->
+    <string name="feed_update_receiver_name">Abonnementen bijwerken</string>
+</resources>
+""")
+
+    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
+
+    # Add our URL to the import page
+    res = client.post(
+        url_for("imports.import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("ui.ui_views.preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b'&lt;string name=&#34;feed_update_receiver_name&#34;' in res.data
+
+    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+
--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):

    set_original_cdata_xml()

-    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)

    # Add our URL to the import page
    res = client.post(
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):

    set_original_cdata_xml()

-    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)

    res = client.post(
        url_for("ui.ui_views.form_quick_watch_add"),
Author	SHA1	Message	Date
dgtlmoon	871466ce37	remove unused types	2025-10-09 10:48:48 +02:00
dgtlmoon	2e31a1a36e	use correct header type for RSS tests	2025-10-09 10:20:45 +02:00
dgtlmoon	3dccac9615	Re #3472 - always use plaintext if its announced as such	2025-10-09 10:01:19 +02:00