From 0c9c475f3249df72cff0036b128a423aee5b85fe Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Mon, 6 Oct 2025 15:39:07 +0200
Subject: [PATCH] Fixing bad detection of text text/plain in previous release,
 adding automated test (#3460)

---
 .../processors/text_json_diff/processor.py    |  4 +-
 changedetectionio/tests/test_backend.py       | 77 +++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 62630ed0..45d64421 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -154,10 +154,10 @@ class perform_site_check(difference_detection_processor):
             self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
             html_content = self.fetcher.content
             content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
-            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()
+            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
 
             # Try to detect better mime types if its a download or not announced as HTML
-            if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
+            if is_attachment:
                 logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
                 try:
                     import magic
diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py
index 2636466e..1447e7bf 100644
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -167,6 +167,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
     assert b'Deleted' in res.data
 
 def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
+    """
+
+    https://github.com/dgtlmoon/changedetection.io/issues/3434
+    I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
+    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
+    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
+
+    :param client:
+    :param live_server:
+    :param measure_memory_usage:
+    :return:
+    """
     with open("test-datastore/endpoint-content.txt", "w") as f:
         f.write("""some random text that should be split by line
 and not parsed with html_to_text
@@ -215,3 +227,68 @@ got it\r\n
 
     res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
 
+
+def test_standard_text_plain(client, live_server, measure_memory_usage):
+    """
+
+    https://github.com/dgtlmoon/changedetection.io/issues/3434
+    I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
+    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
+    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
+
+    The real bug here can be that it will try to process plain-text as HTML, losing <etc>
+
+    :param client:
+    :param live_server:
+    :param measure_memory_usage:
+    :return:
+    """
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""some random text that should be split by line
+and not parsed with html_to_text
+<title>Even this title should stay because we are just plain text</title>
+this way we know that it correctly parsed as plain text
+\r\n
+ok\r\n
+got it\r\n
+""")
+
+    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
+
+    # Add our URL to the import page
+    res = client.post(
+        url_for("imports.import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+
+    wait_for_all_checks(client)
+
+    ### check the front end
+    res = client.get(
+        url_for("ui.ui_views.preview_page", uuid="first"),
+        follow_redirects=True
+    )
+    assert b"some random text that should be split by line\n" in res.data
+    ####
+
+    # Check the snapshot by API that it has linefeeds too
+    watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
+    api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
+    res = client.get(
+        url_for("watchhistory", uuid=watch_uuid),
+        headers={'x-api-key': api_key},
+    )
+
+    # Fetch a snapshot by timestamp, check the right one was found
+    res = client.get(
+        url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]),
+        headers={'x-api-key': api_key},
+    )
+    assert b"some random text that should be split by line\n" in res.data
+    assert b"<title>Even this title should stay because we are just plain text</title>" in res.data
+
+    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+