From 0c9c475f3249df72cff0036b128a423aee5b85fe Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 6 Oct 2025 15:39:07 +0200 Subject: [PATCH] Fixing bad detection of text text/plain in previous release, adding automated test (#3460) --- .../processors/text_json_diff/processor.py | 4 +- changedetectionio/tests/test_backend.py | 77 +++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 62630ed0..45d64421 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -154,10 +154,10 @@ class perform_site_check(difference_detection_processor): self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) html_content = self.fetcher.content content_type = self.fetcher.get_all_headers().get('content-type', '').lower() - is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() + is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type # Try to detect better mime types if its a download or not announced as HTML - if is_attachment or 'octet-stream' in content_type or not 'html' in content_type: + if is_attachment: logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..") try: import magic diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index 2636466e..1447e7bf 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -167,6 +167,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert b'Deleted' in res.data def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage): + """ + + https://github.com/dgtlmoon/changedetection.io/issues/3434 + I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8, + but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog), + changedetection somehow ignores all line breaks and treats the document file as if everything is on one line. + + :param client: + :param live_server: + :param measure_memory_usage: + :return: + """ with open("test-datastore/endpoint-content.txt", "w") as f: f.write("""some random text that should be split by line and not parsed with html_to_text @@ -215,3 +227,68 @@ got it\r\n res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) + +def test_standard_text_plain(client, live_server, measure_memory_usage): + """ + + https://github.com/dgtlmoon/changedetection.io/issues/3434 + I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8, + but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog), + changedetection somehow ignores all line breaks and treats the document file as if everything is on one line. + + The real bug here can be that it will try to process plain-text as HTML, losing + + :param client: + :param live_server: + :param measure_memory_usage: + :return: + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("""some random text that should be split by line +and not parsed with html_to_text +Even this title should stay because we are just plain text +this way we know that it correctly parsed as plain text +\r\n +ok\r\n +got it\r\n +""") + + test_url = url_for('test_endpoint', content_type="text/plain", _external=True) + + # Add our URL to the import page + res = client.post( + url_for("imports.import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + + wait_for_all_checks(client) + + ### check the front end + res = client.get( + url_for("ui.ui_views.preview_page", uuid="first"), + follow_redirects=True + ) + assert b"some random text that should be split by line\n" in res.data + #### + + # Check the snapshot by API that it has linefeeds too + watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching'])) + api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token') + res = client.get( + url_for("watchhistory", uuid=watch_uuid), + headers={'x-api-key': api_key}, + ) + + # Fetch a snapshot by timestamp, check the right one was found + res = client.get( + url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]), + headers={'x-api-key': api_key}, + ) + assert b"some random text that should be split by line\n" in res.data + assert b"Even this title should stay because we are just plain text" in res.data + + res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) +