Process text/* non-HTML in their original format keeping line breaks, auto-detect attachments/downloads for text or HTML, WARNING - Will trigger false changes for some existing text file watches #3434 (#3435)

2025-11-19 16:06:10 +00:00 · 2025-09-19 10:42:34 +02:00
parent b2f9aec383
commit 01c1ac4c0c
2 changed files with 66 additions and 2 deletions
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -153,12 +153,26 @@ class perform_site_check(difference_detection_processor):
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content
+            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
+            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()

-            # If not JSON,  and if it's not text/plain..
-            if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower():
+            # Try to detect better mime types if its a download or not announced as HTML
+            if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
+                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
+                try:
+                    import magic
+                    mime = magic.from_buffer(html_content, mime=True)
+                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
+                    if mime and "/" in mime: # looks valid and is a valid mime type
+                        content_type = mime
+                except Exception as e:
+                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
+
+            if 'text/' in content_type and not 'html' in content_type:
                # Don't run get_text or xpath/css filters on plaintext
                stripped_text_from_html = html_content
            else:
+                # If not JSON, and if it's not text/plain..
                # Does it have some ld+json price data? used for easier monitoring
                update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content)