From bcde39253ef47dbc50222f9f6cb3da8cc976cf41 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 8 Oct 2025 23:10:40 +0200 Subject: [PATCH] tweak info --- changedetectionio/processors/magic.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py index 8c2f919a..dcd3fca9 100644 --- a/changedetectionio/processors/magic.py +++ b/changedetectionio/processors/magic.py @@ -1,5 +1,22 @@ +""" +Content Type Detection and Stream Classification + +This module provides intelligent content-type detection for changedetection.io. +It addresses the common problem where HTTP Content-Type headers are missing, incorrect, +or too generic, which would otherwise cause the wrong processor to be used. + +The guess_stream_type class combines: +1. HTTP Content-Type headers (when available and reliable) +2. Python-magic library for MIME detection (analyzing actual file content) +3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.) + +This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF, +plain text, CSV, YAML, and XML formats - even when servers provide misleading headers. + +Used by: processors/text_json_diff/processor.py and other content processors +""" + # When to apply the 'cdata to real HTML' hack -# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest RSS_XML_CONTENT_TYPES = [ "application/rss+xml", "application/rdf+xml", @@ -12,7 +29,6 @@ RSS_XML_CONTENT_TYPES = [ ] # JSON Content-types -# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest JSON_CONTENT_TYPES = [ "application/activity+json", "application/feed+json",