tweak info

2026-07-08 00:10:46 +00:00 · 2025-10-08 23:10:40 +02:00
parent f770fa3765
commit bcde39253e
1 changed files with 18 additions and 2 deletions
@@ -1,5 +1,22 @@
+"""
+Content Type Detection and Stream Classification
+
+This module provides intelligent content-type detection for changedetection.io.
+It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
+or too generic, which would otherwise cause the wrong processor to be used.
+
+The guess_stream_type class combines:
+1. HTTP Content-Type headers (when available and reliable)
+2. Python-magic library for MIME detection (analyzing actual file content)
+3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
+
+This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
+plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
+
+Used by: processors/text_json_diff/processor.py and other content processors
+"""
+
 # When to apply the 'cdata to real HTML' hack
-# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
 RSS_XML_CONTENT_TYPES = [
    "application/rss+xml",
    "application/rdf+xml",
@@ -12,7 +29,6 @@ RSS_XML_CONTENT_TYPES = [
 ]

 # JSON Content-types
-# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
 JSON_CONTENT_TYPES = [
    "application/activity+json",
    "application/feed+json",