tweak info

This commit is contained in:
dgtlmoon
2025-10-08 23:10:40 +02:00
parent f770fa3765
commit bcde39253e
+18 -2
View File
@@ -1,5 +1,22 @@
"""
Content Type Detection and Stream Classification
This module provides intelligent content-type detection for changedetection.io.
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
or too generic, which would otherwise cause the wrong processor to be used.
The guess_stream_type class combines:
1. HTTP Content-Type headers (when available and reliable)
2. Python-magic library for MIME detection (analyzing actual file content)
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
Used by: processors/text_json_diff/processor.py and other content processors
"""
# When to apply the 'cdata to real HTML' hack
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
RSS_XML_CONTENT_TYPES = [
"application/rss+xml",
"application/rdf+xml",
@@ -12,7 +29,6 @@ RSS_XML_CONTENT_TYPES = [
]
# JSON Content-types
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
JSON_CONTENT_TYPES = [
"application/activity+json",
"application/feed+json",