mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-05-03 00:00:54 +00:00
tweak info
This commit is contained in:
@@ -1,5 +1,22 @@
|
||||
"""
|
||||
Content Type Detection and Stream Classification
|
||||
|
||||
This module provides intelligent content-type detection for changedetection.io.
|
||||
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
|
||||
or too generic, which would otherwise cause the wrong processor to be used.
|
||||
|
||||
The guess_stream_type class combines:
|
||||
1. HTTP Content-Type headers (when available and reliable)
|
||||
2. Python-magic library for MIME detection (analyzing actual file content)
|
||||
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
|
||||
|
||||
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
|
||||
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
|
||||
|
||||
Used by: processors/text_json_diff/processor.py and other content processors
|
||||
"""
|
||||
|
||||
# When to apply the 'cdata to real HTML' hack
|
||||
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
|
||||
RSS_XML_CONTENT_TYPES = [
|
||||
"application/rss+xml",
|
||||
"application/rdf+xml",
|
||||
@@ -12,7 +29,6 @@ RSS_XML_CONTENT_TYPES = [
|
||||
]
|
||||
|
||||
# JSON Content-types
|
||||
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
|
||||
JSON_CONTENT_TYPES = [
|
||||
"application/activity+json",
|
||||
"application/feed+json",
|
||||
|
||||
Reference in New Issue
Block a user