mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-01 07:08:47 +00:00
Compare commits
12 Commits
puremagic-
...
3462-impro
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a3b40234fc | ||
|
|
4c1f089a06 | ||
|
|
09b052c6b1 | ||
|
|
bcde39253e | ||
|
|
f770fa3765 | ||
|
|
b6c6f3a312 | ||
|
|
6de0b312e7 | ||
|
|
013f3117b6 | ||
|
|
edd64fb1dd | ||
|
|
82833abf1a | ||
|
|
6aa253df5f | ||
|
|
15912999b6 |
138
changedetectionio/processors/magic.py
Normal file
138
changedetectionio/processors/magic.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Content Type Detection and Stream Classification
|
||||
|
||||
This module provides intelligent content-type detection for changedetection.io.
|
||||
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
|
||||
or too generic, which would otherwise cause the wrong processor to be used.
|
||||
|
||||
The guess_stream_type class combines:
|
||||
1. HTTP Content-Type headers (when available and reliable)
|
||||
2. Python-magic library for MIME detection (analyzing actual file content)
|
||||
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
|
||||
|
||||
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
|
||||
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
|
||||
|
||||
Used by: processors/text_json_diff/processor.py and other content processors
|
||||
"""
|
||||
|
||||
# When to apply the 'cdata to real HTML' hack
|
||||
RSS_XML_CONTENT_TYPES = [
|
||||
"application/rss+xml",
|
||||
"application/rdf+xml",
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
"application/atom+xml",
|
||||
"text/rss+xml", # rare, non-standard
|
||||
"application/x-rss+xml", # legacy (older feed software)
|
||||
"application/x-atom+xml", # legacy (older Atom)
|
||||
]
|
||||
|
||||
# JSON Content-types
|
||||
JSON_CONTENT_TYPES = [
|
||||
"application/activity+json",
|
||||
"application/feed+json",
|
||||
"application/json",
|
||||
"application/ld+json",
|
||||
"application/vnd.api+json",
|
||||
]
|
||||
|
||||
# CSV Content-types
|
||||
CSV_CONTENT_TYPES = [
|
||||
"text/csv",
|
||||
"application/csv",
|
||||
]
|
||||
|
||||
# Generic XML Content-types (non-RSS/Atom)
|
||||
XML_CONTENT_TYPES = [
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
]
|
||||
|
||||
# YAML Content-types
|
||||
YAML_CONTENT_TYPES = [
|
||||
"text/yaml",
|
||||
"text/x-yaml",
|
||||
"application/yaml",
|
||||
"application/x-yaml",
|
||||
]
|
||||
|
||||
HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
|
||||
|
||||
import re
|
||||
import magic
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class guess_stream_type():
|
||||
is_pdf = False
|
||||
is_json = False
|
||||
is_html = False
|
||||
is_plaintext = False
|
||||
is_rss = False
|
||||
is_csv = False
|
||||
is_xml = False # Generic XML, not RSS/Atom
|
||||
is_yaml = False
|
||||
|
||||
def __init__(self, http_content_header, content):
|
||||
|
||||
magic_content_header = http_content_header
|
||||
test_content = content[:200].lower().strip()
|
||||
|
||||
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
|
||||
test_content_normalized = re.sub(r'<\s+', '<', test_content)
|
||||
|
||||
# Magic will sometimes call text/plain as text/html!
|
||||
magic_result = None
|
||||
try:
|
||||
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
|
||||
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
|
||||
if mime and "/" in mime:
|
||||
magic_result = mime
|
||||
# Ignore generic/fallback mime types from magic
|
||||
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
|
||||
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
|
||||
# Trust magic for non-text types immediately
|
||||
elif mime not in ['text/html', 'text/plain']:
|
||||
magic_content_header = mime
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
|
||||
|
||||
# Content-based detection (most reliable for text formats)
|
||||
# Check for HTML patterns first - if found, override magic's text/plain
|
||||
has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
|
||||
|
||||
# Always trust headers first
|
||||
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
|
||||
self.is_rss = True
|
||||
elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
|
||||
self.is_json = True
|
||||
elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
|
||||
self.is_csv = True
|
||||
elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
|
||||
# Only mark as generic XML if not already detected as RSS
|
||||
if not self.is_rss:
|
||||
self.is_xml = True
|
||||
elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
|
||||
self.is_yaml = True
|
||||
elif 'pdf' in magic_content_header:
|
||||
self.is_pdf = True
|
||||
###
|
||||
elif has_html_patterns or http_content_header == 'text/html':
|
||||
self.is_html = True
|
||||
# If magic says text/plain and we found no HTML patterns, trust it
|
||||
elif magic_result == 'text/plain':
|
||||
self.is_plaintext = True
|
||||
logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
|
||||
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
|
||||
self.is_rss = True
|
||||
elif test_content_normalized.startswith('<?xml'):
|
||||
# Generic XML that's not RSS/Atom (RSS/Atom checked above)
|
||||
self.is_xml = True
|
||||
elif '%pdf-1' in test_content:
|
||||
self.is_pdf = True
|
||||
# Only trust magic for 'text' if no other patterns matched
|
||||
elif 'text' in magic_content_header:
|
||||
self.is_plaintext = True
|
||||
|
||||
@@ -13,6 +13,8 @@ from changedetectionio import html_tools, content_fetchers
|
||||
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.processors.magic import guess_stream_type
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
name = 'Webpage Text/HTML, JSON and PDF changes'
|
||||
@@ -20,6 +22,9 @@ description = 'Detects all text changes where possible'
|
||||
|
||||
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
|
||||
|
||||
# Assume it's this type if the server says nothing on content-type
|
||||
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
|
||||
|
||||
class FilterNotFoundInResponse(ValueError):
|
||||
def __init__(self, msg, screenshot=None, xpath_data=None):
|
||||
self.screenshot = screenshot
|
||||
@@ -45,6 +50,9 @@ class perform_site_check(difference_detection_processor):
|
||||
if not watch:
|
||||
raise Exception("Watch no longer exists.")
|
||||
|
||||
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
|
||||
stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
|
||||
|
||||
# Unset any existing notification error
|
||||
update_obj = {'last_notification_error': False, 'last_error': False}
|
||||
|
||||
@@ -54,7 +62,7 @@ class perform_site_check(difference_detection_processor):
|
||||
self.xpath_data = self.fetcher.xpath_data
|
||||
|
||||
# Track the content type
|
||||
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
update_obj['content_type'] = ctype_header
|
||||
|
||||
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
|
||||
# Saves a lot of CPU
|
||||
@@ -69,24 +77,12 @@ class perform_site_check(difference_detection_processor):
|
||||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||
|
||||
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_html = not is_json
|
||||
is_rss = False
|
||||
|
||||
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
# Go into RSS preprocess for converting CDATA/comment to usable text
|
||||
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
|
||||
if '<rss' in self.fetcher.content[:100].lower():
|
||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||
is_rss = True
|
||||
if stream_content_type.is_rss:
|
||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||
|
||||
# source: support, basically treat it as plaintext
|
||||
if watch.is_source_type_url:
|
||||
is_html = False
|
||||
is_json = False
|
||||
|
||||
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
|
||||
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
|
||||
if watch.is_pdf or stream_content_type.is_pdf:
|
||||
from shutil import which
|
||||
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
|
||||
if not which(tool):
|
||||
@@ -130,11 +126,12 @@ class perform_site_check(difference_detection_processor):
|
||||
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
|
||||
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
|
||||
|
||||
if is_json and not has_filter_rule:
|
||||
include_filters_rule.append("json:$")
|
||||
has_filter_rule = True
|
||||
if stream_content_type.is_json:
|
||||
if not has_filter_rule:
|
||||
# Force a reformat
|
||||
include_filters_rule.append("json:$")
|
||||
has_filter_rule = True
|
||||
|
||||
if is_json:
|
||||
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
||||
try:
|
||||
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
|
||||
@@ -142,34 +139,25 @@ class perform_site_check(difference_detection_processor):
|
||||
# Might have just been a snippet, or otherwise bad JSON, continue
|
||||
pass
|
||||
|
||||
if has_filter_rule:
|
||||
for filter in include_filters_rule:
|
||||
if any(prefix in filter for prefix in json_filter_prefixes):
|
||||
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
||||
is_html = False
|
||||
if has_filter_rule:
|
||||
for filter in include_filters_rule:
|
||||
if any(prefix in filter for prefix in json_filter_prefixes):
|
||||
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
||||
if stripped_text_from_html:
|
||||
stream_content_type.is_json = True
|
||||
stream_content_type.is_html = False
|
||||
|
||||
if is_html or watch.is_source_type_url:
|
||||
# We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
|
||||
if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
|
||||
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||
html_content = self.fetcher.content
|
||||
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
|
||||
|
||||
# Try to detect better mime types if its a download or not announced as HTML
|
||||
if is_attachment:
|
||||
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
|
||||
try:
|
||||
import magic
|
||||
mime = magic.from_buffer(html_content, mime=True)
|
||||
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
|
||||
if mime and "/" in mime: # looks valid and is a valid mime type
|
||||
content_type = mime
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
|
||||
|
||||
if 'text/' in content_type and not 'html' in content_type:
|
||||
# Some kind of "text" but definitely not RSS looking
|
||||
if stream_content_type.is_plaintext:
|
||||
# Don't run get_text or xpath/css filters on plaintext
|
||||
# We are not HTML, we are not any kind of RSS, doesnt even look like HTML
|
||||
stripped_text_from_html = html_content
|
||||
else:
|
||||
# If not JSON, and if it's not text/plain..
|
||||
@@ -186,13 +174,13 @@ class perform_site_check(difference_detection_processor):
|
||||
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
||||
html_content=self.fetcher.content,
|
||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||
is_rss=is_rss)
|
||||
is_rss=stream_content_type.is_rss)
|
||||
|
||||
elif filter_rule.startswith('xpath1:'):
|
||||
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||
html_content=self.fetcher.content,
|
||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||
is_rss=is_rss)
|
||||
is_rss=stream_content_type.is_rss)
|
||||
else:
|
||||
html_content += html_tools.include_filters(include_filters=filter_rule,
|
||||
html_content=self.fetcher.content,
|
||||
@@ -211,7 +199,7 @@ class perform_site_check(difference_detection_processor):
|
||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
||||
render_anchor_tag_content=do_anchor,
|
||||
is_rss=is_rss) # 1874 activate the <title workaround hack
|
||||
is_rss=stream_content_type.is_rss) # 1874 activate the <title workaround hack
|
||||
|
||||
if watch.get('trim_text_whitespace'):
|
||||
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
||||
@@ -250,7 +238,7 @@ class perform_site_check(difference_detection_processor):
|
||||
|
||||
# Treat pages with no renderable text content as a change? No by default
|
||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
|
||||
status_code=self.fetcher.get_last_status_code(),
|
||||
screenshot=self.fetcher.screenshot,
|
||||
|
||||
@@ -174,6 +174,8 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
|
||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||
|
||||
WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
|
||||
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
@@ -271,6 +273,7 @@ got it\r\n
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"some random text that should be split by line\n" in res.data
|
||||
####
|
||||
|
||||
|
||||
@@ -264,8 +264,6 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
|
||||
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
with open('/tmp/fuck.html', 'wb') as f:
|
||||
f.write(res.data)
|
||||
# Should be only 1 unviewed
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert res.data.count(b' unviewed ') == 1
|
||||
|
||||
@@ -3,9 +3,8 @@
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
from .util import wait_for_all_checks
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||
|
||||
@@ -1,12 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
|
||||
from ..html_tools import *
|
||||
from .util import wait_for_all_checks
|
||||
from ..processors.magic import RSS_XML_CONTENT_TYPES
|
||||
|
||||
|
||||
def set_rss_atom_feed_response(header=''):
|
||||
test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly -->
|
||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||
<channel>
|
||||
<atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
|
||||
<title>RSS Feed</title>
|
||||
<link>
|
||||
<![CDATA[ https://store.waterpowered.com/news/collection// ]]>
|
||||
</link>
|
||||
<description>
|
||||
<![CDATA[ Events and Announcements for ]]>
|
||||
</description>
|
||||
<language>en-us</language>
|
||||
<generator>water News RSS</generator>
|
||||
<item>
|
||||
<title> 🍁 Lets go discount</title>
|
||||
<description><p class="bb_paragraph">ok heres the description</p></description>
|
||||
<link>
|
||||
<![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
|
||||
</link>
|
||||
<pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
|
||||
<guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
|
||||
<enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -575,3 +605,47 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
|
||||
|
||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
def _subtest_xpath_rss(client, content_type='text/html'):
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', content_type=content_type, _external=True)
|
||||
res = client.post(
|
||||
url_for("ui.ui_views.form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
|
||||
data={
|
||||
"url": test_url,
|
||||
"include_filters": "xpath://item",
|
||||
"tags": '',
|
||||
"fetch_backend": "html_requests",
|
||||
"time_between_check_use_default": "y",
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"unpaused" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
|
||||
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
|
||||
|
||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
# Be sure all-in-the-wild types of RSS feeds work with xpath
|
||||
def test_rss_xpath(client, live_server):
|
||||
for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
|
||||
set_rss_atom_feed_response(header=feed_header)
|
||||
for content_type in RSS_XML_CONTENT_TYPES:
|
||||
_subtest_xpath_rss(client, content_type=content_type)
|
||||
|
||||
Reference in New Issue
Block a user