Compare commits

...

12 Commits

Author SHA1 Message Date
dgtlmoon
a3b40234fc woops
Some checks failed
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2025-10-08 23:44:07 +02:00
dgtlmoon
4c1f089a06 Fixing PDF 2025-10-08 23:18:25 +02:00
dgtlmoon
09b052c6b1 Rename arg to be more clear 2025-10-08 23:12:06 +02:00
dgtlmoon
bcde39253e tweak info 2025-10-08 23:10:40 +02:00
dgtlmoon
f770fa3765 WIP 2025-10-08 23:00:40 +02:00
dgtlmoon
b6c6f3a312 WIP 2025-10-08 22:26:12 +02:00
dgtlmoon
6de0b312e7 Misc fixes 2025-10-08 22:24:17 +02:00
dgtlmoon
013f3117b6 Moving mime/content type detection to its own helper 2025-10-08 22:07:40 +02:00
dgtlmoon
edd64fb1dd Remove debug :-) 2025-10-08 19:15:25 +02:00
dgtlmoon
82833abf1a move logic 2025-10-08 19:14:30 +02:00
dgtlmoon
6aa253df5f Improve JSON detection 2025-10-08 19:04:10 +02:00
dgtlmoon
15912999b6 Improving RSS/HTML/Plaintext detection 2025-10-08 18:58:13 +02:00
6 changed files with 253 additions and 53 deletions

View File

@@ -0,0 +1,138 @@
"""
Content Type Detection and Stream Classification
This module provides intelligent content-type detection for changedetection.io.
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
or too generic, which would otherwise cause the wrong processor to be used.
The guess_stream_type class combines:
1. HTTP Content-Type headers (when available and reliable)
2. Python-magic library for MIME detection (analyzing actual file content)
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
Used by: processors/text_json_diff/processor.py and other content processors
"""
# When to apply the 'cdata to real HTML' hack
RSS_XML_CONTENT_TYPES = [
"application/rss+xml",
"application/rdf+xml",
"text/xml",
"application/xml",
"application/atom+xml",
"text/rss+xml", # rare, non-standard
"application/x-rss+xml", # legacy (older feed software)
"application/x-atom+xml", # legacy (older Atom)
]
# JSON Content-types
JSON_CONTENT_TYPES = [
"application/activity+json",
"application/feed+json",
"application/json",
"application/ld+json",
"application/vnd.api+json",
]
# CSV Content-types
CSV_CONTENT_TYPES = [
"text/csv",
"application/csv",
]
# Generic XML Content-types (non-RSS/Atom)
XML_CONTENT_TYPES = [
"text/xml",
"application/xml",
]
# YAML Content-types
YAML_CONTENT_TYPES = [
"text/yaml",
"text/x-yaml",
"application/yaml",
"application/x-yaml",
]
HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
import re
import magic
from loguru import logger
class guess_stream_type():
is_pdf = False
is_json = False
is_html = False
is_plaintext = False
is_rss = False
is_csv = False
is_xml = False # Generic XML, not RSS/Atom
is_yaml = False
def __init__(self, http_content_header, content):
magic_content_header = http_content_header
test_content = content[:200].lower().strip()
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
test_content_normalized = re.sub(r'<\s+', '<', test_content)
# Magic will sometimes call text/plain as text/html!
magic_result = None
try:
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types from magic
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
# Trust magic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
# Content-based detection (most reliable for text formats)
# Check for HTML patterns first - if found, override magic's text/plain
has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
# Always trust headers first
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
self.is_rss = True
elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
self.is_json = True
elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
self.is_csv = True
elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
# Only mark as generic XML if not already detected as RSS
if not self.is_rss:
self.is_xml = True
elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
self.is_yaml = True
elif 'pdf' in magic_content_header:
self.is_pdf = True
###
elif has_html_patterns or http_content_header == 'text/html':
self.is_html = True
# If magic says text/plain and we found no HTML patterns, trust it
elif magic_result == 'text/plain':
self.is_plaintext = True
logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
self.is_rss = True
elif test_content_normalized.startswith('<?xml'):
# Generic XML that's not RSS/Atom (RSS/Atom checked above)
self.is_xml = True
elif '%pdf-1' in test_content:
self.is_pdf = True
# Only trust magic for 'text' if no other patterns matched
elif 'text' in magic_content_header:
self.is_plaintext = True

View File

@@ -13,6 +13,8 @@ from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
from changedetectionio.processors.magic import guess_stream_type
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -20,6 +22,9 @@ description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
# Assume it's this type if the server says nothing on content-type
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg, screenshot=None, xpath_data=None):
self.screenshot = screenshot
@@ -45,6 +50,9 @@ class perform_site_check(difference_detection_processor):
if not watch:
raise Exception("Watch no longer exists.")
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
@@ -54,7 +62,7 @@ class perform_site_check(difference_detection_processor):
self.xpath_data = self.fetcher.xpath_data
# Track the content type
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
update_obj['content_type'] = ctype_header
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
@@ -69,24 +77,12 @@ class perform_site_check(difference_detection_processor):
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
is_html = not is_json
is_rss = False
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
# Go into RSS preprocess for converting CDATA/comment to usable text
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
if '<rss' in self.fetcher.content[:100].lower():
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
is_rss = True
if stream_content_type.is_rss:
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
# source: support, basically treat it as plaintext
if watch.is_source_type_url:
is_html = False
is_json = False
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
if watch.is_pdf or stream_content_type.is_pdf:
from shutil import which
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
if not which(tool):
@@ -130,11 +126,12 @@ class perform_site_check(difference_detection_processor):
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
if is_json and not has_filter_rule:
include_filters_rule.append("json:$")
has_filter_rule = True
if stream_content_type.is_json:
if not has_filter_rule:
# Force a reformat
include_filters_rule.append("json:$")
has_filter_rule = True
if is_json:
# Sort the JSON so we dont get false alerts when the content is just re-ordered
try:
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -142,34 +139,25 @@ class perform_site_check(difference_detection_processor):
# Might have just been a snippet, or otherwise bad JSON, continue
pass
if has_filter_rule:
for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
is_html = False
if has_filter_rule:
for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
if stripped_text_from_html:
stream_content_type.is_json = True
stream_content_type.is_html = False
if is_html or watch.is_source_type_url:
# We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
html_content = self.fetcher.content
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
# Try to detect better mime types if its a download or not announced as HTML
if is_attachment:
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
try:
import magic
mime = magic.from_buffer(html_content, mime=True)
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
if mime and "/" in mime: # looks valid and is a valid mime type
content_type = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
if 'text/' in content_type and not 'html' in content_type:
# Some kind of "text" but definitely not RSS looking
if stream_content_type.is_plaintext:
# Don't run get_text or xpath/css filters on plaintext
# We are not HTML, we are not any kind of RSS, doesnt even look like HTML
stripped_text_from_html = html_content
else:
# If not JSON, and if it's not text/plain..
@@ -186,13 +174,13 @@ class perform_site_check(difference_detection_processor):
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss)
is_rss=stream_content_type.is_rss)
elif filter_rule.startswith('xpath1:'):
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss)
is_rss=stream_content_type.is_rss)
else:
html_content += html_tools.include_filters(include_filters=filter_rule,
html_content=self.fetcher.content,
@@ -211,7 +199,7 @@ class perform_site_check(difference_detection_processor):
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
render_anchor_tag_content=do_anchor,
is_rss=is_rss) # 1874 activate the <title workaround hack
is_rss=stream_content_type.is_rss) # 1874 activate the <title workaround hack
if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -250,7 +238,7 @@ class perform_site_check(difference_detection_processor):
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,

View File

@@ -174,6 +174,8 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
:param client:
:param live_server:
:param measure_memory_usage:
@@ -271,6 +273,7 @@ got it\r\n
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b"some random text that should be split by line\n" in res.data
####

View File

@@ -264,8 +264,6 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
wait_for_all_checks(client)
with open('/tmp/fuck.html', 'wb') as f:
f.write(res.data)
# Should be only 1 unviewed
res = client.get(url_for("watchlist.index"))
assert res.data.count(b' unviewed ') == 1

View File

@@ -3,9 +3,8 @@
import time
import os
import json
import logging
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
from .util import wait_for_all_checks
from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):

View File

@@ -1,12 +1,42 @@
# -*- coding: utf-8 -*-
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
from ..html_tools import *
from .util import wait_for_all_checks
from ..processors.magic import RSS_XML_CONTENT_TYPES
def set_rss_atom_feed_response(header=''):
test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly -->
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
<title>RSS Feed</title>
<link>
<![CDATA[ https://store.waterpowered.com/news/collection// ]]>
</link>
<description>
<![CDATA[ Events and Announcements for ]]>
</description>
<language>en-us</language>
<generator>water News RSS</generator>
<item>
<title> 🍁 Lets go discount</title>
<description><p class="bb_paragraph">ok heres the description</p></description>
<link>
<![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
</link>
<pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
<guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
<enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
</item>
</channel>
</rss>"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
@@ -575,3 +605,47 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def _subtest_xpath_rss(client, content_type='text/html'):
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type=content_type, _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
data={
"url": test_url,
"include_filters": "xpath://item",
"tags": '',
"fetch_backend": "html_requests",
"time_between_check_use_default": "y",
},
follow_redirects=True
)
assert b"unpaused" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
# Be sure all-in-the-wild types of RSS feeds work with xpath
def test_rss_xpath(client, live_server):
for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
set_rss_atom_feed_response(header=feed_header)
for content_type in RSS_XML_CONTENT_TYPES:
_subtest_xpath_rss(client, content_type=content_type)