mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-08 10:36:32 +00:00
Compare commits
2 Commits
openapi-me
...
socks-prox
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3704580990 | ||
|
|
e2fa021f80 |
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@@ -11,4 +11,6 @@ updates:
|
||||
- package-ecosystem: pip
|
||||
directory: /
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "daily"
|
||||
allow:
|
||||
- dependency-name: "apprise"
|
||||
|
||||
6
.github/workflows/codeql-analysis.yml
vendored
6
.github/workflows/codeql-analysis.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v4
|
||||
uses: github/codeql-action/init@v3
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
@@ -45,7 +45,7 @@ jobs:
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v4
|
||||
uses: github/codeql-action/autobuild@v3
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
@@ -59,4 +59,4 @@ jobs:
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v4
|
||||
uses: github/codeql-action/analyze@v3
|
||||
|
||||
4
.github/workflows/containers.yml
vendored
4
.github/workflows/containers.yml
vendored
@@ -95,7 +95,7 @@ jobs:
|
||||
push: true
|
||||
tags: |
|
||||
${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:dev,ghcr.io/${{ github.repository }}:dev
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
@@ -133,7 +133,7 @@ jobs:
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# Looks like this was disabled
|
||||
|
||||
4
.github/workflows/test-container-build.yml
vendored
4
.github/workflows/test-container-build.yml
vendored
@@ -38,6 +38,8 @@ jobs:
|
||||
dockerfile: ./Dockerfile
|
||||
- platform: linux/arm/v8
|
||||
dockerfile: ./Dockerfile
|
||||
- platform: linux/arm64/v8
|
||||
dockerfile: ./Dockerfile
|
||||
# Alpine Dockerfile platforms (musl via alpine check)
|
||||
- platform: linux/amd64
|
||||
dockerfile: ./.github/test/Dockerfile-alpine
|
||||
@@ -74,5 +76,5 @@ jobs:
|
||||
file: ${{ matrix.dockerfile }}
|
||||
platforms: ${{ matrix.platform }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=min
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
|
||||
__version__ = '0.50.18'
|
||||
__version__ = '0.50.14'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import copy
|
||||
import yaml
|
||||
import functools
|
||||
from flask import request, abort
|
||||
from loguru import logger
|
||||
from openapi_core import OpenAPI
|
||||
from openapi_core.contrib.flask import FlaskOpenAPIRequest
|
||||
from . import api_schema
|
||||
from ..model import watch_base
|
||||
|
||||
@@ -31,11 +34,7 @@ schema_delete_notification_urls['required'] = ['notification_urls']
|
||||
|
||||
@functools.cache
|
||||
def get_openapi_spec():
|
||||
"""Lazy load OpenAPI spec and dependencies only when validation is needed."""
|
||||
import os
|
||||
import yaml # Lazy import - only loaded when API validation is actually used
|
||||
from openapi_core import OpenAPI # Lazy import - saves ~10.7 MB on startup
|
||||
|
||||
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
|
||||
with open(spec_path, 'r') as f:
|
||||
spec_dict = yaml.safe_load(f)
|
||||
@@ -50,9 +49,6 @@ def validate_openapi_request(operation_id):
|
||||
try:
|
||||
# Skip OpenAPI validation for GET requests since they don't have request bodies
|
||||
if request.method.upper() != 'GET':
|
||||
# Lazy import - only loaded when actually validating a request
|
||||
from openapi_core.contrib.flask import FlaskOpenAPIRequest
|
||||
|
||||
spec = get_openapi_spec()
|
||||
openapi_request = FlaskOpenAPIRequest(request)
|
||||
result = spec.unmarshal_request(openapi_request)
|
||||
|
||||
@@ -191,12 +191,6 @@ nav
|
||||
</ul>
|
||||
</span>
|
||||
</fieldset>
|
||||
<fieldset class="pure-group">
|
||||
{{ render_checkbox_field(form.application.form.strip_ignored_lines) }}
|
||||
<span class="pure-form-message-inline">Remove any text that appears in the "Ignore text" from the output (otherwise its just ignored for change-detection)<br>
|
||||
<i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
|
||||
</span>
|
||||
</fieldset>
|
||||
</div>
|
||||
|
||||
<div class="tab-pane-inner" id="api">
|
||||
|
||||
@@ -759,7 +759,6 @@ class processor_text_json_diff_form(commonSettingsForm):
|
||||
check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
|
||||
remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
|
||||
sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False)
|
||||
strip_ignored_lines = TernaryNoneBooleanField('Strip ignored lines', default=None)
|
||||
trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)
|
||||
|
||||
filter_text_added = BooleanField('Added lines', default=True)
|
||||
@@ -937,7 +936,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
|
||||
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
|
||||
render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
|
||||
shared_diff_access = BooleanField('Allow anonymous access to watch history page when password is enabled', default=False, validators=[validators.Optional()])
|
||||
strip_ignored_lines = BooleanField('Strip ignored lines')
|
||||
rss_hide_muted_watches = BooleanField('Hide muted watches from RSS feed', default=True,
|
||||
validators=[validators.Optional()])
|
||||
filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',
|
||||
|
||||
@@ -57,7 +57,6 @@ class model(dict):
|
||||
'rss_hide_muted_watches': True,
|
||||
'schema_version' : 0,
|
||||
'shared_diff_access': False,
|
||||
'strip_ignored_lines': False,
|
||||
'tags': {}, #@todo use Tag.model initialisers
|
||||
'timezone': None, # Default IANA timezone name
|
||||
'webdriver_delay': None , # Extra delay in seconds before extracting text
|
||||
|
||||
@@ -58,7 +58,6 @@ class watch_base(dict):
|
||||
'proxy': None, # Preferred proxy connection
|
||||
'remote_server_reply': None, # From 'server' reply header
|
||||
'sort_text_alphabetically': False,
|
||||
'strip_ignored_lines': None,
|
||||
'subtractive_selectors': [],
|
||||
'tag': '', # Old system of text name for a tag, to be removed
|
||||
'tags': [], # list of UUIDs to App.Tags
|
||||
|
||||
@@ -1,125 +0,0 @@
|
||||
"""
|
||||
Content Type Detection and Stream Classification
|
||||
|
||||
This module provides intelligent content-type detection for changedetection.io.
|
||||
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
|
||||
or too generic, which would otherwise cause the wrong processor to be used.
|
||||
|
||||
The guess_stream_type class combines:
|
||||
1. HTTP Content-Type headers (when available and reliable)
|
||||
2. Python-magic library for MIME detection (analyzing actual file content)
|
||||
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
|
||||
|
||||
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
|
||||
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
|
||||
|
||||
Used by: processors/text_json_diff/processor.py and other content processors
|
||||
"""
|
||||
|
||||
# When to apply the 'cdata to real HTML' hack
|
||||
RSS_XML_CONTENT_TYPES = [
|
||||
"application/rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/atom+xml",
|
||||
"text/rss+xml", # rare, non-standard
|
||||
"application/x-rss+xml", # legacy (older feed software)
|
||||
"application/x-atom+xml", # legacy (older Atom)
|
||||
]
|
||||
|
||||
# JSON Content-types
|
||||
JSON_CONTENT_TYPES = [
|
||||
"application/activity+json",
|
||||
"application/feed+json",
|
||||
"application/json",
|
||||
"application/ld+json",
|
||||
"application/vnd.api+json",
|
||||
]
|
||||
|
||||
|
||||
# Generic XML Content-types (non-RSS/Atom)
|
||||
XML_CONTENT_TYPES = [
|
||||
"text/xml",
|
||||
"application/xml",
|
||||
]
|
||||
|
||||
HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
|
||||
|
||||
from loguru import logger
|
||||
|
||||
class guess_stream_type():
|
||||
is_pdf = False
|
||||
is_json = False
|
||||
is_html = False
|
||||
is_plaintext = False
|
||||
is_rss = False
|
||||
is_csv = False
|
||||
is_xml = False # Generic XML, not RSS/Atom
|
||||
is_yaml = False
|
||||
|
||||
def __init__(self, http_content_header, content):
|
||||
import re
|
||||
magic_content_header = http_content_header
|
||||
test_content = content[:200].lower().strip()
|
||||
|
||||
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
|
||||
test_content_normalized = re.sub(r'<\s+', '<', test_content)
|
||||
|
||||
# Magic will sometimes call text/plain as text/html!
|
||||
magic_result = None
|
||||
try:
|
||||
import magic
|
||||
|
||||
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
|
||||
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
|
||||
if mime and "/" in mime:
|
||||
magic_result = mime
|
||||
# Ignore generic/fallback mime types from magic
|
||||
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
|
||||
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
|
||||
# Trust magic for non-text types immediately
|
||||
elif mime not in ['text/html', 'text/plain']:
|
||||
magic_content_header = mime
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
|
||||
|
||||
# Content-based detection (most reliable for text formats)
|
||||
# Check for HTML patterns first - if found, override magic's text/plain
|
||||
has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
|
||||
|
||||
# Always trust headers first
|
||||
if 'text/plain' in http_content_header:
|
||||
self.is_plaintext = True
|
||||
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
|
||||
self.is_rss = True
|
||||
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
|
||||
self.is_json = True
|
||||
elif any(s in http_content_header for s in XML_CONTENT_TYPES):
|
||||
# Only mark as generic XML if not already detected as RSS
|
||||
if not self.is_rss:
|
||||
self.is_xml = True
|
||||
elif 'pdf' in magic_content_header:
|
||||
self.is_pdf = True
|
||||
###
|
||||
elif has_html_patterns or http_content_header == 'text/html':
|
||||
self.is_html = True
|
||||
# If magic says text/plain and we found no HTML patterns, trust it
|
||||
elif magic_result == 'text/plain':
|
||||
self.is_plaintext = True
|
||||
logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
|
||||
elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
|
||||
self.is_json = True
|
||||
# magic will call a rss document 'xml'
|
||||
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
|
||||
self.is_rss = True
|
||||
elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
|
||||
# Generic XML that's not RSS/Atom (RSS/Atom checked above)
|
||||
self.is_xml = True
|
||||
elif '%pdf-1' in test_content:
|
||||
self.is_pdf = True
|
||||
elif http_content_header.startswith('text/'):
|
||||
self.is_plaintext = True
|
||||
# Only trust magic for 'text' if no other patterns matched
|
||||
elif 'text' in magic_content_header:
|
||||
self.is_plaintext = True
|
||||
|
||||
@@ -13,8 +13,6 @@ from changedetectionio import html_tools, content_fetchers
|
||||
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
||||
from loguru import logger
|
||||
|
||||
from changedetectionio.processors.magic import guess_stream_type
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
name = 'Webpage Text/HTML, JSON and PDF changes'
|
||||
@@ -22,9 +20,6 @@ description = 'Detects all text changes where possible'
|
||||
|
||||
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
|
||||
|
||||
# Assume it's this type if the server says nothing on content-type
|
||||
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
|
||||
|
||||
class FilterNotFoundInResponse(ValueError):
|
||||
def __init__(self, msg, screenshot=None, xpath_data=None):
|
||||
self.screenshot = screenshot
|
||||
@@ -50,9 +45,6 @@ class perform_site_check(difference_detection_processor):
|
||||
if not watch:
|
||||
raise Exception("Watch no longer exists.")
|
||||
|
||||
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
|
||||
stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
|
||||
|
||||
# Unset any existing notification error
|
||||
update_obj = {'last_notification_error': False, 'last_error': False}
|
||||
|
||||
@@ -62,7 +54,7 @@ class perform_site_check(difference_detection_processor):
|
||||
self.xpath_data = self.fetcher.xpath_data
|
||||
|
||||
# Track the content type
|
||||
update_obj['content_type'] = ctype_header
|
||||
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
|
||||
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
|
||||
# Saves a lot of CPU
|
||||
@@ -77,12 +69,24 @@ class perform_site_check(difference_detection_processor):
|
||||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||
|
||||
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_html = not is_json
|
||||
is_rss = False
|
||||
|
||||
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
# Go into RSS preprocess for converting CDATA/comment to usable text
|
||||
if stream_content_type.is_rss:
|
||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
|
||||
if '<rss' in self.fetcher.content[:100].lower():
|
||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||
is_rss = True
|
||||
|
||||
if watch.is_pdf or stream_content_type.is_pdf:
|
||||
# source: support, basically treat it as plaintext
|
||||
if watch.is_source_type_url:
|
||||
is_html = False
|
||||
is_json = False
|
||||
|
||||
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
|
||||
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
|
||||
from shutil import which
|
||||
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
|
||||
if not which(tool):
|
||||
@@ -126,12 +130,11 @@ class perform_site_check(difference_detection_processor):
|
||||
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
|
||||
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
|
||||
|
||||
if stream_content_type.is_json:
|
||||
if not has_filter_rule:
|
||||
# Force a reformat
|
||||
include_filters_rule.append("json:$")
|
||||
has_filter_rule = True
|
||||
if is_json and not has_filter_rule:
|
||||
include_filters_rule.append("json:$")
|
||||
has_filter_rule = True
|
||||
|
||||
if is_json:
|
||||
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
||||
try:
|
||||
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
|
||||
@@ -139,25 +142,34 @@ class perform_site_check(difference_detection_processor):
|
||||
# Might have just been a snippet, or otherwise bad JSON, continue
|
||||
pass
|
||||
|
||||
if has_filter_rule:
|
||||
for filter in include_filters_rule:
|
||||
if any(prefix in filter for prefix in json_filter_prefixes):
|
||||
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
||||
if stripped_text_from_html:
|
||||
stream_content_type.is_json = True
|
||||
stream_content_type.is_html = False
|
||||
if has_filter_rule:
|
||||
for filter in include_filters_rule:
|
||||
if any(prefix in filter for prefix in json_filter_prefixes):
|
||||
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
||||
is_html = False
|
||||
|
||||
# We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
|
||||
if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
|
||||
if is_html or watch.is_source_type_url:
|
||||
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||
html_content = self.fetcher.content
|
||||
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()
|
||||
|
||||
# Some kind of "text" but definitely not RSS looking
|
||||
if stream_content_type.is_plaintext:
|
||||
# Try to detect better mime types if its a download or not announced as HTML
|
||||
if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
|
||||
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
|
||||
try:
|
||||
import magic
|
||||
mime = magic.from_buffer(html_content, mime=True)
|
||||
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
|
||||
if mime and "/" in mime: # looks valid and is a valid mime type
|
||||
content_type = mime
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
|
||||
|
||||
if 'text/' in content_type and not 'html' in content_type:
|
||||
# Don't run get_text or xpath/css filters on plaintext
|
||||
# We are not HTML, we are not any kind of RSS, doesnt even look like HTML
|
||||
stripped_text_from_html = html_content
|
||||
else:
|
||||
# If not JSON, and if it's not text/plain..
|
||||
@@ -174,13 +186,13 @@ class perform_site_check(difference_detection_processor):
|
||||
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
||||
html_content=self.fetcher.content,
|
||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||
is_rss=stream_content_type.is_rss)
|
||||
is_rss=is_rss)
|
||||
|
||||
elif filter_rule.startswith('xpath1:'):
|
||||
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||
html_content=self.fetcher.content,
|
||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||
is_rss=stream_content_type.is_rss)
|
||||
is_rss=is_rss)
|
||||
else:
|
||||
html_content += html_tools.include_filters(include_filters=filter_rule,
|
||||
html_content=self.fetcher.content,
|
||||
@@ -199,7 +211,7 @@ class perform_site_check(difference_detection_processor):
|
||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
||||
render_anchor_tag_content=do_anchor,
|
||||
is_rss=stream_content_type.is_rss) # 1874 activate the <title workaround hack
|
||||
is_rss=is_rss) # 1874 activate the <title workaround hack
|
||||
|
||||
if watch.get('trim_text_whitespace'):
|
||||
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
||||
@@ -238,7 +250,7 @@ class perform_site_check(difference_detection_processor):
|
||||
|
||||
# Treat pages with no renderable text content as a change? No by default
|
||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
|
||||
status_code=self.fetcher.get_last_status_code(),
|
||||
screenshot=self.fetcher.screenshot,
|
||||
@@ -303,11 +315,6 @@ class perform_site_check(difference_detection_processor):
|
||||
text_for_checksuming = stripped_text_from_html
|
||||
if text_to_ignore:
|
||||
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
|
||||
# Some people prefer to also completely remove it
|
||||
strip_ignored_lines = watch.get('strip_ignored_lines') if watch.get('strip_ignored_lines') is not None else self.datastore.data['settings']['application'].get('strip_ignored_lines')
|
||||
if strip_ignored_lines:
|
||||
# @todo add test in the 'preview' mode, check the widget works? compare to datastruct
|
||||
stripped_text_from_html = text_for_checksuming
|
||||
|
||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
transition: all 0.2s ease;
|
||||
cursor: pointer;
|
||||
display: block;
|
||||
min-width: 60px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -26,10 +26,7 @@
|
||||
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
|
||||
</ul>
|
||||
</span>
|
||||
<br><br>
|
||||
<div class="pure-control-group">
|
||||
{{ render_ternary_field(form.strip_ignored_lines) }}
|
||||
</div>
|
||||
|
||||
</fieldset>
|
||||
|
||||
<fieldset>
|
||||
|
||||
@@ -167,20 +167,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
|
||||
"""
|
||||
|
||||
https://github.com/dgtlmoon/changedetection.io/issues/3434
|
||||
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
|
||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||
|
||||
WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
|
||||
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:return:
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("""some random text that should be split by line
|
||||
and not parsed with html_to_text
|
||||
@@ -229,102 +215,3 @@ got it\r\n
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
def test_standard_text_plain(client, live_server, measure_memory_usage):
|
||||
"""
|
||||
|
||||
https://github.com/dgtlmoon/changedetection.io/issues/3434
|
||||
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
|
||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||
|
||||
The real bug here can be that it will try to process plain-text as HTML, losing <etc>
|
||||
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:return:
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("""some random text that should be split by line
|
||||
and not parsed with html_to_text
|
||||
<title>Even this title should stay because we are just plain text</title>
|
||||
this way we know that it correctly parsed as plain text
|
||||
\r\n
|
||||
ok\r\n
|
||||
got it\r\n
|
||||
""")
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
### check the front end
|
||||
res = client.get(
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"some random text that should be split by line\n" in res.data
|
||||
####
|
||||
|
||||
# Check the snapshot by API that it has linefeeds too
|
||||
watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
res = client.get(
|
||||
url_for("watchhistory", uuid=watch_uuid),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
|
||||
# Fetch a snapshot by timestamp, check the right one was found
|
||||
res = client.get(
|
||||
url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
assert b"some random text that should be split by line\n" in res.data
|
||||
assert b"<title>Even this title should stay because we are just plain text</title>" in res.data
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
# Server says its plaintext, we should always treat it as plaintext
|
||||
def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("""<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources xmlns:tools="http://schemas.android.com/tools">
|
||||
<!--Activity and fragment titles-->
|
||||
<string name="feed_update_receiver_name">Abonnementen bijwerken</string>
|
||||
</resources>
|
||||
""")
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'<string name="feed_update_receiver_name"' in res.data
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
@@ -264,6 +264,8 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
|
||||
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
with open('/tmp/fuck.html', 'wb') as f:
|
||||
f.write(res.data)
|
||||
# Should be only 1 unviewed
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert res.data.count(b' unviewed ') == 1
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from flask import url_for
|
||||
from .util import wait_for_all_checks
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||
|
||||
@@ -58,39 +58,3 @@ def test_ignore(client, live_server, measure_memory_usage):
|
||||
# Should be in base.html
|
||||
assert b'csrftoken' in res.data
|
||||
|
||||
|
||||
def test_strip_ignore_lines(client, live_server, measure_memory_usage):
|
||||
# live_server_setup(live_server) # Setup on conftest per function
|
||||
set_original_ignore_response()
|
||||
|
||||
|
||||
# Goto the settings page, add our ignore text
|
||||
res = client.post(
|
||||
url_for("settings.settings_page"),
|
||||
data={
|
||||
"requests-time_between_check-minutes": 180,
|
||||
"application-ignore_whitespace": "y",
|
||||
"application-strip_ignored_lines": "y",
|
||||
"application-global_ignore_text": "Which is across multiple",
|
||||
'application-fetch_backend': "html_requests"
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
wait_for_all_checks(client)
|
||||
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
|
||||
|
||||
# It should not be in the preview anymore
|
||||
res = client.get(url_for("ui.ui_views.preview_page", uuid=uuid))
|
||||
assert b'<div class="ignored">' not in res.data
|
||||
assert b'Which is across multiple' not in res.data
|
||||
|
||||
@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
|
||||
|
||||
set_original_cdata_xml()
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
|
||||
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
|
||||
|
||||
set_original_cdata_xml()
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
|
||||
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
|
||||
|
||||
res = client.post(
|
||||
url_for("ui.ui_views.form_quick_watch_add"),
|
||||
|
||||
@@ -1,42 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import wait_for_all_checks
|
||||
from ..processors.magic import RSS_XML_CONTENT_TYPES
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
|
||||
from ..html_tools import *
|
||||
|
||||
|
||||
def set_rss_atom_feed_response(header=''):
|
||||
test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly -->
|
||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||
<channel>
|
||||
<atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
|
||||
<title>RSS Feed</title>
|
||||
<link>
|
||||
<![CDATA[ https://store.waterpowered.com/news/collection// ]]>
|
||||
</link>
|
||||
<description>
|
||||
<![CDATA[ Events and Announcements for ]]>
|
||||
</description>
|
||||
<language>en-us</language>
|
||||
<generator>water News RSS</generator>
|
||||
<item>
|
||||
<title> 🍁 Lets go discount</title>
|
||||
<description><p class="bb_paragraph">ok heres the description</p></description>
|
||||
<link>
|
||||
<![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
|
||||
</link>
|
||||
<pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
|
||||
<guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
|
||||
<enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -605,47 +575,3 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
|
||||
|
||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
def _subtest_xpath_rss(client, content_type='text/html'):
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', content_type=content_type, _external=True)
|
||||
res = client.post(
|
||||
url_for("ui.ui_views.form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
|
||||
data={
|
||||
"url": test_url,
|
||||
"include_filters": "xpath://item",
|
||||
"tags": '',
|
||||
"fetch_backend": "html_requests",
|
||||
"time_between_check_use_default": "y",
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"unpaused" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
|
||||
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
|
||||
|
||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
# Be sure all-in-the-wild types of RSS feeds work with xpath
|
||||
def test_rss_xpath(client, live_server):
|
||||
for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
|
||||
set_rss_atom_feed_response(header=feed_header)
|
||||
for content_type in RSS_XML_CONTENT_TYPES:
|
||||
_subtest_xpath_rss(client, content_type=content_type)
|
||||
|
||||
@@ -12,7 +12,7 @@ flask_wtf~=1.2
|
||||
flask~=2.3
|
||||
flask-socketio~=5.5.1
|
||||
python-socketio~=5.13.0
|
||||
python-engineio~=4.12.3
|
||||
python-engineio~=4.12.0
|
||||
inscriptis~=2.2
|
||||
pytz
|
||||
timeago~=1.0
|
||||
@@ -39,7 +39,7 @@ jsonpath-ng~=1.5.3
|
||||
# jq not available on Windows so must be installed manually
|
||||
|
||||
# Notification library
|
||||
apprise==1.9.5
|
||||
apprise==1.9.4
|
||||
|
||||
# - Needed for apprise/spush, and maybe others? hopefully doesnt trigger a rust compile.
|
||||
# - Requires extra wheel for rPi, adds build time for arm/v8 which is not in piwheels
|
||||
@@ -135,7 +135,7 @@ tzdata
|
||||
pluggy ~= 1.5
|
||||
|
||||
# Needed for testing, cross-platform for process and system monitoring
|
||||
psutil==7.1.0
|
||||
psutil==7.0.0
|
||||
|
||||
ruff >= 0.11.2
|
||||
pre_commit >= 4.2.0
|
||||
|
||||
Reference in New Issue
Block a user