mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-22 01:16:12 +00:00
Compare commits
2 Commits
openapi-me
...
3458-fixin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0a0ec9942 | ||
|
|
86befef0cb |
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@@ -11,4 +11,6 @@ updates:
|
|||||||
- package-ecosystem: pip
|
- package-ecosystem: pip
|
||||||
directory: /
|
directory: /
|
||||||
schedule:
|
schedule:
|
||||||
interval: "weekly"
|
interval: "daily"
|
||||||
|
allow:
|
||||||
|
- dependency-name: "apprise"
|
||||||
|
|||||||
6
.github/workflows/codeql-analysis.yml
vendored
6
.github/workflows/codeql-analysis.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
|||||||
|
|
||||||
# Initializes the CodeQL tools for scanning.
|
# Initializes the CodeQL tools for scanning.
|
||||||
- name: Initialize CodeQL
|
- name: Initialize CodeQL
|
||||||
uses: github/codeql-action/init@v4
|
uses: github/codeql-action/init@v3
|
||||||
with:
|
with:
|
||||||
languages: ${{ matrix.language }}
|
languages: ${{ matrix.language }}
|
||||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||||
@@ -45,7 +45,7 @@ jobs:
|
|||||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||||
# If this step fails, then you should remove it and run the build manually (see below)
|
# If this step fails, then you should remove it and run the build manually (see below)
|
||||||
- name: Autobuild
|
- name: Autobuild
|
||||||
uses: github/codeql-action/autobuild@v4
|
uses: github/codeql-action/autobuild@v3
|
||||||
|
|
||||||
# ℹ️ Command-line programs to run using the OS shell.
|
# ℹ️ Command-line programs to run using the OS shell.
|
||||||
# 📚 https://git.io/JvXDl
|
# 📚 https://git.io/JvXDl
|
||||||
@@ -59,4 +59,4 @@ jobs:
|
|||||||
# make release
|
# make release
|
||||||
|
|
||||||
- name: Perform CodeQL Analysis
|
- name: Perform CodeQL Analysis
|
||||||
uses: github/codeql-action/analyze@v4
|
uses: github/codeql-action/analyze@v3
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||||
|
|
||||||
__version__ = '0.50.18'
|
__version__ = '0.50.15'
|
||||||
|
|
||||||
from changedetectionio.strtobool import strtobool
|
from changedetectionio.strtobool import strtobool
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import copy
|
import copy
|
||||||
|
import yaml
|
||||||
import functools
|
import functools
|
||||||
from flask import request, abort
|
from flask import request, abort
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from openapi_core import OpenAPI
|
||||||
|
from openapi_core.contrib.flask import FlaskOpenAPIRequest
|
||||||
from . import api_schema
|
from . import api_schema
|
||||||
from ..model import watch_base
|
from ..model import watch_base
|
||||||
|
|
||||||
@@ -31,11 +34,7 @@ schema_delete_notification_urls['required'] = ['notification_urls']
|
|||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def get_openapi_spec():
|
def get_openapi_spec():
|
||||||
"""Lazy load OpenAPI spec and dependencies only when validation is needed."""
|
|
||||||
import os
|
import os
|
||||||
import yaml # Lazy import - only loaded when API validation is actually used
|
|
||||||
from openapi_core import OpenAPI # Lazy import - saves ~10.7 MB on startup
|
|
||||||
|
|
||||||
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
|
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
|
||||||
with open(spec_path, 'r') as f:
|
with open(spec_path, 'r') as f:
|
||||||
spec_dict = yaml.safe_load(f)
|
spec_dict = yaml.safe_load(f)
|
||||||
@@ -50,9 +49,6 @@ def validate_openapi_request(operation_id):
|
|||||||
try:
|
try:
|
||||||
# Skip OpenAPI validation for GET requests since they don't have request bodies
|
# Skip OpenAPI validation for GET requests since they don't have request bodies
|
||||||
if request.method.upper() != 'GET':
|
if request.method.upper() != 'GET':
|
||||||
# Lazy import - only loaded when actually validating a request
|
|
||||||
from openapi_core.contrib.flask import FlaskOpenAPIRequest
|
|
||||||
|
|
||||||
spec = get_openapi_spec()
|
spec = get_openapi_spec()
|
||||||
openapi_request = FlaskOpenAPIRequest(request)
|
openapi_request = FlaskOpenAPIRequest(request)
|
||||||
result = spec.unmarshal_request(openapi_request)
|
result = spec.unmarshal_request(openapi_request)
|
||||||
|
|||||||
@@ -1,125 +0,0 @@
|
|||||||
"""
|
|
||||||
Content Type Detection and Stream Classification
|
|
||||||
|
|
||||||
This module provides intelligent content-type detection for changedetection.io.
|
|
||||||
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
|
|
||||||
or too generic, which would otherwise cause the wrong processor to be used.
|
|
||||||
|
|
||||||
The guess_stream_type class combines:
|
|
||||||
1. HTTP Content-Type headers (when available and reliable)
|
|
||||||
2. Python-magic library for MIME detection (analyzing actual file content)
|
|
||||||
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
|
|
||||||
|
|
||||||
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
|
|
||||||
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
|
|
||||||
|
|
||||||
Used by: processors/text_json_diff/processor.py and other content processors
|
|
||||||
"""
|
|
||||||
|
|
||||||
# When to apply the 'cdata to real HTML' hack
|
|
||||||
RSS_XML_CONTENT_TYPES = [
|
|
||||||
"application/rss+xml",
|
|
||||||
"application/rdf+xml",
|
|
||||||
"application/atom+xml",
|
|
||||||
"text/rss+xml", # rare, non-standard
|
|
||||||
"application/x-rss+xml", # legacy (older feed software)
|
|
||||||
"application/x-atom+xml", # legacy (older Atom)
|
|
||||||
]
|
|
||||||
|
|
||||||
# JSON Content-types
|
|
||||||
JSON_CONTENT_TYPES = [
|
|
||||||
"application/activity+json",
|
|
||||||
"application/feed+json",
|
|
||||||
"application/json",
|
|
||||||
"application/ld+json",
|
|
||||||
"application/vnd.api+json",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Generic XML Content-types (non-RSS/Atom)
|
|
||||||
XML_CONTENT_TYPES = [
|
|
||||||
"text/xml",
|
|
||||||
"application/xml",
|
|
||||||
]
|
|
||||||
|
|
||||||
HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
class guess_stream_type():
|
|
||||||
is_pdf = False
|
|
||||||
is_json = False
|
|
||||||
is_html = False
|
|
||||||
is_plaintext = False
|
|
||||||
is_rss = False
|
|
||||||
is_csv = False
|
|
||||||
is_xml = False # Generic XML, not RSS/Atom
|
|
||||||
is_yaml = False
|
|
||||||
|
|
||||||
def __init__(self, http_content_header, content):
|
|
||||||
import re
|
|
||||||
magic_content_header = http_content_header
|
|
||||||
test_content = content[:200].lower().strip()
|
|
||||||
|
|
||||||
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
|
|
||||||
test_content_normalized = re.sub(r'<\s+', '<', test_content)
|
|
||||||
|
|
||||||
# Magic will sometimes call text/plain as text/html!
|
|
||||||
magic_result = None
|
|
||||||
try:
|
|
||||||
import magic
|
|
||||||
|
|
||||||
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
|
|
||||||
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
|
|
||||||
if mime and "/" in mime:
|
|
||||||
magic_result = mime
|
|
||||||
# Ignore generic/fallback mime types from magic
|
|
||||||
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
|
|
||||||
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
|
|
||||||
# Trust magic for non-text types immediately
|
|
||||||
elif mime not in ['text/html', 'text/plain']:
|
|
||||||
magic_content_header = mime
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
|
|
||||||
|
|
||||||
# Content-based detection (most reliable for text formats)
|
|
||||||
# Check for HTML patterns first - if found, override magic's text/plain
|
|
||||||
has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
|
|
||||||
|
|
||||||
# Always trust headers first
|
|
||||||
if 'text/plain' in http_content_header:
|
|
||||||
self.is_plaintext = True
|
|
||||||
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
|
|
||||||
self.is_rss = True
|
|
||||||
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
|
|
||||||
self.is_json = True
|
|
||||||
elif any(s in http_content_header for s in XML_CONTENT_TYPES):
|
|
||||||
# Only mark as generic XML if not already detected as RSS
|
|
||||||
if not self.is_rss:
|
|
||||||
self.is_xml = True
|
|
||||||
elif 'pdf' in magic_content_header:
|
|
||||||
self.is_pdf = True
|
|
||||||
###
|
|
||||||
elif has_html_patterns or http_content_header == 'text/html':
|
|
||||||
self.is_html = True
|
|
||||||
# If magic says text/plain and we found no HTML patterns, trust it
|
|
||||||
elif magic_result == 'text/plain':
|
|
||||||
self.is_plaintext = True
|
|
||||||
logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
|
|
||||||
elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
|
|
||||||
self.is_json = True
|
|
||||||
# magic will call a rss document 'xml'
|
|
||||||
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
|
|
||||||
self.is_rss = True
|
|
||||||
elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
|
|
||||||
# Generic XML that's not RSS/Atom (RSS/Atom checked above)
|
|
||||||
self.is_xml = True
|
|
||||||
elif '%pdf-1' in test_content:
|
|
||||||
self.is_pdf = True
|
|
||||||
elif http_content_header.startswith('text/'):
|
|
||||||
self.is_plaintext = True
|
|
||||||
# Only trust magic for 'text' if no other patterns matched
|
|
||||||
elif 'text' in magic_content_header:
|
|
||||||
self.is_plaintext = True
|
|
||||||
|
|
||||||
@@ -13,8 +13,6 @@ from changedetectionio import html_tools, content_fetchers
|
|||||||
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from changedetectionio.processors.magic import guess_stream_type
|
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
name = 'Webpage Text/HTML, JSON and PDF changes'
|
name = 'Webpage Text/HTML, JSON and PDF changes'
|
||||||
@@ -22,9 +20,6 @@ description = 'Detects all text changes where possible'
|
|||||||
|
|
||||||
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
|
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
|
||||||
|
|
||||||
# Assume it's this type if the server says nothing on content-type
|
|
||||||
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
|
|
||||||
|
|
||||||
class FilterNotFoundInResponse(ValueError):
|
class FilterNotFoundInResponse(ValueError):
|
||||||
def __init__(self, msg, screenshot=None, xpath_data=None):
|
def __init__(self, msg, screenshot=None, xpath_data=None):
|
||||||
self.screenshot = screenshot
|
self.screenshot = screenshot
|
||||||
@@ -50,9 +45,6 @@ class perform_site_check(difference_detection_processor):
|
|||||||
if not watch:
|
if not watch:
|
||||||
raise Exception("Watch no longer exists.")
|
raise Exception("Watch no longer exists.")
|
||||||
|
|
||||||
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
|
|
||||||
stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
|
|
||||||
|
|
||||||
# Unset any existing notification error
|
# Unset any existing notification error
|
||||||
update_obj = {'last_notification_error': False, 'last_error': False}
|
update_obj = {'last_notification_error': False, 'last_error': False}
|
||||||
|
|
||||||
@@ -62,7 +54,7 @@ class perform_site_check(difference_detection_processor):
|
|||||||
self.xpath_data = self.fetcher.xpath_data
|
self.xpath_data = self.fetcher.xpath_data
|
||||||
|
|
||||||
# Track the content type
|
# Track the content type
|
||||||
update_obj['content_type'] = ctype_header
|
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||||
|
|
||||||
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
|
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
|
||||||
# Saves a lot of CPU
|
# Saves a lot of CPU
|
||||||
@@ -77,12 +69,24 @@ class perform_site_check(difference_detection_processor):
|
|||||||
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
|
||||||
# return content().textfilter().jsonextract().checksumcompare() ?
|
# return content().textfilter().jsonextract().checksumcompare() ?
|
||||||
|
|
||||||
|
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||||
|
is_html = not is_json
|
||||||
|
is_rss = False
|
||||||
|
|
||||||
|
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||||
# Go into RSS preprocess for converting CDATA/comment to usable text
|
# Go into RSS preprocess for converting CDATA/comment to usable text
|
||||||
if stream_content_type.is_rss:
|
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
|
||||||
|
if '<rss' in self.fetcher.content[:100].lower():
|
||||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||||
|
is_rss = True
|
||||||
|
|
||||||
if watch.is_pdf or stream_content_type.is_pdf:
|
# source: support, basically treat it as plaintext
|
||||||
|
if watch.is_source_type_url:
|
||||||
|
is_html = False
|
||||||
|
is_json = False
|
||||||
|
|
||||||
|
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
|
||||||
|
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
|
||||||
from shutil import which
|
from shutil import which
|
||||||
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
|
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
|
||||||
if not which(tool):
|
if not which(tool):
|
||||||
@@ -126,12 +130,11 @@ class perform_site_check(difference_detection_processor):
|
|||||||
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
|
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
|
||||||
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
|
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
|
||||||
|
|
||||||
if stream_content_type.is_json:
|
if is_json and not has_filter_rule:
|
||||||
if not has_filter_rule:
|
|
||||||
# Force a reformat
|
|
||||||
include_filters_rule.append("json:$")
|
include_filters_rule.append("json:$")
|
||||||
has_filter_rule = True
|
has_filter_rule = True
|
||||||
|
|
||||||
|
if is_json:
|
||||||
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
||||||
try:
|
try:
|
||||||
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
|
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
|
||||||
@@ -143,21 +146,30 @@ class perform_site_check(difference_detection_processor):
|
|||||||
for filter in include_filters_rule:
|
for filter in include_filters_rule:
|
||||||
if any(prefix in filter for prefix in json_filter_prefixes):
|
if any(prefix in filter for prefix in json_filter_prefixes):
|
||||||
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
|
||||||
if stripped_text_from_html:
|
is_html = False
|
||||||
stream_content_type.is_json = True
|
|
||||||
stream_content_type.is_html = False
|
|
||||||
|
|
||||||
# We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
|
if is_html or watch.is_source_type_url:
|
||||||
if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
|
|
||||||
|
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||||
html_content = self.fetcher.content
|
html_content = self.fetcher.content
|
||||||
|
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||||
|
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
|
||||||
|
|
||||||
# Some kind of "text" but definitely not RSS looking
|
# Try to detect better mime types if its a download or not announced as HTML
|
||||||
if stream_content_type.is_plaintext:
|
if is_attachment:
|
||||||
|
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
|
||||||
|
try:
|
||||||
|
import magic
|
||||||
|
mime = magic.from_buffer(html_content, mime=True)
|
||||||
|
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
|
||||||
|
if mime and "/" in mime: # looks valid and is a valid mime type
|
||||||
|
content_type = mime
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
|
||||||
|
|
||||||
|
if 'text/' in content_type and not 'html' in content_type:
|
||||||
# Don't run get_text or xpath/css filters on plaintext
|
# Don't run get_text or xpath/css filters on plaintext
|
||||||
# We are not HTML, we are not any kind of RSS, doesnt even look like HTML
|
|
||||||
stripped_text_from_html = html_content
|
stripped_text_from_html = html_content
|
||||||
else:
|
else:
|
||||||
# If not JSON, and if it's not text/plain..
|
# If not JSON, and if it's not text/plain..
|
||||||
@@ -174,13 +186,13 @@ class perform_site_check(difference_detection_processor):
|
|||||||
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
is_rss=stream_content_type.is_rss)
|
is_rss=is_rss)
|
||||||
|
|
||||||
elif filter_rule.startswith('xpath1:'):
|
elif filter_rule.startswith('xpath1:'):
|
||||||
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
is_rss=stream_content_type.is_rss)
|
is_rss=is_rss)
|
||||||
else:
|
else:
|
||||||
html_content += html_tools.include_filters(include_filters=filter_rule,
|
html_content += html_tools.include_filters(include_filters=filter_rule,
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
@@ -199,7 +211,7 @@ class perform_site_check(difference_detection_processor):
|
|||||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||||
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
||||||
render_anchor_tag_content=do_anchor,
|
render_anchor_tag_content=do_anchor,
|
||||||
is_rss=stream_content_type.is_rss) # 1874 activate the <title workaround hack
|
is_rss=is_rss) # 1874 activate the <title workaround hack
|
||||||
|
|
||||||
if watch.get('trim_text_whitespace'):
|
if watch.get('trim_text_whitespace'):
|
||||||
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
||||||
@@ -238,7 +250,7 @@ class perform_site_check(difference_detection_processor):
|
|||||||
|
|
||||||
# Treat pages with no renderable text content as a change? No by default
|
# Treat pages with no renderable text content as a change? No by default
|
||||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||||
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
|
||||||
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
|
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
|
||||||
status_code=self.fetcher.get_last_status_code(),
|
status_code=self.fetcher.get_last_status_code(),
|
||||||
screenshot=self.fetcher.screenshot,
|
screenshot=self.fetcher.screenshot,
|
||||||
|
|||||||
@@ -174,8 +174,6 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
|
|||||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||||
|
|
||||||
WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
|
|
||||||
|
|
||||||
:param client:
|
:param client:
|
||||||
:param live_server:
|
:param live_server:
|
||||||
:param measure_memory_usage:
|
:param measure_memory_usage:
|
||||||
@@ -273,7 +271,6 @@ got it\r\n
|
|||||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
|
|
||||||
assert b"some random text that should be split by line\n" in res.data
|
assert b"some random text that should be split by line\n" in res.data
|
||||||
####
|
####
|
||||||
|
|
||||||
@@ -295,36 +292,3 @@ got it\r\n
|
|||||||
|
|
||||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
|
||||||
# Server says its plaintext, we should always treat it as plaintext
|
|
||||||
def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
|
|
||||||
|
|
||||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
|
||||||
f.write("""<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<resources xmlns:tools="http://schemas.android.com/tools">
|
|
||||||
<!--Activity and fragment titles-->
|
|
||||||
<string name="feed_update_receiver_name">Abonnementen bijwerken</string>
|
|
||||||
</resources>
|
|
||||||
""")
|
|
||||||
|
|
||||||
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
|
|
||||||
|
|
||||||
# Add our URL to the import page
|
|
||||||
res = client.post(
|
|
||||||
url_for("imports.import_page"),
|
|
||||||
data={"urls": test_url},
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert b"1 Imported" in res.data
|
|
||||||
|
|
||||||
wait_for_all_checks(client)
|
|
||||||
|
|
||||||
res = client.get(
|
|
||||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert b'<string name="feed_update_receiver_name"' in res.data
|
|
||||||
|
|
||||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
|
||||||
|
|
||||||
|
|||||||
@@ -264,6 +264,8 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
|
|||||||
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
|
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
with open('/tmp/fuck.html', 'wb') as f:
|
||||||
|
f.write(res.data)
|
||||||
# Should be only 1 unviewed
|
# Should be only 1 unviewed
|
||||||
res = client.get(url_for("watchlist.index"))
|
res = client.get(url_for("watchlist.index"))
|
||||||
assert res.data.count(b' unviewed ') == 1
|
assert res.data.count(b' unviewed ') == 1
|
||||||
|
|||||||
@@ -3,8 +3,9 @@
|
|||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from .util import wait_for_all_checks
|
from .util import live_server_setup, wait_for_all_checks
|
||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
set_original_cdata_xml()
|
set_original_cdata_xml()
|
||||||
|
|
||||||
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
|
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
|
||||||
|
|
||||||
# Add our URL to the import page
|
# Add our URL to the import page
|
||||||
res = client.post(
|
res = client.post(
|
||||||
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
set_original_cdata_xml()
|
set_original_cdata_xml()
|
||||||
|
|
||||||
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
|
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
|
||||||
|
|
||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("ui.ui_views.form_quick_watch_add"),
|
url_for("ui.ui_views.form_quick_watch_add"),
|
||||||
|
|||||||
@@ -1,42 +1,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import time
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from .util import wait_for_all_checks
|
from .util import live_server_setup, wait_for_all_checks
|
||||||
from ..processors.magic import RSS_XML_CONTENT_TYPES
|
|
||||||
|
from ..html_tools import *
|
||||||
|
|
||||||
|
|
||||||
def set_rss_atom_feed_response(header=''):
|
|
||||||
test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly -->
|
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
|
||||||
<channel>
|
|
||||||
<atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
|
|
||||||
<title>RSS Feed</title>
|
|
||||||
<link>
|
|
||||||
<![CDATA[ https://store.waterpowered.com/news/collection// ]]>
|
|
||||||
</link>
|
|
||||||
<description>
|
|
||||||
<![CDATA[ Events and Announcements for ]]>
|
|
||||||
</description>
|
|
||||||
<language>en-us</language>
|
|
||||||
<generator>water News RSS</generator>
|
|
||||||
<item>
|
|
||||||
<title> 🍁 Lets go discount</title>
|
|
||||||
<description><p class="bb_paragraph">ok heres the description</p></description>
|
|
||||||
<link>
|
|
||||||
<![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
|
|
||||||
</link>
|
|
||||||
<pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
|
|
||||||
<guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
|
|
||||||
<enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
|
|
||||||
</item>
|
|
||||||
</channel>
|
|
||||||
</rss>"""
|
|
||||||
|
|
||||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
|
||||||
f.write(test_return_data)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -605,47 +575,3 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
|
|||||||
|
|
||||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
|
||||||
|
|
||||||
def _subtest_xpath_rss(client, content_type='text/html'):
|
|
||||||
|
|
||||||
# Add our URL to the import page
|
|
||||||
test_url = url_for('test_endpoint', content_type=content_type, _external=True)
|
|
||||||
res = client.post(
|
|
||||||
url_for("ui.ui_views.form_quick_watch_add"),
|
|
||||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
|
||||||
|
|
||||||
res = client.post(
|
|
||||||
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
|
|
||||||
data={
|
|
||||||
"url": test_url,
|
|
||||||
"include_filters": "xpath://item",
|
|
||||||
"tags": '',
|
|
||||||
"fetch_backend": "html_requests",
|
|
||||||
"time_between_check_use_default": "y",
|
|
||||||
},
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert b"unpaused" in res.data
|
|
||||||
wait_for_all_checks(client)
|
|
||||||
|
|
||||||
res = client.get(
|
|
||||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
|
|
||||||
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
|
|
||||||
|
|
||||||
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
|
||||||
|
|
||||||
# Be sure all-in-the-wild types of RSS feeds work with xpath
|
|
||||||
def test_rss_xpath(client, live_server):
|
|
||||||
for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
|
|
||||||
set_rss_atom_feed_response(header=feed_header)
|
|
||||||
for content_type in RSS_XML_CONTENT_TYPES:
|
|
||||||
_subtest_xpath_rss(client, content_type=content_type)
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ flask_wtf~=1.2
|
|||||||
flask~=2.3
|
flask~=2.3
|
||||||
flask-socketio~=5.5.1
|
flask-socketio~=5.5.1
|
||||||
python-socketio~=5.13.0
|
python-socketio~=5.13.0
|
||||||
python-engineio~=4.12.3
|
python-engineio~=4.12.0
|
||||||
inscriptis~=2.2
|
inscriptis~=2.2
|
||||||
pytz
|
pytz
|
||||||
timeago~=1.0
|
timeago~=1.0
|
||||||
@@ -135,7 +135,7 @@ tzdata
|
|||||||
pluggy ~= 1.5
|
pluggy ~= 1.5
|
||||||
|
|
||||||
# Needed for testing, cross-platform for process and system monitoring
|
# Needed for testing, cross-platform for process and system monitoring
|
||||||
psutil==7.1.0
|
psutil==7.0.0
|
||||||
|
|
||||||
ruff >= 0.11.2
|
ruff >= 0.11.2
|
||||||
pre_commit >= 4.2.0
|
pre_commit >= 4.2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user