Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
3704580990 Pin bs4 2025-09-29 11:14:42 +02:00
dgtlmoon
e2fa021f80 Attempt to fix socks test server test 2025-09-29 10:37:25 +02:00
22 changed files with 76 additions and 426 deletions

View File

@@ -11,4 +11,6 @@ updates:
- package-ecosystem: pip
directory: /
schedule:
interval: "weekly"
interval: "daily"
allow:
- dependency-name: "apprise"

View File

@@ -34,7 +34,7 @@ jobs:
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v4
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,7 +45,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v4
uses: github/codeql-action/autobuild@v3
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
@@ -59,4 +59,4 @@ jobs:
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4
uses: github/codeql-action/analyze@v3

View File

@@ -95,7 +95,7 @@ jobs:
push: true
tags: |
${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:dev,ghcr.io/${{ github.repository }}:dev
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
cache-from: type=gha
cache-to: type=gha,mode=max
@@ -133,7 +133,7 @@ jobs:
file: ./Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
cache-from: type=gha
cache-to: type=gha,mode=max
# Looks like this was disabled

View File

@@ -38,6 +38,8 @@ jobs:
dockerfile: ./Dockerfile
- platform: linux/arm/v8
dockerfile: ./Dockerfile
- platform: linux/arm64/v8
dockerfile: ./Dockerfile
# Alpine Dockerfile platforms (musl via alpine check)
- platform: linux/amd64
dockerfile: ./.github/test/Dockerfile-alpine
@@ -74,5 +76,5 @@ jobs:
file: ${{ matrix.dockerfile }}
platforms: ${{ matrix.platform }}
cache-from: type=gha
cache-to: type=gha,mode=min
cache-to: type=gha,mode=max

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.50.18'
__version__ = '0.50.14'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -1,7 +1,10 @@
import copy
import yaml
import functools
from flask import request, abort
from loguru import logger
from openapi_core import OpenAPI
from openapi_core.contrib.flask import FlaskOpenAPIRequest
from . import api_schema
from ..model import watch_base
@@ -31,11 +34,7 @@ schema_delete_notification_urls['required'] = ['notification_urls']
@functools.cache
def get_openapi_spec():
"""Lazy load OpenAPI spec and dependencies only when validation is needed."""
import os
import yaml # Lazy import - only loaded when API validation is actually used
from openapi_core import OpenAPI # Lazy import - saves ~10.7 MB on startup
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
with open(spec_path, 'r') as f:
spec_dict = yaml.safe_load(f)
@@ -50,9 +49,6 @@ def validate_openapi_request(operation_id):
try:
# Skip OpenAPI validation for GET requests since they don't have request bodies
if request.method.upper() != 'GET':
# Lazy import - only loaded when actually validating a request
from openapi_core.contrib.flask import FlaskOpenAPIRequest
spec = get_openapi_spec()
openapi_request = FlaskOpenAPIRequest(request)
result = spec.unmarshal_request(openapi_request)

View File

@@ -191,12 +191,6 @@ nav
</ul>
</span>
</fieldset>
<fieldset class="pure-group">
{{ render_checkbox_field(form.application.form.strip_ignored_lines) }}
<span class="pure-form-message-inline">Remove any text that appears in the "Ignore text" from the output (otherwise its just ignored for change-detection)<br>
<i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
</span>
</fieldset>
</div>
<div class="tab-pane-inner" id="api">

View File

@@ -759,7 +759,6 @@ class processor_text_json_diff_form(commonSettingsForm):
check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False)
strip_ignored_lines = TernaryNoneBooleanField('Strip ignored lines', default=None)
trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)
filter_text_added = BooleanField('Added lines', default=True)
@@ -937,7 +936,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
shared_diff_access = BooleanField('Allow anonymous access to watch history page when password is enabled', default=False, validators=[validators.Optional()])
strip_ignored_lines = BooleanField('Strip ignored lines')
rss_hide_muted_watches = BooleanField('Hide muted watches from RSS feed', default=True,
validators=[validators.Optional()])
filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',

View File

@@ -57,7 +57,6 @@ class model(dict):
'rss_hide_muted_watches': True,
'schema_version' : 0,
'shared_diff_access': False,
'strip_ignored_lines': False,
'tags': {}, #@todo use Tag.model initialisers
'timezone': None, # Default IANA timezone name
'webdriver_delay': None , # Extra delay in seconds before extracting text

View File

@@ -58,7 +58,6 @@ class watch_base(dict):
'proxy': None, # Preferred proxy connection
'remote_server_reply': None, # From 'server' reply header
'sort_text_alphabetically': False,
'strip_ignored_lines': None,
'subtractive_selectors': [],
'tag': '', # Old system of text name for a tag, to be removed
'tags': [], # list of UUIDs to App.Tags

View File

@@ -1,125 +0,0 @@
"""
Content Type Detection and Stream Classification
This module provides intelligent content-type detection for changedetection.io.
It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
or too generic, which would otherwise cause the wrong processor to be used.
The guess_stream_type class combines:
1. HTTP Content-Type headers (when available and reliable)
2. Python-magic library for MIME detection (analyzing actual file content)
3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
Used by: processors/text_json_diff/processor.py and other content processors
"""
# When to apply the 'cdata to real HTML' hack
RSS_XML_CONTENT_TYPES = [
"application/rss+xml",
"application/rdf+xml",
"application/atom+xml",
"text/rss+xml", # rare, non-standard
"application/x-rss+xml", # legacy (older feed software)
"application/x-atom+xml", # legacy (older Atom)
]
# JSON Content-types
JSON_CONTENT_TYPES = [
"application/activity+json",
"application/feed+json",
"application/json",
"application/ld+json",
"application/vnd.api+json",
]
# Generic XML Content-types (non-RSS/Atom)
XML_CONTENT_TYPES = [
"text/xml",
"application/xml",
]
HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
from loguru import logger
class guess_stream_type():
is_pdf = False
is_json = False
is_html = False
is_plaintext = False
is_rss = False
is_csv = False
is_xml = False # Generic XML, not RSS/Atom
is_yaml = False
def __init__(self, http_content_header, content):
import re
magic_content_header = http_content_header
test_content = content[:200].lower().strip()
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
test_content_normalized = re.sub(r'<\s+', '<', test_content)
# Magic will sometimes call text/plain as text/html!
magic_result = None
try:
import magic
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types from magic
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
# Trust magic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
# Content-based detection (most reliable for text formats)
# Check for HTML patterns first - if found, override magic's text/plain
has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
# Always trust headers first
if 'text/plain' in http_content_header:
self.is_plaintext = True
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
self.is_rss = True
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
self.is_json = True
elif any(s in http_content_header for s in XML_CONTENT_TYPES):
# Only mark as generic XML if not already detected as RSS
if not self.is_rss:
self.is_xml = True
elif 'pdf' in magic_content_header:
self.is_pdf = True
###
elif has_html_patterns or http_content_header == 'text/html':
self.is_html = True
# If magic says text/plain and we found no HTML patterns, trust it
elif magic_result == 'text/plain':
self.is_plaintext = True
logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
self.is_json = True
# magic will call a rss document 'xml'
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
self.is_rss = True
elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
# Generic XML that's not RSS/Atom (RSS/Atom checked above)
self.is_xml = True
elif '%pdf-1' in test_content:
self.is_pdf = True
elif http_content_header.startswith('text/'):
self.is_plaintext = True
# Only trust magic for 'text' if no other patterns matched
elif 'text' in magic_content_header:
self.is_plaintext = True

View File

@@ -13,8 +13,6 @@ from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
from changedetectionio.processors.magic import guess_stream_type
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -22,9 +20,6 @@ description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
# Assume it's this type if the server says nothing on content-type
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg, screenshot=None, xpath_data=None):
self.screenshot = screenshot
@@ -50,9 +45,6 @@ class perform_site_check(difference_detection_processor):
if not watch:
raise Exception("Watch no longer exists.")
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
@@ -62,7 +54,7 @@ class perform_site_check(difference_detection_processor):
self.xpath_data = self.fetcher.xpath_data
# Track the content type
update_obj['content_type'] = ctype_header
update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
@@ -77,12 +69,24 @@ class perform_site_check(difference_detection_processor):
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
is_html = not is_json
is_rss = False
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
# Go into RSS preprocess for converting CDATA/comment to usable text
if stream_content_type.is_rss:
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
if '<rss' in self.fetcher.content[:100].lower():
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
is_rss = True
if watch.is_pdf or stream_content_type.is_pdf:
# source: support, basically treat it as plaintext
if watch.is_source_type_url:
is_html = False
is_json = False
inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
from shutil import which
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
if not which(tool):
@@ -126,12 +130,11 @@ class perform_site_check(difference_detection_processor):
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
if stream_content_type.is_json:
if not has_filter_rule:
# Force a reformat
include_filters_rule.append("json:$")
has_filter_rule = True
if is_json and not has_filter_rule:
include_filters_rule.append("json:$")
has_filter_rule = True
if is_json:
# Sort the JSON so we dont get false alerts when the content is just re-ordered
try:
self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -139,25 +142,34 @@ class perform_site_check(difference_detection_processor):
# Might have just been a snippet, or otherwise bad JSON, continue
pass
if has_filter_rule:
for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
if stripped_text_from_html:
stream_content_type.is_json = True
stream_content_type.is_html = False
if has_filter_rule:
for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
is_html = False
# We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
if is_html or watch.is_source_type_url:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
html_content = self.fetcher.content
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()
# Some kind of "text" but definitely not RSS looking
if stream_content_type.is_plaintext:
# Try to detect better mime types if its a download or not announced as HTML
if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
try:
import magic
mime = magic.from_buffer(html_content, mime=True)
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
if mime and "/" in mime: # looks valid and is a valid mime type
content_type = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
if 'text/' in content_type and not 'html' in content_type:
# Don't run get_text or xpath/css filters on plaintext
# We are not HTML, we are not any kind of RSS, doesnt even look like HTML
stripped_text_from_html = html_content
else:
# If not JSON, and if it's not text/plain..
@@ -174,13 +186,13 @@ class perform_site_check(difference_detection_processor):
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=stream_content_type.is_rss)
is_rss=is_rss)
elif filter_rule.startswith('xpath1:'):
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=stream_content_type.is_rss)
is_rss=is_rss)
else:
html_content += html_tools.include_filters(include_filters=filter_rule,
html_content=self.fetcher.content,
@@ -199,7 +211,7 @@ class perform_site_check(difference_detection_processor):
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
render_anchor_tag_content=do_anchor,
is_rss=stream_content_type.is_rss) # 1874 activate the <title workaround hack
is_rss=is_rss) # 1874 activate the <title workaround hack
if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -238,7 +250,7 @@ class perform_site_check(difference_detection_processor):
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,
@@ -303,11 +315,6 @@ class perform_site_check(difference_detection_processor):
text_for_checksuming = stripped_text_from_html
if text_to_ignore:
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
# Some people prefer to also completely remove it
strip_ignored_lines = watch.get('strip_ignored_lines') if watch.get('strip_ignored_lines') is not None else self.datastore.data['settings']['application'].get('strip_ignored_lines')
if strip_ignored_lines:
# @todo add test in the 'preview' mode, check the widget works? compare to datastruct
stripped_text_from_html = text_for_checksuming
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):

View File

@@ -34,6 +34,7 @@
transition: all 0.2s ease;
cursor: pointer;
display: block;
min-width: 60px;
text-align: center;
}

File diff suppressed because one or more lines are too long

View File

@@ -26,10 +26,7 @@
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
</ul>
</span>
<br><br>
<div class="pure-control-group">
{{ render_ternary_field(form.strip_ignored_lines) }}
</div>
</fieldset>
<fieldset>

View File

@@ -167,20 +167,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b'Deleted' in res.data
def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
"""
https://github.com/dgtlmoon/changedetection.io/issues/3434
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
:param client:
:param live_server:
:param measure_memory_usage:
:return:
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""some random text that should be split by line
and not parsed with html_to_text
@@ -229,102 +215,3 @@ got it\r\n
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_standard_text_plain(client, live_server, measure_memory_usage):
"""
https://github.com/dgtlmoon/changedetection.io/issues/3434
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
The real bug here can be that it will try to process plain-text as HTML, losing <etc>
:param client:
:param live_server:
:param measure_memory_usage:
:return:
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""some random text that should be split by line
and not parsed with html_to_text
<title>Even this title should stay because we are just plain text</title>
this way we know that it correctly parsed as plain text
\r\n
ok\r\n
got it\r\n
""")
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
# Add our URL to the import page
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
### check the front end
res = client.get(
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b"some random text that should be split by line\n" in res.data
####
# Check the snapshot by API that it has linefeeds too
watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
res = client.get(
url_for("watchhistory", uuid=watch_uuid),
headers={'x-api-key': api_key},
)
# Fetch a snapshot by timestamp, check the right one was found
res = client.get(
url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]),
headers={'x-api-key': api_key},
)
assert b"some random text that should be split by line\n" in res.data
assert b"<title>Even this title should stay because we are just plain text</title>" in res.data
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
# Server says its plaintext, we should always treat it as plaintext
def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<?xml version="1.0" encoding="utf-8"?>
<resources xmlns:tools="http://schemas.android.com/tools">
<!--Activity and fragment titles-->
<string name="feed_update_receiver_name">Abonnementen bijwerken</string>
</resources>
""")
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
# Add our URL to the import page
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b'&lt;string name=&#34;feed_update_receiver_name&#34;' in res.data
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)

View File

@@ -264,6 +264,8 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
wait_for_all_checks(client)
with open('/tmp/fuck.html', 'wb') as f:
f.write(res.data)
# Should be only 1 unviewed
res = client.get(url_for("watchlist.index"))
assert res.data.count(b' unviewed ') == 1

View File

@@ -3,8 +3,9 @@
import time
import os
import json
import logging
from flask import url_for
from .util import wait_for_all_checks
from .util import live_server_setup, wait_for_all_checks
from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):

View File

@@ -58,39 +58,3 @@ def test_ignore(client, live_server, measure_memory_usage):
# Should be in base.html
assert b'csrftoken' in res.data
def test_strip_ignore_lines(client, live_server, measure_memory_usage):
# live_server_setup(live_server) # Setup on conftest per function
set_original_ignore_response()
# Goto the settings page, add our ignore text
res = client.post(
url_for("settings.settings_page"),
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-strip_ignored_lines": "y",
"application-global_ignore_text": "Which is across multiple",
'application-fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Settings updated." in res.data
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Give the thread time to pick it up
wait_for_all_checks(client)
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
# It should not be in the preview anymore
res = client.get(url_for("ui.ui_views.preview_page", uuid=uuid))
assert b'<div class="ignored">' not in res.data
assert b'Which is across multiple' not in res.data

View File

@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):
set_original_cdata_xml()
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
# Add our URL to the import page
res = client.post(
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
set_original_cdata_xml()
test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),

View File

@@ -1,42 +1,12 @@
# -*- coding: utf-8 -*-
import time
from flask import url_for
from .util import wait_for_all_checks
from ..processors.magic import RSS_XML_CONTENT_TYPES
from .util import live_server_setup, wait_for_all_checks
from ..html_tools import *
def set_rss_atom_feed_response(header=''):
test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly -->
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
<title>RSS Feed</title>
<link>
<![CDATA[ https://store.waterpowered.com/news/collection// ]]>
</link>
<description>
<![CDATA[ Events and Announcements for ]]>
</description>
<language>en-us</language>
<generator>water News RSS</generator>
<item>
<title> 🍁 Lets go discount</title>
<description><p class="bb_paragraph">ok heres the description</p></description>
<link>
<![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
</link>
<pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
<guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
<enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
</item>
</channel>
</rss>"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
@@ -605,47 +575,3 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def _subtest_xpath_rss(client, content_type='text/html'):
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type=content_type, _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
data={
"url": test_url,
"include_filters": "xpath://item",
"tags": '',
"fetch_backend": "html_requests",
"time_between_check_use_default": "y",
},
follow_redirects=True
)
assert b"unpaused" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_views.preview_page", uuid="first"),
follow_redirects=True
)
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
# Be sure all-in-the-wild types of RSS feeds work with xpath
def test_rss_xpath(client, live_server):
for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
set_rss_atom_feed_response(header=feed_header)
for content_type in RSS_XML_CONTENT_TYPES:
_subtest_xpath_rss(client, content_type=content_type)

View File

@@ -12,7 +12,7 @@ flask_wtf~=1.2
flask~=2.3
flask-socketio~=5.5.1
python-socketio~=5.13.0
python-engineio~=4.12.3
python-engineio~=4.12.0
inscriptis~=2.2
pytz
timeago~=1.0
@@ -39,7 +39,7 @@ jsonpath-ng~=1.5.3
# jq not available on Windows so must be installed manually
# Notification library
apprise==1.9.5
apprise==1.9.4
# - Needed for apprise/spush, and maybe others? hopefully doesnt trigger a rust compile.
# - Requires extra wheel for rPi, adds build time for arm/v8 which is not in piwheels
@@ -135,7 +135,7 @@ tzdata
pluggy ~= 1.5
# Needed for testing, cross-platform for process and system monitoring
psutil==7.1.0
psutil==7.0.0
ruff >= 0.11.2
pre_commit >= 4.2.0