Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon 4c72ff1d13 oops 2026-02-11 06:34:32 +01:00
dgtlmoon c6557dbc67 Merge branch 'master' into 3854-API-tags-missing-response 2026-02-11 06:32:01 +01:00
dgtlmoon 7060e9bc02 Include missing tags reply Re #3854 2026-02-11 06:17:17 +01:00
86 changed files with 1012 additions and 5560 deletions
-33
View File
@@ -1,33 +0,0 @@
server {
listen 80;
server_name localhost;
# Test basic reverse proxy to changedetection.io
location / {
proxy_pass http://changedet-app:5000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
# Test subpath deployment with X-Forwarded-Prefix
location /changedet-sub/ {
proxy_pass http://changedet-app:5000/;
proxy_set_header X-Forwarded-Prefix /changedet-sub;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
}
@@ -103,7 +103,7 @@ jobs:
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
# Basic pytest tests with ancillary services
basic-tests:
@@ -324,175 +324,6 @@ jobs:
run: |
docker run --rm --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py'
nginx-reverse-proxy:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 10
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
- name: Download Docker image artifact
uses: actions/download-artifact@v7
with:
name: test-changedetectionio-${{ env.PYTHON_VERSION }}
path: /tmp
- name: Load Docker image
run: |
docker load -i /tmp/test-changedetectionio.tar
- name: Spin up services
run: |
docker network create changedet-network
# Start changedetection.io container with X-Forwarded headers support
docker run --name changedet-app --hostname changedet-app --network changedet-network \
-e USE_X_SETTINGS=true \
-d test-changedetectionio
sleep 3
- name: Start nginx reverse proxy
run: |
# Start nginx with our test configuration
docker run --name nginx-proxy --network changedet-network -d -p 8080:80 --rm \
-v ${{ github.workspace }}/.github/nginx-reverse-proxy-test.conf:/etc/nginx/conf.d/default.conf:ro \
nginx:alpine
sleep 2
- name: Test reverse proxy - root path
run: |
echo "=== Testing nginx reverse proxy at root path ==="
curl --retry-connrefused --retry 6 -s http://localhost:8080/ > /tmp/nginx-test-root.html
# Check for changedetection.io UI elements
if grep -q "checkbox-uuid" /tmp/nginx-test-root.html; then
echo "✓ Found checkbox-uuid in response"
else
echo "ERROR: checkbox-uuid not found in response"
cat /tmp/nginx-test-root.html
exit 1
fi
# Check for watchlist content
if grep -q -i "watch" /tmp/nginx-test-root.html; then
echo "✓ Found watch/watchlist content in response"
else
echo "ERROR: watchlist content not found"
cat /tmp/nginx-test-root.html
exit 1
fi
echo "✓ Root path reverse proxy working correctly"
- name: Test reverse proxy - subpath with X-Forwarded-Prefix
run: |
echo "=== Testing nginx reverse proxy at subpath /changedet-sub/ ==="
curl --retry-connrefused --retry 6 -s http://localhost:8080/changedet-sub/ > /tmp/nginx-test-subpath.html
# Check for changedetection.io UI elements
if grep -q "checkbox-uuid" /tmp/nginx-test-subpath.html; then
echo "✓ Found checkbox-uuid in subpath response"
else
echo "ERROR: checkbox-uuid not found in subpath response"
cat /tmp/nginx-test-subpath.html
exit 1
fi
echo "✓ Subpath reverse proxy working correctly"
- name: Test API through reverse proxy subpath
run: |
echo "=== Testing API endpoints through nginx subpath /changedet-sub/ ==="
# Extract API key from the changedetection.io datastore
API_KEY=$(docker exec changedet-app cat /datastore/changedetection.json | grep -o '"api_access_token": *"[^"]*"' | cut -d'"' -f4)
if [ -z "$API_KEY" ]; then
echo "ERROR: Could not extract API key from datastore"
docker exec changedet-app cat /datastore/changedetection.json
exit 1
fi
echo "✓ Extracted API key: ${API_KEY:0:8}..."
# Create a watch via API through nginx proxy subpath
echo "Creating watch via POST to /changedet-sub/api/v1/watch"
RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/changedet-sub/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com/test-nginx-proxy",
"tag": "nginx-test"
}')
HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
BODY=$(echo "$RESPONSE" | head -n-1)
if [ "$HTTP_CODE" != "201" ]; then
echo "ERROR: Expected HTTP 201, got $HTTP_CODE"
echo "Response: $BODY"
exit 1
fi
echo "✓ Watch created successfully (HTTP 201)"
# Extract the watch UUID from response
WATCH_UUID=$(echo "$BODY" | grep -o '"uuid": *"[^"]*"' | cut -d'"' -f4)
echo "✓ Watch UUID: $WATCH_UUID"
# Update the watch via PUT through nginx proxy subpath
echo "Updating watch via PUT to /changedet-sub/api/v1/watch/${WATCH_UUID}"
RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT "http://localhost:8080/changedet-sub/api/v1/watch/${WATCH_UUID}" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
-d '{
"paused": true
}')
HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
BODY=$(echo "$RESPONSE" | head -n-1)
if [ "$HTTP_CODE" != "200" ]; then
echo "ERROR: Expected HTTP 200, got $HTTP_CODE"
echo "Response: $BODY"
exit 1
fi
if echo "$BODY" | grep -q 'OK'; then
echo "✓ Watch updated successfully (HTTP 200, response: OK)"
else
echo "ERROR: Expected response 'OK', got: $BODY"
echo "Response: $BODY"
exit 1
fi
# Verify the watch is paused via GET
echo "Verifying watch is paused via GET"
RESPONSE=$(curl -s "http://localhost:8080/changedet-sub/api/v1/watch/${WATCH_UUID}" \
-H "x-api-key: ${API_KEY}")
if echo "$RESPONSE" | grep -q '"paused": *true'; then
echo "✓ Watch is paused as expected"
else
echo "ERROR: Watch paused state not confirmed"
echo "Response: $RESPONSE"
exit 1
fi
echo "✓ API tests through nginx subpath completed successfully"
- name: Cleanup nginx test
if: always()
run: |
docker logs nginx-proxy || true
docker logs changedet-app || true
docker stop nginx-proxy changedet-app || true
docker rm nginx-proxy changedet-app || true
# Proxy tests
proxy-tests:
runs-on: ubuntu-latest
@@ -685,142 +516,3 @@ jobs:
exit 1
fi
docker rm sig-test
# Upgrade path test
upgrade-path-test:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 25
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # Fetch all history and tags for upgrade testing
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Check upgrade works without error
run: |
echo "=== Testing upgrade path from 0.49.1 to ${{ github.ref_name }} (${{ github.sha }}) ==="
# Checkout old version and create datastore
git checkout 0.49.1
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
pip install 'pyOpenSSL>=23.2.0'
echo "=== Running version 0.49.1 to create datastore ==="
python3 ./changedetection.py -C -d /tmp/data &
APP_PID=$!
# Wait for app to be ready
echo "Waiting for 0.49.1 to be ready..."
sleep 6
# Extract API key from datastore (0.49.1 uses url-watches.json)
API_KEY=$(jq -r '.settings.application.api_access_token // empty' /tmp/data/url-watches.json)
echo "API Key: ${API_KEY:0:8}..."
# Create a watch with tag "github-group-test" via API
echo "Creating test watch with tag via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
--retry 6 --retry-delay 1 --retry-connrefused \
-d '{
"url": "https://example.com/upgrade-test",
"tag": "github-group-test"
}'
echo "✓ Created watch with tag 'github-group-test'"
# Create a specific test URL watch
echo "Creating test URL watch via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
-d '{
"url": "http://localhost/test.txt"
}'
echo "✓ Created watch for 'http://localhost/test.txt' in version 0.49.1"
# Stop the old version gracefully
kill $APP_PID
wait $APP_PID || true
echo "✓ Version 0.49.1 stopped"
# Upgrade to current version (use commit SHA since we're in detached HEAD)
echo "Upgrading to commit ${{ github.sha }}"
git checkout ${{ github.sha }}
pip install -r requirements.txt
echo "=== Running current version (commit ${{ github.sha }}) with old datastore (testing mode) ==="
TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD=1 python3 ./changedetection.py -d /tmp/data > /tmp/upgrade-test.log 2>&1
echo "=== Upgrade test output ==="
cat /tmp/upgrade-test.log
echo "✓ Datastore upgraded successfully"
# Now start the current version normally to verify the tag survived
echo "=== Starting current version to verify tag exists after upgrade ==="
timeout 20 python3 ./changedetection.py -d /tmp/data > /tmp/ui-test.log 2>&1 &
APP_PID=$!
# Wait for app to be ready and fetch UI
echo "Waiting for current version to be ready..."
sleep 5
curl --retry 6 --retry-delay 1 --retry-connrefused --silent http://127.0.0.1:5000 > /tmp/ui-output.html
# Verify tag exists in UI
if grep -q "github-group-test" /tmp/ui-output.html; then
echo "✓ Tag 'github-group-test' found in UI after upgrade"
else
echo "ERROR: Tag 'github-group-test' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Verify test URL exists in UI
if grep -q "http://localhost/test.txt" /tmp/ui-output.html; then
echo "✓ Watch URL 'http://localhost/test.txt' found in UI after upgrade"
else
echo "ERROR: Watch URL 'http://localhost/test.txt' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Cleanup
kill $APP_PID || true
wait $APP_PID || true
echo ""
echo "✓✓✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }} ✓✓✓"
echo " - Commit: ${{ github.sha }}"
echo " - Datastore migrated successfully"
echo " - Tag 'github-group-test' survived upgrade"
echo " - Watch URL 'http://localhost/test.txt' survived upgrade"
echo "✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }}"
- name: Upload upgrade test logs
if: always()
uses: actions/upload-artifact@v6
with:
name: upgrade-test-logs-py${{ env.PYTHON_VERSION }}
path: /tmp/upgrade-test.log
+3 -11
View File
@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
# Semver means never use .01, or 00. Should be .1.
__version__ = '0.53.7'
__version__ = '0.52.9'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError
@@ -371,15 +371,7 @@ def main():
# Dont' start if the JSON DB looks corrupt
logger.critical(f"ERROR: JSON DB or Proxy List JSON at '{app_config['datastore_path']}' appears to be corrupt, aborting.")
logger.critical(str(e))
sys.exit(1)
# Testing mode: Exit cleanly after datastore initialization (for CI/CD upgrade tests)
if os.environ.get('TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD'):
logger.success(f"TESTING MODE: Datastore loaded successfully from {app_config['datastore_path']}")
logger.success(f"TESTING MODE: Schema version: {datastore.data['settings']['application'].get('schema_version', 'unknown')}")
logger.success(f"TESTING MODE: Loaded {len(datastore.data['watching'])} watches")
logger.success("TESTING MODE: Exiting cleanly (TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD is set)")
sys.exit(0)
return
# Apply all_paused setting if specified via CLI
if all_paused is not None:
@@ -610,7 +602,7 @@ def main():
@app.context_processor
def inject_template_globals():
return dict(right_sticky="v"+__version__,
return dict(right_sticky="v{}".format(datastore.data['version_tag']),
new_version_available=app.config['NEW_VERSION_AVAILABLE'],
has_password=datastore.data['settings']['application']['password'] != False,
socket_io_enabled=datastore.data['settings']['application'].get('ui', {}).get('socket_io_enabled', True),
+7 -169
View File
@@ -4,10 +4,6 @@ from flask import request
from functools import wraps
from . import auth, validate_openapi_request
from ..validate_url import is_safe_valid_url
import json
# Number of URLs above which import switches to background processing
IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD = 20
def default_content_type(content_type='text/plain'):
@@ -23,76 +19,6 @@ def default_content_type(content_type='text/plain'):
return decorator
def convert_query_param_to_type(value, schema_property):
"""
Convert a query parameter string to the appropriate type based on schema definition.
Args:
value: String value from query parameter
schema_property: Schema property definition with 'type' or 'anyOf' field
Returns:
Converted value in the appropriate type
Supports both OpenAPI 3.1 formats:
- type: [string, 'null'] (array format)
- anyOf: [{type: string}, {type: null}] (anyOf format)
"""
prop_type = schema_property.get('type')
# Handle OpenAPI 3.1 type arrays: type: [string, 'null']
if isinstance(prop_type, list):
# Use the first non-null type from the array
for t in prop_type:
if t != 'null':
prop_type = t
break
else:
prop_type = None
# Handle anyOf schemas (older format)
elif 'anyOf' in schema_property:
# Use the first non-null type from anyOf
for option in schema_property['anyOf']:
if option.get('type') and option.get('type') != 'null':
prop_type = option.get('type')
break
else:
prop_type = None
# Handle array type (e.g., notification_urls)
if prop_type == 'array':
# Support both comma-separated and JSON array format
if value.startswith('['):
try:
return json.loads(value)
except json.JSONDecodeError:
return [v.strip() for v in value.split(',')]
return [v.strip() for v in value.split(',')]
# Handle object type (e.g., time_between_check, headers)
elif prop_type == 'object':
try:
return json.loads(value)
except json.JSONDecodeError:
raise ValueError(f"Invalid JSON object for field: {value}")
# Handle boolean type
elif prop_type == 'boolean':
return strtobool(value)
# Handle integer type
elif prop_type == 'integer':
return int(value)
# Handle number type (float)
elif prop_type == 'number':
return float(value)
# Default: return as string
return value
class Import(Resource):
def __init__(self, **kwargs):
# datastore is a black box dependency
@@ -102,128 +28,40 @@ class Import(Resource):
@default_content_type('text/plain') #3547 #3542
@validate_openapi_request('importWatches')
def post(self):
"""Import a list of watched URLs with optional watch configuration."""
from . import get_watch_schema_properties
# Special parameters that are NOT watch configuration
special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'}
"""Import a list of watched URLs."""
extras = {}
# Handle special 'proxy' parameter
if request.args.get('proxy'):
plist = self.datastore.proxy_list
if not request.args.get('proxy') in plist:
proxy_list_str = ', '.join(plist) if plist else 'none configured'
return f"Invalid proxy choice, currently supported proxies are '{proxy_list_str}'", 400
return "Invalid proxy choice, currently supported proxies are '{}'".format(', '.join(plist)), 400
else:
extras['proxy'] = request.args.get('proxy')
# Handle special 'dedupe' parameter
dedupe = strtobool(request.args.get('dedupe', 'true'))
# Handle special 'tag' and 'tag_uuids' parameters
tags = request.args.get('tag')
tag_uuids = request.args.get('tag_uuids')
if tag_uuids:
tag_uuids = tag_uuids.split(',')
# Extract ALL other query parameters as watch configuration
# Get schema from OpenAPI spec (replaces old schema_create_watch)
schema_properties = get_watch_schema_properties()
for param_name, param_value in request.args.items():
# Skip special parameters
if param_name in special_params:
continue
# Skip if not in schema (unknown parameter)
if param_name not in schema_properties:
return f"Unknown watch configuration parameter: {param_name}", 400
# Convert to appropriate type based on schema
try:
converted_value = convert_query_param_to_type(param_value, schema_properties[param_name])
extras[param_name] = converted_value
except (ValueError, json.JSONDecodeError) as e:
return f"Invalid value for parameter '{param_name}': {str(e)}", 400
# Validate processor if provided
if 'processor' in extras:
from changedetectionio.processors import available_processors
available = [p[0] for p in available_processors()]
if extras['processor'] not in available:
return f"Invalid processor '{extras['processor']}'. Available processors: {', '.join(available)}", 400
# Validate fetch_backend if provided
if 'fetch_backend' in extras:
from changedetectionio.content_fetchers import available_fetchers
available = [f[0] for f in available_fetchers()]
# Also allow 'system' and extra_browser_* patterns
is_valid = (
extras['fetch_backend'] == 'system' or
extras['fetch_backend'] in available or
extras['fetch_backend'].startswith('extra_browser_')
)
if not is_valid:
return f"Invalid fetch_backend '{extras['fetch_backend']}'. Available: system, {', '.join(available)}", 400
# Validate notification_urls if provided
if 'notification_urls' in extras:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
validate_notification_urls(extras['notification_urls'])
except ValidationError as e:
return f"Invalid notification_urls: {str(e)}", 400
urls = request.get_data().decode('utf8').splitlines()
# Clean and validate URLs upfront
urls_to_import = []
added = []
for url in urls:
url = url.strip()
if not len(url):
continue
# Validate URL
# If hosts that only contain alphanumerics are allowed ("localhost" for example)
if not is_safe_valid_url(url):
return f"Invalid or unsupported URL - {url}", 400
# Check for duplicates if dedupe is enabled
if dedupe and self.datastore.url_exists(url):
continue
urls_to_import.append(url)
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added.append(new_uuid)
# For small imports, process synchronously for immediate feedback
if len(urls_to_import) < IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD:
added = []
for url in urls_to_import:
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added.append(new_uuid)
return added, 200
# For large imports (>= 20), process in background thread
else:
import threading
from loguru import logger
def import_watches_background():
"""Background thread to import watches - discarded after completion."""
try:
added_count = 0
for url in urls_to_import:
try:
self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added_count += 1
except Exception as e:
logger.error(f"Error importing URL {url}: {e}")
logger.info(f"Background import complete: {added_count} watches created")
except Exception as e:
logger.error(f"Error in background import: {e}")
# Start background thread and return immediately
thread = threading.Thread(target=import_watches_background, daemon=True, name="ImportWatches-Background")
thread.start()
return {'status': f'Importing {len(urls_to_import)} URLs in background', 'count': len(urls_to_import)}, 202
return added
+5
View File
@@ -1,6 +1,8 @@
from flask_expects_json import expects_json
from flask_restful import Resource, abort
from flask import request
from . import auth, validate_openapi_request
from . import schema_create_notification_urls, schema_delete_notification_urls
class Notifications(Resource):
def __init__(self, **kwargs):
@@ -20,6 +22,7 @@ class Notifications(Resource):
@auth.check_token
@validate_openapi_request('addNotifications')
@expects_json(schema_create_notification_urls)
def post(self):
"""Create Notification URLs."""
@@ -47,6 +50,7 @@ class Notifications(Resource):
@auth.check_token
@validate_openapi_request('replaceNotifications')
@expects_json(schema_create_notification_urls)
def put(self):
"""Replace Notification URLs."""
json_data = request.get_json()
@@ -69,6 +73,7 @@ class Notifications(Resource):
@auth.check_token
@validate_openapi_request('deleteNotifications')
@expects_json(schema_delete_notification_urls)
def delete(self):
"""Delete Notification URLs."""
+20 -60
View File
@@ -1,5 +1,6 @@
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from flask_expects_json import expects_json
from flask_restful import abort, Resource
from loguru import logger
@@ -7,7 +8,8 @@ import threading
from flask import request
from . import auth
from . import validate_openapi_request
# Import schemas from __init__.py
from . import schema_tag, schema_create_tag, schema_update_tag, validate_openapi_request
class Tag(Resource):
@@ -67,25 +69,7 @@ class Tag(Resource):
tag.commit()
return "OK", 200
# Filter out Watch-specific runtime fields that don't apply to Tags (yet)
# TODO: Future enhancement - aggregate these values from all Watches that have this tag:
# - check_count: sum of all watches' check_count
# - last_checked: most recent last_checked from all watches
# - last_changed: most recent last_changed from all watches
# - consecutive_filter_failures: count of watches with failures
# - etc.
# These come from watch_base inheritance but currently have no meaningful value for Tags
watch_only_fields = {
'browser_steps_last_error_step', 'check_count', 'consecutive_filter_failures',
'content-type', 'fetch_time', 'last_changed', 'last_checked', 'last_error',
'last_notification_error', 'last_viewed', 'notification_alert_count',
'page_title', 'previous_md5', 'remote_server_reply'
}
# Create clean tag dict without Watch-specific fields
clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields}
return clean_tag
return tag
@auth.check_token
@validate_openapi_request('deleteTag')
@@ -97,6 +81,17 @@ class Tag(Resource):
# Delete the tag, and any tag reference
del self.datastore.data['settings']['application']['tags'][uuid]
# Delete tag.json file if it exists
import os
tag_dir = os.path.join(self.datastore.datastore_path, uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
logger.info(f"Deleted tag.json for tag {uuid}")
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
# Remove tag from all watches
for watch_uuid, watch in self.datastore.data['watching'].items():
if watch.get('tags') and uuid in watch['tags']:
@@ -107,73 +102,38 @@ class Tag(Resource):
@auth.check_token
@validate_openapi_request('updateTag')
@expects_json(schema_update_tag)
def put(self, uuid):
"""Update tag information."""
tag = self.datastore.data['settings']['application']['tags'].get(uuid)
if not tag:
abort(404, message='No tag exists with the UUID of {}'.format(uuid))
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Validate notification_urls if provided
if 'notification_urls' in json_data:
if 'notification_urls' in request.json:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
notification_urls = json_data.get('notification_urls', [])
notification_urls = request.json.get('notification_urls', [])
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
# Filter out readOnly fields (extracted from OpenAPI spec Tag schema)
# These are system-managed fields that should never be user-settable
from . import get_readonly_tag_fields
readonly_fields = get_readonly_tag_fields()
# Tag model inherits from watch_base but has no @property attributes of its own
# So we only need to filter readOnly fields
for field in readonly_fields:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
tag.update(json_data)
tag.update(request.json)
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = self.datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated via API, cleared {cleared_count} watch checksums")
return "OK", 200
@auth.check_token
@validate_openapi_request('createTag')
# Only cares for {'title': 'xxxx'}
def post(self):
"""Create a single tag/group."""
json_data = request.get_json()
title = json_data.get("title",'').strip()
# Validate that only valid fields are provided
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
new_uuid = self.datastore.add_tag(title=title)
if new_uuid:
+5 -30
View File
@@ -8,11 +8,13 @@ from . import auth
from changedetectionio import queuedWatchMetaData, strtobool
from changedetectionio import worker_pool
from flask import request, make_response, send_from_directory
from flask_expects_json import expects_json
from flask_restful import abort, Resource
from loguru import logger
import copy
from . import validate_openapi_request, get_readonly_watch_fields
# Import schemas from __init__.py
from . import schema, schema_create_watch, schema_update_watch, validate_openapi_request
from ..notification import valid_notification_formats
from ..notification.handler import newline_re
@@ -119,6 +121,7 @@ class Watch(Resource):
@auth.check_token
@validate_openapi_request('updateWatch')
@expects_json(schema_update_watch)
def put(self, uuid):
"""Update watch information."""
watch = self.datastore.data['watching'].get(uuid)
@@ -172,35 +175,6 @@ class Watch(Resource):
# Extract and remove processor config fields from json_data
processor_config_data = processors.extract_processor_config_from_form_data(json_data)
# Filter out readOnly fields (extracted from OpenAPI spec Watch schema)
# These are system-managed fields that should never be user-settable
readonly_fields = get_readonly_watch_fields()
# Also filter out @property attributes (computed/derived values from the model)
# These are not stored and should be ignored in PUT requests
from changedetectionio.model.Watch import model as WatchModel
property_fields = WatchModel.get_property_names()
# Combine both sets of fields to ignore
fields_to_ignore = readonly_fields | property_fields
# Remove all ignored fields from update data
for field in fields_to_ignore:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from WatchBase schema
from . import get_watch_schema_properties
valid_fields = set(get_watch_schema_properties().keys())
# Also allow last_viewed (explicitly defined in UpdateWatch schema)
valid_fields.add('last_viewed')
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
# Update watch with regular (non-processor-config) fields
watch.update(json_data)
watch.commit()
@@ -419,6 +393,7 @@ class CreateWatch(Resource):
@auth.check_token
@validate_openapi_request('createWatch')
@expects_json(schema_create_watch)
def post(self):
"""Create a single watch."""
+37 -97
View File
@@ -1,6 +1,41 @@
import copy
import functools
from flask import request, abort
from loguru import logger
from . import api_schema
from ..model import watch_base
# Build a JSON Schema atleast partially based on our Watch model
watch_base_config = watch_base()
schema = api_schema.build_watch_json_schema(watch_base_config)
schema_create_watch = copy.deepcopy(schema)
schema_create_watch['required'] = ['url']
del schema_create_watch['properties']['last_viewed']
# Allow processor_config_* fields (handled separately in endpoint)
schema_create_watch['patternProperties'] = {
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
}
schema_update_watch = copy.deepcopy(schema)
schema_update_watch['additionalProperties'] = False
# Allow processor_config_* fields (handled separately in endpoint)
schema_update_watch['patternProperties'] = {
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
}
# Tag schema is also based on watch_base since Tag inherits from it
schema_tag = copy.deepcopy(schema)
schema_create_tag = copy.deepcopy(schema_tag)
schema_create_tag['required'] = ['title']
schema_update_tag = copy.deepcopy(schema_tag)
schema_update_tag['additionalProperties'] = False
schema_notification_urls = copy.deepcopy(schema)
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
schema_create_notification_urls['required'] = ['notification_urls']
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
schema_delete_notification_urls['required'] = ['notification_urls']
@functools.cache
def get_openapi_spec():
@@ -19,79 +54,6 @@ def get_openapi_spec():
_openapi_spec = OpenAPI.from_dict(spec_dict)
return _openapi_spec
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Used by Import endpoint to validate and convert query parameters.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_schema_properties(schema_name):
"""
Generic helper to resolve schema properties, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'WatchBase', 'Watch', 'Tag')
Returns:
dict: All properties including inherited ones from $ref schemas
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
properties = {}
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
properties.update(ref_schema.get('properties', {}))
# Add schema-specific properties
if 'properties' in item:
properties.update(item['properties'])
else:
# Direct properties (no inheritance)
properties = schema.get('properties', {})
return properties
@functools.cache
def get_watch_schema_properties():
"""
Extract watch schema properties from OpenAPI spec for Import endpoint.
Returns WatchBase properties (all writable Watch fields).
"""
return _resolve_schema_properties('WatchBase')
# Import readonly field utilities from shared module (avoids circular dependencies with model layer)
from changedetectionio.model.schema_utils import get_readonly_watch_fields, get_readonly_tag_fields
@functools.cache
def get_tag_schema_properties():
"""
Extract Tag schema properties from OpenAPI spec.
Returns WatchBase properties + Tag-specific properties (overrides_watch).
"""
return _resolve_schema_properties('Tag')
def validate_openapi_request(operation_id):
"""Decorator to validate incoming requests against OpenAPI spec."""
def decorator(f):
@@ -103,7 +65,6 @@ def validate_openapi_request(operation_id):
if request.method.upper() != 'GET':
# Lazy import - only loaded when actually validating a request
from openapi_core.contrib.flask import FlaskOpenAPIRequest
from openapi_core.templating.paths.exceptions import ServerNotFound, PathNotFound, PathError
spec = get_openapi_spec()
openapi_request = FlaskOpenAPIRequest(request)
@@ -111,29 +72,8 @@ def validate_openapi_request(operation_id):
if result.errors:
error_details = []
for error in result.errors:
# Skip path/server validation errors for reverse proxy compatibility
# Flask routing already validates that endpoints exist (returns 404 if not).
# OpenAPI validation here is primarily for request body schema validation.
# When behind nginx/reverse proxy, URLs may have path prefixes that don't
# match the OpenAPI server definitions, causing false positives.
if isinstance(error, PathError):
logger.debug(f"API Call - Skipping path/server validation (delegated to Flask): {error}")
continue
error_str = str(error)
# Extract detailed schema errors from __cause__
if hasattr(error, '__cause__') and hasattr(error.__cause__, 'schema_errors'):
for schema_error in error.__cause__.schema_errors:
field = '.'.join(str(p) for p in schema_error.path) if schema_error.path else 'body'
msg = schema_error.message if hasattr(schema_error, 'message') else str(schema_error)
error_details.append(f"{field}: {msg}")
else:
error_details.append(error_str)
# Only raise if we have actual validation errors (not path/server issues)
if error_details:
logger.error(f"API Call - Validation failed: {'; '.join(error_details)}")
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
error_details.append(str(error))
raise BadRequest(f"OpenAPI validation failed: {error_details}")
except BadRequest:
# Re-raise BadRequest exceptions (validation failures)
raise
+162
View File
@@ -0,0 +1,162 @@
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
# Probably other ways to solve this when the backend switches to some ORM
from changedetectionio.notification import valid_notification_formats
def build_time_between_check_json_schema():
# Setup time between check schema
schema_properties_time_between_check = {
"type": "object",
"additionalProperties": False,
"properties": {}
}
for p in ['weeks', 'days', 'hours', 'minutes', 'seconds']:
schema_properties_time_between_check['properties'][p] = {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
]
}
return schema_properties_time_between_check
def build_watch_json_schema(d):
# Base JSON schema
schema = {
'type': 'object',
'properties': {},
}
for k, v in d.items():
# @todo 'integer' is not covered here because its almost always for internal usage
if isinstance(v, type(None)):
schema['properties'][k] = {
"anyOf": [
{"type": "null"},
]
}
elif isinstance(v, list):
schema['properties'][k] = {
"anyOf": [
{"type": "array",
# Always is an array of strings, like text or regex or something
"items": {
"type": "string",
"maxLength": 5000
}
},
]
}
elif isinstance(v, bool):
schema['properties'][k] = {
"anyOf": [
{"type": "boolean"},
]
}
elif isinstance(v, str):
schema['properties'][k] = {
"anyOf": [
{"type": "string",
"maxLength": 5000},
]
}
# Can also be a string (or None by default above)
for v in ['body',
'notification_body',
'notification_format',
'notification_title',
'proxy',
'tag',
'title',
'webdriver_js_execute_code'
]:
schema['properties'][v]['anyOf'].append({'type': 'string', "maxLength": 5000})
for v in ['last_viewed']:
schema['properties'][v] = {
"type": "integer",
"description": "Unix timestamp in seconds of the last time the watch was viewed.",
"minimum": 0
}
# None or Boolean
schema['properties']['track_ldjson_price_data']['anyOf'].append({'type': 'boolean'})
schema['properties']['method'] = {"type": "string",
"enum": ["GET", "POST", "DELETE", "PUT"]
}
schema['properties']['fetch_backend']['anyOf'].append({"type": "string",
"enum": ["html_requests", "html_webdriver"]
})
schema['properties']['processor'] = {"anyOf": [
{"type": "string", "enum": ["restock_diff", "text_json_diff"]},
{"type": "null"}
]}
# All headers must be key/value type dict
schema['properties']['headers'] = {
"type": "object",
"patternProperties": {
# Should always be a string:string type value
".*": {"type": "string"},
}
}
schema['properties']['notification_format'] = {'type': 'string',
'enum': list(valid_notification_formats.keys())
}
# Stuff that shouldn't be available but is just state-storage
for v in ['previous_md5', 'last_error', 'has_ldjson_price_data', 'previous_md5_before_filters', 'uuid']:
del schema['properties'][v]
schema['properties']['webdriver_delay']['anyOf'].append({'type': 'integer'})
schema['properties']['time_between_check'] = build_time_between_check_json_schema()
schema['properties']['time_between_check_use_default'] = {
"type": "boolean",
"default": True,
"description": "Whether to use global settings for time between checks - defaults to true if not set"
}
schema['properties']['browser_steps'] = {
"anyOf": [
{
"type": "array",
"items": {
"type": "object",
"properties": {
"operation": {
"type": ["string", "null"],
"maxLength": 5000 # Allows null and any string up to 5000 chars (including "")
},
"selector": {
"type": ["string", "null"],
"maxLength": 5000
},
"optional_value": {
"type": ["string", "null"],
"maxLength": 5000
}
},
"required": ["operation", "selector", "optional_value"],
"additionalProperties": False # No extra keys allowed
}
},
{"type": "null"}, # Allows null for `browser_steps`
{"type": "array", "maxItems": 0} # Allows empty array []
]
}
# headers ?
return schema
@@ -13,7 +13,7 @@ from loguru import logger
BACKUP_FILENAME_FORMAT = "changedetection-backup-{}.zip"
def create_backup(datastore_path, watches: dict, tags: dict = None):
def create_backup(datastore_path, watches: dict):
logger.debug("Creating backup...")
import zipfile
from pathlib import Path
@@ -45,15 +45,6 @@ def create_backup(datastore_path, watches: dict, tags: dict = None):
if os.path.isfile(secret_file):
zipObj.write(secret_file, arcname="secret.txt")
# Add tag data directories (each tag has its own {uuid}/tag.json)
for uuid, tag in (tags or {}).items():
for f in Path(tag.data_dir).glob('*'):
zipObj.write(f,
arcname=os.path.join(f.parts[-2], f.parts[-1]),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
logger.debug(f"Added tag '{tag.get('title')}' ({uuid}) to backup")
# Add any data in the watch data directory.
for uuid, w in watches.items():
for f in Path(w.data_dir).glob('*'):
@@ -97,10 +88,7 @@ def create_backup(datastore_path, watches: dict, tags: dict = None):
def construct_blueprint(datastore: ChangeDetectionStore):
from .restore import construct_restore_blueprint
backups_blueprint = Blueprint('backups', __name__, template_folder="templates")
backups_blueprint.register_blueprint(construct_restore_blueprint(datastore))
backup_threads = []
@login_optionally_required
@@ -108,17 +96,16 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def request_backup():
if any(thread.is_alive() for thread in backup_threads):
flash(gettext("A backup is already running, check back in a few minutes"), "error")
return redirect(url_for('backups.create'))
return redirect(url_for('backups.index'))
if len(find_backups()) > int(os.getenv("MAX_NUMBER_BACKUPS", 100)):
flash(gettext("Maximum number of backups reached, please remove some"), "error")
return redirect(url_for('backups.create'))
return redirect(url_for('backups.index'))
# With immediate persistence, all data is already saved
zip_thread = threading.Thread(
target=create_backup,
args=(datastore.datastore_path, datastore.data.get("watching")),
kwargs={'tags': datastore.data['settings']['application'].get('tags', {})},
daemon=True,
name="BackupCreator"
)
@@ -126,7 +113,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
backup_threads.append(zip_thread)
flash(gettext("Backup building in background, check back in a few minutes."))
return redirect(url_for('backups.create'))
return redirect(url_for('backups.index'))
def find_backups():
backup_filepath = os.path.join(datastore.datastore_path, BACKUP_FILENAME_FORMAT.format("*"))
@@ -168,14 +155,14 @@ def construct_blueprint(datastore: ChangeDetectionStore):
return send_from_directory(os.path.abspath(datastore.datastore_path), filename, as_attachment=True)
@login_optionally_required
@backups_blueprint.route("/", methods=['GET'])
@backups_blueprint.route("/create", methods=['GET'])
def create():
@backups_blueprint.route("", methods=['GET'])
def index():
backups = find_backups()
output = render_template("backup_create.html",
output = render_template("overview.html",
available_backups=backups,
backup_running=any(thread.is_alive() for thread in backup_threads)
)
return output
@login_optionally_required
@@ -189,6 +176,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Backups were deleted."))
return redirect(url_for('backups.create'))
return redirect(url_for('backups.index'))
return backups_blueprint
@@ -1,208 +0,0 @@
import io
import json
import os
import shutil
import tempfile
import threading
import zipfile
from flask import Blueprint, render_template, flash, url_for, redirect, request
from flask_babel import gettext, lazy_gettext as _l
from wtforms import Form, BooleanField, SubmitField
from flask_wtf.file import FileField, FileAllowed
from loguru import logger
from changedetectionio.flask_app import login_optionally_required
class RestoreForm(Form):
zip_file = FileField(_l('Backup zip file'), validators=[
FileAllowed(['zip'], _l('Must be a .zip backup file!'))
])
include_groups = BooleanField(_l('Include groups'), default=True)
include_groups_replace_existing = BooleanField(_l('Replace existing groups of the same UUID'), default=True)
include_watches = BooleanField(_l('Include watches'), default=True)
include_watches_replace_existing = BooleanField(_l('Replace existing watches of the same UUID'), default=True)
submit = SubmitField(_l('Restore backup'))
def import_from_zip(zip_stream, datastore, include_groups, include_groups_replace, include_watches, include_watches_replace):
"""
Extract and import watches and groups from a backup zip stream.
Mirrors the store's _load_watches / _load_tags loading pattern:
- UUID dirs with tag.json Tag.model + tag_obj.commit()
- UUID dirs with watch.json rehydrate_entity + watch_obj.commit()
Returns a dict with counts: restored_groups, skipped_groups, restored_watches, skipped_watches.
Raises zipfile.BadZipFile if the stream is not a valid zip.
"""
from changedetectionio.model import Tag
restored_groups = 0
skipped_groups = 0
restored_watches = 0
skipped_watches = 0
current_tags = datastore.data['settings']['application'].get('tags', {})
current_watches = datastore.data['watching']
with tempfile.TemporaryDirectory() as tmpdir:
logger.debug(f"Restore: extracting zip to {tmpdir}")
with zipfile.ZipFile(zip_stream, 'r') as zf:
zf.extractall(tmpdir)
logger.debug("Restore: zip extracted, scanning UUID directories")
for entry in os.scandir(tmpdir):
if not entry.is_dir():
continue
uuid = entry.name
tag_json_path = os.path.join(entry.path, 'tag.json')
watch_json_path = os.path.join(entry.path, 'watch.json')
# --- Tags (groups) ---
if include_groups and os.path.exists(tag_json_path):
if uuid in current_tags and not include_groups_replace:
logger.debug(f"Restore: skipping existing group {uuid} (replace not requested)")
skipped_groups += 1
continue
try:
with open(tag_json_path, 'r', encoding='utf-8') as f:
tag_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Restore: failed to read tag.json for {uuid}: {e}")
continue
title = tag_data.get('title', uuid)
logger.debug(f"Restore: importing group '{title}' ({uuid})")
# Mirror _load_tags: set uuid and force processor
tag_data['uuid'] = uuid
tag_data['processor'] = 'restock_diff'
# Copy the UUID directory so data_dir exists for commit()
dst_dir = os.path.join(datastore.datastore_path, uuid)
if os.path.exists(dst_dir):
shutil.rmtree(dst_dir)
shutil.copytree(entry.path, dst_dir)
tag_obj = Tag.model(
datastore_path=datastore.datastore_path,
__datastore=datastore.data,
default=tag_data
)
current_tags[uuid] = tag_obj
tag_obj.commit()
restored_groups += 1
logger.success(f"Restore: group '{title}' ({uuid}) restored")
# --- Watches ---
elif include_watches and os.path.exists(watch_json_path):
if uuid in current_watches and not include_watches_replace:
logger.debug(f"Restore: skipping existing watch {uuid} (replace not requested)")
skipped_watches += 1
continue
try:
with open(watch_json_path, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Restore: failed to read watch.json for {uuid}: {e}")
continue
url = watch_data.get('url', uuid)
logger.debug(f"Restore: importing watch '{url}' ({uuid})")
# Copy UUID directory first so data_dir and history files exist
dst_dir = os.path.join(datastore.datastore_path, uuid)
if os.path.exists(dst_dir):
shutil.rmtree(dst_dir)
shutil.copytree(entry.path, dst_dir)
# Mirror _load_watches / rehydrate_entity
watch_data['uuid'] = uuid
watch_obj = datastore.rehydrate_entity(uuid, watch_data)
current_watches[uuid] = watch_obj
watch_obj.commit()
restored_watches += 1
logger.success(f"Restore: watch '{url}' ({uuid}) restored")
logger.debug(f"Restore: scan complete - groups {restored_groups} restored / {skipped_groups} skipped, "
f"watches {restored_watches} restored / {skipped_watches} skipped")
# Persist changedetection.json (includes the updated tags dict)
logger.debug("Restore: committing datastore settings")
datastore.commit()
return {
'restored_groups': restored_groups,
'skipped_groups': skipped_groups,
'restored_watches': restored_watches,
'skipped_watches': skipped_watches,
}
def construct_restore_blueprint(datastore):
restore_blueprint = Blueprint('restore', __name__, template_folder="templates")
restore_threads = []
@login_optionally_required
@restore_blueprint.route("/restore", methods=['GET'])
def restore():
form = RestoreForm()
return render_template("backup_restore.html",
form=form,
restore_running=any(t.is_alive() for t in restore_threads))
@login_optionally_required
@restore_blueprint.route("/restore/start", methods=['POST'])
def backups_restore_start():
if any(t.is_alive() for t in restore_threads):
flash(gettext("A restore is already running, check back in a few minutes"), "error")
return redirect(url_for('backups.restore.restore'))
zip_file = request.files.get('zip_file')
if not zip_file or not zip_file.filename:
flash(gettext("No file uploaded"), "error")
return redirect(url_for('backups.restore.restore'))
if not zip_file.filename.lower().endswith('.zip'):
flash(gettext("File must be a .zip backup file"), "error")
return redirect(url_for('backups.restore.restore'))
# Read into memory now — the request stream is gone once we return
try:
zip_bytes = io.BytesIO(zip_file.read())
zipfile.ZipFile(zip_bytes) # quick validity check before spawning
zip_bytes.seek(0)
except zipfile.BadZipFile:
flash(gettext("Invalid or corrupted zip file"), "error")
return redirect(url_for('backups.restore.restore'))
include_groups = request.form.get('include_groups') == 'y'
include_groups_replace = request.form.get('include_groups_replace_existing') == 'y'
include_watches = request.form.get('include_watches') == 'y'
include_watches_replace = request.form.get('include_watches_replace_existing') == 'y'
restore_thread = threading.Thread(
target=import_from_zip,
kwargs={
'zip_stream': zip_bytes,
'datastore': datastore,
'include_groups': include_groups,
'include_groups_replace': include_groups_replace,
'include_watches': include_watches,
'include_watches_replace': include_watches_replace,
},
daemon=True,
name="BackupRestore"
)
restore_thread.start()
restore_threads.append(restore_thread)
flash(gettext("Restore started in background, check back in a few minutes."))
return redirect(url_for('backups.restore.restore'))
return restore_blueprint
@@ -1,49 +0,0 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_simple_field, render_field %}
<div class="edit-form">
<div class="tabs collapsable">
<ul>
<li class="tab active" id=""><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
</ul>
</div>
<div class="box-wrap inner">
<div id="general">
{% if backup_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
</p>
{% endif %}
<p>
{{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
</p>
<br>
{% if available_backups %}
<ul>
{% for backup in available_backups %}
<li>
<a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{ backup["filesize"] }} {{ _('Mb') }}
</li>
{% endfor %}
</ul>
{% else %}
<p>
<strong>{{ _('No backups found.') }}</strong>
</p>
{% endif %}
<a class="pure-button pure-button-primary"
href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
{% if available_backups %}
<a class="pure-button button-small button-error "
href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
{% endif %}
</div>
</div>
</div>
{% endblock %}
@@ -1,58 +0,0 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_field, render_checkbox_field %}
<div class="edit-form">
<div class="tabs collapsable">
<ul>
<li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
<li class="tab active"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
</ul>
</div>
<div class="box-wrap inner">
<div id="general">
{% if restore_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A restore is running!') }}</strong>
</p>
{% endif %}
<p>{{ _('Restore a backup. Must be a .zip backup file created on/after v0.53.1 (new database layout).') }}</p>
<p>{{ _('Note: This does not override the main application settings, only watches and groups.') }}</p>
<form class="pure-form pure-form-stacked settings"
action="{{ url_for('backups.restore.backups_restore_start') }}"
method="POST"
enctype="multipart/form-data">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<div class="pure-control-group">
{{ render_checkbox_field(form.include_groups) }}
<span class="pure-form-message-inline">{{ _('Include all groups found in backup?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_groups_replace_existing) }}
<span class="pure-form-message-inline">{{ _('Replace any existing groups of the same UUID?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_watches) }}
<span class="pure-form-message-inline">{{ _('Include all watches found in backup?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_watches_replace_existing) }}
<span class="pure-form-message-inline">{{ _('Replace any existing watches of the same UUID?') }}</span>
</div>
<div class="pure-control-group">
{{ render_field(form.zip_file) }}
</div>
<div class="pure-controls">
<button type="submit" class="pure-button pure-button-primary">{{ _('Restore backup') }}</button>
</div>
</form>
</div>
</div>
</div>
{% endblock %}
@@ -0,0 +1,36 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_simple_field, render_field %}
<div class="edit-form">
<div class="box-wrap inner">
<h2>{{ _('Backups') }}</h2>
{% if backup_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
</p>
{% endif %}
<p>
{{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
</p>
<br>
{% if available_backups %}
<ul>
{% for backup in available_backups %}
<li><a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{ backup["filesize"] }} {{ _('Mb') }}</li>
{% endfor %}
</ul>
{% else %}
<p>
<strong>{{ _('No backups found.') }}</strong>
</p>
{% endif %}
<a class="pure-button pure-button-primary" href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
{% if available_backups %}
<a class="pure-button button-small button-error " href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
{% endif %}
</div>
</div>
{% endblock %}
@@ -174,7 +174,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
async def start_browsersteps_session(watch_uuid):
from changedetectionio.browser_steps import browser_steps
from . import browser_steps
import time
from playwright.async_api import async_playwright
@@ -238,6 +238,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@browser_steps_blueprint.route("/browsersteps_start_session", methods=['GET'])
def browsersteps_start_session():
# A new session was requested, return sessionID
import asyncio
import uuid
browsersteps_session_id = str(uuid.uuid4())
watch_uuid = request.args.get('uuid')
@@ -300,10 +301,11 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@browser_steps_blueprint.route("/browsersteps_update", methods=['POST'])
def browsersteps_ui_update():
import base64
import playwright._impl._errors
from changedetectionio.blueprint.browser_steps import browser_steps
remaining = 0
remaining =0
uuid = request.args.get('uuid')
goto_website_url_first_step = request.args.get('goto_website_url_first_step')
browsersteps_session_id = request.args.get('browsersteps_session_id')
@@ -314,33 +316,33 @@ def construct_blueprint(datastore: ChangeDetectionStore):
return make_response('No session exists under that ID', 500)
is_last_step = False
# @todo - should always be an existing session
if goto_website_url_first_step:
logger.debug("Going to site (requested automatically before stepping)..")
step_operation = "Goto site"
step_selector = None
step_optional_value = None
else:
# Actions - step/apply/etc, do the thing and return state
if request.method == 'POST':
# @todo - should always be an existing session
step_operation = request.form.get('operation')
step_selector = request.form.get('selector')
step_optional_value = request.form.get('optional_value')
is_last_step = strtobool(request.form.get('is_last_step'))
try:
# Run the async call_action method in the dedicated browser steps event loop
run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(
action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value
try:
# Run the async call_action method in the dedicated browser steps event loop
run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(
action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value
)
)
)
except Exception as e:
logger.error(f"Exception when calling step operation {step_operation} {str(e)}")
# Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401)
except Exception as e:
logger.error(f"Exception when calling step operation {step_operation} {str(e)}")
# Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401)
# if not this_session.page:
# cleanup_playwright_session()
# return make_response('Browser session ran out of time :( Please reload this page.', 401)
# Screenshots and other info only needed on requesting a step (POST)
try:
@@ -348,7 +350,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
(screenshot, xpath_data) = run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
)
if is_last_step:
watch = datastore.data['watching'].get(uuid)
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
@@ -8,17 +8,6 @@ from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT
from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.jinja2_custom import render as jinja_render
def browser_steps_get_valid_steps(browser_steps: list):
if browser_steps is not None and len(browser_steps):
valid_steps = list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one'),browser_steps))
# Just incase they selected Goto site by accident with older JS
if valid_steps and valid_steps[0]['operation'] == 'Goto site':
del(valid_steps[0])
return valid_steps
return []
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
@@ -16,11 +16,6 @@
<form class="pure-form" action="{{url_for('imports.import_page')}}" method="POST" enctype="multipart/form-data">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<div class="tab-pane-inner" id="url-list">
<p>
{{ _('Restoring changedetection.io backups is in the') }}<a href="{{ url_for('backups.restore.restore') }}"> {{ _('backups section') }}</a>.
<br>
</p>
<div class="pure-control-group">
{{ _('Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):') }}
<br>
@@ -42,6 +37,9 @@
</div>
<div class="tab-pane-inner" id="distill-io">
<div class="pure-control-group">
{{ _('Copy and Paste your Distill.io watch \'export\' file, this should be a JSON file.') }}<br>
{{ _('This is') }} <i>{{ _('experimental') }}</i>, {{ _('supported fields are') }} <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, {{ _('the rest (including') }} <code>schedule</code>) {{ _('are ignored.') }}
@@ -51,6 +49,8 @@
{{ _('Be sure to set your default fetcher to Chrome if required.') }}<br>
</p>
</div>
<textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
font-family:monospace;
white-space: pre;
@@ -114,7 +114,6 @@
</div>
</div>
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">{{ _('Import') }}</button>
</form>
</div>
@@ -83,10 +83,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
datastore.data['settings']['requests'].update(form.data['requests'])
datastore.commit()
# Clear all checksums to force reprocessing with new settings
# Global settings can affect watch behavior (filters, rendering, etc.)
datastore.clear_all_last_checksums()
# Adjust worker count if it changed
if new_worker_count != old_worker_count:
from changedetectionio import worker_pool
@@ -25,7 +25,7 @@
<li class="tab"><a href="#ui-options">{{ _('UI Options') }}</a></li>
<li class="tab"><a href="#api">{{ _('API') }}</a></li>
<li class="tab"><a href="#rss">{{ _('RSS') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Backups') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.index') }}">{{ _('Backups') }}</a></li>
<li class="tab"><a href="#timedate">{{ _('Time & Date') }}</a></li>
<li class="tab"><a href="#proxies">{{ _('CAPTCHA & Proxies') }}</a></li>
{% if plugin_tabs %}
+25 -9
View File
@@ -70,6 +70,17 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if datastore.data['settings']['application']['tags'].get(uuid):
del datastore.data['settings']['application']['tags'][uuid]
# Delete tag.json file if it exists
import os
tag_dir = os.path.join(datastore.datastore_path, uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
logger.info(f"Deleted tag.json for tag {uuid}")
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
# Remove tag from all watches in background thread to avoid blocking
def remove_tag_background(tag_uuid):
"""Background thread to remove tag from watches - discarded after completion."""
@@ -116,11 +127,19 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@tags_blueprint.route("/delete_all", methods=['GET'])
@login_optionally_required
def delete_all():
# Delete all tag.json files
import os
for tag_uuid in list(datastore.data['settings']['application']['tags'].keys()):
# TagsDict 'del' handler will remove the dir
del datastore.data['settings']['application']['tags'][tag_uuid]
tag_dir = os.path.join(datastore.datastore_path, tag_uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {tag_uuid}: {e}")
# Clear all tags from settings immediately
datastore.data['settings']['application']['tags'] = {}
# Clear tags from all watches in background thread to avoid blocking
def clear_all_tags_background():
@@ -225,15 +244,12 @@ def construct_blueprint(datastore: ChangeDetectionStore):
tag.update(form.data)
tag['processor'] = 'restock_diff'
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated, cleared {cleared_count} watch checksums")
flash(gettext("Updated"))
return redirect(url_for('tags.tags_overview_page'))
@tags_blueprint.route("/delete/<string:uuid>", methods=['GET'])
def form_tag_delete(uuid):
return redirect(url_for('tags.tags_overview_page'))
return tags_blueprint
+9 -15
View File
@@ -194,9 +194,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
tag_limit = request.args.get('tag')
now = int(time.time())
# Mark watches as viewed - use background thread only for large watch counts
def mark_viewed_impl():
"""Mark watches as viewed - can run synchronously or in background thread."""
# Mark watches as viewed in background thread to avoid blocking
def mark_viewed_background():
"""Background thread to mark watches as viewed - discarded after completion."""
marked_count = 0
try:
for watch_uuid, watch in datastore.data['watching'].items():
@@ -209,21 +209,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
datastore.set_last_viewed(watch_uuid, now)
marked_count += 1
logger.info(f"Marking complete: {marked_count} watches marked as viewed")
logger.info(f"Background marking complete: {marked_count} watches marked as viewed")
except Exception as e:
logger.error(f"Error marking as viewed: {e}")
logger.error(f"Error in background mark as viewed: {e}")
# For small watch counts (< 10), run synchronously to avoid race conditions in tests
# For larger counts, use background thread to avoid blocking the UI
watch_count = len(datastore.data['watching'])
if watch_count < 10:
# Run synchronously for small watch counts
mark_viewed_impl()
else:
# Start background thread for large watch counts
thread = threading.Thread(target=mark_viewed_impl, daemon=True)
thread.start()
# Start background thread and return immediately
thread = threading.Thread(target=mark_viewed_background, daemon=True)
thread.start()
flash(gettext("Marking watches as viewed in background..."))
return redirect(url_for('watchlist.index', tag=tag_limit))
@ui_blueprint.route("/delete", methods=['GET'])
+1 -51
View File
@@ -26,7 +26,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# https://wtforms.readthedocs.io/en/3.0.x/forms/#wtforms.form.Form.populate_obj ?
def edit_page(uuid):
from changedetectionio import forms
from changedetectionio.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio import processors
import importlib
@@ -354,56 +354,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Return a 500 error
abort(500)
@edit_blueprint.route("/edit/<string:uuid>/get-data-package", methods=['GET'])
@login_optionally_required
def watch_get_data_package(uuid):
"""Download all data for a single watch as a zip file"""
from io import BytesIO
from flask import send_file
import zipfile
from pathlib import Path
import datetime
watch = datastore.data['watching'].get(uuid)
if not watch:
abort(404)
# Create zip in memory
memory_file = BytesIO()
with zipfile.ZipFile(memory_file, 'w',
compression=zipfile.ZIP_DEFLATED,
compresslevel=8) as zipObj:
# Add the watch's JSON file if it exists
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
if os.path.isfile(watch_json_path):
zipObj.write(watch_json_path,
arcname=os.path.join(uuid, 'watch.json'),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
# Add all files in the watch data directory
if os.path.isdir(watch.data_dir):
for f in Path(watch.data_dir).glob('*'):
if f.is_file() and f.name != 'watch.json': # Skip watch.json since we already added it
zipObj.write(f,
arcname=os.path.join(uuid, f.name),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
# Seek to beginning of file
memory_file.seek(0)
# Generate filename with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"watch-data-{uuid[:8]}-{timestamp}.zip"
return send_file(memory_file,
as_attachment=True,
download_name=filename,
mimetype='application/zip')
# Ajax callback
@edit_blueprint.route("/edit/<string:uuid>/preview-rendered", methods=['POST'])
@login_optionally_required
@@ -488,7 +488,6 @@ Math: {{ 1 + 1 }}") }}
{% if watch.history_n %}
<p>
<a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">{{ _('Download latest HTML snapshot') }}</a>
<a href="{{url_for('ui.ui_edit.watch_get_data_package', uuid=uuid)}}" class="pure-button button-small">{{ _('Download watch data package') }}</a>
</p>
{% endif %}
@@ -304,13 +304,12 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
</span>
{%- endif -%}
{%- if watch.get('restock') and watch['restock'].get('price') -%}
{%- if watch['restock']['price'] is number -%}
{%- if watch.get('restock') and watch['restock']['price'] != None -%}
{%- if watch['restock']['price'] != None -%}
<span class="restock-label price" title="{{ _('Price') }}">
{{ watch['restock']['price']|format_number_locale if watch['restock'].get('price') else '' }} {{ watch['restock'].get('currency','') }}
</span>
{%- else -%} <!-- watch['restock']['price']' is not a number, cant output it -->
{%- endif -%}
{%- endif -%}
{%- elif not watch.has_restock_info -%}
<span class="restock-label error">{{ _('No information') }}</span>
{%- endif -%}
+18 -3
View File
@@ -38,6 +38,7 @@ def manage_user_agent(headers, current_ua=''):
return None
class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
@@ -162,16 +163,30 @@ class Fetcher():
"""
return {k.lower(): v for k, v in self.headers.items()}
def browser_steps_get_valid_steps(self):
if self.browser_steps is not None and len(self.browser_steps):
valid_steps = list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one'),
self.browser_steps))
# Just incase they selected Goto site by accident with older JS
if valid_steps and valid_steps[0]['operation'] == 'Goto site':
del(valid_steps[0])
return valid_steps
return None
async def iterate_browser_steps(self, start_url=None):
from changedetectionio.browser_steps.browser_steps import steppable_browser_interface, browser_steps_get_valid_steps
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._errors import TimeoutError, Error
from changedetectionio.jinja2_custom import render as jinja_render
step_n = 0
if self.browser_steps:
if self.browser_steps is not None and len(self.browser_steps):
interface = steppable_browser_interface(start_url=start_url)
interface.page = self.page
valid_steps = browser_steps_get_valid_steps(self.browser_steps)
valid_steps = self.browser_steps_get_valid_steps()
for step in valid_steps:
step_n += 1
@@ -295,7 +295,7 @@ class fetcher(Fetcher):
self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Re-use as much code from browser steps as possible so its the same
from changedetectionio.browser_steps.browser_steps import steppable_browser_interface
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
browsersteps_interface = steppable_browser_interface(start_url=url)
browsersteps_interface.page = self.page
@@ -362,7 +362,7 @@ class fetcher(Fetcher):
# Wrap remaining operations in try/finally to ensure cleanup
try:
# Run Browser Steps here
if self.browser_steps:
if self.browser_steps_get_valid_steps():
try:
await self.iterate_browser_steps(start_url=url)
except BrowserStepsStepException:
@@ -86,8 +86,8 @@ async def capture_full_page(page, screenshot_format='JPEG', watch_uuid=None, loc
# better than scrollTo incase they override it in the page
await page.evaluate(
"""(y) => {
const el = document.scrollingElement;
if (el) el.scrollTop = y;
document.documentElement.scrollTop = y;
document.body.scrollTop = y;
}""",
y
)
@@ -305,8 +305,6 @@ class fetcher(Fetcher):
await asyncio.wait_for(self.browser.close(), timeout=3.0)
except Exception as cleanup_error:
logger.error(f"[{watch_uuid}] Failed to cleanup browser after page creation failure: {cleanup_error}")
finally:
self.browser = None
raise
# Add console handler to capture console.log from favicon fetcher
@@ -458,7 +456,7 @@ class fetcher(Fetcher):
# Run Browser Steps here
# @todo not yet supported, we switch to playwright in this case
# if self.browser_steps:
# if self.browser_steps_get_valid_steps():
# self.iterate_browser_steps()
@@ -534,14 +532,6 @@ class fetcher(Fetcher):
)
except asyncio.TimeoutError:
raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
finally:
# Internal cleanup on any exception/timeout - call quit() immediately
# This prevents connection leaks during exception bursts
# Worker.py's quit() call becomes a redundant safety net (idempotent)
try:
await self.quit(watch={'uuid': watch_uuid} if watch_uuid else None)
except Exception as cleanup_error:
logger.error(f"[{watch_uuid}] Error during internal quit() cleanup: {cleanup_error}")
# Plugin registration for built-in fetcher
@@ -3,7 +3,7 @@ import hashlib
import os
import re
import asyncio
from functools import partial
from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
@@ -36,7 +36,7 @@ class fetcher(Fetcher):
import requests
from requests.exceptions import ProxyError, ConnectionError, RequestException
if self.browser_steps:
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
proxies = {}
@@ -184,6 +184,7 @@ class fetcher(Fetcher):
)
async def quit(self, watch=None):
# In case they switched to `requests` fetcher from something else
# Then the screenshot could be old, in any case, it's not used here.
# REMOVE_REQUESTS_OLD_SCREENSHOTS - Mainly used for testing
+11 -20
View File
@@ -27,6 +27,7 @@ from flask import (
session,
url_for,
)
from flask_compress import Compress as FlaskCompress
from flask_restful import abort, Api
from flask_cors import CORS
@@ -69,18 +70,14 @@ socketio_server = None
# Enable CORS, especially useful for the Chrome extension to operate from anywhere
CORS(app)
# Flask-Compress handles HTTP compression, Socket.IO compression disabled to prevent memory leak.
# There's also a bug between flask compress and socketio that causes some kind of slow memory leak
# It's better to use compression on your reverse proxy (nginx etc) instead.
if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
from flask_compress import Compress as FlaskCompress
app.config['COMPRESS_MIN_SIZE'] = 2096
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
app.config['COMPRESS_ALGORITHM'] = ['gzip']
compress = FlaskCompress()
compress.init_app(app)
# Super handy for compressing large BrowserSteps responses and others
# Flask-Compress handles HTTP compression, Socket.IO compression disabled to prevent memory leak
compress = FlaskCompress()
app.config['COMPRESS_MIN_SIZE'] = 2096
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
app.config['COMPRESS_ALGORITHM'] = ['gzip']
compress.init_app(app)
app.config['TEMPLATES_AUTO_RELOAD'] = False
@@ -711,14 +708,8 @@ def changedetection_app(config=None, datastore_o=None):
def static_content(group, filename):
from flask import make_response
import re
# Strict sanitization: only allow a-z, 0-9, and underscore (blocks .. and other traversal)
group = re.sub(r'[^a-z0-9_-]+', '', group.lower())
filename = filename
# Additional safety: reject if sanitization resulted in empty strings
if not group or not filename:
abort(404)
group = re.sub(r'[^\w.-]+', '', group.lower())
filename = re.sub(r'[^\w.-]+', '', filename.lower())
if group == 'screenshot':
# Could be sensitive, follow password requirements
+6 -2
View File
@@ -7,6 +7,8 @@ from flask_babel import lazy_gettext as _l, gettext
from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES, RSS_TEMPLATE_TYPE_OPTIONS, RSS_TEMPLATE_HTML_DEFAULT
from changedetectionio.conditions.form import ConditionFormRow
from changedetectionio.notification_service import NotificationContextData
from changedetectionio.processors.image_ssim_diff import SCREENSHOT_COMPARISON_THRESHOLD_OPTIONS, \
SCREENSHOT_COMPARISON_THRESHOLD_OPTIONS_DEFAULT
from changedetectionio.strtobool import strtobool
from changedetectionio import processors
@@ -35,7 +37,7 @@ from changedetectionio.widgets import TernaryNoneBooleanField
# default
# each select <option data-enabled="enabled-0-0"
from changedetectionio.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio import html_tools, content_fetchers
@@ -492,6 +494,7 @@ class ValidateJinja2Template(object):
Validates that a {token} is from a valid set
"""
def __call__(self, form, field):
from changedetectionio import notification
from changedetectionio.jinja2_custom import create_jinja_env
from jinja2 import BaseLoader, TemplateSyntaxError, UndefinedError
from jinja2.meta import find_undeclared_variables
@@ -817,7 +820,8 @@ class processor_text_json_diff_form(commonSettingsForm):
filter_text_removed = BooleanField(_l('Removed lines'), default=True)
trigger_text = StringListField(_l('Keyword triggers - Trigger/wait for text'), [validators.Optional(), ValidateListRegex()])
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
text_should_not_be_present = StringListField(_l('Block change-detection while text matches'), [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField(_l('Execute JavaScript before change detection'), render_kw={"rows": "5"}, validators=[validators.Optional()])
+1 -24
View File
@@ -561,33 +561,10 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
)
else:
parser_config = None
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
else:
# Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
# causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
# can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
# causing the regex to scan past the intended close and eat real page content.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Strip tags that inscriptis cannot render as meaningful text and which can be very large.
# svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
# video/audio/picture are kept — they may contain meaningful fallback text or captions.
for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
'math', 'canvas', 'iframe', 'template']):
tag.decompose()
# SPAs often use <body style="display:none"> to hide content until JS loads.
# inscriptis respects CSS display rules, so strip hiding styles from the body tag.
body_tag = soup.find('body')
if body_tag and body_tag.get('style'):
style = body_tag['style']
if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
del body_tag['style']
html_content = str(soup)
text_content = get_text(html_content, config=parser_config)
return text_content
+2 -9
View File
@@ -2,7 +2,6 @@ from os import getenv
from copy import deepcopy
from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES, RSS_CONTENT_FORMAT_DEFAULT
from changedetectionio.model.Tags import TagsDict
from changedetectionio.notification import (
default_notification_body,
@@ -69,7 +68,7 @@ class model(dict):
'schema_version' : 0,
'shared_diff_access': False,
'strip_ignored_lines': False,
'tags': None, # Initialized in __init__ with real datastore_path
'tags': {}, #@todo use Tag.model initialisers
'webdriver_delay': None , # Extra delay in seconds before extracting text
'ui': {
'use_page_title_in_list': True,
@@ -81,16 +80,10 @@ class model(dict):
}
}
def __init__(self, *arg, datastore_path=None, **kw):
def __init__(self, *arg, **kw):
super(model, self).__init__(*arg, **kw)
# Capture any tags data passed in before base_config overwrites the structure
existing_tags = self.get('settings', {}).get('application', {}).get('tags') or {}
# CRITICAL: deepcopy to avoid sharing mutable objects between instances
self.update(deepcopy(self.base_config))
# TagsDict requires the real datastore_path at runtime (cannot be set at class-definition time)
if datastore_path is None:
raise ValueError("App.model() requires 'datastore_path' keyword argument")
self['settings']['application']['tags'] = TagsDict(existing_tags, datastore_path=datastore_path)
def parse_headers_from_text_file(filepath):
+2
View File
@@ -20,9 +20,11 @@ See: Watch.py model docstring for full Pydantic architecture explanation
See: processors/restock_diff/processor.py:184-192 for current manual implementation
"""
import os
from changedetectionio.model import watch_base
from changedetectionio.model.persistence import EntityPersistenceMixin
class model(EntityPersistenceMixin, watch_base):
"""
Tag domain model - groups watches and can override their settings.
-39
View File
@@ -1,39 +0,0 @@
import os
import shutil
from pathlib import Path
from loguru import logger
_SENTINEL = object()
class TagsDict(dict):
"""Dict subclass that removes the corresponding tag.json file when a tag is deleted."""
def __init__(self, *args, datastore_path: str | os.PathLike, **kwargs) -> None:
self._datastore_path = Path(datastore_path)
super().__init__(*args, **kwargs)
def __delitem__(self, key: str) -> None:
super().__delitem__(key)
tag_dir = self._datastore_path / key
tag_json_file = tag_dir / "tag.json"
if not os.path.exists(tag_json_file):
logger.critical(f"Aborting deletion of directory '{tag_dir}' because '{tag_json_file}' does not exist.")
return
try:
shutil.rmtree(tag_dir)
logger.info(f"Deleted tag directory for tag {key!r}")
except FileNotFoundError:
pass
except OSError as e:
logger.error(f"Failed to delete tag directory for tag {key!r}: {e}")
def pop(self, key: str, default=_SENTINEL):
"""Remove and return tag, deleting its tag.json file. Raises KeyError if missing and no default given."""
if key in self:
value = self[key]
del self[key]
return value
if default is _SENTINEL:
raise KeyError(key)
return default
+5 -10
View File
@@ -335,6 +335,7 @@ class model(EntityPersistenceMixin, watch_base):
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None
})
@@ -385,16 +386,10 @@ class model(EntityPersistenceMixin, watch_base):
@property
def is_pdf(self):
url = str(self.get("url") or "").lower()
content_type = str(self.get("content-type") or "").lower()
if content_type in ("none", "null", ""):
content_type = ""
return (
url.endswith(".pdf")
or content_type.split(";")[0].strip() == "application/pdf"
)
# content_type field is set in the future
# https://github.com/dgtlmoon/changedetection.io/issues/1392
# Not sure the best logic here
return self.get('url', '').lower().endswith('.pdf') or 'pdf' in self.get('content_type', '').lower()
@property
def label(self):
+6 -157
View File
@@ -2,12 +2,10 @@ import os
import uuid
from changedetectionio import strtobool
from .persistence import EntityPersistenceMixin, _determine_entity_type
from .persistence import EntityPersistenceMixin
__all__ = ['EntityPersistenceMixin', 'watch_base']
from ..browser_steps.browser_steps import browser_steps_get_valid_steps
USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH = 'System default'
CONDITIONS_MATCH_LOGIC_DEFAULT = 'ALL'
@@ -28,7 +26,6 @@ class watch_base(dict):
- Configuration override chain resolution (Watch Tag Global)
- Immutability options
- Better testing
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
CHAIN RESOLUTION ARCHITECTURE:
The dream is a 3-level override hierarchy:
@@ -131,6 +128,7 @@ class watch_base(dict):
fetch_time (float): Duration of last fetch in seconds
consecutive_filter_failures (int): Counter for consecutive filter match failures
previous_md5 (str|bool): MD5 hash of previous content
previous_md5_before_filters (str|bool): MD5 hash before filters applied
history_snapshot_max_length (int|None): Max history snapshots to keep (None = use global)
Conditions:
@@ -167,10 +165,6 @@ class watch_base(dict):
if kw.get('datastore_path'):
del kw['datastore_path']
# IMPORTANT: Don't initialize __watch_was_edited yet!
# We'll initialize it AFTER the initial update() call below
# This prevents marking the watch as edited during initialization
self.update({
# Custom notification content
# Re #110, so then if this is set to None, we know to use the default value instead
@@ -179,7 +173,7 @@ class watch_base(dict):
'body': None,
'browser_steps': [],
'browser_steps_last_error_step': None,
'conditions' : [],
'conditions' : {},
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
'check_count': 0,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
@@ -216,6 +210,7 @@ class watch_base(dict):
'page_title': None, # <title> from the page
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'price_change_threshold_percent': None,
'proxy': None, # Preferred proxy connection
@@ -301,157 +296,9 @@ class watch_base(dict):
super(watch_base, self).__init__(*arg, **kw)
# Check if we're being initialized from an existing watch object
# that has was_edited=True, so we can preserve the flag
preserve_edited_flag = False
if self.get('default'):
# When creating a new watch object from an existing one (e.g., changing processor),
# preserve the was_edited flag if it was True
default_watch = self.get('default')
if hasattr(default_watch, 'was_edited') and default_watch.was_edited:
preserve_edited_flag = True
del self['default']
# NOW initialize the edited flag after all initial setup is complete
# This ensures initialization doesn't trigger the edited flag
# But preserve it if the source watch had it set to True
self.__watch_was_edited = preserve_edited_flag
def _mark_field_as_edited(self, key):
"""
Helper to mark a field as edited if it's writable.
Internal method used by __setitem__, update(), pop(), etc.
"""
# Don't track edits during initial load or if already edited
if not hasattr(self, '_watch_base__watch_was_edited'):
return
if self.__watch_was_edited:
return # Already marked as edited
# Import from shared schema utilities (no circular dependency)
from .schema_utils import get_readonly_watch_fields
readonly_fields = get_readonly_watch_fields()
# Additional system-managed fields not in OpenAPI spec (yet)
# These are set by processors/workers and should not trigger edited flag
additional_system_fields = {
'last_check_status', # Set by processors
'restock', # Set by restock processor
'last_viewed', # Set by mark_all_viewed endpoint
}
# Only mark as edited if this is a user-writable field
if key not in readonly_fields and key not in additional_system_fields:
self.__watch_was_edited = True
def __setitem__(self, key, value):
"""
Override dict.__setitem__ to track when writable watch fields are modified.
This enables skipping reprocessing when:
1. HTML content is unchanged (checksumFromPreviousCheckWasTheSame)
2. AND watch configuration was not edited
Only sets the edited flag when field is NOT in readonly_fields (from OpenAPI spec).
"""
# Set the value first (always)
super().__setitem__(key, value)
# Mark as edited if writable field
self._mark_field_as_edited(key)
def __delitem__(self, key):
"""Override dict.__delitem__ to track deletions of writable fields."""
super().__delitem__(key)
self._mark_field_as_edited(key)
def update(self, *args, **kwargs):
if args and args[0].get('browser_steps'):
args[0]['browser_steps'] = browser_steps_get_valid_steps(args[0].get('browser_steps'))
"""Override dict.update() to track modifications to writable fields."""
# Call parent update first
super().update(*args, **kwargs)
# Mark as edited for any writable fields that were updated
# Handle both update(dict) and update(key=value) forms
if args:
for key in args[0].keys():
self._mark_field_as_edited(key)
for key in kwargs.keys():
self._mark_field_as_edited(key)
def pop(self, key, *args):
"""Override dict.pop() to track removal of writable fields."""
result = super().pop(key, *args)
self._mark_field_as_edited(key)
return result
def setdefault(self, key, default=None):
"""Override dict.setdefault() to track modifications to writable fields."""
# Only marks as edited if key didn't exist (i.e., a new value was set)
existed = key in self
result = super().setdefault(key, default)
if not existed:
self._mark_field_as_edited(key)
return result
@property
def was_edited(self):
"""
Check if watch configuration was edited since last processing.
Returns:
bool: True if writable fields were modified, False otherwise
"""
return getattr(self, '_watch_base__watch_was_edited', False)
def reset_watch_edited_flag(self):
"""
Reset the watch edited flag after successful processing.
Call this after processing completes to allow future content-only change detection.
"""
self.__watch_was_edited = False
@classmethod
def get_property_names(cls):
"""
Get all @property attribute names from this model class using introspection.
This discovers computed/derived properties that are not stored in the datastore.
These properties should be filtered out during PUT/POST requests.
Returns:
frozenset: Immutable set of @property attribute names from the model class
"""
import functools
# Create a cached version if it doesn't exist
if not hasattr(cls, '_cached_get_property_names'):
@functools.cache
def _get_props():
properties = set()
# Use introspection to find all @property attributes
for name in dir(cls):
# Skip private/magic attributes
if name.startswith('_'):
continue
try:
attr = getattr(cls, name)
# Check if it's a property descriptor
if isinstance(attr, property):
properties.add(name)
except (AttributeError, TypeError):
continue
return frozenset(properties)
cls._cached_get_property_names = _get_props
return cls._cached_get_property_names()
def __deepcopy__(self, memo):
"""
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
@@ -664,8 +511,10 @@ class watch_base(dict):
# Save to disk via subclass implementation
try:
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
from changedetectionio.model.persistence import _determine_entity_type
entity_type = _determine_entity_type(self.__class__)
filename = f"{entity_type}.json"
self._save_to_disk(data_dict, uuid)
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
except Exception as e:
-92
View File
@@ -1,92 +0,0 @@
"""
Schema utilities for Watch and Tag models.
Provides functions to extract readonly fields and properties from OpenAPI spec.
Shared by both the model layer and API layer to avoid circular dependencies.
"""
import functools
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_readonly_fields(schema_name):
"""
Generic helper to resolve readOnly fields, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
Returns:
frozenset: All readOnly field names including inherited ones
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
readonly_fields = set()
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
if 'properties' in ref_schema:
for field_name, field_def in ref_schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
# Check schema-specific properties
if 'properties' in item:
for field_name, field_def in item['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
else:
# Direct properties (no inheritance)
if 'properties' in schema:
for field_name, field_def in schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
return frozenset(readonly_fields)
@functools.cache
def get_readonly_watch_fields():
"""
Extract readOnly field names from Watch schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
Used by:
- model/watch_base.py: Track when writable fields are edited
- api/Watch.py: Filter readonly fields from PUT requests
"""
return _resolve_readonly_fields('Watch')
@functools.cache
def get_readonly_tag_fields():
"""
Extract readOnly field names from Tag schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
"""
return _resolve_readonly_fields('Tag')
+1 -108
View File
@@ -129,51 +129,6 @@ class ChangeDetectionSpec:
"""
pass
@hookspec
def update_handler_alter(update_handler, watch, datastore):
"""Modify or wrap the update_handler before it processes a watch.
This hook is called after the update_handler (perform_site_check instance) is created
but before it calls call_browser() and run_changedetection(). Plugins can use this to:
- Wrap the handler to add logging/metrics
- Modify handler configuration
- Add custom preprocessing logic
Args:
update_handler: The perform_site_check instance that will process the watch
watch: The watch dict being processed
datastore: The application datastore
Returns:
object or None: Return a modified/wrapped handler, or None to keep the original.
If multiple plugins return handlers, they are chained in registration order.
"""
pass
@hookspec
def update_finalize(update_handler, watch, datastore, processing_exception):
"""Called after watch processing completes (success or failure).
This hook is called in the finally block after all processing is complete,
allowing plugins to perform cleanup, update metrics, or log final status.
The plugin can access update_handler.last_logging_insert_id if it was stored
during update_handler_alter, and use processing_exception to determine if
the processing succeeded or failed.
Args:
update_handler: The perform_site_check instance (may be None if creation failed)
watch: The watch dict that was processed (may be None if not loaded)
datastore: The application datastore
processing_exception: The exception from the main processing block, or None if successful.
This does NOT include cleanup exceptions - only exceptions from
the actual watch processing (fetch, diff, etc).
Returns:
None: This hook doesn't return a value
"""
pass
# Set up Plugin Manager
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
@@ -544,66 +499,4 @@ def get_plugin_template_paths():
template_paths.append(templates_dir)
logger.debug(f"Added plugin template path: {templates_dir}")
return template_paths
def apply_update_handler_alter(update_handler, watch, datastore):
"""Apply update_handler_alter hooks from all plugins.
Allows plugins to wrap or modify the update_handler before it processes a watch.
Multiple plugins can chain modifications - each plugin receives the result from
the previous plugin.
Args:
update_handler: The perform_site_check instance to potentially modify
watch: The watch dict being processed
datastore: The application datastore
Returns:
object: The (potentially modified/wrapped) update_handler
"""
# Get all plugins that implement the update_handler_alter hook
results = plugin_manager.hook.update_handler_alter(
update_handler=update_handler,
watch=watch,
datastore=datastore
)
# Chain results - each plugin gets the result from the previous one
current_handler = update_handler
if results:
for result in results:
if result is not None:
logger.debug(f"Plugin modified update_handler for watch {watch.get('uuid')}")
current_handler = result
return current_handler
def apply_update_finalize(update_handler, watch, datastore, processing_exception):
"""Apply update_finalize hooks from all plugins.
Called in the finally block after watch processing completes, allowing plugins
to perform cleanup, update metrics, or log final status.
Args:
update_handler: The perform_site_check instance (may be None)
watch: The watch dict that was processed (may be None)
datastore: The application datastore
processing_exception: The exception from processing, or None if successful
Returns:
None
"""
try:
# Call all plugins that implement the update_finalize hook
plugin_manager.hook.update_finalize(
update_handler=update_handler,
watch=watch,
datastore=datastore,
processing_exception=processing_exception
)
except Exception as e:
# Don't let plugin errors crash the worker
logger.error(f"Error in update_finalize hook: {e}")
logger.exception(f"update_finalize hook exception details:")
return template_paths
+7 -24
View File
@@ -1,6 +1,6 @@
from functools import lru_cache
from loguru import logger
from flask_babel import gettext, get_locale
from flask_babel import gettext
import importlib
import inspect
import os
@@ -190,15 +190,14 @@ def get_plugin_processor_metadata():
logger.warning(f"Error getting plugin processor metadata: {e}")
return metadata
@lru_cache(maxsize=32)
def _available_processors_cached(locale_str):
"""
Internal cached function that includes locale in cache key.
This ensures translations are cached per-language instead of globally.
:param locale_str: The locale string (e.g., 'en', 'it', 'zh')
:return: A list of tuples (processor_name, translated_description, weight)
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
:return: A list :)
"""
processor_classes = find_processors()
# Check if DISABLED_PROCESSORS env var is set
@@ -257,22 +256,6 @@ def _available_processors_cached(locale_str):
# Return as tuples without weight (for backwards compatibility)
return [(name, desc) for name, desc, weight in available]
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
This function delegates to a locale-aware cached version to ensure translations
are cached per-language instead of globally.
:return: A list of tuples (processor_name, translated_description)
"""
# Get current locale and use it as cache key
# Convert Babel Locale object to string for use as cache key
locale = get_locale()
locale_str = str(locale) if locale else 'en'
return _available_processors_cached(locale_str)
def get_default_processor():
"""
+2 -71
View File
@@ -1,7 +1,5 @@
import re
import hashlib
from changedetectionio.browser_steps.browser_steps import browser_steps_get_valid_steps
from changedetectionio.content_fetchers.base import Fetcher
from changedetectionio.strtobool import strtobool
from copy import deepcopy
@@ -21,7 +19,6 @@ class difference_detection_processor():
xpath_data = None
preferred_proxy = None
screenshot_format = SCREENSHOT_FORMAT_JPEG
last_raw_content_checksum = None
def __init__(self, datastore, watch_uuid):
self.datastore = datastore
@@ -37,64 +34,6 @@ class difference_detection_processor():
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()
# Load the last raw content checksum from file
self.read_last_raw_content_checksum()
def update_last_raw_content_checksum(self, checksum):
"""
Save the raw content MD5 checksum to file.
This is used for skip logic - avoid reprocessing if raw HTML unchanged.
"""
if not checksum:
return
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
return
data_dir = watch.data_dir
if not data_dir:
return
watch.ensure_data_dir_exists()
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
try:
with open(checksum_file, 'w', encoding='utf-8') as f:
f.write(checksum)
self.last_raw_content_checksum = checksum
except IOError as e:
logger.warning(f"Failed to write checksum file for {self.watch_uuid}: {e}")
def read_last_raw_content_checksum(self):
"""
Read the last raw content MD5 checksum from file.
Returns None if file doesn't exist (first run) or can't be read.
"""
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
self.last_raw_content_checksum = None
return
data_dir = watch.data_dir
if not data_dir:
self.last_raw_content_checksum = None
return
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
if not os.path.isfile(checksum_file):
self.last_raw_content_checksum = None
return
try:
with open(checksum_file, 'r', encoding='utf-8') as f:
self.last_raw_content_checksum = f.read().strip()
except IOError as e:
logger.warning(f"Failed to read checksum file for {self.watch_uuid}: {e}")
self.last_raw_content_checksum = None
async def call_browser(self, preferred_proxy_id=None):
from requests.structures import CaseInsensitiveDict
@@ -171,7 +110,7 @@ class difference_detection_processor():
)
if self.watch.has_browser_steps:
self.fetcher.browser_steps = browser_steps_get_valid_steps(self.watch.get('browser_steps', []))
self.fetcher.browser_steps = self.watch.get('browser_steps', [])
self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid'))
# Tweak the base config with the per-watch ones
@@ -318,16 +257,8 @@ class difference_detection_processor():
except IOError as e:
logger.error(f"Failed to write extra watch config {filename}: {e}")
def get_raw_document_checksum(self):
checksum = None
if self.fetcher.content:
checksum = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
return checksum
@abstractmethod
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
@@ -30,7 +30,7 @@ class perform_site_check(difference_detection_processor):
# Override to use PNG format for better image comparison (JPEG compression creates noise)
screenshot_format = SCREENSHOT_FORMAT_PNG
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
"""
Perform screenshot comparison using OpenCV subprocess handler.
@@ -2,7 +2,6 @@ from ..base import difference_detection_processor
from ..exceptions import ProcessorException
from . import Restock
from loguru import logger
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
import urllib3
import time
@@ -57,259 +56,6 @@ def _deduplicate_prices(data):
return list(unique_data)
# =============================================================================
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
# =============================================================================
#
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
# libxml2 - a C library that allocates memory at the C level.
#
# Memory Leak Problem:
# --------------------
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
# C-level allocations (nodes, attributes, text content)
# 2. Python's garbage collector can mark these objects as collectible, but
# cannot force the OS to reclaim the actual C-level memory
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
# "free in the process address space"
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
# after Python GC runs
#
# Why Multiprocessing Fixes This:
# --------------------------------
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
# allocations that Python GC couldn't release. This ensures clean memory state
# after each extraction.
#
# Performance Impact:
# -------------------
# - Memray analysis showed 1.2M document_fromstring allocations per page
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
# - With subprocess: ~35MB overhead but forces full cleanup after each run
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
#
# References:
# -----------
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
#
# Additional Context:
# -------------------
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
# use direct call as fallback
#
# Alternative Solution (Future Optimization):
# -------------------------------------------
# This entire problem could be avoided by using regex to extract just the machine
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
#
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
# 2. Use regex to extract <meta property="og:*"> tags
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
#
# Benefits:
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
# - Eliminates the lxml memory leak entirely
# - Faster extraction (regex is much faster than DOM parsing)
# - No subprocess overhead needed
#
# Trade-offs:
# - Regex for HTML is brittle (comments, CDATA, edge cases)
# - Microdata extraction would be complex (need to track element boundaries)
# - Would need extensive testing to ensure we don't miss valid data
# - extruct is battle-tested; regex solution would need similar maturity
#
# For now, the subprocess approach is safer and leverages existing extruct code.
# =============================================================================
def _extract_itemprop_availability_worker(pipe_conn):
"""
Subprocess worker for itemprop extraction (Linux memory management).
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
When the subprocess exits, the OS reclaims ALL memory including lxml's
C-level allocations that Python's GC cannot release.
Args:
pipe_conn: Pipe connection to receive HTML and send result
"""
import json
import gc
html_content = None
result_data = None
try:
# Receive HTML as raw bytes (no pickle)
html_bytes = pipe_conn.recv_bytes()
html_content = html_bytes.decode('utf-8')
# Explicitly delete html_bytes to free memory
del html_bytes
gc.collect()
# Perform extraction in subprocess (uses extruct/lxml)
result_data = get_itemprop_availability(html_content)
# Convert Restock object to dict for JSON serialization
result = {
'success': True,
'data': dict(result_data) if result_data else {}
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
# Clean up before exit
del result_data, html_content, result
gc.collect()
except MoreThanOnePriceFound:
# Serialize the specific exception type
result = {
'success': False,
'exception_type': 'MoreThanOnePriceFound'
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
except Exception as e:
# Serialize other exceptions
result = {
'success': False,
'exception_type': type(e).__name__,
'exception_message': str(e)
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
finally:
# Final cleanup before subprocess exits
# Variables may already be deleted in try block, so use try/except
try:
del html_content
except (NameError, UnboundLocalError):
pass
try:
del result_data
except (NameError, UnboundLocalError):
pass
gc.collect()
pipe_conn.close()
def extract_itemprop_availability_safe(html_content) -> Restock:
"""
Extract itemprop availability with hybrid approach for memory efficiency.
Strategy (fastest to slowest, least to most memory):
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
2. Fall back to extruct with subprocess isolation on Linux for complex cases
Args:
html_content: HTML string to parse
Returns:
Restock: Extracted availability data
Raises:
MoreThanOnePriceFound: When multiple prices detected
Other exceptions: From extruct/parsing
"""
import platform
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
try:
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
logger.trace("Attempting pure Python metadata extraction (no lxml)")
extracted_data = extract_metadata_pure_python(html_content)
price_data = query_price_availability(extracted_data)
# If we got price AND availability, we're done!
if price_data.get('price') and price_data.get('availability'):
result = Restock(price_data)
logger.debug(f"Pure Python extraction successful: {dict(result)}")
return result
# If we got some data but not everything, still try extruct for completeness
if price_data.get('price') or price_data.get('availability'):
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
except Exception as e:
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
# Only use subprocess isolation on Linux
# Other platforms may have issues with spawn or don't need the aggressive memory management
if platform.system() == 'Linux':
import multiprocessing
import json
import gc
try:
ctx = multiprocessing.get_context('spawn')
parent_conn, child_conn = ctx.Pipe()
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
p.start()
# Send HTML as raw bytes (no pickle)
html_bytes = html_content.encode('utf-8')
parent_conn.send_bytes(html_bytes)
# Explicitly delete html_bytes copy immediately after sending
del html_bytes
gc.collect()
# Receive result as JSON
result_bytes = parent_conn.recv_bytes()
result = json.loads(result_bytes.decode('utf-8'))
# Wait for subprocess to complete
p.join()
# Close pipes
parent_conn.close()
child_conn.close()
# Clean up all subprocess-related objects
del p, parent_conn, child_conn, result_bytes
gc.collect()
# Handle result or re-raise exception
if result['success']:
# Reconstruct Restock object from dict
restock_obj = Restock(result['data'])
# Clean up result dict
del result
gc.collect()
return restock_obj
else:
# Re-raise the exception that occurred in subprocess
exception_type = result['exception_type']
exception_msg = result.get('exception_message', '')
del result
gc.collect()
if exception_type == 'MoreThanOnePriceFound':
raise MoreThanOnePriceFound()
else:
raise Exception(f"{exception_type}: {exception_msg}")
except Exception as e:
# If multiprocessing itself fails, log and fall back to direct call
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
gc.collect()
return get_itemprop_availability(html_content)
else:
# Non-Linux: direct call (no subprocess overhead needed)
return get_itemprop_availability(html_content)
# should return Restock()
# add casting?
def get_itemprop_availability(html_content) -> Restock:
@@ -404,37 +150,22 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
import hashlib
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False, 'restock': Restock()}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = self.fetcher.headers.get('Content-Type', '') # Use hyphen (matches OpenAPI spec)
# Track the content type
update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.
# Otherwise it will assume "in stock" because nothing suggesting the opposite was found
from ...html_tools import html_to_text
@@ -465,9 +196,8 @@ class perform_site_check(difference_detection_processor):
multiple_prices_found = False
# Try built-in extraction first, this will scan metadata in the HTML
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
try:
itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
itemprop_availability = get_itemprop_availability(self.fetcher.content)
except MoreThanOnePriceFound as e:
# Don't raise immediately - let plugins try to handle this case
# Plugins might be able to determine which price is correct
@@ -1,286 +0,0 @@
"""
Pure Python metadata extractor - no lxml, no memory leaks.
This module provides a fast, memory-efficient alternative to extruct for common
e-commerce metadata extraction. It handles:
- JSON-LD (covers 80%+ of modern sites)
- OpenGraph meta tags
- Basic microdata attributes
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
memory allocation issues. For edge cases, the main processor can fall back to
extruct (with subprocess isolation on Linux).
"""
from html.parser import HTMLParser
import json
import re
from loguru import logger
class JSONLDExtractor(HTMLParser):
"""
Extract JSON-LD structured data from HTML.
Finds all <script type="application/ld+json"> tags and parses their content.
Handles multiple JSON-LD blocks on the same page.
"""
def __init__(self):
super().__init__()
self.in_jsonld = False
self.data = [] # List of all parsed JSON-LD objects
self.current_script = []
def handle_starttag(self, tag, attrs):
if tag == 'script':
# Check if this is a JSON-LD script tag
for attr, value in attrs:
if attr == 'type' and value == 'application/ld+json':
self.in_jsonld = True
self.current_script = []
break
def handle_data(self, data):
if self.in_jsonld:
self.current_script.append(data)
def handle_endtag(self, tag):
if tag == 'script' and self.in_jsonld:
# Parse the accumulated script content
script_content = ''.join(self.current_script)
if script_content.strip():
try:
# Parse JSON (handles both objects and arrays)
parsed = json.loads(script_content)
if isinstance(parsed, list):
self.data.extend(parsed)
else:
self.data.append(parsed)
except json.JSONDecodeError as e:
logger.debug(f"Failed to parse JSON-LD: {e}")
pass
self.in_jsonld = False
self.current_script = []
class OpenGraphExtractor(HTMLParser):
"""
Extract OpenGraph meta tags from HTML.
Finds <meta property="og:*"> tags commonly used for social media sharing.
"""
def __init__(self):
super().__init__()
self.og_data = {}
def handle_starttag(self, tag, attrs):
if tag == 'meta':
attrs_dict = dict(attrs)
prop = attrs_dict.get('property', '')
# Extract OpenGraph properties
if prop.startswith('og:'):
content = attrs_dict.get('content', '')
if content:
self.og_data[prop] = content
class MicrodataExtractor(HTMLParser):
"""
Extract basic microdata attributes from HTML.
Finds elements with itemprop attributes. This is a simplified extractor
that doesn't handle nested itemscope/itemtype hierarchies - for complex
cases, use extruct as fallback.
"""
def __init__(self):
super().__init__()
self.microdata = {}
self.current_itemprop = None
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if 'itemprop' in attrs_dict:
itemprop = attrs_dict['itemprop']
# Price/currency/availability can be in content/href attributes
if itemprop == 'price':
if 'content' in attrs_dict:
self.microdata['price'] = attrs_dict['content']
else:
self.current_itemprop = 'price'
elif itemprop == 'priceCurrency':
if 'content' in attrs_dict:
self.microdata['currency'] = attrs_dict['content']
else:
self.current_itemprop = 'priceCurrency'
elif itemprop == 'availability':
# Can be in href (link) or content (meta)
if 'href' in attrs_dict:
self.microdata['availability'] = attrs_dict['href']
elif 'content' in attrs_dict:
self.microdata['availability'] = attrs_dict['content']
else:
self.current_itemprop = 'availability'
def handle_data(self, data):
# Capture text content for itemprop elements
if self.current_itemprop == 'price':
# Try to extract numeric price from text
try:
price_text = re.sub(r'[^\d.]', '', data.strip())
if price_text:
self.microdata['price'] = float(price_text)
except ValueError:
pass
elif self.current_itemprop == 'priceCurrency':
currency = data.strip()
if currency:
self.microdata['currency'] = currency
elif self.current_itemprop == 'availability':
availability = data.strip()
if availability:
self.microdata['availability'] = availability
def handle_endtag(self, tag):
# Reset current itemprop after closing tag
self.current_itemprop = None
def extract_metadata_pure_python(html_content):
"""
Extract structured metadata from HTML using pure Python parsers.
Returns a dict with three keys:
- 'json-ld': List of parsed JSON-LD objects
- 'opengraph': Dict of OpenGraph properties
- 'microdata': Dict of microdata properties
Args:
html_content: HTML string to parse
Returns:
dict: Extracted metadata in three formats
"""
result = {
'json-ld': [],
'opengraph': {},
'microdata': {}
}
# Extract JSON-LD
try:
jsonld_extractor = JSONLDExtractor()
jsonld_extractor.feed(html_content)
result['json-ld'] = jsonld_extractor.data
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
except Exception as e:
logger.debug(f"JSON-LD extraction failed: {e}")
# Extract OpenGraph
try:
og_extractor = OpenGraphExtractor()
og_extractor.feed(html_content)
result['opengraph'] = og_extractor.og_data
if result['opengraph']:
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
except Exception as e:
logger.debug(f"OpenGraph extraction failed: {e}")
# Extract Microdata
try:
microdata_extractor = MicrodataExtractor()
microdata_extractor.feed(html_content)
result['microdata'] = microdata_extractor.microdata
if result['microdata']:
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
except Exception as e:
logger.debug(f"Microdata extraction failed: {e}")
return result
def query_price_availability(extracted_data):
"""
Query extracted metadata for price and availability information.
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
Args:
extracted_data: Dict from extract_metadata_pure_python()
Returns:
dict: {'price': float, 'currency': str, 'availability': str}
"""
from jsonpath_ng import parse
result = {}
# 1. Try JSON-LD first (most reliable and common)
for data in extracted_data.get('json-ld', []):
try:
# Use jsonpath to find price/availability anywhere in the structure
price_parse = parse('$..(price|Price)')
availability_parse = parse('$..(availability|Availability)')
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
price_results = [m.value for m in price_parse.find(data)]
if price_results and not result.get('price'):
# Handle various price formats
price_val = price_results[0]
if isinstance(price_val, (int, float)):
result['price'] = float(price_val)
elif isinstance(price_val, str):
# Extract numeric value from string
try:
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
except ValueError:
pass
avail_results = [m.value for m in availability_parse.find(data)]
if avail_results and not result.get('availability'):
result['availability'] = str(avail_results[0])
curr_results = [m.value for m in currency_parse.find(data)]
if curr_results and not result.get('currency'):
result['currency'] = str(curr_results[0])
# If we found price, this JSON-LD block is good
if result.get('price'):
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
break
except Exception as e:
logger.debug(f"Error querying JSON-LD: {e}")
continue
# 2. Try OpenGraph if JSON-LD didn't provide everything
og_data = extracted_data.get('opengraph', {})
if not result.get('price') and 'og:price:amount' in og_data:
try:
result['price'] = float(og_data['og:price:amount'])
except ValueError:
pass
if not result.get('currency') and 'og:price:currency' in og_data:
result['currency'] = og_data['og:price:currency']
if not result.get('availability') and 'og:availability' in og_data:
result['availability'] = og_data['og:availability']
# 3. Use microdata as last resort
microdata = extracted_data.get('microdata', {})
if not result.get('price') and 'price' in microdata:
result['price'] = microdata['price']
if not result.get('currency') and 'currency' in microdata:
result['currency'] = microdata['currency']
if not result.get('availability') and 'availability' in microdata:
result['availability'] = microdata['availability']
return result
@@ -17,8 +17,7 @@ def _task(watch, update_handler):
try:
# The slow process (we run 2 of these in parallel)
# Always force reprocess for preview - we want to show the filtered content regardless of checksums
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch, force_reprocess=True)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:
@@ -7,7 +7,6 @@ import re
import urllib3
from changedetectionio.conditions import execute_ruleset_against_all_plugins
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
from ..base import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
@@ -347,7 +346,6 @@ class ContentProcessor:
def extract_text_from_html(self, html_content, stream_content_type):
"""Convert HTML to plain text."""
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
return html_tools.html_to_text(
html_content=html_content,
render_anchor_tag_content=do_anchor,
@@ -370,24 +368,12 @@ class ChecksumCalculator:
# (set_proxy_from_list)
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
changed_detected = False
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Initialize components
filter_config = FilterConfig(watch, self.datastore)
content_processor = ContentProcessor(self.fetcher, watch, filter_config, self.datastore)
@@ -405,11 +391,9 @@ class perform_site_check(difference_detection_processor):
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = ctype_header # Use hyphen (matches OpenAPI spec and watch_base default)
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# Track the content type and checksum before filters
update_obj['content_type'] = ctype_header
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
# === CONTENT PREPROCESSING ===
# Avoid creating unnecessary intermediate string copies by reassigning only when needed
@@ -198,6 +198,7 @@ def handle_watch_update(socketio, **kwargs):
except Exception as e:
logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")
def init_socketio(app, datastore):
"""Initialize SocketIO with the main Flask app"""
import platform
+80 -64
View File
@@ -17,6 +17,8 @@ $(document).ready(function () {
set_scale();
});
// Should always be disabled
$('#browser_steps-0-operation option[value="Goto site"]').prop("selected", "selected");
$('#browser_steps-0-operation').attr('disabled', 'disabled');
$('#browsersteps-click-start').click(function () {
$("#browsersteps-click-start").fadeOut();
@@ -43,6 +45,12 @@ $(document).ready(function () {
browsersteps_session_id = false;
apply_buttons_disabled = false;
ctx.clearRect(0, 0, c.width, c.height);
set_first_gotosite_disabled();
}
function set_first_gotosite_disabled() {
$('#browser_steps >li:first-child select').val('Goto site').attr('disabled', 'disabled');
$('#browser_steps >li:first-child').css('opacity', '0.5');
}
// Show seconds remaining until the browser interface needs to restart the session
@@ -235,54 +243,14 @@ $(document).ready(function () {
ctx.fill();
}
// Reusable AJAX function for browser step operations
function executeBrowserStep(url, data = {}) {
$('#browser-steps-ui .loader .spinner').fadeIn();
apply_buttons_disabled = true;
$('ul#browser_steps li .control .apply').css('opacity', 0.5);
$("#browsersteps-img").css('opacity', 0.65);
return $.ajax({
method: "POST",
url: url,
data: data,
statusCode: {
400: function () {
alert("There was a problem processing the request, please reload the page.");
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
},
401: function (data) {
alert(data.responseText);
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
}
}
}).done(function (data) {
xpath_data = data.xpath_data;
$('#browsersteps-img').attr('src', data.screenshot);
$('#browser-steps-ui .loader .spinner').fadeOut();
apply_buttons_disabled = false;
$("#browsersteps-img").css('opacity', 1);
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#loading-status-text").hide();
}).fail(function (data) {
console.log(data);
if (data.responseText && data.responseText.includes("Browser session expired")) {
disable_browsersteps_ui();
}
apply_buttons_disabled = false;
$("#loading-status-text").hide();
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#browsersteps-img").css('opacity', 1);
});
}
function start() {
console.log("Starting browser-steps UI");
browsersteps_session_id = false;
// @todo This setting of the first one should be done at the datalayer but wtforms doesnt wanna play nice
$('#browser_steps >li:first-child').removeClass('empty');
set_first_gotosite_disabled();
$('#browser-steps-ui .loader .spinner').show();
// Request a new session
$('.clear,.remove', $('#browser_steps >li:first-child')).hide();
$.ajax({
type: "GET",
url: browser_steps_start_url,
@@ -299,12 +267,11 @@ $(document).ready(function () {
}).done(function (data) {
$("#loading-status-text").fadeIn();
browsersteps_session_id = data.browsersteps_session_id;
// This should trigger 'Goto site'
console.log("Got startup response, requesting Goto-Site (first) step fake click");
$('#browser_steps >li:first-child .apply').click();
browser_interface_seconds_remaining = 500;
// Request goto_site operation
executeBrowserStep(
browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id + "&goto_website_url_first_step=true"
);
set_first_gotosite_disabled();
}).fail(function (data) {
console.log(data);
alert('There was an error communicating with the server.');
@@ -313,6 +280,7 @@ $(document).ready(function () {
}
function disable_browsersteps_ui() {
set_first_gotosite_disabled();
$("#browser-steps-ui").css('opacity', '0.3');
$('#browsersteps-selector-canvas').off("mousemove mousedown click");
}
@@ -360,13 +328,16 @@ $(document).ready(function () {
// Add the extra buttons to the steps
$('ul#browser_steps li').each(function (i) {
var s = '<div class="control">' + '<a data-step-index=' + i + ' class="pure-button button-secondary button-green button-xsmall apply" >Apply</a>&nbsp;';
s += `<a data-step-index="${i}" class="pure-button button-secondary button-xsmall clear" >Clear</a>&nbsp;` +
`<a data-step-index="${i}" class="pure-button button-secondary button-red button-xsmall remove" >Remove</a>`;
if (i > 0) {
// The first step never gets these (Goto-site)
s += `<a data-step-index="${i}" class="pure-button button-secondary button-xsmall clear" >Clear</a>&nbsp;` +
`<a data-step-index="${i}" class="pure-button button-secondary button-red button-xsmall remove" >Remove</a>`;
// if a screenshot is available
if (browser_steps_available_screenshots.includes(i.toString())) {
var d = (browser_steps_last_error_step === i+1) ? 'before' : 'after';
s += `&nbsp;<a data-step-index="${i}" class="pure-button button-secondary button-xsmall show-screenshot" title="Show screenshot from last run" data-type="${d}">Pic</a>&nbsp;`;
// if a screenshot is available
if (browser_steps_available_screenshots.includes(i.toString())) {
var d = (browser_steps_last_error_step === i+1) ? 'before' : 'after';
s += `&nbsp;<a data-step-index="${i}" class="pure-button button-secondary button-xsmall show-screenshot" title="Show screenshot from last run" data-type="${d}">Pic</a>&nbsp;`;
}
}
s += '</div>';
$(this).append(s)
@@ -405,35 +376,80 @@ $(document).ready(function () {
});
$('ul#browser_steps li .control .apply').click(function (event) {
// sequential requests @todo refactor
if (apply_buttons_disabled) {
return;
}
var current_data = $(event.currentTarget).closest('li');
$('#browser-steps-ui .loader .spinner').fadeIn();
apply_buttons_disabled = true;
$('ul#browser_steps li .control .apply').css('opacity', 0.5);
$("#browsersteps-img").css('opacity', 0.65);
var is_last_step = 0;
var step_n = $(event.currentTarget).data('step-index');
// Determine if this is the last configured step
var is_last_step = 0;
// On the last step, we should also be getting data ready for the visual selector
$('ul#browser_steps li select').each(function (i) {
if ($(this).val() !== 'Choose one') {
is_last_step += 1;
}
});
is_last_step = (is_last_step == (step_n + 1));
if (is_last_step == (step_n + 1)) {
is_last_step = true;
} else {
is_last_step = false;
}
console.log("Requesting step via POST " + $("select[id$='operation']", current_data).first().val());
// Execute the browser step
executeBrowserStep(
browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id,
{
// POST the currently clicked step form widget back and await response, redraw
$.ajax({
method: "POST",
url: browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id,
data: {
'operation': $("select[id$='operation']", current_data).first().val(),
'selector': $("input[id$='selector']", current_data).first().val(),
'optional_value': $("input[id$='optional_value']", current_data).first().val(),
'step_n': step_n,
'is_last_step': is_last_step
},
statusCode: {
400: function () {
// More than likely the CSRF token was lost when the server restarted
alert("There was a problem processing the request, please reload the page.");
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
},
401: function (data) {
// More than likely the CSRF token was lost when the server restarted
alert(data.responseText);
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
}
}
);
}).done(function (data) {
// it should return the new state (selectors available and screenshot)
xpath_data = data.xpath_data;
$('#browsersteps-img').attr('src', data.screenshot);
$('#browser-steps-ui .loader .spinner').fadeOut();
apply_buttons_disabled = false;
$("#browsersteps-img").css('opacity', 1);
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#loading-status-text").hide();
set_first_gotosite_disabled();
}).fail(function (data) {
console.log(data);
if (data.responseText.includes("Browser session expired")) {
disable_browsersteps_ui();
}
apply_buttons_disabled = false;
$("#loading-status-text").hide();
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#browsersteps-img").css('opacity', 1);
});
});
$('ul#browser_steps li .control .show-screenshot').click(function (element) {
+1 -3
View File
@@ -102,9 +102,7 @@
}
// Navigate to search results (always redirect to watchlist home)
// Use base_path if available (for sub-path deployments like /enlighten-richerx)
const basePath = typeof base_path !== 'undefined' ? base_path : '';
window.location.href = basePath + '/?' + params.toString();
window.location.href = '/?' + params.toString();
});
}
});
+1 -1
View File
@@ -1 +1 @@
#diff-form{background:rgba(0,0,0,.05);padding:1em;border-radius:10px;margin-bottom:1em;color:#fff;font-size:.9rem;text-align:center}#diff-form label.from-to-label{width:4rem;text-decoration:none;padding:.5rem}#diff-form label.from-to-label#change-from{color:#b30000;background:#fadad7}#diff-form label.from-to-label#change-to{background:#eaf2c2;color:#406619}#diff-form #diff-style>span{display:inline-block;padding:.3em}#diff-form #diff-style>span label{font-weight:normal}#diff-form *{vertical-align:middle}body.difference-page section.content{padding-top:40px}#diff-ui{background:var(--color-background);padding:1rem;border-radius:5px}@media(min-width: 767px){#diff-ui{min-width:50%}}#diff-ui #text{font-size:11px}#diff-ui pre{white-space:break-spaces;overflow-wrap:anywhere}#diff-ui h1{display:inline;font-size:100%}#diff-ui #result{white-space:pre-wrap;word-break:break-word;overflow-wrap:break-word}#diff-ui .source{position:absolute;right:1%;top:.2em}@-moz-document url-prefix(){#diff-ui body{height:99%}}#diff-ui td#diff-col div{text-align:justify;white-space:pre-wrap}#diff-ui .ignored{background-color:#ccc;opacity:.7}#diff-ui .triggered{background-color:#1b98f8}#diff-ui .ignored.triggered{background-color:red}#diff-ui .tab-pane-inner#screenshot{text-align:center}#diff-ui .tab-pane-inner#screenshot img{max-width:99%}#diff-ui .pure-form button.reset-margin{margin:0px}#diff-ui .diff-fieldset{display:flex;align-items:center;gap:4px;flex-wrap:wrap}#diff-ui ul#highlightSnippetActions{list-style-type:none;display:flex;align-items:center;justify-content:center;gap:1.5rem;flex-wrap:wrap;padding:0;margin:0}#diff-ui ul#highlightSnippetActions li{display:flex;flex-direction:column;align-items:center;text-align:center;padding:.5rem;gap:.3rem}#diff-ui ul#highlightSnippetActions li button,#diff-ui ul#highlightSnippetActions li a{white-space:nowrap}#diff-ui ul#highlightSnippetActions span{font-size:.8rem;color:var(--color-text-input-description)}#diff-ui #cell-diff-jump-visualiser{display:flex;flex-direction:row;gap:1px;background:var(--color-background);border-radius:3px;overflow-x:hidden;position:sticky;top:0;z-index:10;padding-top:1rem;padding-bottom:1rem;justify-content:center}#diff-ui #cell-diff-jump-visualiser>div{flex:1;min-width:1px;max-width:10px;height:10px;background:var(--color-background-button-cancel);opacity:.3;border-radius:1px;transition:opacity .2s;position:relative}#diff-ui #cell-diff-jump-visualiser>div.deletion{background:#b30000;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.insertion{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.note{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.mixed{background:linear-gradient(to right, #b30000 50%, #406619 50%);opacity:1}#diff-ui #cell-diff-jump-visualiser>div.current-position::after{content:"";position:absolute;bottom:-6px;left:50%;transform:translateX(-50%);width:0;height:0;border-left:4px solid rgba(0,0,0,0);border-right:4px solid rgba(0,0,0,0);border-bottom:4px solid var(--color-text)}#diff-ui #cell-diff-jump-visualiser>div:hover{opacity:.8;cursor:pointer}#text-diff-heading-area .snapshot-age{padding:4px;margin:.5rem 0;background-color:var(--color-background-snapshot-age);border-radius:3px;font-weight:bold;margin-bottom:4px}#text-diff-heading-area .snapshot-age.error{background-color:var(--color-error-background-snapshot-age);color:var(--color-error-text-snapshot-age)}#text-diff-heading-area .snapshot-age>*{padding-right:1rem}
#diff-form{background:rgba(0,0,0,.05);padding:1em;border-radius:10px;margin-bottom:1em;color:#fff;font-size:.9rem;text-align:center}#diff-form label.from-to-label{width:4rem;text-decoration:none;padding:.5rem}#diff-form label.from-to-label#change-from{color:#b30000;background:#fadad7}#diff-form label.from-to-label#change-to{background:#eaf2c2;color:#406619}#diff-form #diff-style>span{display:inline-block;padding:.3em}#diff-form #diff-style>span label{font-weight:normal}#diff-form *{vertical-align:middle}body.difference-page section.content{padding-top:40px}#diff-ui{background:var(--color-background);padding:1rem;border-radius:5px}@media(min-width: 767px){#diff-ui{min-width:50%}}#diff-ui #text{font-size:11px}#diff-ui pre{white-space:break-spaces}#diff-ui h1{display:inline;font-size:100%}#diff-ui #result{white-space:pre-wrap;word-break:break-word;overflow-wrap:break-word}#diff-ui .source{position:absolute;right:1%;top:.2em}@-moz-document url-prefix(){#diff-ui body{height:99%}}#diff-ui td#diff-col div{text-align:justify;white-space:pre-wrap}#diff-ui .ignored{background-color:#ccc;opacity:.7}#diff-ui .triggered{background-color:#1b98f8}#diff-ui .ignored.triggered{background-color:red}#diff-ui .tab-pane-inner#screenshot{text-align:center}#diff-ui .tab-pane-inner#screenshot img{max-width:99%}#diff-ui .pure-form button.reset-margin{margin:0px}#diff-ui .diff-fieldset{display:flex;align-items:center;gap:4px;flex-wrap:wrap}#diff-ui ul#highlightSnippetActions{list-style-type:none;display:flex;align-items:center;justify-content:center;gap:1.5rem;flex-wrap:wrap;padding:0;margin:0}#diff-ui ul#highlightSnippetActions li{display:flex;flex-direction:column;align-items:center;text-align:center;padding:.5rem;gap:.3rem}#diff-ui ul#highlightSnippetActions li button,#diff-ui ul#highlightSnippetActions li a{white-space:nowrap}#diff-ui ul#highlightSnippetActions span{font-size:.8rem;color:var(--color-text-input-description)}#diff-ui #cell-diff-jump-visualiser{display:flex;flex-direction:row;gap:1px;background:var(--color-background);border-radius:3px;overflow-x:hidden;position:sticky;top:0;z-index:10;padding-top:1rem;padding-bottom:1rem;justify-content:center}#diff-ui #cell-diff-jump-visualiser>div{flex:1;min-width:1px;max-width:10px;height:10px;background:var(--color-background-button-cancel);opacity:.3;border-radius:1px;transition:opacity .2s;position:relative}#diff-ui #cell-diff-jump-visualiser>div.deletion{background:#b30000;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.insertion{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.note{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.mixed{background:linear-gradient(to right, #b30000 50%, #406619 50%);opacity:1}#diff-ui #cell-diff-jump-visualiser>div.current-position::after{content:"";position:absolute;bottom:-6px;left:50%;transform:translateX(-50%);width:0;height:0;border-left:4px solid rgba(0,0,0,0);border-right:4px solid rgba(0,0,0,0);border-bottom:4px solid var(--color-text)}#diff-ui #cell-diff-jump-visualiser>div:hover{opacity:.8;cursor:pointer}#text-diff-heading-area .snapshot-age{padding:4px;margin:.5rem 0;background-color:var(--color-background-snapshot-age);border-radius:3px;font-weight:bold;margin-bottom:4px}#text-diff-heading-area .snapshot-age.error{background-color:var(--color-error-background-snapshot-age);color:var(--color-error-text-snapshot-age)}#text-diff-heading-area .snapshot-age>*{padding-right:1rem}
@@ -62,7 +62,6 @@ body.difference-page {
pre {
white-space: break-spaces;
overflow-wrap: anywhere;
}
File diff suppressed because one or more lines are too long
+112 -142
View File
@@ -22,8 +22,6 @@ import uuid as uuid_builder
from loguru import logger
from blinker import signal
from ..model.Tags import TagsDict
# Try to import orjson for faster JSON serialization
try:
import orjson
@@ -35,8 +33,9 @@ except ImportError:
from ..processors import get_custom_watch_obj_for_processor
# Import the base class and helpers
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_json_atomic
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_watch_atomic, save_tag_atomic, save_json_atomic
from .updates import DatastoreUpdatesMixin
from .legacy_loader import has_legacy_datastore
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
@@ -79,7 +78,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
copyfile(db_path, db_path_version_backup)
def _load_settings(self, filename="changedetection.json"):
def _load_settings(self):
"""
Load settings from storage.
@@ -88,7 +87,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Returns:
dict: Settings data loaded from storage
"""
changedetection_json = os.path.join(self.datastore_path, filename)
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
logger.info(f"Loading settings from {changedetection_json}")
@@ -123,16 +122,6 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
if 'application' in settings_data['settings']:
self.__data['settings']['application'].update(settings_data['settings']['application'])
# Use our Tags dict with cleanup helpers etc
# @todo Same for Watches
existing_tags = settings_data.get('settings', {}).get('application', {}).get('tags') or {}
self.__data['settings']['application']['tags'] = TagsDict(existing_tags, datastore_path=self.datastore_path)
# More or less for the old format which had this data in the one url-watches.json
# cant hurt to leave it here,
if 'watching' in settings_data:
self.__data['watching'].update(settings_data['watching'])
def _rehydrate_tags(self):
"""Rehydrate tag entities from stored data into Tag objects with restock_diff processor."""
from ..model import Tag
@@ -157,28 +146,23 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
logger.info(f"Rehydrating {watch_count} watches...")
watching_rehydrated = {}
for uuid, watch_dict in self.__data.get('watching', {}).items():
if isinstance(watch_dict, dict):
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
else:
logger.error(f"Watch UUID {uuid} already rehydrated")
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
self.__data['watching'] = watching_rehydrated
logger.success(f"Rehydrated {watch_count} watches into Watch objects")
def _load_state(self, main_settings_filename="changedetection.json"):
def _load_state(self):
"""
Load complete datastore state from storage.
Orchestrates loading of settings, watches, and tags using polymorphic methods.
"""
# Load settings
settings_data = self._load_settings(filename=main_settings_filename)
settings_data = self._load_settings()
self._apply_settings(settings_data)
# Load watches, scan them from the disk
# Load watches (polymorphic - parent class method)
self._load_watches()
self._rehydrate_watches()
# Load tags from individual tag.json files
# These will override any tags in settings (migration path)
@@ -203,7 +187,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.datastore_path = datastore_path
# Initialize data structure
self.__data = App.model(datastore_path=datastore_path)
self.__data = App.model()
self.json_store_path = os.path.join(self.datastore_path, "changedetection.json")
# Base definition for all watchers (deepcopy part of #569)
@@ -216,75 +200,112 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Check if datastore already exists
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
changedetection_json_old_schema = os.path.join(self.datastore_path, "url-watches.json")
if os.path.exists(changedetection_json):
# Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set)
# Load existing datastore (changedetection.json + watch.json files)
logger.info("Loading existing datastore")
self._load_state()
current_schema = self.data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
try:
self._load_state()
except Exception as e:
logger.critical(f"Failed to load datastore: {e}")
raise
# Legacy datastore detected - trigger migration, even works if the schema is much before the migration step.
elif os.path.exists(changedetection_json_old_schema):
logger.critical(f"Legacy datastore detected at {changedetection_json_old_schema}, loading and running updates")
self._load_state(main_settings_filename="url-watches.json")
# update 26 will load the whole old config from disk to __data
# Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set)
current_schema = self.__data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
# Probably tags were also shifted to disk and many other changes, so best to reload here.
self._load_state()
else:
# No datastore yet - check if this is a fresh install or legacy migration
self.init_fresh_install(include_default_watches=include_default_watches,
version_tag=version_tag)
# Maybe they copied a bunch of watch subdirs across too
self._load_state()
# Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
def init_fresh_install(self, include_default_watches, version_tag):
# Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
# Generate RSS access token
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Generate RSS access token
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Generate API access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
# Generate API access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
# Check if legacy datastore exists (url-watches.json)
if has_legacy_datastore(self.datastore_path):
# Legacy datastore detected - trigger migration
logger.critical(f"Legacy datastore detected at {self.datastore_path}/url-watches.json")
logger.critical("Migration will be triggered via update_26")
# Set schema version to latest (no updates needed)
latest_update_available = self.get_updates_available().pop()
logger.info(f"Marking fresh install to schema version {latest_update_available}")
self.__data['settings']['application']['schema_version'] = latest_update_available
# Load the legacy datastore
from .legacy_loader import load_legacy_format
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
legacy_data = load_legacy_format(legacy_path)
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
if not legacy_data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# Merge legacy data with base_config defaults (preserves new fields like 'ui')
# self.__data already has App.model() defaults from line 190
logger.info("Merging legacy data with base_config defaults...")
# Apply top-level fields from legacy data
if 'app_guid' in legacy_data:
self.__data['app_guid'] = legacy_data['app_guid']
if 'build_sha' in legacy_data:
self.__data['build_sha'] = legacy_data['build_sha']
if 'version_tag' in legacy_data:
self.__data['version_tag'] = legacy_data['version_tag']
# Apply watching data (complete replacement as these are user's watches)
if 'watching' in legacy_data:
self.__data['watching'] = legacy_data['watching']
# Merge settings sections (preserves base_config defaults for missing fields)
if 'settings' in legacy_data:
if 'headers' in legacy_data['settings']:
self.__data['settings']['headers'].update(legacy_data['settings']['headers'])
if 'requests' in legacy_data['settings']:
self.__data['settings']['requests'].update(legacy_data['settings']['requests'])
if 'application' in legacy_data['settings']:
# CRITICAL: Use .update() to merge, not replace
# This preserves new fields like 'ui' that exist in base_config
self.__data['settings']['application'].update(legacy_data['settings']['application'])
# CRITICAL: Rehydrate watches from dicts into Watch objects
# This ensures watches have their methods available during migration
self._rehydrate_watches()
# update_26 will save watches to individual files and create changedetection.json
# Next startup will load from new format normally
self.run_updates()
else:
# Fresh install - create new datastore
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
# Set schema version to latest (no updates needed)
updates_available = self.get_updates_available()
self.__data['settings']['application']['schema_version'] = updates_available.pop() if updates_available else 26
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# Set version tag
self.__data['version_tag'] = version_tag
@@ -362,12 +383,17 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Deep copy settings to avoid modifying the original
settings_copy = copy.deepcopy(self.__data['settings'])
# Is saved as {uuid}/tag.json
settings_copy['application']['tags'] = {}
# Only exclude tags if we've already migrated them to individual files (schema >= 28)
# This ensures update_28 can migrate tags from settings
schema_version = self.__data['settings']['application'].get('schema_version', 0)
if schema_version >= 28:
# Tags are in individual tag.json files, don't save to settings
settings_copy['application']['tags'] = {}
# else: keep tags in settings for update_28 migration
return {
'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
'app_guid': self.__data.get('app_guid'),
'app_guid': self.__data['app_guid'],
'settings': settings_copy,
'build_sha': self.__data.get('build_sha'),
'version_tag': self.__data.get('version_tag')
@@ -396,14 +422,15 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Implementation of abstract method from FileSavingDataStore.
Delegates to helper function and stores results in internal data structure.
"""
# Store loaded data
# @note this will also work for the old legacy format because self.__data['watching'] should already have them loaded by this point.
self.__data['watching'].update(load_all_watches(
watching = load_all_watches(
self.datastore_path,
self.rehydrate_entity
))
logger.debug(f"Loaded {len(self.__data['watching'])} watches")
)
# Store loaded data
self.__data['watching'] = watching
logger.debug(f"Loaded {len(watching)} watches")
def _load_tags(self):
"""
@@ -468,63 +495,6 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.__data['settings']['application']['password'] = False
self.commit()
def clear_all_last_checksums(self):
"""
Delete all last-checksum.txt files to force reprocessing of all watches.
This should be called when global settings change, since watches inherit
configuration and need to reprocess even if their individual watch dict
hasn't been modified.
Note: We delete the checksum file rather than setting was_edited=True because:
- was_edited is not persisted across restarts
- File deletion ensures reprocessing works across app restarts
"""
deleted_count = 0
for uuid in self.__data['watching'].keys():
watch = self.__data['watching'][uuid]
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid}")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files to force reprocessing")
return deleted_count
def clear_checksums_for_tag(self, tag_uuid):
"""
Delete last-checksum.txt files for all watches using a specific tag.
This should be called when a tag configuration is edited, since watches
inherit tag settings and need to reprocess.
Args:
tag_uuid: UUID of the tag that was modified
Returns:
int: Number of checksum files deleted
"""
deleted_count = 0
for uuid, watch in self.__data['watching'].items():
if watch.get('tags') and tag_uuid in watch['tags']:
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid} (tag {tag_uuid})")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files for tag {tag_uuid}")
return deleted_count
def commit(self):
"""
Save settings immediately to disk using atomic write.
@@ -207,6 +207,15 @@ def save_watch_atomic(watch_dir, uuid, watch_dict):
save_entity_atomic(watch_dir, uuid, watch_dict, "watch.json", "watch", max_size_mb=10)
def save_tag_atomic(tag_dir, uuid, tag_dict):
"""
Save a tag to disk using atomic write pattern.
Convenience wrapper around save_entity_atomic for tags.
Kept for backwards compatibility.
"""
save_entity_atomic(tag_dir, uuid, tag_dict, "tag.json", "tag", max_size_mb=1)
def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
"""
+66
View File
@@ -0,0 +1,66 @@
"""
Legacy format loader for url-watches.json.
Provides functions to detect and load from the legacy monolithic JSON format.
Used during migration (update_26) to transition to individual watch.json files.
"""
import os
import json
from loguru import logger
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
def has_legacy_datastore(datastore_path):
"""
Check if a legacy url-watches.json file exists.
This is used by update_26 to determine if migration is needed.
Args:
datastore_path: Path to datastore directory
Returns:
bool: True if url-watches.json exists
"""
url_watches_json = os.path.join(datastore_path, "url-watches.json")
return os.path.exists(url_watches_json)
def load_legacy_format(json_store_path):
"""
Load datastore from legacy url-watches.json format.
Args:
json_store_path: Full path to url-watches.json file
Returns:
dict: Loaded datastore data with 'watching', 'settings', etc.
None: If file doesn't exist or loading failed
"""
logger.info(f"Loading from legacy format: {json_store_path}")
if not os.path.isfile(json_store_path):
logger.warning(f"Legacy file not found: {json_store_path}")
return None
try:
if HAS_ORJSON:
with open(json_store_path, 'rb') as f:
data = orjson.loads(f.read())
else:
with open(json_store_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Loaded {len(data.get('watching', {}))} watches from legacy format")
return data
except Exception as e:
logger.error(f"Failed to load legacy format: {e}")
return None
+60 -51
View File
@@ -16,18 +16,12 @@ import time
from loguru import logger
from copy import deepcopy
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..processors.restock_diff import Restock
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from .file_saving_datastore import save_watch_atomic
def create_backup_tarball(datastore_path, update_number):
"""
@@ -103,7 +97,7 @@ def create_backup_tarball(datastore_path, update_number):
tar.add(tag_json, arcname=f"{entry}/tag.json")
tag_count += 1
logger.success(f"Backup created: {backup_filename} ({watch_count} watches from disk, {tag_count} tags from disk)")
logger.success(f"Backup created: {backup_filename} ({watch_count} watches, {tag_count} tags)")
return backup_path
except Exception as e:
@@ -143,7 +137,6 @@ class DatastoreUpdatesMixin:
return updates_available
def run_updates(self, current_schema_version=None):
import sys
"""
Run all pending schema updates sequentially.
@@ -167,23 +160,6 @@ class DatastoreUpdatesMixin:
4. All changes saved via individual .commit() calls
"""
updates_available = self.get_updates_available()
if self.data.get('watching'):
test_watch = self.data['watching'].get(next(iter(self.data.get('watching', {}))))
from ..model.Watch import model
if not isinstance(test_watch, model):
import sys
logger.critical("Cannot run updates! Watch structure must be re-hydrated back to a Watch model object!")
sys.exit(1)
if self.data['settings']['application'].get('tags',{}):
test_tag = self.data['settings']['application'].get('tags',{}).get(next(iter(self.data['settings']['application'].get('tags',{}))))
from ..model.Tag import model as tag_model
if not isinstance(test_tag, tag_model):
import sys
logger.critical("Cannot run updates! Watch tag/group structure must be re-hydrated back to a Tag model object!")
sys.exit(1)
# Determine current schema version
if current_schema_version is None:
@@ -225,9 +201,10 @@ class DatastoreUpdatesMixin:
try:
update_method = getattr(self, f"update_{update_n}")()
except Exception as e:
logger.critical(f"Error while trying update_{update_n}")
logger.exception(e)
sys.exit(1)
logger.error(f"Error while trying update_{update_n}")
logger.error(e)
# Don't run any more updates
return
else:
# Bump the version
self.data['settings']['application']['schema_version'] = update_n
@@ -578,6 +555,27 @@ class DatastoreUpdatesMixin:
logger.critical("COPY-based migration: url-watches.json will remain intact for rollback")
logger.critical("=" * 80)
# Check if already migrated
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
if os.path.exists(changedetection_json):
logger.info("Migration already completed (changedetection.json exists), skipping")
return
# Check if we need to load legacy data
from .legacy_loader import has_legacy_datastore, load_legacy_format
if not has_legacy_datastore(self.datastore_path):
logger.info("No legacy datastore found, nothing to migrate")
return
# Load legacy data from url-watches.json
logger.critical("Loading legacy datastore from url-watches.json...")
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
legacy_data = load_legacy_format(legacy_path)
if not legacy_data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Populate settings from legacy data
logger.info("Populating settings from legacy data...")
watch_count = len(self.data['watching'])
@@ -589,7 +587,9 @@ class DatastoreUpdatesMixin:
saved_count = 0
for uuid, watch in self.data['watching'].items():
try:
watch.commit()
watch_dict = dict(watch)
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
saved_count += 1
if saved_count % 100 == 0:
@@ -635,19 +635,18 @@ class DatastoreUpdatesMixin:
# Phase 4: Verify settings file exists
logger.critical("Phase 4/4: Verifying changedetection.json exists...")
changedetection_json_new_schema=os.path.join(self.datastore_path, "changedetection.json")
if not os.path.isfile(changedetection_json_new_schema):
import sys
logger.critical("Migration failed, changedetection.json not found after update ran!")
sys.exit(1)
if not os.path.isfile(changedetection_json):
raise Exception(
"Migration failed: changedetection.json not found after save. "
"url-watches.json remains intact, safe to retry."
)
logger.critical("Phase 4 complete: Verified changedetection.json exists")
# Success! Now reload from new format
logger.critical("Reloading datastore from new format...")
# write it to disk, it will be saved without ['watching'] in the JSON db because we find it from disk glob
self._save_settings()
self._load_state() # Includes load_watches
logger.success("Datastore reloaded from new format successfully")
logger.critical("=" * 80)
logger.critical("MIGRATION COMPLETED SUCCESSFULLY!")
@@ -669,9 +668,7 @@ class DatastoreUpdatesMixin:
def update_26(self):
self.migrate_legacy_db_format()
# Re-run tag to JSON migration
def update_29(self):
def update_28(self):
"""
Migrate tags to individual tag.json files.
@@ -686,7 +683,7 @@ class DatastoreUpdatesMixin:
"""
logger.critical("=" * 80)
logger.critical("Running migration: Individual tag persistence (update_28)")
logger.critical("Creating individual tag.json files")
logger.critical("Creating individual tag.json files (tags remain in settings too)")
logger.critical("=" * 80)
tags = self.data['settings']['application'].get('tags', {})
@@ -702,12 +699,28 @@ class DatastoreUpdatesMixin:
failed_count = 0
for uuid, tag_data in tags.items():
if os.path.isfile(os.path.join(self.datastore_path, uuid, "tag.json")):
logger.debug(f"Tag {uuid} tag.json exists, skipping")
continue
try:
tag_data.commit()
# Force save as tag.json (not watch.json) even if object is corrupted
from changedetectionio.store.file_saving_datastore import save_entity_atomic
import os
tag_dir = os.path.join(self.datastore_path, uuid)
os.makedirs(tag_dir, exist_ok=True)
# Convert to dict if it's an object
tag_dict = dict(tag_data) if hasattr(tag_data, '__iter__') else tag_data
# Save explicitly as tag.json
save_entity_atomic(
tag_dir,
uuid,
tag_dict,
filename='tag.json',
entity_type='tag',
max_size_mb=1
)
saved_count += 1
if saved_count % 10 == 0:
logger.info(f" Progress: {saved_count}/{tag_count} tags migrated...")
@@ -724,9 +737,5 @@ class DatastoreUpdatesMixin:
# On next load, _load_tags() will read from tag.json files and merge with settings
logger.info("Tags saved to both settings AND individual tag.json files")
logger.info("Future tag edits will update both locations (dual storage)")
logger.critical("=" * 80)
# write it to disk, it will be saved without ['tags'] in the JSON db because we find it from disk glob
# (left this out by accident in previous update, added tags={} in the changedetection.json save_to_disk)
self._save_settings()
logger.critical("=" * 80)
@@ -10,7 +10,6 @@
<li>{{ _('Trigger text is processed from the result-text that comes out of any CSS/JSON Filters for this monitor') }}</li>
<li>{{ _('Each line is processed separately (think of each line as "OR")') }}</li>
<li>{{ _('Note: Wrap in forward slash / to use regex example:') }} <code>/foo\d/</code></li>
<li>{{ _('You can also use')}} <a href="#conditions">{{ _('conditions')}}</a> - {{ _('"Page text" - with Contains, Starts With, Not Contains and many more' ) }} <code>/foo\d/</code></li>
</ul>
</span>
</div>
-13
View File
@@ -331,7 +331,6 @@ def prepare_test_function(live_server, datastore_path):
# Cleanup: Clear watches and queue after test
try:
from changedetectionio.flask_app import update_q
from pathlib import Path
# Clear the queue to prevent leakage to next test
while not update_q.empty():
@@ -341,18 +340,6 @@ def prepare_test_function(live_server, datastore_path):
break
datastore.data['watching'] = {}
# Delete any old watch metadata JSON files
base_path = Path(datastore.datastore_path).resolve()
max_depth = 2
for file in base_path.rglob("*.json"):
# Calculate depth relative to base path
depth = len(file.relative_to(base_path).parts) - 1
if depth <= max_depth and file.is_file():
file.unlink()
except Exception as e:
logger.warning(f"Error during datastore cleanup: {e}")
+5 -444
View File
@@ -328,68 +328,6 @@ def test_api_simple(client, live_server, measure_memory_usage, datastore_path):
)
assert len(res.json) == 0, "Watch list should be empty"
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
# Create new
res = client.post(
url_for("createwatch"),
data=json.dumps({"url": test_url}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
watch=res.json
# Be sure that 'readOnly' values are never updated in the real watch
watch['last_changed'] = 454444444444
watch['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(watch),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
last_changed = res.json.get('last_changed')
assert last_changed != 454444444444
assert last_changed != "454444444444"
date_created = res.json.get('date_created')
assert date_created != 454444444444
assert date_created != "454444444444"
def test_access_denied(client, live_server, measure_memory_usage, datastore_path):
# `config_api_token_enabled` Should be On by default
res = client.get(
@@ -463,9 +401,6 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
follow_redirects=True
)
if res.status_code != 201:
print(f"\n=== POST createwatch failed with {res.status_code} ===")
print(f"Response: {res.data}")
assert res.status_code == 201
wait_for_all_checks(client)
@@ -529,12 +464,11 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
)
assert res.status_code == 400, "Should get error 400 when we give a field that doesnt exist"
# Backend validation now rejects unknown fields with a clear error message
assert (b'Unknown field' in res.data or
b'Additional properties are not allowed' in res.data or
b'Unevaluated properties are not allowed' in res.data or
# Message will come from `flask_expects_json`
# With patternProperties for processor_config_*, the error message format changed slightly
assert (b'Additional properties are not allowed' in res.data or
b'does not match any of the regexes' in res.data), \
"Should reject unknown fields with validation error"
"Should reject unknown fields with schema validation error"
# Try a XSS URL
@@ -555,7 +489,6 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: Basic import with tag
res = client.post(
url_for("import") + "?tag=import-test",
data='https://website1.com\r\nhttps://website2.com',
@@ -574,239 +507,6 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
res = client.get(url_for('tags.tags_overview_page'))
assert b'import-test' in res.data
# Test 2: Import with watch configuration fields (issue #3845)
# Test string field (include_filters), boolean (paused), and processor
import urllib.parse
params = urllib.parse.urlencode({
'tag': 'config-test',
'include_filters': 'div.content',
'paused': 'true',
'processor': 'text_json_diff',
'title': 'Imported with Config'
})
res = client.post(
url_for("import") + "?" + params,
data='https://website3.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
assert len(res.json) == 1
uuid = res.json[0]
# Verify the configuration was applied
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert watch['include_filters'] == ['div.content'], "include_filters should be set as array"
assert watch['paused'] == True, "paused should be True"
assert watch['processor'] == 'text_json_diff', "processor should be set"
assert watch['title'] == 'Imported with Config', "title should be set"
# Test 3: Import with array field (notification_urls) - using valid Apprise format
params = urllib.parse.urlencode({
'tag': 'notification-test',
'notification_urls': 'mailto://test@example.com,mailto://admin@example.com'
})
res = client.post(
url_for("import") + "?" + params,
data='https://website4.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert isinstance(watch['notification_urls'], list), "notification_urls must be stored as a list"
assert len(watch['notification_urls']) == 2, "notification_urls should have 2 entries"
assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email"
assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email"
# Test 4: Import with object field (time_between_check)
import json
time_config = json.dumps({"hours": 2, "minutes": 30})
params = urllib.parse.urlencode({
'tag': 'schedule-test',
'time_between_check': time_config
})
res = client.post(
url_for("import") + "?" + params,
data='https://website5.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert watch['time_between_check']['hours'] == 2, "time_between_check hours should be 2"
assert watch['time_between_check']['minutes'] == 30, "time_between_check minutes should be 30"
# Test 5: Import with invalid processor (should fail)
res = client.post(
url_for("import") + "?processor=invalid_processor",
data='https://website6.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 400, "Should reject invalid processor"
assert b"Invalid processor" in res.data, "Error message should mention invalid processor"
# Test 6: Import with invalid field (should fail)
res = client.post(
url_for("import") + "?unknown_field=value",
data='https://website7.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 400, "Should reject unknown field"
assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter"
# Test 7: Import with complex nested array (browser_steps) - array of objects
browser_steps = json.dumps([
{"operation": "wait", "selector": "5", "optional_value": ""},
{"operation": "click", "selector": "button.submit", "optional_value": ""}
])
params = urllib.parse.urlencode({
'tag': 'browser-test',
'browser_steps': browser_steps
})
res = client.post(
url_for("import") + "?" + params,
data='https://website8.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200, "Should accept browser_steps array"
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert len(watch['browser_steps']) == 2, "Should have 2 browser steps"
assert watch['browser_steps'][0]['operation'] == 'wait', "First step should be wait"
assert watch['browser_steps'][1]['operation'] == 'click', "Second step should be click"
assert watch['browser_steps'][1]['selector'] == 'button.submit', "Second step selector should be button.submit"
# Cleanup
delete_all_watches(client)
def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path):
"""Test that small imports (< threshold) are processed synchronously"""
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Use local test endpoint to avoid network delays
test_url_base = url_for('test_endpoint', _external=True)
# Create URLs: threshold - 1 to stay under limit
num_urls = min(5, IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD - 1) # Use small number for faster test
urls = '\n'.join([f'{test_url_base}?id=small-{i}' for i in range(num_urls)])
# Import small batch
res = client.post(
url_for("import") + "?tag=small-test",
data=urls,
headers={'x-api-key': api_key},
follow_redirects=True
)
# Should return 200 OK with UUID list (synchronous)
assert res.status_code == 200, f"Should return 200 for small imports, got {res.status_code}"
assert isinstance(res.json, list), "Response should be a list of UUIDs"
assert len(res.json) == num_urls, f"Should return {num_urls} UUIDs, got {len(res.json)}"
# Verify all watches were created immediately
for uuid in res.json:
assert uuid in live_server.app.config['DATASTORE'].data['watching'], \
f"Watch {uuid} should exist immediately after synchronous import"
print(f"\n✓ Successfully created {num_urls} watches synchronously")
def test_api_import_large_background(client, live_server, measure_memory_usage, datastore_path):
"""Test that large imports (>= threshold) are processed in background thread"""
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
import time
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Use local test endpoint to avoid network delays
test_url_base = url_for('test_endpoint', _external=True)
# Create URLs: threshold + 10 to trigger background processing
num_urls = IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD + 10
urls = '\n'.join([f'{test_url_base}?id=bulk-{i}' for i in range(num_urls)])
# Import large batch
res = client.post(
url_for("import") + "?tag=bulk-test",
data=urls,
headers={'x-api-key': api_key},
follow_redirects=True
)
# Should return 202 Accepted (background processing)
assert res.status_code == 202, f"Should return 202 for large imports, got {res.status_code}"
assert b"background" in res.data.lower(), "Response should mention background processing"
# Extract expected count from response
response_json = res.json
assert 'count' in response_json, "Response should include count"
assert response_json['count'] == num_urls, f"Count should be {num_urls}, got {response_json['count']}"
# Wait for background thread to complete (with timeout)
max_wait = 10 # seconds
wait_interval = 0.5
elapsed = 0
watches_created = 0
while elapsed < max_wait:
time.sleep(wait_interval)
elapsed += wait_interval
# Count how many watches have been created
watches_created = len([
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
if 'id=bulk-' in watch['url']
])
if watches_created == num_urls:
break
# Verify all watches were created
assert watches_created == num_urls, \
f"Expected {num_urls} watches to be created, but found {watches_created} after {elapsed}s"
# Verify watches have correct configuration
bulk_watches = [
watch for watch in live_server.app.config['DATASTORE'].data['watching'].values()
if 'id=bulk-' in watch['url']
]
assert len(bulk_watches) == num_urls, "All bulk watches should exist"
# Check that they have the correct tag
datastore = live_server.app.config['DATASTORE']
# Get UUIDs of bulk watches by filtering the datastore keys
bulk_watch_uuids = [
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
if 'id=bulk-' in watch['url']
]
for watch_uuid in bulk_watch_uuids:
tags = datastore.get_all_tags_for_watch(uuid=watch_uuid)
tag_names = [t['title'] for t in tags.values()]
assert 'bulk-test' in tag_names, f"Watch {watch_uuid} should have 'bulk-test' tag"
print(f"\n✓ Successfully created {num_urls} watches in background (took {elapsed}s)")
def test_api_conflict_UI_password(client, live_server, measure_memory_usage, datastore_path):
@@ -933,9 +633,7 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
)
assert res.status_code == 400, "Updating watch URL to null should fail"
# Accept either OpenAPI validation error or our custom validation error
assert (b'URL cannot be null' in res.data or
b'Validation failed' in res.data or
b'validation error' in res.data.lower())
assert b'URL cannot be null' in res.data or b'OpenAPI validation failed' in res.data or b'validation error' in res.data.lower()
# Test 8: UPDATE to empty string URL should fail
res = client.put(
@@ -1022,140 +720,3 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
headers={'x-api-key': api_key},
)
delete_all_watches(client)
def test_api_time_between_check_validation(client, live_server, measure_memory_usage, datastore_path):
"""
Test that time_between_check validation works correctly:
- When time_between_check_use_default is false, at least one time value must be > 0
- Values must be valid integers
"""
import json
from flask import url_for
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: time_between_check_use_default=false with NO time_between_check should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time_between_check_use_default=false with no time_between_check"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 2: time_between_check_use_default=false with ALL zeros should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": 0,
"days": 0,
"hours": 0,
"minutes": 0,
"seconds": 0
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are 0"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 3: time_between_check_use_default=false with NULL values should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": None,
"days": None,
"hours": None,
"minutes": None,
"seconds": None
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are null"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 4: time_between_check_use_default=false with valid hours should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": 2
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid hours value"
uuid1 = res.json.get('uuid')
# Test 5: time_between_check_use_default=false with valid minutes should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example2.com",
"time_between_check_use_default": False,
"time_between_check": {
"minutes": 30
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid minutes value"
uuid2 = res.json.get('uuid')
# Test 6: time_between_check_use_default=true (or missing) with no time_between_check should succeed (uses defaults)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example3.com",
"time_between_check_use_default": True
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed when using default settings"
uuid3 = res.json.get('uuid')
# Test 7: Default behavior (no time_between_check_use_default field) should use defaults and succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example4.com"
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with default behavior (using global settings)"
uuid4 = res.json.get('uuid')
# Test 8: Verify integer type validation - string should fail (OpenAPI validation)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example5.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": "not_a_number"
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time value is not an integer"
assert b"Validation failed" in res.data or b"not of type" in res.data, "Should mention validation/type error"
# Cleanup
for uuid in [uuid1, uuid2, uuid3, uuid4]:
client.delete(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key},
)
@@ -107,7 +107,7 @@ def test_watch_notification_urls_validation(client, live_server, measure_memory_
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"Validation failed" in res.data or b"is not of type" in res.data
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 6: Verify original URLs are preserved after failed update
res = client.get(
@@ -159,7 +159,7 @@ def test_tag_notification_urls_validation(client, live_server, measure_memory_us
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"Validation failed" in res.data or b"is not of type" in res.data
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 4: Verify original URLs are preserved after failed update
tag = datastore.data['settings']['application']['tags'][tag_uuid]
+10 -19
View File
@@ -9,7 +9,7 @@ by testing various scenarios that should trigger validation errors.
import time
import json
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, delete_all_watches
from .util import live_server_setup, wait_for_all_checks
def test_openapi_validation_invalid_content_type_on_create_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -26,8 +26,7 @@ def test_openapi_validation_invalid_content_type_on_create_watch(client, live_se
# Should get 400 error due to OpenAPI validation failure
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_missing_required_field_create_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -44,8 +43,7 @@ def test_openapi_validation_missing_required_field_create_watch(client, live_ser
# Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_invalid_field_in_request_body(client, live_server, measure_memory_usage, datastore_path):
@@ -82,10 +80,10 @@ def test_openapi_validation_invalid_field_in_request_body(client, live_server, m
# Should get 400 error due to invalid field (this will be caught by internal validation)
# Note: This tests the flow where OpenAPI validation passes but internal validation catches it
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
# Backend validation now returns "Unknown field(s):" message
assert b"Unknown field" in res.data, \
"Should contain validation error about unknown fields"
delete_all_watches(client)
# With patternProperties for processor_config_*, the error message format changed slightly
assert (b"Additional properties are not allowed" in res.data or
b"does not match any of the regexes" in res.data), \
"Should contain validation error about additional/invalid properties"
def test_openapi_validation_import_wrong_content_type(client, live_server, measure_memory_usage, datastore_path):
@@ -102,8 +100,7 @@ def test_openapi_validation_import_wrong_content_type(client, live_server, measu
# Should get 400 error due to content-type mismatch
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_import_correct_content_type_succeeds(client, live_server, measure_memory_usage, datastore_path):
@@ -121,7 +118,6 @@ def test_openapi_validation_import_correct_content_type_succeeds(client, live_se
# Should succeed
assert res.status_code == 200, f"Expected 200 but got {res.status_code}"
assert len(res.json) == 2, "Should import 2 URLs"
delete_all_watches(client)
def test_openapi_validation_get_requests_bypass_validation(client, live_server, measure_memory_usage, datastore_path):
@@ -146,7 +142,6 @@ def test_openapi_validation_get_requests_bypass_validation(client, live_server,
# Should return JSON with watch list (empty in this case)
assert isinstance(res.json, dict), "Should return JSON dictionary for watch list"
delete_all_watches(client)
def test_openapi_validation_create_tag_missing_required_title(client, live_server, measure_memory_usage, datastore_path):
@@ -163,14 +158,11 @@ def test_openapi_validation_create_tag_missing_required_title(client, live_serve
# Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_watch_update_allows_partial_updates(client, live_server, measure_memory_usage, datastore_path):
"""Test that watch updates allow partial updates without requiring all fields (positive test)."""
#xxx
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# First create a valid watch
@@ -207,5 +199,4 @@ def test_openapi_validation_watch_update_allows_partial_updates(client, live_ser
)
assert res.status_code == 200
assert res.json.get('title') == 'Updated Title Only', "Title should be updated"
assert res.json.get('url') == 'https://example.com', "URL should remain unchanged"
delete_all_watches(client)
assert res.json.get('url') == 'https://example.com', "URL should remain unchanged"
-53
View File
@@ -176,57 +176,4 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
assert res.status_code == 204
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
res = client.post(
url_for("tag"),
data=json.dumps({"title": "My tag title"}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
tag = res.json
# Only test with date_created (readOnly field that should be filtered out)
# last_changed is Watch-specific and doesn't apply to Tags
tag['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(tag),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
# Verify readOnly fields like date_created cannot be overridden
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
date_created = res.json.get('date_created')
assert date_created != 454444444444, "ReadOnly date_created should not be updateable"
assert date_created != "454444444444", "ReadOnly date_created should not be updateable"
+2
View File
@@ -6,6 +6,8 @@ from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, \
extract_UUID_from_client, delete_all_watches
sleep_time_for_fetch_thread = 3
# Basic test to check inscriptus is not adding return line chars, basically works etc
def test_inscriptus():
+7 -128
View File
@@ -6,10 +6,11 @@ import io
from zipfile import ZipFile
import re
import time
from changedetectionio.model import Watch, Tag
def test_backup(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
set_original_response(datastore_path=datastore_path)
@@ -31,7 +32,7 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
time.sleep(4)
res = client.get(
url_for("backups.create"),
url_for("backups.index"),
follow_redirects=True
)
# Can see the download link to the backup
@@ -53,11 +54,11 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
backup = ZipFile(io.BytesIO(res.data))
l = backup.namelist()
# Check for UUID-based txt files (history, snapshot, and last-checksum)
# Check for UUID-based txt files (history and snapshot)
uuid4hex_txt = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, l))
# Should be three txt files in the archive (history, snapshot, and last-checksum)
assert len(txt_files) == 3
# Should be two txt files in the archive (history and the snapshot)
assert len(txt_files) == 2
# Check for watch.json files (new format)
uuid4hex_json = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}/watch\.json$', re.I)
@@ -74,126 +75,4 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
assert b'No backups found.' in res.data
def test_watch_data_package_download(client, live_server, measure_memory_usage, datastore_path):
"""Test downloading a single watch's data as a zip package"""
set_original_response(datastore_path=datastore_path)
uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
tag_uuid = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag")
tag_uuid2 = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag number two")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Download the watch data package
res = client.get(url_for("ui.ui_edit.watch_get_data_package", uuid=uuid))
# Should get the right zip content type
assert res.content_type == "application/zip"
# Should be PK/ZIP stream (PKzip header)
assert res.data[:2] == b'PK', "File should start with PK (PKzip header)"
assert res.data.count(b'PK') >= 2, "Should have multiple PK markers (zip file structure)"
# Verify zip contents
backup = ZipFile(io.BytesIO(res.data))
files = backup.namelist()
# Should have files in a UUID directory
assert any(uuid in f for f in files), f"Files should be in UUID directory: {files}"
# Should contain watch.json
watch_json_path = f"{uuid}/watch.json"
assert watch_json_path in files, f"Should contain watch.json, got: {files}"
# Should contain history/snapshot files
uuid4hex_txt = re.compile(f'^{re.escape(uuid)}/.*\\.txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, files))
assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
def test_backup_restore(client, live_server, measure_memory_usage, datastore_path):
"""Test that a full backup zip can be restored — watches and tags survive a round-trip."""
set_original_response(datastore_path=datastore_path)
datastore = live_server.app.config['DATASTORE']
watch_url = url_for('test_endpoint', _external=True)
# Set up: one watch and two tags
uuid = datastore.add_watch(url=watch_url)
tag_uuid = datastore.add_tag(title="Tasty backup tag")
tag_uuid2 = datastore.add_tag(title="Tasty backup tag number two")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Create a full backup
client.get(url_for("backups.request_backup"), follow_redirects=True)
time.sleep(4)
# Download the latest backup zip
res = client.get(url_for("backups.download_backup", filename="latest"), follow_redirects=True)
assert res.content_type == "application/zip"
zip_data = res.data
# Confirm the zip contains both watch.json and tag.json entries
backup = ZipFile(io.BytesIO(zip_data))
names = backup.namelist()
assert f"{uuid}/watch.json" in names, f"watch.json missing from backup: {names}"
assert f"{tag_uuid}/tag.json" in names, f"tag.json for tag 1 missing from backup: {names}"
assert f"{tag_uuid2}/tag.json" in names, f"tag.json for tag 2 missing from backup: {names}"
# --- Wipe everything ---
datastore.delete('all')
client.get(url_for("tags.delete_all"), follow_redirects=True)
assert uuid not in datastore.data['watching'], "Watch should be gone after delete"
assert tag_uuid not in datastore.data['settings']['application']['tags'], "Tag 1 should be gone after delete"
assert tag_uuid2 not in datastore.data['settings']['application']['tags'], "Tag 2 should be gone after delete"
# --- Restore from the backup zip ---
res = client.post(
url_for("backups.restore.backups_restore_start"),
data={
'zip_file': (io.BytesIO(zip_data), 'backup.zip'),
'include_groups': 'y',
'include_groups_replace_existing': 'y',
'include_watches': 'y',
'include_watches_replace_existing': 'y',
},
content_type='multipart/form-data',
follow_redirects=True
)
assert res.status_code == 200
# Wait for the thread to finish
time.sleep(2)
# --- Watch checks ---
restored_watch = datastore.data['watching'].get(uuid)
assert restored_watch is not None, f"Watch {uuid} not found after restore"
assert restored_watch['url'] == watch_url, "Restored watch URL does not match"
assert isinstance(restored_watch, Watch.model), \
f"Watch not properly rehydrated, got {type(restored_watch)}"
assert restored_watch.history_n >= 1, \
f"Restored watch should have at least 1 history entry, got {restored_watch.history_n}"
# --- Tag checks ---
restored_tags = datastore.data['settings']['application']['tags']
restored_tag = restored_tags.get(tag_uuid)
assert restored_tag is not None, f"Tag {tag_uuid} not found after restore"
assert restored_tag['title'] == "Tasty backup tag", "Restored tag 1 title does not match"
assert isinstance(restored_tag, Tag.model), \
f"Tag 1 not properly rehydrated, got {type(restored_tag)}"
restored_tag2 = restored_tags.get(tag_uuid2)
assert restored_tag2 is not None, f"Tag {tag_uuid2} not found after restore"
assert restored_tag2['title'] == "Tasty backup tag number two", "Restored tag 2 title does not match"
assert isinstance(restored_tag2, Tag.model), \
f"Tag 2 not properly rehydrated, got {type(restored_tag2)}"
assert b'No backups found.' in res.data
+9 -6
View File
@@ -71,19 +71,22 @@ def test_include_filters_output():
# Tests the whole stack works with the CSS Filter
def test_check_markup_include_filters_restriction(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
include_filters = "#sametext"
set_original_response(datastore_path=datastore_path)
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, add our ignore text
# Add our URL to the import page
@@ -100,15 +103,15 @@ def test_check_markup_include_filters_restriction(client, live_server, measure_m
)
assert bytes(include_filters.encode('utf-8')) in res.data
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Make a change
set_modified_response(datastore_path=datastore_path)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should have 'has-unread-changes' still
# Because it should be looking at only that 'sametext' id
@@ -6,6 +6,10 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import os
sleep_time_for_fetch_thread = 3
def test_check_extract_text_from_diff(client, live_server, measure_memory_usage, datastore_path):
import time
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
+4 -2
View File
@@ -5,8 +5,6 @@ from flask import url_for
from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, extract_UUID_from_client, delete_all_watches
import os
from ..store import ChangeDetectionStore
# def test_setup(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
@@ -489,6 +487,7 @@ def test_tag_json_persistence(client, live_server, measure_memory_usage, datasto
- Tag deletion removes tag.json file
"""
import json
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
@@ -570,6 +569,9 @@ def test_tag_json_migration_update_27(client, live_server, measure_memory_usage,
This simulates a pre-update_27 datastore and verifies migration works.
"""
import json
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
# 1. Create multiple tags
tag_names = ['migration-tag-1', 'migration-tag-2', 'migration-tag-3']
@@ -106,7 +106,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and fname != 'watch.json' and fname != 'last-checksum.txt' and 'html' not in fname:
if fname != 'history.txt' and fname != 'watch.json' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
@@ -123,18 +123,11 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
# With new format, we have watch.json, so 4 files minimum
# Note: last-checksum.txt may or may not exist - it gets cleared by settings changes,
# and this test changes settings before checking files
# This assertion should be AFTER the loop, not inside it
# With new format, we also have watch.json, so 4 files total
if os.path.exists(changedetection_json):
# 4 required files: watch.json, html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional (cleared by settings changes in this test)
assert len(files_in_watch_dir) >= 4 and len(files_in_watch_dir) <= 5, f"Should be 4-5 files in the dir with new format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
assert len(files_in_watch_dir) == 4, "Should be four files in the dir with new format: watch.json, html.br snapshot, history.txt and the extracted text snapshot"
else:
# 3 required files: html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional
assert len(files_in_watch_dir) >= 3 and len(files_in_watch_dir) <= 4, f"Should be 3-4 files in the dir with legacy format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir with legacy format: html.br snapshot, history.txt and the extracted text snapshot"
# Check that 'default' Watch vars aren't accidentally being saved
if os.path.exists(changedetection_json):
@@ -41,6 +41,7 @@ def set_modified_ignore_response(datastore_path):
def test_render_anchor_tag_content_true(client, live_server, measure_memory_usage, datastore_path):
"""Testing that the link changes are detected when
render_anchor_tag_content setting is set to true"""
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
@@ -100,6 +100,7 @@ def test_normal_page_check_works_with_ignore_status_code(client, live_server, me
# Tests the whole stack works with staus codes ignored
def test_403_page_check_works_with_ignore_status_code(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
set_original_response(datastore_path=datastore_path)
@@ -111,7 +112,8 @@ def test_403_page_check_works_with_ignore_status_code(client, live_server, measu
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, check our ignore option
# Add our URL to the import page
@@ -2,9 +2,10 @@
import time
from flask import url_for
from . util import live_server_setup
import os
from .util import live_server_setup, delete_all_watches, wait_for_all_checks
# Should be the same as set_original_ignore_response(datastore_path=datastore_path) but with a little more whitespacing
@@ -49,7 +50,10 @@ def set_original_ignore_response(datastore_path):
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_check_ignore_whitespace(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
set_original_ignore_response(datastore_path=datastore_path)
@@ -70,17 +74,17 @@ def test_check_ignore_whitespace(client, live_server, measure_memory_usage, data
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
set_original_ignore_response_but_with_whitespace(datastore_path)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
-101
View File
@@ -24,30 +24,6 @@ def set_original_response(datastore_path):
f.write(test_return_data)
return None
def test_favicon(client, live_server, measure_memory_usage, datastore_path):
# Attempt to fetch it, make sure that works
SVG_BASE64 = 'PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxIDEiLz4='
uuid = client.application.config.get('DATASTORE').add_watch(url='https://localhost')
live_server.app.config['DATASTORE'].data['watching'][uuid].bump_favicon(url="favicon-set-type.svg",
favicon_base_64=SVG_BASE64
)
res = client.get(url_for('static_content', group='favicon', filename=uuid))
assert res.status_code == 200
assert len(res.data) > 10
res = client.get(url_for('static_content', group='..', filename='__init__.py'))
assert res.status_code != 200
res = client.get(url_for('static_content', group='.', filename='../__init__.py'))
assert res.status_code != 200
# Traverse by filename protection
res = client.get(url_for('static_content', group='js', filename='../styles/styles.css'))
assert res.status_code != 200
def test_bad_access(client, live_server, measure_memory_usage, datastore_path):
res = client.post(
@@ -502,80 +478,3 @@ def test_logout_with_redirect(client, live_server, measure_memory_usage, datasto
# Cleanup
del client.application.config['DATASTORE'].data['settings']['application']['password']
def test_static_directory_traversal(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the static file serving route properly blocks directory traversal attempts.
This tests the fix for GHSA-9jj8-v89v-xjvw (CVE pending).
The vulnerability was in /static/<group>/<filename> where the sanitization regex
allowed dots, enabling "../" traversal to read application source files.
The fix changed the regex from r'[^\w.-]+' to r'[^a-z0-9_]+' which blocks dots.
"""
# Test 1: Direct .. traversal attempt (URL-encoded)
res = client.get(
"/static/%2e%2e/flask_app.py",
follow_redirects=False
)
# Should be blocked (404 or 403)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
# Should NOT contain application source code
assert b"def static_content" not in res.data
assert b"changedetection_app" not in res.data
# Test 2: Direct .. traversal attempt (unencoded)
res = client.get(
"/static/../flask_app.py",
follow_redirects=False
)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
assert b"def static_content" not in res.data
# Test 3: Multiple dots traversal
res = client.get(
"/static/..../flask_app.py",
follow_redirects=False
)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
assert b"def static_content" not in res.data
# Test 4: Try to access other application files
for filename in ["__init__.py", "datastore.py", "store.py"]:
res = client.get(
f"/static/%2e%2e/{filename}",
follow_redirects=False
)
assert res.status_code in [404, 403], f"File {filename} should be blocked"
# Should not contain Python code indicators
assert b"import" not in res.data or b"# Test" in res.data # Allow "1 Imported" etc
# Test 5: Verify legitimate static files still work
# Note: We can't test actual files without knowing what exists,
# but we can verify the sanitization doesn't break valid groups
res = client.get(
"/static/images/test.png", # Will 404 if file doesn't exist, but won't traverse
follow_redirects=False
)
# Should get 404 (file not found) not 403 (blocked)
# This confirms the group name "images" is valid
assert res.status_code == 404
# Test 6: Ensure hyphens and dots are blocked in group names
res = client.get(
"/static/../../../etc/passwd",
follow_redirects=False
)
assert res.status_code in [404, 403]
assert b"root:" not in res.data
# Test 7: Test that underscores still work (they're allowed)
res = client.get(
"/static/visual_selector_data/test.json",
follow_redirects=False
)
# visual_selector_data is a real group, but requires auth
# Should get 403 (not authenticated) or 404 (file not found), not a path traversal
assert res.status_code in [403, 404]
@@ -1,208 +0,0 @@
#!/usr/bin/env python3
"""
Test that changing global settings or tag configurations forces reprocessing.
When settings or tag configurations change, all affected watches need to
reprocess even if their content hasn't changed, because configuration affects
the processing result.
"""
import os
import time
from flask import url_for
from .util import wait_for_all_checks
def test_settings_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing global settings clears all checksums to force reprocessing.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Add two watches
datastore = client.application.config.get('DATASTORE')
uuid1 = datastore.add_watch(url=test_url, extras={'title': 'Watch 1'})
uuid2 = datastore.add_watch(url=test_url, extras={'title': 'Watch 2'})
# Unpause watches
datastore.data['watching'][uuid1]['paused'] = False
datastore.data['watching'][uuid2]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum1 = os.path.join(datastore_path, uuid1, 'last-checksum.txt')
checksum2 = os.path.join(datastore_path, uuid2, 'last-checksum.txt')
assert os.path.isfile(checksum1), "First check should create checksum file for watch 1"
assert os.path.isfile(checksum2), "First check should create checksum file for watch 2"
# Change global settings (any setting will do)
res = client.post(
url_for("settings.settings_page"),
data={
"application-empty_pages_are_a_change": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Settings updated." in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ALL checksum files were deleted
assert not os.path.isfile(checksum1), "Settings change should delete checksum for watch 1"
assert not os.path.isfile(checksum2), "Settings change should delete checksum for watch 2"
# Next check should reprocess (not skip) and recreate checksums
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were recreated
assert os.path.isfile(checksum1), "Reprocessing should recreate checksum file for watch 1"
assert os.path.isfile(checksum2), "Reprocessing should recreate checksum file for watch 2"
print("✓ Settings change forces reprocessing of all watches")
def test_tag_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing a tag configuration clears checksums only for watches with that tag.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('Test Tag')
# Add watches - one with tag, one without
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch With Tag', 'tags': [tag_uuid]})
uuid_without_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch Without Tag'})
# Unpause watches
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_without_tag]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum_with = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
checksum_without = os.path.join(datastore_path, uuid_without_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_with), "First check should create checksum for tagged watch"
assert os.path.isfile(checksum_without), "First check should create checksum for untagged watch"
# Edit the tag (change notification_muted as an example)
tag = datastore.data['settings']['application']['tags'][tag_uuid]
res = client.post(
url_for("tags.form_tag_edit_submit", uuid=tag_uuid),
data={
'title': 'Test Tag',
'notification_muted': 'y',
'overrides_watch': 'n'
},
follow_redirects=True
)
assert b"Updated" in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ONLY the tagged watch's checksum was deleted
assert not os.path.isfile(checksum_with), "Tag change should delete checksum for watch WITH tag"
assert os.path.isfile(checksum_without), "Tag change should NOT delete checksum for watch WITHOUT tag"
# Next check should reprocess tagged watch and recreate its checksum
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify tagged watch's checksum was recreated
assert os.path.isfile(checksum_with), "Reprocessing should recreate checksum for tagged watch"
assert os.path.isfile(checksum_without), "Untagged watch should still have its checksum"
print("✓ Tag change forces reprocessing only for watches with that tag")
def test_tag_change_via_api_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that updating a tag via API also clears checksums for affected watches.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('API Test Tag')
# Add watch with tag
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'API Watch'})
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_with_tag]['tags'] = [tag_uuid]
datastore.data['watching'][uuid_with_tag].commit()
# First check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum exists
checksum_file = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_file), "First check should create checksum file"
# Update tag via API
res = client.put(
f'/api/v1/tag/{tag_uuid}',
json={'notification_muted': True},
headers={'x-api-key': datastore.data['settings']['application']['api_access_token']}
)
assert res.status_code == 200, f"API call failed with status {res.status_code}: {res.data}"
# Give it more time for async operations
time.sleep(1.0)
# Debug: Check if checksum still exists
if os.path.isfile(checksum_file):
# Read checksum to see if it changed
with open(checksum_file, 'r') as f:
checksum_content = f.read()
print(f"Checksum still exists: {checksum_content}")
# Verify checksum was deleted
assert not os.path.isfile(checksum_file), "API tag update should delete checksum"
print("✓ Tag update via API forces reprocessing")
@@ -6,6 +6,9 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, delete_all_watches
import re
sleep_time_for_fetch_thread = 3
def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
+2 -4
View File
@@ -6,6 +6,7 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
from ..diff import ADDED_STYLE
sleep_time_for_fetch_thread = 3
def test_check_basic_change_detection_functionality_source(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
@@ -71,10 +72,7 @@ def test_check_ignore_elements(client, live_server, measure_memory_usage, datast
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(
url_for("ui.ui_preview.preview_page", uuid="first"),
@@ -2,8 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, delete_all_watches, wait_for_all_checks
from . util import live_server_setup, delete_all_watches
import os
@@ -26,6 +25,9 @@ def set_original_ignore_response(datastore_path):
def test_trigger_regex_functionality_with_filter(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
sleep_time_for_fetch_thread = 3
set_original_ignore_response(datastore_path=datastore_path)
# Give the endpoint time to spin up
@@ -36,7 +38,8 @@ def test_trigger_regex_functionality_with_filter(client, live_server, measure_me
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# it needs time to save the original version
time.sleep(sleep_time_for_fetch_thread)
### test regex with filter
res = client.post(
@@ -49,9 +52,8 @@ def test_trigger_regex_functionality_with_filter(client, live_server, measure_me
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
client.get(url_for("ui.ui_diff.diff_history_page", uuid="first"))
@@ -60,8 +62,7 @@ def test_trigger_regex_functionality_with_filter(client, live_server, measure_me
f.write("<html>some new noise with cool stuff2 ok</html>")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (nothing should match the regex and filter)
res = client.get(url_for("watchlist.index"))
@@ -72,8 +73,7 @@ def test_trigger_regex_functionality_with_filter(client, live_server, measure_me
f.write("<html>some new noise with <span id=in-here>cool stuff6</span> ok</html>")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data
@@ -1,246 +0,0 @@
#!/usr/bin/env python3
"""
Test the watch edited flag functionality.
This tests the private __watch_was_edited flag that tracks when writable
watch fields are modified, which prevents skipping reprocessing when the
watch configuration has changed.
"""
import os
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
def set_test_content(datastore_path):
"""Write test HTML content to endpoint-content.txt for test server."""
test_html = """<html>
<body>
<p>Test content for watch edited flag tests</p>
<p>This content stays the same across checks</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
def test_watch_edited_flag_lifecycle(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full lifecycle of the was_edited flag:
1. Flag starts False when watch is created
2. Flag becomes True when writable fields are modified
3. Flag is reset False after worker processing
4. Flag stays False when readonly fields are modified
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
assert b"Watch added" in res.data or b"Updated watch" in res.data
# Get the watch UUID
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Reset flag after initial form submission (form sets fields which trigger the flag)
watch.reset_watch_edited_flag()
# Test 1: Flag should be False after reset
assert not watch.was_edited, "Flag should be False after reset"
# Test 2: Modify a writable field (title) - flag should become True
watch['title'] = 'New Title'
assert watch.was_edited, "Flag should be True after modifying writable field 'title'"
# Test 3: Reset flag manually (simulating what worker does)
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Test 4: Modify another writable field (url) - flag should become True again
watch['url'] = 'https://example.com'
assert watch.was_edited, "Flag should be True after modifying writable field 'url'"
# Test 5: Reset and modify a readonly field - flag should stay False
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Modify readonly field (uuid) - should not set flag
old_uuid = watch['uuid']
watch['uuid'] = 'readonly-test-uuid'
assert not watch.was_edited, "Flag should stay False when modifying readonly field 'uuid'"
watch['uuid'] = old_uuid # Restore original
# Note: Worker reset behavior is tested in test_check_removed_line_contains_trigger
# and test_watch_edited_flag_prevents_skip
print("✓ All watch edited flag lifecycle tests passed")
def test_watch_edited_flag_dict_methods(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the flag is set correctly by various dict methods:
- __setitem__ (watch['key'] = value)
- update() (watch.update({'key': value}))
- setdefault() (watch.setdefault('key', default))
- pop() (watch.pop('key'))
- __delitem__ (del watch['key'])
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Test __setitem__
watch.reset_watch_edited_flag()
watch['title'] = 'Test via setitem'
assert watch.was_edited, "Flag should be True after __setitem__ on writable field"
# Test update() with dict
watch.reset_watch_edited_flag()
watch.update({'title': 'Test via update dict'})
assert watch.was_edited, "Flag should be True after update() with writable field"
# Test update() with kwargs
watch.reset_watch_edited_flag()
watch.update(title='Test via update kwargs')
assert watch.was_edited, "Flag should be True after update() kwargs with writable field"
# Test setdefault() on new key
watch.reset_watch_edited_flag()
watch.setdefault('title', 'Should not be set') # Key exists, no change
assert not watch.was_edited, "Flag should stay False when setdefault() doesn't change existing key"
watch.setdefault('custom_field', 'New value') # New key
assert watch.was_edited, "Flag should be True after setdefault() creates new writable field"
# Test pop() on writable field
watch.reset_watch_edited_flag()
watch.pop('custom_field', None)
assert watch.was_edited, "Flag should be True after pop() on writable field"
# Test __delitem__ on writable field
watch.reset_watch_edited_flag()
watch['temp_field'] = 'temp'
watch.reset_watch_edited_flag() # Reset after adding
del watch['temp_field']
assert watch.was_edited, "Flag should be True after __delitem__ on writable field"
print("✓ All dict methods correctly set the flag")
def test_watch_edited_flag_prevents_skip(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the was_edited flag prevents skipping reprocessing.
When watch configuration is edited, it should reprocess even if content unchanged.
After worker processing, flag should be reset and subsequent checks can skip.
"""
# Setup test content
set_test_content(datastore_path)
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
assert b"Watch added" in res.data or b"Updated watch" in res.data
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Unpause the watch (watches are paused by default in tests)
watch['paused'] = False
# Run first check to establish baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify first check completed successfully - checksum file should exist
checksum_file = os.path.join(datastore_path, uuid, 'last-checksum.txt')
assert os.path.isfile(checksum_file), "First check should create last-checksum.txt file"
# Reset the was_edited flag (simulating clean state after processing)
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Run second check without any changes - should skip via checksumFromPreviousCheckWasTheSame
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify it was skipped (last_check_status should indicate skip)
# Note: The actual skip is tested in test_check_removed_line_contains_trigger
# Here we're focused on the was_edited flag interaction
# Now modify the watch - flag should become True
watch['title'] = 'Modified Title'
assert watch.was_edited, "Flag should be True after modifying watch"
# Run third check - should NOT skip because was_edited=True even though content unchanged
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# After worker processing, the flag should be reset by the worker
# This reset happens in the processor's run() method after processing completes
assert not watch.was_edited, "Flag should be False after worker processing"
print("✓ was_edited flag correctly prevents skip and is reset by worker")
def test_watch_edited_flag_system_fields(client, live_server, measure_memory_usage, datastore_path):
"""
Test that system fields (readonly + additional system fields) don't trigger the flag.
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Test readonly fields from OpenAPI spec
readonly_fields = ['uuid', 'date_created', 'last_viewed']
for field in readonly_fields:
watch.reset_watch_edited_flag()
if field in watch:
old_value = watch[field]
watch[field] = 'modified-readonly-value'
assert not watch.was_edited, f"Flag should stay False when modifying readonly field '{field}'"
watch[field] = old_value # Restore
# Test additional system fields not in OpenAPI spec yet
system_fields = ['last_check_status']
for field in system_fields:
watch.reset_watch_edited_flag()
watch[field] = 'system-value'
assert not watch.was_edited, f"Flag should stay False when modifying system field '{field}'"
# Test that content-type (readonly per OpenAPI) doesn't trigger flag
watch.reset_watch_edited_flag()
watch['content-type'] = 'text/html'
assert not watch.was_edited, "Flag should stay False when modifying 'content-type' (readonly)"
print("✓ System fields correctly don't trigger the flag")
@@ -199,428 +199,6 @@ class TestHtmlToText(unittest.TestCase):
print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors")
def test_large_html_with_bloated_head(self):
"""
Test that html_to_text can handle large HTML documents with massive <head> bloat.
SPAs often dump 10MB+ of styles, scripts, and other bloat into the <head> section.
This can cause inscriptis to silently exit when processing very large documents.
The fix strips <style>, <script>, <svg>, <noscript>, <link>, <meta>, and HTML comments
before processing, allowing extraction of actual body content.
"""
# Generate massive style block (~5MB)
large_style = '<style>' + '.class{color:red;}\n' * 200000 + '</style>\n'
# Generate massive script block (~5MB)
large_script = '<script>' + 'console.log("bloat");\n' * 200000 + '</script>\n'
# Generate lots of SVG bloat (~3MB)
svg_bloat = '<svg><path d="M0,0 L100,100"/></svg>\n' * 50000
# Generate meta/link tags (~2MB)
meta_bloat = '<meta name="description" content="bloat"/>\n' * 50000
link_bloat = '<link rel="stylesheet" href="bloat.css"/>\n' * 50000
# Generate HTML comments (~1MB)
comment_bloat = '<!-- This is bloat -->\n' * 50000
# Generate noscript bloat
noscript_bloat = '<noscript>Enable JavaScript</noscript>\n' * 10000
# Build the large HTML document
html = f'''<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
{large_style}
{large_script}
{svg_bloat}
{meta_bloat}
{link_bloat}
{comment_bloat}
{noscript_bloat}
</head>
<body>
<h1>Important Heading</h1>
<p>This is the actual content that should be extracted.</p>
<div>
<p>First paragraph with meaningful text.</p>
<p>Second paragraph with more content.</p>
</div>
<footer>Footer text</footer>
</body>
</html>
'''
# Verify the HTML is actually large (should be ~20MB+)
html_size_mb = len(html) / (1024 * 1024)
assert html_size_mb > 15, f"HTML should be >15MB, got {html_size_mb:.2f}MB"
print(f" Testing {html_size_mb:.2f}MB HTML document with bloated head...")
# This should not crash or silently exit
text = html_to_text(html)
# Verify we got actual text output (not empty/None)
assert text is not None, "html_to_text returned None"
assert len(text) > 0, "html_to_text returned empty string"
# Verify the actual body content was extracted
assert 'Important Heading' in text, "Failed to extract heading"
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
assert 'First paragraph with meaningful text' in text, "Failed to extract first paragraph"
assert 'Second paragraph with more content' in text, "Failed to extract second paragraph"
assert 'Footer text' in text, "Failed to extract footer"
# Verify bloat was stripped (output should be tiny compared to input)
text_size_kb = len(text) / 1024
assert text_size_kb < 1, f"Output too large ({text_size_kb:.2f}KB), bloat not stripped"
# Verify no CSS, script content, or SVG leaked through
assert 'color:red' not in text, "Style content leaked into text output"
assert 'console.log' not in text, "Script content leaked into text output"
assert '<path' not in text, "SVG content leaked into text output"
assert 'bloat.css' not in text, "Link href leaked into text output"
print(f" ✓ Successfully processed {html_size_mb:.2f}MB HTML -> {text_size_kb:.2f}KB text")
def test_body_display_none_spa_pattern(self):
"""
Test that html_to_text can extract content from pages with display:none body.
SPAs (Single Page Applications) often use <body style="display:none"> to hide content
until JavaScript loads and renders the page. inscriptis respects CSS display rules,
so without preprocessing, it would skip all content and return only newlines.
The fix strips display:none and visibility:hidden styles from the body tag before
processing, allowing text extraction from client-side rendered applications.
"""
# Test case 1: Basic display:none
html1 = '''<!DOCTYPE html>
<html lang="en">
<head><title>What's New Fluxguard</title></head>
<body style="display:none">
<h1>Important Heading</h1>
<p>This is actual content that should be extracted.</p>
<div>
<p>First paragraph with meaningful text.</p>
<p>Second paragraph with more content.</p>
</div>
</body>
</html>'''
text1 = html_to_text(html1)
# Before fix: would return ~33 newlines, len(text) ~= 33
# After fix: should extract actual content, len(text) > 100
assert len(text1) > 100, f"Expected substantial text output, got {len(text1)} chars"
assert 'Important Heading' in text1, "Failed to extract heading from display:none body"
assert 'actual content' in text1, "Failed to extract paragraph from display:none body"
assert 'First paragraph' in text1, "Failed to extract nested content"
# Should not be mostly newlines
newline_ratio = text1.count('\n') / len(text1)
assert newline_ratio < 0.5, f"Output is mostly newlines ({newline_ratio:.2%}), content not extracted"
# Test case 2: visibility:hidden (another hiding pattern)
html2 = '<html><body style="visibility:hidden"><h1>Hidden Content</h1><p>Test paragraph.</p></body></html>'
text2 = html_to_text(html2)
assert 'Hidden Content' in text2, "Failed to extract content from visibility:hidden body"
assert 'Test paragraph' in text2, "Failed to extract paragraph from visibility:hidden body"
# Test case 3: Mixed styles (display:none with other CSS)
html3 = '<html><body style="color: red; display:none; font-size: 12px"><p>Mixed style content</p></body></html>'
text3 = html_to_text(html3)
assert 'Mixed style content' in text3, "Failed to extract content from body with mixed styles"
# Test case 4: Case insensitivity (DISPLAY:NONE uppercase)
html4 = '<html><body style="DISPLAY:NONE"><p>Uppercase style</p></body></html>'
text4 = html_to_text(html4)
assert 'Uppercase style' in text4, "Failed to handle uppercase DISPLAY:NONE"
# Test case 5: Space variations (display: none vs display:none)
html5 = '<html><body style="display: none"><p>With spaces</p></body></html>'
text5 = html_to_text(html5)
assert 'With spaces' in text5, "Failed to handle 'display: none' with space"
# Test case 6: Body with other attributes (class, id)
html6 = '<html><body class="foo" style="display:none" id="bar"><p>With attributes</p></body></html>'
text6 = html_to_text(html6)
assert 'With attributes' in text6, "Failed to extract from body with multiple attributes"
# Test case 7: Should NOT affect opacity:0 (which doesn't hide from inscriptis)
html7 = '<html><body style="opacity:0"><p>Transparent content</p></body></html>'
text7 = html_to_text(html7)
# Opacity doesn't affect inscriptis text extraction, content should be there
assert 'Transparent content' in text7, "Incorrectly stripped opacity:0 style"
print(" ✓ All display:none body tag tests passed")
def test_style_tag_with_svg_data_uri(self):
"""
Test that style tags containing SVG data URIs are properly stripped.
Some WordPress and modern sites embed SVG as data URIs in CSS, which contains
<svg> and </svg> tags within the style content. The regex must use backreferences
to ensure <style> matches </style> (not </svg> inside the CSS).
This was causing errors where the regex would match <style> and stop at the first
</svg> it encountered inside a CSS data URI, breaking the HTML structure.
"""
# Real-world example from WordPress wp-block-image styles
html = '''<!DOCTYPE html>
<html>
<head>
<style id='wp-block-image-inline-css'>
.wp-block-image>a,.wp-block-image>figure>a{display:inline-block}.wp-block-image img{box-sizing:border-box;height:auto;max-width:100%;vertical-align:bottom}@supports ((-webkit-mask-image:none) or (mask-image:none)) or (-webkit-mask-image:none){.wp-block-image.is-style-circle-mask img{border-radius:0;-webkit-mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-mode:alpha}}
</style>
</head>
<body>
<h1>Test Heading</h1>
<p>This is the actual content that should be extracted.</p>
<div class="wp-block-image">
<img src="test.jpg" alt="Test image">
</div>
</body>
</html>'''
# This should not crash and should extract the body content
text = html_to_text(html)
# Verify the actual body content was extracted
assert text is not None, "html_to_text returned None"
assert len(text) > 0, "html_to_text returned empty string"
assert 'Test Heading' in text, "Failed to extract heading"
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
# Verify CSS content was stripped (including the SVG data URI)
assert '.wp-block-image' not in text, "CSS class selector leaked into text"
assert 'mask-image' not in text, "CSS property leaked into text"
assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text"
assert 'viewBox' not in text, "SVG attributes leaked into text"
# Verify no broken HTML structure
assert '<style' not in text, "Unclosed style tag in output"
assert '</svg>' not in text, "SVG closing tag leaked into text"
print(" ✓ Style tag with SVG data URI test passed")
def test_style_tag_closes_correctly(self):
"""
Test that each tag type (style, script, svg) closes with the correct closing tag.
Before the fix, the regex used (?:style|script|svg|noscript) for both opening and
closing tags, which meant <style> could incorrectly match </svg> as its closing tag.
With backreferences, <style> must close with </style>, <svg> with </svg>, etc.
"""
# Test nested tags where incorrect matching would break
html = '''<!DOCTYPE html>
<html>
<head>
<style>
body { background: url('data:image/svg+xml,<svg><rect/></svg>'); }
</style>
<script>
const svg = '<svg><path d="M0,0"/></svg>';
</script>
</head>
<body>
<h1>Content</h1>
<svg><circle cx="50" cy="50" r="40"/></svg>
<p>After SVG</p>
</body>
</html>'''
text = html_to_text(html)
# Should extract body content
assert 'Content' in text, "Failed to extract heading"
assert 'After SVG' in text, "Failed to extract content after SVG"
# Should strip all style/script/svg content
assert 'background:' not in text, "Style content leaked"
assert 'const svg' not in text, "Script content leaked"
assert '<circle' not in text, "SVG element leaked"
assert 'data:image/svg+xml' not in text, "Data URI leaked"
print(" ✓ Tag closing validation test passed")
def test_script_with_closing_tag_in_string_does_not_eat_content(self):
"""
Script tag containing </script> inside a JS string must not prematurely end the block.
This is the classic regex failure mode: the old pattern would find the first </script>
inside the JS string literal and stop there, leaving the tail of the script block
(plus any following content) exposed as raw text. BS4 parses the HTML correctly.
"""
html = '''<html><body>
<p>Before script</p>
<script>
var html = "<div>foo<\\/script><p>bar</p>";
var also = 1;
</script>
<p>AFTER SCRIPT</p>
</body></html>'''
text = html_to_text(html)
assert 'Before script' in text
assert 'AFTER SCRIPT' in text
# Script internals must not leak
assert 'var html' not in text
assert 'var also' not in text
def test_content_sandwiched_between_multiple_body_scripts(self):
"""Content between multiple script/style blocks in the body must all survive."""
html = '''<html><body>
<script>var a = 1;</script>
<p>CONTENT A</p>
<style>.x { color: red; }</style>
<p>CONTENT B</p>
<script>var b = 2;</script>
<p>CONTENT C</p>
<style>.y { color: blue; }</style>
<p>CONTENT D</p>
</body></html>'''
text = html_to_text(html)
for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
assert label in text, f"'{label}' was eaten by script/style stripping"
assert 'var a' not in text
assert 'var b' not in text
assert 'color: red' not in text
assert 'color: blue' not in text
def test_unicode_and_international_content_preserved(self):
"""Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
html = '''<html><body>
<style>.x{color:red}</style>
<p>German: Aus\xadge\xadbucht! ANMELDUNG Fan\xadday 2026</p>
<p>Chinese: \u6ce8\u518c</p>
<p>Japanese: \u767b\u9332</p>
<p>Korean: \ub4f1\ub85d</p>
<p>Emoji: \U0001f4e2</p>
<script>var x = 1;</script>
</body></html>'''
text = html_to_text(html)
assert 'ANMELDUNG' in text
assert '\u6ce8\u518c' in text # Chinese
assert '\u767b\u9332' in text # Japanese
assert '\ub4f1\ub85d' in text # Korean
def test_style_with_type_attribute_is_stripped(self):
"""<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
html = '''<html><body>
<style type="text/css">.important { display: none; }</style>
<p>VISIBLE CONTENT</p>
</body></html>'''
text = html_to_text(html)
assert 'VISIBLE CONTENT' in text
assert '.important' not in text
assert 'display: none' not in text
def test_ldjson_script_is_stripped(self):
"""<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
html = '''<html><body>
<script type="application/ld+json">
{"@type": "Product", "name": "Widget", "price": "9.99"}
</script>
<p>PRODUCT PAGE</p>
</body></html>'''
text = html_to_text(html)
assert 'PRODUCT PAGE' in text
assert '@type' not in text
assert '"price"' not in text
def test_inline_svg_is_stripped_entirely(self):
"""
Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
"""
html = '''<html><body>
<p>Before SVG</p>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M14 5L7 12L14 19Z" fill="none"/>
<circle cx="12" cy="12" r="10"/>
</svg>
<p>After SVG</p>
</body></html>'''
text = html_to_text(html)
assert 'Before SVG' in text
assert 'After SVG' in text
assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
assert 'viewBox' not in text, "SVG attributes should not appear in text output"
def test_tag_inside_json_data_attribute_does_not_eat_content(self):
"""
Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
closing tags appear as <\\/svg> rather than </svg>.
The old regex approach would find <svg> inside the attribute value, then fail to find
<\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM
eating tens of kilobytes of actual page content in the process.
"""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
</div>
<div class="content">
<h1>IMPORTANT CONTENT</h1>
<p>This text must not be eaten by the tag-stripping logic.</p>
</div>
<svg><circle cx="50" cy="50" r="40"/></svg>
</body>
</html>'''
text = html_to_text(html)
assert 'IMPORTANT CONTENT' in text, (
"Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
"The tag-stripping logic is matching <tag> inside attribute values and scanning "
"forward to the next real closing tag in the DOM."
)
assert 'This text must not be eaten' in text
def test_script_inside_json_data_attribute_does_not_eat_content(self):
"""Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
</div>
<div>
<h1>MUST SURVIVE</h1>
<p>Real content after the data attribute with embedded script tag.</p>
</div>
<script>var real = 1;</script>
</body>
</html>'''
text = html_to_text(html)
assert 'MUST SURVIVE' in text, (
"Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
)
assert 'Real content after the data attribute' in text
if __name__ == '__main__':
# Can run this file directly for quick testing
@@ -8,7 +8,6 @@ python3 -m pytest changedetectionio/tests/unit/test_time_handler.py -v
"""
import unittest
import unittest.mock
import arrow
from changedetectionio import time_handler
@@ -241,211 +240,6 @@ class TestAmIInsideTime(unittest.TestCase):
# Result depends on current time
self.assertIsInstance(result, bool)
def test_24_hour_schedule_from_midnight(self):
"""Test 24-hour schedule starting at midnight covers entire day."""
timezone_str = 'UTC'
# Test at a specific time: Monday 00:00
test_time = arrow.get('2024-01-01 00:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd') # Monday
# Mock current time for testing
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="00:00",
timezone_str=timezone_str,
duration=1440 # 24 hours
)
self.assertTrue(result, "Should be active at start of 24-hour schedule")
def test_24_hour_schedule_at_end_of_day(self):
"""Test 24-hour schedule is active at 23:59:59."""
timezone_str = 'UTC'
# Test at Monday 23:59:59
test_time = arrow.get('2024-01-01 23:59:59', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd') # Monday
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="00:00",
timezone_str=timezone_str,
duration=1440 # 24 hours
)
self.assertTrue(result, "Should be active at end of 24-hour schedule")
def test_24_hour_schedule_at_midnight_transition(self):
"""Test 24-hour schedule at exactly midnight transition."""
timezone_str = 'UTC'
# Test at Tuesday 00:00:00 (end of Monday's 24-hour schedule)
test_time = arrow.get('2024-01-02 00:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
monday = test_time.shift(days=-1).format('dddd') # Monday
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=monday,
time_str="00:00",
timezone_str=timezone_str,
duration=1440 # 24 hours
)
self.assertTrue(result, "Should include exactly midnight at end of 24-hour schedule")
def test_schedule_crosses_midnight_before_midnight(self):
"""Test schedule crossing midnight - before midnight."""
timezone_str = 'UTC'
# Monday 23:30
test_time = arrow.get('2024-01-01 23:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd') # Monday
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="23:00",
timezone_str=timezone_str,
duration=120 # 2 hours (until 01:00 next day)
)
self.assertTrue(result, "Should be active before midnight in cross-midnight schedule")
def test_schedule_crosses_midnight_after_midnight(self):
"""Test schedule crossing midnight - after midnight."""
timezone_str = 'UTC'
# Tuesday 00:30
test_time = arrow.get('2024-01-02 00:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
monday = test_time.shift(days=-1).format('dddd') # Monday
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=monday,
time_str="23:00",
timezone_str=timezone_str,
duration=120 # 2 hours (until 01:00 Tuesday)
)
self.assertTrue(result, "Should be active after midnight in cross-midnight schedule")
def test_schedule_crosses_midnight_at_exact_end(self):
"""Test schedule crossing midnight at exact end time."""
timezone_str = 'UTC'
# Tuesday 01:00 (exact end of Monday 23:00 + 120 minutes)
test_time = arrow.get('2024-01-02 01:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
monday = test_time.shift(days=-1).format('dddd') # Monday
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=monday,
time_str="23:00",
timezone_str=timezone_str,
duration=120 # 2 hours
)
self.assertTrue(result, "Should include exact end time of schedule")
def test_duration_60_minutes(self):
"""Test that duration of 60 minutes works correctly."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 12:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=60 # Exactly 60 minutes
)
self.assertTrue(result, "60-minute duration should work")
def test_duration_at_exact_end_minute(self):
"""Test at exact end of 60-minute window."""
timezone_str = 'UTC'
# Exactly 13:00 (end of 12:00 + 60 minutes)
test_time = arrow.get('2024-01-01 13:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=60
)
self.assertTrue(result, "Should include exact end minute")
def test_one_second_after_schedule_ends(self):
"""Test one second after schedule should end."""
timezone_str = 'UTC'
# 13:00:01 (one second after 12:00 + 60 minutes)
test_time = arrow.get('2024-01-01 13:00:01', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=60
)
self.assertFalse(result, "Should be False one second after schedule ends")
def test_multi_day_schedule(self):
"""Test schedule longer than 24 hours (48 hours)."""
timezone_str = 'UTC'
# Tuesday 12:00 (36 hours after Monday 00:00)
test_time = arrow.get('2024-01-02 12:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
monday = test_time.shift(days=-1).format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=monday,
time_str="00:00",
timezone_str=timezone_str,
duration=2880 # 48 hours
)
self.assertTrue(result, "Should support multi-day schedules")
def test_schedule_one_minute_duration(self):
"""Test very short 1-minute schedule."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 12:00:30', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=1 # Just 1 minute
)
self.assertTrue(result, "1-minute schedule should work")
def test_schedule_at_exact_start_time(self):
"""Test at exact start time (00:00:00.000000)."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 12:00:00.000000', 'YYYY-MM-DD HH:mm:ss.SSSSSS').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=30
)
self.assertTrue(result, "Should include exact start time")
def test_schedule_one_microsecond_before_start(self):
"""Test one microsecond before schedule starts."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 11:59:59.999999', 'YYYY-MM-DD HH:mm:ss.SSSSSS').replace(tzinfo=timezone_str)
day_of_week = test_time.format('dddd')
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.am_i_inside_time(
day_of_week=day_of_week,
time_str="12:00",
timezone_str=timezone_str,
duration=30
)
self.assertFalse(result, "Should not include time before start")
class TestIsWithinSchedule(unittest.TestCase):
"""Tests for the is_within_schedule function."""
@@ -611,175 +405,6 @@ class TestIsWithinSchedule(unittest.TestCase):
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should handle timezone with whitespace")
def test_schedule_with_60_minutes(self):
"""Test schedule with duration of 0 hours and 60 minutes."""
timezone_str = 'UTC'
now = arrow.now(timezone_str)
current_day = now.format('dddd').lower()
current_hour = now.format('HH:00')
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': current_hour,
'duration': {'hours': 0, 'minutes': 60} # 60 minutes
}
}
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should accept 60 minutes as valid duration")
def test_schedule_with_24_hours(self):
"""Test schedule with duration of 24 hours and 0 minutes."""
timezone_str = 'UTC'
now = arrow.now(timezone_str)
current_day = now.format('dddd').lower()
start_hour = now.format('HH:00')
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': start_hour,
'duration': {'hours': 24, 'minutes': 0} # Full 24 hours
}
}
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should accept 24 hours as valid duration")
def test_schedule_with_90_minutes(self):
"""Test schedule with duration of 0 hours and 90 minutes."""
timezone_str = 'UTC'
now = arrow.now(timezone_str)
current_day = now.format('dddd').lower()
current_hour = now.format('HH:00')
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': current_hour,
'duration': {'hours': 0, 'minutes': 90} # 90 minutes = 1.5 hours
}
}
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should accept 90 minutes as valid duration")
def test_schedule_24_hours_from_midnight(self):
"""Test 24-hour schedule from midnight using is_within_schedule."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 12:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
current_day = test_time.format('dddd').lower() # monday
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': '00:00',
'duration': {'hours': 24, 'minutes': 0}
}
}
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "24-hour schedule from midnight should cover entire day")
def test_schedule_24_hours_at_end_of_day(self):
"""Test 24-hour schedule at 23:59 using is_within_schedule."""
timezone_str = 'UTC'
test_time = arrow.get('2024-01-01 23:59:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
current_day = test_time.format('dddd').lower()
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': '00:00',
'duration': {'hours': 24, 'minutes': 0}
}
}
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should be active at 23:59 in 24-hour schedule")
def test_schedule_crosses_midnight_with_is_within_schedule(self):
"""Test schedule crossing midnight using is_within_schedule."""
timezone_str = 'UTC'
# Tuesday 00:30
test_time = arrow.get('2024-01-02 00:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
# Get Monday as that's when the schedule started
monday = test_time.shift(days=-1).format('dddd').lower()
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
'monday': {
'enabled': True,
'start_time': '23:00',
'duration': {'hours': 2, 'minutes': 0} # Until 01:00 Tuesday
},
'tuesday': {
'enabled': False,
'start_time': '09:00',
'duration': {'hours': 8, 'minutes': 0}
}
}
with unittest.mock.patch('arrow.now', return_value=test_time):
result = time_handler.is_within_schedule(time_schedule_limit)
# Note: This checks Tuesday's schedule, not Monday's overlap
# So it should be False because Tuesday is disabled
self.assertFalse(result, "Should check current day (Tuesday), which is disabled")
def test_schedule_with_mixed_hours_minutes(self):
"""Test schedule with both hours and minutes (23 hours 60 minutes = 24 hours)."""
timezone_str = 'UTC'
now = arrow.now(timezone_str)
current_day = now.format('dddd').lower()
current_hour = now.format('HH:00')
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': current_hour,
'duration': {'hours': 23, 'minutes': 60} # = 1440 minutes = 24 hours
}
}
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should handle 23 hours + 60 minutes = 24 hours")
def test_schedule_48_hours(self):
"""Test schedule with 48-hour duration."""
timezone_str = 'UTC'
now = arrow.now(timezone_str)
current_day = now.format('dddd').lower()
start_hour = now.format('HH:00')
time_schedule_limit = {
'enabled': True,
'timezone': timezone_str,
current_day: {
'enabled': True,
'start_time': start_hour,
'duration': {'hours': 48, 'minutes': 0} # 2 full days
}
}
result = time_handler.is_within_schedule(time_schedule_limit)
self.assertTrue(result, "Should support 48-hour (multi-day) schedules")
class TestWeekdayEnum(unittest.TestCase):
"""Tests for the Weekday enum."""
-18
View File
@@ -160,7 +160,6 @@ def extract_UUID_from_client(client):
return uuid.strip()
def delete_all_watches(client=None):
wait_for_all_checks(client)
uuids = list(client.application.config.get('DATASTORE').data['watching'])
for uuid in uuids:
@@ -181,23 +180,6 @@ def delete_all_watches(client=None):
time.sleep(0.2)
# Delete any old watch metadata
from pathlib import Path
base_path = Path(
client.application.config.get('DATASTORE').datastore_path
).resolve()
max_depth = 2
for file in base_path.rglob("*.json"):
# Calculate depth relative to base path
depth = len(file.relative_to(base_path).parts) - 1
if depth <= max_depth and file.is_file():
file.unlink()
def wait_for_all_checks(client=None):
"""
Waits until the queue is empty and workers are idle.
@@ -88,6 +88,7 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage
def test_basic_browserstep(client, live_server, measure_memory_usage, datastore_path):
assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
test_url = url_for('test_interactive_html_endpoint', _external=True)
test_url = test_url.replace('localhost.localdomain', 'cdio')
@@ -107,13 +108,13 @@ def test_basic_browserstep(client, live_server, measure_memory_usage, datastore_
"url": test_url,
"tags": "",
'fetch_backend': "html_webdriver",
'browser_steps-5-operation': 'Enter text in field',
'browser_steps-5-selector': '#test-input-text',
'browser_steps-0-operation': 'Enter text in field',
'browser_steps-0-selector': '#test-input-text',
# Should get set to the actual text (jinja2 rendered)
'browser_steps-5-optional_value': "Hello-Jinja2-{% now 'Europe/Berlin', '%Y-%m-%d' %}",
'browser_steps-8-operation': 'Click element',
'browser_steps-8-selector': 'button[name=test-button]',
'browser_steps-8-optional_value': '',
'browser_steps-0-optional_value': "Hello-Jinja2-{% now 'Europe/Berlin', '%Y-%m-%d' %}",
'browser_steps-1-operation': 'Click element',
'browser_steps-1-selector': 'button[name=test-button]',
'browser_steps-1-optional_value': '',
# For now, cookies doesnt work in headers because it must be a full cookiejar object
'headers': "testheader: yes\buser-agent: MyCustomAgent",
"time_between_check_use_default": "y",
@@ -121,18 +122,9 @@ def test_basic_browserstep(client, live_server, measure_memory_usage, datastore_
follow_redirects=True
)
assert b"unpaused" in res.data
wait_for_all_checks(client)
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
# 3874 - should have tidied up any blanks
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert watch['browser_steps'][0].get('operation') == 'Enter text in field'
assert watch['browser_steps'][1].get('selector') == 'button[name=test-button]'
# This part actually needs the browser, before this we are just testing data
assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
assert b"This text should be removed" not in res.data
+3 -3
View File
@@ -62,19 +62,19 @@ def am_i_inside_time(
# Calculate start and end times for the overlap from the previous day
start_datetime_tz = start_datetime_tz.shift(days=-1)
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
if start_datetime_tz <= now_tz <= end_datetime_tz:
if start_datetime_tz <= now_tz < end_datetime_tz:
return True
# Handle current day's range
if target_weekday == current_weekday:
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
if start_datetime_tz <= now_tz <= end_datetime_tz:
if start_datetime_tz <= now_tz < end_datetime_tz:
return True
# Handle next day's overlap
if target_weekday == (current_weekday + 1) % 7:
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
if now_tz < start_datetime_tz and now_tz.shift(days=1) <= end_datetime_tz:
if now_tz < start_datetime_tz and now_tz.shift(days=1) < end_datetime_tz:
return True
return False
+22 -68
View File
@@ -4,10 +4,11 @@ import changedetectionio.content_fetchers.exceptions as content_fetchers_excepti
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
from changedetectionio import worker_pool
from changedetectionio.flask_app import watch_check_update
from changedetectionio.queuedWatchMetaData import PrioritizedItem
from changedetectionio.pluggy_interface import apply_update_handler_alter, apply_update_finalize
import asyncio
import importlib
import os
import sys
import time
@@ -55,7 +56,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
while not app.config.exit.is_set():
update_handler = None
watch = None
processing_exception = None # Reset at start of each iteration to prevent state bleeding
try:
# Efficient blocking via run_in_executor (no polling overhead!)
@@ -119,7 +119,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
# to prevent race condition with wait_for_all_checks()
fetch_start_time = round(time.time())
try:
if uuid in list(datastore.data['watching'].keys()) and datastore.data['watching'][uuid].get('url'):
changed_detected = False
@@ -136,8 +136,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.info(f"Worker {worker_id} processing watch UUID {uuid} Priority {queued_item_data.priority} URL {watch['url']}")
try:
# Retrieve signal by name to ensure thread-safe access across worker threads
watch_check_update = signal('watch_check_update')
watch_check_update.send(watch_uuid=uuid)
# Processor is what we are using for detecting the "Change"
@@ -156,9 +154,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid)
# Allow plugins to modify/wrap the update_handler
update_handler = apply_update_handler_alter(update_handler, watch, datastore)
update_signal = signal('watch_small_status_comment')
update_signal.send(watch_uuid=uuid, status="Fetching page..")
@@ -281,9 +276,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
logger.debug(f'[{uuid}] - checksumFromPreviousCheckWasTheSame - Checksum from previous check was the same, nothing todo here.')
# Reset the edited flag since we successfully completed the check
watch.reset_watch_edited_flag()
except content_fetchers_exceptions.BrowserConnectError as e:
datastore.update_watch(uuid=uuid,
@@ -386,7 +378,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if not datastore.data['watching'].get(uuid):
continue
update_obj['content-type'] = str(update_handler.fetcher.get_all_headers().get('content-type', '') or "").lower()
update_obj['content-type'] = update_handler.fetcher.get_all_headers().get('content-type', '').lower()
if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
@@ -400,8 +392,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler and update_handler.xpath_data else 'empty.'}")
if update_handler and process_changedetection_results:
try:
# Reset the edited flag BEFORE update_watch (which calls watch.update() and would set it again)
watch.reset_watch_edited_flag()
datastore.update_watch(uuid=uuid, update_obj=update_obj)
if changed_detected or not watch.history_n:
@@ -449,22 +439,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.exception(f"Worker {worker_id} full exception details:")
datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Always record attempt count
count = watch.get('check_count', 0) + 1
final_updates = {'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count,
}
# Record server header
try:
server_header = str(update_handler.fetcher.get_all_headers().get('server', '') or "").strip().lower()[:255]
if server_header:
final_updates['remote_server_reply'] = server_header
except Exception as e:
server_header = None
pass
if update_handler: # Could be none or empty if the processor was not found
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
@@ -473,23 +449,32 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
final_updates['page_title'] = page_title
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
logger.exception(f"Worker {worker_id} full exception details:")
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Record server header
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
except Exception as e:
pass
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
datastore.update_watch(uuid=uuid, update_obj=final_updates)
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
# Explicitly delete update_handler to free all references
if update_handler:
@@ -501,8 +486,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
gc.collect()
except Exception as e:
# Store the processing exception for plugin finalization hook
processing_exception = e
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.exception(f"Worker {worker_id} full exception details:")
@@ -514,11 +497,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
finally:
# Always cleanup - this runs whether there was an exception or not
if uuid:
# Capture references for plugin finalize hook BEFORE cleanup
# (cleanup may delete these variables, but plugins need the original references)
finalize_handler = update_handler # Capture now, before cleanup deletes it
finalize_watch = watch # Capture now, before any modifications
# Call quit() as backup (Puppeteer/Playwright have internal cleanup, but this acts as safety net)
try:
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
@@ -528,6 +506,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.exception(f"Worker {worker_id} full exception details:")
try:
# Release UUID from processing (thread-safe)
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
# Send completion signal
if watch:
watch_check_update.send(watch_uuid=watch['uuid'])
# Clean up all memory references BEFORE garbage collection
if update_handler:
@@ -551,37 +535,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")
logger.exception(f"Worker {worker_id} full exception details:")
# Call plugin finalization hook after all cleanup is done
# Use captured references from before cleanup
try:
apply_update_finalize(
update_handler=finalize_handler,
watch=finalize_watch,
datastore=datastore,
processing_exception=processing_exception
)
except Exception as finalize_error:
logger.error(f"Worker {worker_id} error in finalize hook: {finalize_error}")
logger.exception(f"Worker {worker_id} full exception details:")
finally:
# Clean up captured references to allow immediate garbage collection
del finalize_handler
del finalize_watch
# Release UUID from processing AFTER all cleanup and hooks complete (thread-safe)
# This ensures wait_for_all_checks() waits for finalize hooks to complete
try:
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
except Exception as release_error:
logger.error(f"Worker {worker_id} error releasing UUID: {release_error}")
logger.exception(f"Worker {worker_id} full exception details:")
finally:
# Send completion signal - retrieve by name to ensure thread-safe access
if watch:
watch_check_update = signal('watch_check_update')
watch_check_update.send(watch_uuid=watch['uuid'])
del (uuid)
del(uuid)
# Brief pause before continuing to avoid tight error loops (only on error)
if 'e' in locals():
+62 -367
View File
@@ -28,7 +28,7 @@ info:
For example: `x-api-key: YOUR_API_KEY`
version: 0.1.6
version: 0.1.4
contact:
name: ChangeDetection.io
url: https://github.com/dgtlmoon/changedetection.io
@@ -126,22 +126,13 @@ components:
WatchBase:
type: object
properties:
uuid:
type: string
format: uuid
description: Unique identifier
readOnly: true
date_created:
type: [integer, 'null']
description: Unix timestamp of creation
readOnly: true
url:
type: string
format: uri
description: URL to monitor for changes
maxLength: 5000
title:
type: [string, 'null']
type: string
description: Custom title for the web page change monitor (watch), not to be confused with page_title
maxLength: 5000
tag:
@@ -165,61 +156,56 @@ components:
description: HTTP method to use
fetch_backend:
type: string
description: |
Backend to use for fetching content. Common values:
- `system` (default) - Use the system-wide default fetcher
- `html_requests` - Fast requests-based fetcher
- `html_webdriver` - Browser-based fetcher (Playwright/Puppeteer)
- `extra_browser_*` - Custom browser configurations (if configured)
- Plugin-provided fetchers (if installed)
pattern: '^(system|html_requests|html_webdriver|extra_browser_.+)$'
default: system
enum: [html_requests, html_webdriver]
description: Backend to use for fetching content
headers:
type: object
additionalProperties:
type: string
description: HTTP headers to include in requests
body:
type: [string, 'null']
type: string
description: HTTP request body
maxLength: 5000
proxy:
type: [string, 'null']
type: string
description: Proxy configuration
maxLength: 5000
ignore_status_codes:
type: [boolean, 'null']
description: Ignore HTTP status code errors (boolean or null)
webdriver_delay:
type: [integer, 'null']
type: integer
description: Delay in seconds for webdriver
webdriver_js_execute_code:
type: [string, 'null']
type: string
description: JavaScript code to execute
maxLength: 5000
time_between_check:
type: object
properties:
weeks:
type: [integer, 'null']
type: integer
minimum: 0
maximum: 52000
nullable: true
days:
type: [integer, 'null']
type: integer
minimum: 0
maximum: 365000
nullable: true
hours:
type: [integer, 'null']
type: integer
minimum: 0
maximum: 8760000
nullable: true
minutes:
type: [integer, 'null']
type: integer
minimum: 0
maximum: 525600000
nullable: true
seconds:
type: [integer, 'null']
type: integer
minimum: 0
maximum: 31536000000
nullable: true
description: Time intervals between checks. All fields must be non-negative. At least one non-zero value required when not using default settings.
time_between_check_use_default:
type: boolean
@@ -233,11 +219,11 @@ components:
maxItems: 100
description: Notification URLs for this web page change monitor (watch). Maximum 100 URLs.
notification_title:
type: [string, 'null']
type: string
description: Custom notification title
maxLength: 5000
notification_body:
type: [string, 'null']
type: string
description: Custom notification body
maxLength: 5000
notification_format:
@@ -245,7 +231,7 @@ components:
enum: ['text', 'html', 'htmlcolor', 'markdown', 'System default']
description: Format for notifications
track_ldjson_price_data:
type: [boolean, 'null']
type: boolean
description: Whether to track JSON-LD price data
browser_steps:
type: array
@@ -253,14 +239,17 @@ components:
type: object
properties:
operation:
type: [string, 'null']
type: string
maxLength: 5000
nullable: true
selector:
type: [string, 'null']
type: string
maxLength: 5000
nullable: true
optional_value:
type: [string, 'null']
type: string
maxLength: 5000
nullable: true
required: [operation, selector, optional_value]
additionalProperties: false
maxItems: 100
@@ -271,197 +260,16 @@ components:
default: text_json_diff
description: Optional processor mode to use for change detection. Defaults to `text_json_diff` if not specified.
# Content Filtering
include_filters:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: CSS/XPath selectors to extract specific content from the page
subtractive_selectors:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: CSS/XPath selectors to remove content from the page
ignore_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text patterns to ignore in change detection
trigger_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text/regex patterns that must be present to trigger a change
text_should_not_be_present:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text that should NOT be present (triggers alert if found)
extract_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Regex patterns to extract specific text after filtering
# Text Processing
trim_text_whitespace:
type: boolean
default: false
description: Strip leading/trailing whitespace from text
sort_text_alphabetically:
type: boolean
default: false
description: Sort lines alphabetically before comparison
remove_duplicate_lines:
type: boolean
default: false
description: Remove duplicate lines from content
check_unique_lines:
type: boolean
default: false
description: Compare against all history for unique lines
strip_ignored_lines:
type: [boolean, 'null']
description: Remove lines matching ignore patterns
# Change Detection Filters
filter_text_added:
type: boolean
default: true
description: Include added text in change detection
filter_text_removed:
type: boolean
default: true
description: Include removed text in change detection
filter_text_replaced:
type: boolean
default: true
description: Include replaced text in change detection
# Restock/Price Detection
in_stock_only:
type: boolean
default: true
description: Only trigger on in-stock transitions (restock_diff processor)
follow_price_changes:
type: boolean
default: true
description: Monitor and track price changes (restock_diff processor)
price_change_threshold_percent:
type: [number, 'null']
description: Minimum price change percentage to trigger notification
has_ldjson_price_data:
type: [boolean, 'null']
description: Whether page has LD-JSON price data (auto-detected)
readOnly: true
# Notifications
notification_screenshot:
type: boolean
default: false
description: Include screenshot in notifications (if supported by notification URL)
filter_failure_notification_send:
type: boolean
default: true
description: Send notification when filters fail to match content
# History & Display
use_page_title_in_list:
type: [boolean, 'null']
description: Display page title in watch list (null = use system default)
history_snapshot_max_length:
type: [integer, 'null']
minimum: 1
maximum: 1000
description: Maximum number of history snapshots to keep (null = use system default)
# Scheduling
time_schedule_limit:
type: object
description: Weekly schedule limiting when checks can run
properties:
enabled:
type: boolean
default: false
monday:
$ref: '#/components/schemas/DaySchedule'
tuesday:
$ref: '#/components/schemas/DaySchedule'
wednesday:
$ref: '#/components/schemas/DaySchedule'
thursday:
$ref: '#/components/schemas/DaySchedule'
friday:
$ref: '#/components/schemas/DaySchedule'
saturday:
$ref: '#/components/schemas/DaySchedule'
sunday:
$ref: '#/components/schemas/DaySchedule'
# Conditions (advanced logic)
conditions:
type: array
items:
type: object
properties:
field:
type: string
description: Field to check (e.g., 'page_filtered_text', 'page_title')
operator:
type: string
description: Comparison operator (e.g., 'contains_regex', 'equals', 'not_equals')
value:
type: string
description: Value to compare against
required: [field, operator, value]
maxItems: 100
description: Array of condition rules for change detection logic (empty array when not set)
conditions_match_logic:
type: string
enum: ['ALL', 'ANY']
default: 'ALL'
description: Logic operator - ALL (match all conditions) or ANY (match any condition)
DaySchedule:
type: object
properties:
enabled:
type: boolean
default: true
start_time:
type: string
pattern: '^([0-1]?[0-9]|2[0-3]):[0-5][0-9]$'
default: '00:00'
description: Start time in HH:MM format
duration:
type: object
properties:
hours:
type: string
pattern: '^[0-9]+$'
default: '24'
minutes:
type: string
pattern: '^[0-9]+$'
default: '00'
Watch:
allOf:
- $ref: '#/components/schemas/WatchBase'
- type: object
properties:
uuid:
type: string
format: uuid
description: Unique identifier for the web page change monitor (watch)
readOnly: true
last_checked:
type: integer
description: Unix timestamp of last check
@@ -470,10 +278,9 @@ components:
type: integer
description: Unix timestamp of last change
readOnly: true
x-computed: true
last_error:
type: [string, boolean, 'null']
description: Last error message (false when no error, string when error occurred, null if not checked yet)
type: string
description: Last error message
readOnly: true
last_viewed:
type: integer
@@ -484,61 +291,6 @@ components:
format: string
description: The watch URL rendered in case of any Jinja2 markup, always use this for listing.
readOnly: true
x-computed: true
page_title:
type: [string, 'null']
description: HTML <title> tag extracted from the page
readOnly: true
check_count:
type: integer
description: Total number of checks performed
readOnly: true
fetch_time:
type: number
description: Duration of last fetch in seconds
readOnly: true
previous_md5:
type: [string, boolean]
description: MD5 hash of previous content (false if not set)
readOnly: true
previous_md5_before_filters:
type: [string, boolean]
description: MD5 hash before filters applied (false if not set)
readOnly: true
consecutive_filter_failures:
type: integer
description: Counter for consecutive filter match failures
readOnly: true
last_notification_error:
type: [string, 'null']
description: Last notification error message
readOnly: true
notification_alert_count:
type: integer
description: Number of notifications sent
readOnly: true
content-type:
type: [string, 'null']
description: Content-Type from last fetch
readOnly: true
remote_server_reply:
type: [string, 'null']
description: Server header from last response
readOnly: true
browser_steps_last_error_step:
type: [integer, 'null']
description: Last browser step that caused an error
readOnly: true
viewed:
type: [integer, boolean]
description: Computed property - true if watch has been viewed, false otherwise (deprecated, use last_viewed instead)
readOnly: true
x-computed: true
history_n:
type: integer
description: Number of history snapshots available
readOnly: true
x-computed: true
CreateWatch:
allOf:
@@ -549,45 +301,34 @@ components:
UpdateWatch:
allOf:
- $ref: '#/components/schemas/WatchBase' # Extends WatchBase for user-settable fields
- $ref: '#/components/schemas/WatchBase'
- type: object
properties:
last_viewed:
type: integer
description: Unix timestamp in seconds of the last time the watch was viewed. Setting it to a value higher than `last_changed` in the "Update watch" endpoint marks the watch as viewed.
minimum: 0
# Note: ReadOnly and @property fields are filtered out in the backend before update
# We don't use unevaluatedProperties:false here to allow roundtrip GET/PUT workflows
# where the response includes computed fields that should be silently ignored
Tag:
allOf:
- $ref: '#/components/schemas/WatchBase'
- type: object
properties:
overrides_watch:
type: [boolean, 'null']
description: |
Whether this tag's settings override watch settings for all watches in this tag/group.
- true: Tag settings override watch settings
- false: Tag settings do not override (watches use their own settings)
- null: Not decided yet / inherit default behavior
# Future: Aggregated statistics from all watches with this tag
# check_count:
# type: integer
# description: Sum of check_count from all watches with this tag
# readOnly: true
# x-computed: true
# last_checked:
# type: integer
# description: Most recent last_checked timestamp from all watches with this tag
# readOnly: true
# x-computed: true
# last_changed:
# type: integer
# description: Most recent last_changed timestamp from all watches with this tag
# readOnly: true
# x-computed: true
type: object
properties:
uuid:
type: string
format: uuid
description: Unique identifier for the tag
readOnly: true
title:
type: string
description: Tag title
maxLength: 5000
notification_urls:
type: array
items:
type: string
description: Default notification URLs for web page change monitors (watches) with this tag
notification_muted:
type: boolean
description: Whether notifications are muted for this tag
CreateTag:
allOf:
@@ -1762,92 +1503,46 @@ paths:
post:
operationId: importWatches
tags: [Import]
summary: Import watch URLs with configuration
description: |
Import a list of URLs to monitor with optional watch configuration. Accepts line-separated URLs in request body.
**Configuration via Query Parameters:**
You can pass ANY watch configuration field as query parameters to apply settings to all imported watches.
All parameters from the Watch schema are supported (processor, fetch_backend, notification_urls, etc.).
**Special Parameters:**
- `tag` / `tag_uuids` - Assign tags to imported watches
- `proxy` - Use specific proxy for imported watches
- `dedupe` - Skip duplicate URLs (default: true)
**Type Conversion:**
- Booleans: `true`, `false`, `1`, `0`, `yes`, `no`
- Arrays: Comma-separated or JSON format (`[item1,item2]`)
- Objects: JSON format (`{"key":"value"}`)
- Numbers: Parsed as int or float
summary: Import watch URLs
description: Import a list of URLs to monitor. Accepts line-separated URLs in request body.
x-code-samples:
- lang: 'curl'
source: |
# Basic import
curl -X POST "http://localhost:5000/api/v1/import" \
-H "x-api-key: YOUR_API_KEY" \
-H "Content-Type: text/plain" \
-d $'https://example.com\nhttps://example.org\nhttps://example.net'
# Import with processor and fetch backend
curl -X POST "http://localhost:5000/api/v1/import?processor=restock_diff&fetch_backend=html_webdriver" \
-H "x-api-key: YOUR_API_KEY" \
-H "Content-Type: text/plain" \
-d $'https://example.com\nhttps://example.org'
# Import with multiple settings
curl -X POST "http://localhost:5000/api/v1/import?processor=restock_diff&paused=true&tag=production" \
-H "x-api-key: YOUR_API_KEY" \
-H "Content-Type: text/plain" \
-d $'https://example.com'
- lang: 'Python'
source: |
import requests
headers = {
'x-api-key': 'YOUR_API_KEY',
'Content-Type': 'text/plain'
}
# Basic import
urls = 'https://example.com\nhttps://example.org\nhttps://example.net'
response = requests.post('http://localhost:5000/api/v1/import',
response = requests.post('http://localhost:5000/api/v1/import',
headers=headers, data=urls)
print(response.json())
# Import with configuration
params = {
'processor': 'restock_diff',
'fetch_backend': 'html_webdriver',
'paused': 'false',
'tag': 'production'
}
response = requests.post('http://localhost:5000/api/v1/import',
headers=headers, params=params, data=urls)
print(response.json())
parameters:
- name: tag_uuids
in: query
description: Tag UUID(s) to apply to imported watches (comma-separated for multiple)
description: Tag UUID to apply to imported web page change monitors (watches)
schema:
type: string
example: "550e8400-e29b-41d4-a716-446655440000"
- name: tag
in: query
description: Tag name to apply to imported watches
description: Tag name to apply to imported web page change monitors (watches)
schema:
type: string
example: "production"
- name: proxy
in: query
description: Proxy key to use for imported watches
description: Proxy key to use for imported web page change monitors (watches)
schema:
type: string
example: "proxy1"
- name: dedupe
in: query
description: Skip duplicate URLs (default true)
description: Remove duplicate URLs (default true)
schema:
type: boolean
default: true
+62 -435
View File
File diff suppressed because one or more lines are too long
+16 -9
View File
@@ -1,21 +1,23 @@
# eventlet>=0.38.0 # Removed - replaced with threading mode for better Python 3.12+ compatibility
feedgen~=1.0
feedparser~=6.0 # For parsing RSS/Atom feeds
flask-compress
# 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers)
flask-login>=0.6.3
flask-paginate
flask-socketio>=5.6.1,<6 # Re #3910
flask>=3.1,<4
flask_cors # For the Chrome extension to operate
flask_expects_json~=1.7
flask_restful
flask_cors # For the Chrome extension to operate
# janus # No longer needed - using pure threading.Queue for multi-loop support
flask_wtf~=1.2
flask~=3.1
flask-socketio~=5.6.0
python-socketio~=5.16.0
python-engineio~=4.13.0
inscriptis~=2.2
python-engineio>=4.9.0,<5
python-socketio>=5.11.0,<6
pytz
timeago~=1.0
validators~=0.35
werkzeug==3.1.6
# Set these versions together to avoid a RequestsDependencyWarning
# >= 2.26 also adds Brotli support if brotli is installed
@@ -97,8 +99,12 @@ pytest ~=9.0
pytest-flask ~=1.3
pytest-mock ~=3.15
# Anything 4.0 and up but not 5.0
jsonschema ~= 4.26
# OpenAPI validation support
openapi-core[flask] ~= 0.22
openapi-core[flask] >= 0.19.0
loguru
@@ -120,7 +126,8 @@ greenlet >= 3.0.3
# Default SOCKETIO_MODE=threading is recommended for better compatibility
gevent
referencing # Don't pin — jsonschema-path (required by openapi-core>=0.18) caps referencing<0.37.0, so pinning 0.37.0 forces openapi-core back to 0.17.2. Revisit once jsonschema-path>=0.3.5 relaxes the cap.
# Pinned or it causes problems with flask_expects_json which seems unmaintained
referencing==0.35.1
# For conditions
panzi-json-logic