Compare commits

..

1 Commits

Author SHA1 Message Date
dgtlmoon 71eb1daf70 Ability to limit total number of watches with env var PAGE_WATCH_LIMIT 2026-02-02 11:12:25 +01:00
132 changed files with 2649 additions and 8454 deletions
@@ -37,29 +37,10 @@ jobs:
${{ runner.os }}-pip-py${{ env.PYTHON_VERSION }}-
${{ runner.os }}-pip-
- name: Get current date for cache key
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build changedetection.io container for testing under Python ${{ env.PYTHON_VERSION }}
uses: docker/build-push-action@v6
with:
context: ./
file: ./Dockerfile
build-args: |
PYTHON_VERSION=${{ env.PYTHON_VERSION }}
LOGGER_LEVEL=TRACE
tags: test-changedetectionio
load: true
cache-from: type=gha,scope=build-${{ github.ref_name }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements.txt', 'Dockerfile') }}-${{ steps.date.outputs.date }}
cache-to: type=gha,mode=max,scope=build-${{ github.ref_name }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements.txt', 'Dockerfile') }}-${{ steps.date.outputs.date }}
- name: Verify build
run: |
echo "---- Built for Python ${{ env.PYTHON_VERSION }} -----"
echo "---- Building for Python ${{ env.PYTHON_VERSION }} -----"
docker build --build-arg PYTHON_VERSION=${{ env.PYTHON_VERSION }} --build-arg LOGGER_LEVEL=TRACE -t test-changedetectionio .
docker run test-changedetectionio bash -c 'pip list'
- name: We should be Python ${{ env.PYTHON_VERSION }} ...
@@ -103,7 +84,7 @@ jobs:
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
# Basic pytest tests with ancillary services
basic-tests:
@@ -395,29 +376,6 @@ jobs:
cd changedetectionio
./run_custom_browser_url_tests.sh
processor-plugin-tests:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 20
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
- name: Download Docker image artifact
uses: actions/download-artifact@v7
with:
name: test-changedetectionio-${{ env.PYTHON_VERSION }}
path: /tmp
- name: Load Docker image
run: |
docker load -i /tmp/test-changedetectionio.tar
- name: Basic processor plugin registration and checks
run: |
docker run -e EXTRA_PACKAGES=changedetection.io-osint-processor test-changedetectionio bash -c 'cd changedetectionio;pytest -vvv -s tests/plugins/test_processor.py::test_check_plugin_processor'
# Container startup tests
container-tests:
runs-on: ubuntu-latest
@@ -516,142 +474,3 @@ jobs:
exit 1
fi
docker rm sig-test
# Upgrade path test
upgrade-path-test:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 25
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # Fetch all history and tags for upgrade testing
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Check upgrade works without error
run: |
echo "=== Testing upgrade path from 0.49.1 to ${{ github.ref_name }} (${{ github.sha }}) ==="
# Checkout old version and create datastore
git checkout 0.49.1
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
pip install 'pyOpenSSL>=23.2.0'
echo "=== Running version 0.49.1 to create datastore ==="
python3 ./changedetection.py -C -d /tmp/data &
APP_PID=$!
# Wait for app to be ready
echo "Waiting for 0.49.1 to be ready..."
sleep 6
# Extract API key from datastore (0.49.1 uses url-watches.json)
API_KEY=$(jq -r '.settings.application.api_access_token // empty' /tmp/data/url-watches.json)
echo "API Key: ${API_KEY:0:8}..."
# Create a watch with tag "github-group-test" via API
echo "Creating test watch with tag via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
--retry 6 --retry-delay 1 --retry-connrefused \
-d '{
"url": "https://example.com/upgrade-test",
"tag": "github-group-test"
}'
echo "✓ Created watch with tag 'github-group-test'"
# Create a specific test URL watch
echo "Creating test URL watch via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
-d '{
"url": "http://localhost/test.txt"
}'
echo "✓ Created watch for 'http://localhost/test.txt' in version 0.49.1"
# Stop the old version gracefully
kill $APP_PID
wait $APP_PID || true
echo "✓ Version 0.49.1 stopped"
# Upgrade to current version (use commit SHA since we're in detached HEAD)
echo "Upgrading to commit ${{ github.sha }}"
git checkout ${{ github.sha }}
pip install -r requirements.txt
echo "=== Running current version (commit ${{ github.sha }}) with old datastore (testing mode) ==="
TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD=1 python3 ./changedetection.py -d /tmp/data > /tmp/upgrade-test.log 2>&1
echo "=== Upgrade test output ==="
cat /tmp/upgrade-test.log
echo "✓ Datastore upgraded successfully"
# Now start the current version normally to verify the tag survived
echo "=== Starting current version to verify tag exists after upgrade ==="
timeout 20 python3 ./changedetection.py -d /tmp/data > /tmp/ui-test.log 2>&1 &
APP_PID=$!
# Wait for app to be ready and fetch UI
echo "Waiting for current version to be ready..."
sleep 5
curl --retry 6 --retry-delay 1 --retry-connrefused --silent http://127.0.0.1:5000 > /tmp/ui-output.html
# Verify tag exists in UI
if grep -q "github-group-test" /tmp/ui-output.html; then
echo "✓ Tag 'github-group-test' found in UI after upgrade"
else
echo "ERROR: Tag 'github-group-test' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Verify test URL exists in UI
if grep -q "http://localhost/test.txt" /tmp/ui-output.html; then
echo "✓ Watch URL 'http://localhost/test.txt' found in UI after upgrade"
else
echo "ERROR: Watch URL 'http://localhost/test.txt' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Cleanup
kill $APP_PID || true
wait $APP_PID || true
echo ""
echo "✓✓✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }} ✓✓✓"
echo " - Commit: ${{ github.sha }}"
echo " - Datastore migrated successfully"
echo " - Tag 'github-group-test' survived upgrade"
echo " - Watch URL 'http://localhost/test.txt' survived upgrade"
echo "✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }}"
- name: Upload upgrade test logs
if: always()
uses: actions/upload-artifact@v6
with:
name: upgrade-test-logs-py${{ env.PYTHON_VERSION }}
path: /tmp/upgrade-test.log
-1
View File
@@ -29,4 +29,3 @@ test-datastore/
# Memory consumption log
test-memory.log
tests/logs/
-9
View File
@@ -138,15 +138,6 @@ ENV LOGGER_LEVEL="$LOGGER_LEVEL"
ENV LC_ALL=en_US.UTF-8
WORKDIR /app
# Copy and set up entrypoint script for installing extra packages
COPY docker-entrypoint.sh /docker-entrypoint.sh
RUN chmod +x /docker-entrypoint.sh
# Set entrypoint to handle EXTRA_PACKAGES env var
ENTRYPOINT ["/docker-entrypoint.sh"]
# Default command (can be overridden in docker-compose.yml)
CMD ["python", "./changedetection.py", "-d", "/datastore"]
+30 -25
View File
@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
# Semver means never use .01, or 00. Should be .1.
__version__ = '0.53.1'
__version__ = '0.52.9'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError
@@ -102,8 +102,8 @@ def sigshutdown_handler(_signo, _stack_frame):
# Shutdown workers and queues immediately
try:
from changedetectionio import worker_pool
worker_pool.shutdown_workers()
from changedetectionio import worker_handler
worker_handler.shutdown_workers()
except Exception as e:
logger.error(f"Error shutting down workers: {str(e)}")
@@ -112,9 +112,9 @@ def sigshutdown_handler(_signo, _stack_frame):
from changedetectionio.flask_app import update_q, notification_q
update_q.close()
notification_q.close()
logger.debug("Queues closed successfully")
logger.debug("Janus queues closed successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to close queues: {e}")
logger.critical(f"CRITICAL: Failed to close janus queues: {e}")
# Shutdown socketio server fast
from changedetectionio.flask_app import socketio_server
@@ -124,9 +124,13 @@ def sigshutdown_handler(_signo, _stack_frame):
except Exception as e:
logger.error(f"Error shutting down Socket.IO server: {str(e)}")
# With immediate persistence, all data is already saved
logger.success('All data already persisted (immediate commits enabled).')
# Save data quickly - force immediate save using abstract method
try:
datastore.force_save_all()
logger.success('Fast sync to storage complete.')
except Exception as e:
logger.error(f"Error syncing to storage: {str(e)}")
sys.exit()
def print_help():
@@ -182,6 +186,7 @@ def main():
from changedetectionio.flask_app import changedetection_app
datastore_path = None
do_cleanup = False
# Set a default logger level
logger_level = 'DEBUG'
include_default_watches = True
@@ -264,7 +269,7 @@ def main():
i += 1
try:
opts, args = getopt.getopt(cleaned_argv[1:], "6Csd:h:p:l:P:", "port")
opts, args = getopt.getopt(cleaned_argv[1:], "6Ccsd:h:p:l:P:", "port")
except getopt.GetoptError as e:
print_help()
print(f'Error: {e}')
@@ -292,6 +297,10 @@ def main():
if opt == '-d':
datastore_path = arg
# Cleanup (remove text files that arent in the index)
if opt == '-c':
do_cleanup = True
# Create the datadir if it doesnt exist
if opt == '-C':
create_datastore_dir = True
@@ -371,15 +380,7 @@ def main():
# Dont' start if the JSON DB looks corrupt
logger.critical(f"ERROR: JSON DB or Proxy List JSON at '{app_config['datastore_path']}' appears to be corrupt, aborting.")
logger.critical(str(e))
sys.exit(1)
# Testing mode: Exit cleanly after datastore initialization (for CI/CD upgrade tests)
if os.environ.get('TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD'):
logger.success(f"TESTING MODE: Datastore loaded successfully from {app_config['datastore_path']}")
logger.success(f"TESTING MODE: Schema version: {datastore.data['settings']['application'].get('schema_version', 'unknown')}")
logger.success(f"TESTING MODE: Loaded {len(datastore.data['watching'])} watches")
logger.success("TESTING MODE: Exiting cleanly (TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD is set)")
sys.exit(0)
return
# Apply all_paused setting if specified via CLI
if all_paused is not None:
@@ -414,12 +415,12 @@ def main():
# This must happen AFTER app initialization so update_q is available
if batch_mode and added_watch_uuids:
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData, worker_pool
from changedetectionio import queuedWatchMetaData, worker_handler
logger.info(f"Batch mode: Queuing {len(added_watch_uuids)} newly added watches")
for watch_uuid in added_watch_uuids:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -431,7 +432,7 @@ def main():
# This must happen AFTER app initialization so update_q is available
if recheck_watches is not None:
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData, worker_pool
from changedetectionio import queuedWatchMetaData, worker_handler
watches_to_queue = []
if recheck_watches == 'all':
@@ -453,7 +454,7 @@ def main():
for watch_uuid in watches_to_queue:
if watch_uuid in datastore.data['watching']:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -515,7 +516,7 @@ def main():
for watch_uuid in watches_to_queue:
if watch_uuid in datastore.data['watching']:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -548,7 +549,7 @@ def main():
logger.info(f"Batch mode: Waiting for iteration {current_iteration}/{total_iterations} to complete...")
# Use the shared wait_for_all_checks function
completed = worker_pool.wait_for_all_checks(update_q, timeout=300)
completed = worker_handler.wait_for_all_checks(update_q, timeout=300)
if not completed:
logger.warning(f"Batch mode: Iteration {current_iteration} timed out after 300 seconds")
@@ -605,6 +606,10 @@ def main():
else:
logger.info("SIGUSR1 handler only registered on Linux, skipped.")
# Go into cleanup mode
if do_cleanup:
datastore.remove_unused_snapshots()
app.config['datastore_path'] = datastore_path
@@ -613,7 +618,7 @@ def main():
return dict(right_sticky="v{}".format(datastore.data['version_tag']),
new_version_available=app.config['NEW_VERSION_AVAILABLE'],
has_password=datastore.data['settings']['application']['password'] != False,
socket_io_enabled=datastore.data['settings']['application'].get('ui', {}).get('socket_io_enabled', True),
socket_io_enabled=datastore.data['settings']['application']['ui'].get('socket_io_enabled', True),
all_paused=datastore.data['settings']['application'].get('all_paused', False),
all_muted=datastore.data['settings']['application'].get('all_muted', False)
)
+7 -169
View File
@@ -4,10 +4,6 @@ from flask import request
from functools import wraps
from . import auth, validate_openapi_request
from ..validate_url import is_safe_valid_url
import json
# Number of URLs above which import switches to background processing
IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD = 20
def default_content_type(content_type='text/plain'):
@@ -23,76 +19,6 @@ def default_content_type(content_type='text/plain'):
return decorator
def convert_query_param_to_type(value, schema_property):
"""
Convert a query parameter string to the appropriate type based on schema definition.
Args:
value: String value from query parameter
schema_property: Schema property definition with 'type' or 'anyOf' field
Returns:
Converted value in the appropriate type
Supports both OpenAPI 3.1 formats:
- type: [string, 'null'] (array format)
- anyOf: [{type: string}, {type: null}] (anyOf format)
"""
prop_type = schema_property.get('type')
# Handle OpenAPI 3.1 type arrays: type: [string, 'null']
if isinstance(prop_type, list):
# Use the first non-null type from the array
for t in prop_type:
if t != 'null':
prop_type = t
break
else:
prop_type = None
# Handle anyOf schemas (older format)
elif 'anyOf' in schema_property:
# Use the first non-null type from anyOf
for option in schema_property['anyOf']:
if option.get('type') and option.get('type') != 'null':
prop_type = option.get('type')
break
else:
prop_type = None
# Handle array type (e.g., notification_urls)
if prop_type == 'array':
# Support both comma-separated and JSON array format
if value.startswith('['):
try:
return json.loads(value)
except json.JSONDecodeError:
return [v.strip() for v in value.split(',')]
return [v.strip() for v in value.split(',')]
# Handle object type (e.g., time_between_check, headers)
elif prop_type == 'object':
try:
return json.loads(value)
except json.JSONDecodeError:
raise ValueError(f"Invalid JSON object for field: {value}")
# Handle boolean type
elif prop_type == 'boolean':
return strtobool(value)
# Handle integer type
elif prop_type == 'integer':
return int(value)
# Handle number type (float)
elif prop_type == 'number':
return float(value)
# Default: return as string
return value
class Import(Resource):
def __init__(self, **kwargs):
# datastore is a black box dependency
@@ -102,128 +28,40 @@ class Import(Resource):
@default_content_type('text/plain') #3547 #3542
@validate_openapi_request('importWatches')
def post(self):
"""Import a list of watched URLs with optional watch configuration."""
from . import get_watch_schema_properties
# Special parameters that are NOT watch configuration
special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'}
"""Import a list of watched URLs."""
extras = {}
# Handle special 'proxy' parameter
if request.args.get('proxy'):
plist = self.datastore.proxy_list
if not request.args.get('proxy') in plist:
proxy_list_str = ', '.join(plist) if plist else 'none configured'
return f"Invalid proxy choice, currently supported proxies are '{proxy_list_str}'", 400
return "Invalid proxy choice, currently supported proxies are '{}'".format(', '.join(plist)), 400
else:
extras['proxy'] = request.args.get('proxy')
# Handle special 'dedupe' parameter
dedupe = strtobool(request.args.get('dedupe', 'true'))
# Handle special 'tag' and 'tag_uuids' parameters
tags = request.args.get('tag')
tag_uuids = request.args.get('tag_uuids')
if tag_uuids:
tag_uuids = tag_uuids.split(',')
# Extract ALL other query parameters as watch configuration
# Get schema from OpenAPI spec (replaces old schema_create_watch)
schema_properties = get_watch_schema_properties()
for param_name, param_value in request.args.items():
# Skip special parameters
if param_name in special_params:
continue
# Skip if not in schema (unknown parameter)
if param_name not in schema_properties:
return f"Unknown watch configuration parameter: {param_name}", 400
# Convert to appropriate type based on schema
try:
converted_value = convert_query_param_to_type(param_value, schema_properties[param_name])
extras[param_name] = converted_value
except (ValueError, json.JSONDecodeError) as e:
return f"Invalid value for parameter '{param_name}': {str(e)}", 400
# Validate processor if provided
if 'processor' in extras:
from changedetectionio.processors import available_processors
available = [p[0] for p in available_processors()]
if extras['processor'] not in available:
return f"Invalid processor '{extras['processor']}'. Available processors: {', '.join(available)}", 400
# Validate fetch_backend if provided
if 'fetch_backend' in extras:
from changedetectionio.content_fetchers import available_fetchers
available = [f[0] for f in available_fetchers()]
# Also allow 'system' and extra_browser_* patterns
is_valid = (
extras['fetch_backend'] == 'system' or
extras['fetch_backend'] in available or
extras['fetch_backend'].startswith('extra_browser_')
)
if not is_valid:
return f"Invalid fetch_backend '{extras['fetch_backend']}'. Available: system, {', '.join(available)}", 400
# Validate notification_urls if provided
if 'notification_urls' in extras:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
validate_notification_urls(extras['notification_urls'])
except ValidationError as e:
return f"Invalid notification_urls: {str(e)}", 400
urls = request.get_data().decode('utf8').splitlines()
# Clean and validate URLs upfront
urls_to_import = []
added = []
for url in urls:
url = url.strip()
if not len(url):
continue
# Validate URL
# If hosts that only contain alphanumerics are allowed ("localhost" for example)
if not is_safe_valid_url(url):
return f"Invalid or unsupported URL - {url}", 400
# Check for duplicates if dedupe is enabled
if dedupe and self.datastore.url_exists(url):
continue
urls_to_import.append(url)
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added.append(new_uuid)
# For small imports, process synchronously for immediate feedback
if len(urls_to_import) < IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD:
added = []
for url in urls_to_import:
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added.append(new_uuid)
return added, 200
# For large imports (>= 20), process in background thread
else:
import threading
from loguru import logger
def import_watches_background():
"""Background thread to import watches - discarded after completion."""
try:
added_count = 0
for url in urls_to_import:
try:
self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
added_count += 1
except Exception as e:
logger.error(f"Error importing URL {url}: {e}")
logger.info(f"Background import complete: {added_count} watches created")
except Exception as e:
logger.error(f"Error in background import: {e}")
# Start background thread and return immediately
thread = threading.Thread(target=import_watches_background, daemon=True, name="ImportWatches-Background")
thread.start()
return {'status': f'Importing {len(urls_to_import)} URLs in background', 'count': len(urls_to_import)}, 202
return added
+7 -2
View File
@@ -1,6 +1,8 @@
from flask_expects_json import expects_json
from flask_restful import Resource, abort
from flask import request
from . import auth, validate_openapi_request
from . import schema_create_notification_urls, schema_delete_notification_urls
class Notifications(Resource):
def __init__(self, **kwargs):
@@ -20,6 +22,7 @@ class Notifications(Resource):
@auth.check_token
@validate_openapi_request('addNotifications')
@expects_json(schema_create_notification_urls)
def post(self):
"""Create Notification URLs."""
@@ -47,6 +50,7 @@ class Notifications(Resource):
@auth.check_token
@validate_openapi_request('replaceNotifications')
@expects_json(schema_create_notification_urls)
def put(self):
"""Replace Notification URLs."""
json_data = request.get_json()
@@ -63,12 +67,13 @@ class Notifications(Resource):
clean_urls = [url.strip() for url in notification_urls if isinstance(url, str)]
self.datastore.data['settings']['application']['notification_urls'] = clean_urls
self.datastore.commit()
self.datastore.needs_write = True
return {'notification_urls': clean_urls}, 200
@auth.check_token
@validate_openapi_request('deleteNotifications')
@expects_json(schema_delete_notification_urls)
def delete(self):
"""Delete Notification URLs."""
@@ -90,7 +95,7 @@ class Notifications(Resource):
abort(400, message="No matching notification URLs found.")
self.datastore.data['settings']['application']['notification_urls'] = notification_urls
self.datastore.commit()
self.datastore.needs_write = True
return 'OK', 204
+18 -82
View File
@@ -1,5 +1,6 @@
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask_expects_json import expects_json
from flask_restful import abort, Resource
from loguru import logger
@@ -7,7 +8,8 @@ import threading
from flask import request
from . import auth
from . import validate_openapi_request
# Import schemas from __init__.py
from . import schema_tag, schema_create_tag, schema_update_tag, validate_openapi_request
class Tag(Resource):
@@ -22,7 +24,8 @@ class Tag(Resource):
@validate_openapi_request('getTag')
def get(self, uuid):
"""Get data for a single tag/group, toggle notification muting, or recheck all."""
tag = self.datastore.data['settings']['application']['tags'].get(uuid)
from copy import deepcopy
tag = deepcopy(self.datastore.data['settings']['application']['tags'].get(uuid))
if not tag:
abort(404, message=f'No tag exists with the UUID of {uuid}')
@@ -39,7 +42,7 @@ class Tag(Resource):
# If less than 20 watches, queue synchronously for immediate feedback
if len(watches_to_queue) < 20:
for watch_uuid in watches_to_queue:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
return {'status': f'OK, queued {len(watches_to_queue)} watches for rechecking'}, 200
else:
# 20+ watches - queue in background thread to avoid blocking API response
@@ -47,7 +50,7 @@ class Tag(Resource):
"""Background thread to queue watches - discarded after completion."""
try:
for watch_uuid in watches_to_queue:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
logger.info(f"Background queueing complete for tag {tag['uuid']}: {len(watches_to_queue)} watches queued")
except Exception as e:
logger.error(f"Error in background queueing for tag {tag['uuid']}: {e}")
@@ -59,33 +62,13 @@ class Tag(Resource):
return {'status': f'OK, queueing {len(watches_to_queue)} watches in background'}, 202
if request.args.get('muted', '') == 'muted':
tag['notification_muted'] = True
tag.commit()
self.datastore.data['settings']['application']['tags'][uuid]['notification_muted'] = True
return "OK", 200
elif request.args.get('muted', '') == 'unmuted':
tag['notification_muted'] = False
tag.commit()
self.datastore.data['settings']['application']['tags'][uuid]['notification_muted'] = False
return "OK", 200
# Filter out Watch-specific runtime fields that don't apply to Tags (yet)
# TODO: Future enhancement - aggregate these values from all Watches that have this tag:
# - check_count: sum of all watches' check_count
# - last_checked: most recent last_checked from all watches
# - last_changed: most recent last_changed from all watches
# - consecutive_filter_failures: count of watches with failures
# - etc.
# These come from watch_base inheritance but currently have no meaningful value for Tags
watch_only_fields = {
'browser_steps_last_error_step', 'check_count', 'consecutive_filter_failures',
'content-type', 'fetch_time', 'last_changed', 'last_checked', 'last_error',
'last_notification_error', 'last_viewed', 'notification_alert_count',
'page_title', 'previous_md5', 'remote_server_reply'
}
# Create clean tag dict without Watch-specific fields
clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields}
return clean_tag
return tag
@auth.check_token
@validate_openapi_request('deleteTag')
@@ -96,95 +79,48 @@ class Tag(Resource):
# Delete the tag, and any tag reference
del self.datastore.data['settings']['application']['tags'][uuid]
# Delete tag.json file if it exists
import os
tag_dir = os.path.join(self.datastore.datastore_path, uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
logger.info(f"Deleted tag.json for tag {uuid}")
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
# Remove tag from all watches
for watch_uuid, watch in self.datastore.data['watching'].items():
if watch.get('tags') and uuid in watch['tags']:
watch['tags'].remove(uuid)
watch.commit()
return 'OK', 204
@auth.check_token
@validate_openapi_request('updateTag')
@expects_json(schema_update_tag)
def put(self, uuid):
"""Update tag information."""
tag = self.datastore.data['settings']['application']['tags'].get(uuid)
if not tag:
abort(404, message='No tag exists with the UUID of {}'.format(uuid))
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Validate notification_urls if provided
if 'notification_urls' in json_data:
if 'notification_urls' in request.json:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
notification_urls = json_data.get('notification_urls', [])
notification_urls = request.json.get('notification_urls', [])
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
# Filter out readOnly fields (extracted from OpenAPI spec Tag schema)
# These are system-managed fields that should never be user-settable
from . import get_readonly_tag_fields
readonly_fields = get_readonly_tag_fields()
# Tag model inherits from watch_base but has no @property attributes of its own
# So we only need to filter readOnly fields
for field in readonly_fields:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
tag.update(json_data)
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = self.datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated via API, cleared {cleared_count} watch checksums")
tag.update(request.json)
self.datastore.needs_write_urgent = True
return "OK", 200
@auth.check_token
@validate_openapi_request('createTag')
# Only cares for {'title': 'xxxx'}
def post(self):
"""Create a single tag/group."""
json_data = request.get_json()
title = json_data.get("title",'').strip()
# Validate that only valid fields are provided
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
new_uuid = self.datastore.add_tag(title=title)
if new_uuid:
+86 -82
View File
@@ -6,13 +6,15 @@ from changedetectionio.favicon_utils import get_favicon_mime_type
from . import auth
from changedetectionio import queuedWatchMetaData, strtobool
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask import request, make_response, send_from_directory
from flask_expects_json import expects_json
from flask_restful import abort, Resource
from loguru import logger
import copy
from . import validate_openapi_request, get_readonly_watch_fields
# Import schemas from __init__.py
from . import schema, schema_create_watch, schema_update_watch, validate_openapi_request
from ..notification import valid_notification_formats
from ..notification.handler import newline_re
@@ -64,46 +66,47 @@ class Watch(Resource):
@validate_openapi_request('getWatch')
def get(self, uuid):
"""Get information about a single watch, recheck, pause, or mute."""
# Get watch reference first (for pause/mute operations)
watch_obj = self.datastore.data['watching'].get(uuid)
if not watch_obj:
import time
from copy import deepcopy
watch = None
# Retry up to 20 times if dict is being modified
# With sleep(0), this is fast: ~200µs best case, ~20ms worst case under heavy load
for attempt in range(20):
try:
watch = deepcopy(self.datastore.data['watching'].get(uuid))
break
except RuntimeError:
# Dict changed during deepcopy, retry after yielding to scheduler
# sleep(0) releases GIL and yields - no fixed delay, just lets other threads run
if attempt < 19: # Don't yield on last attempt
time.sleep(0) # Yield to scheduler (microseconds, not milliseconds)
if not watch:
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
# Create a dict copy for JSON response (with lock for thread safety)
# This is much faster than deepcopy and doesn't copy the datastore reference
# WARNING: dict() is a SHALLOW copy - nested dicts are shared with original!
# Only safe because we only ADD scalar properties (line 97-101), never modify nested dicts
# If you need to modify nested dicts, use: from copy import deepcopy; watch = deepcopy(dict(watch_obj))
with self.datastore.lock:
watch = dict(watch_obj)
if request.args.get('recheck'):
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
return "OK", 200
if request.args.get('paused', '') == 'paused':
watch_obj.pause()
watch_obj.commit()
self.datastore.data['watching'].get(uuid).pause()
return "OK", 200
elif request.args.get('paused', '') == 'unpaused':
watch_obj.unpause()
watch_obj.commit()
self.datastore.data['watching'].get(uuid).unpause()
return "OK", 200
if request.args.get('muted', '') == 'muted':
watch_obj.mute()
watch_obj.commit()
self.datastore.data['watching'].get(uuid).mute()
return "OK", 200
elif request.args.get('muted', '') == 'unmuted':
watch_obj.unmute()
watch_obj.commit()
self.datastore.data['watching'].get(uuid).unmute()
return "OK", 200
# Return without history, get that via another API call
# Properties are not returned as a JSON, so add the required props manually
watch['history_n'] = watch_obj.history_n
watch['history_n'] = watch.history_n
# attr .last_changed will check for the last written text snapshot on change
watch['last_changed'] = watch_obj.last_changed
watch['viewed'] = watch_obj.viewed
watch['link'] = watch_obj.link,
watch['last_changed'] = watch.last_changed
watch['viewed'] = watch.viewed
watch['link'] = watch.link,
return watch
@@ -119,6 +122,7 @@ class Watch(Resource):
@auth.check_token
@validate_openapi_request('updateWatch')
@expects_json(schema_update_watch)
def put(self, uuid):
"""Update watch information."""
watch = self.datastore.data['watching'].get(uuid)
@@ -165,48 +169,58 @@ class Watch(Resource):
# Handle processor-config-* fields separately (save to JSON, not datastore)
from changedetectionio import processors
processor_config_data = {}
regular_data = {}
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Extract and remove processor config fields from json_data
processor_config_data = processors.extract_processor_config_from_form_data(json_data)
# Filter out readOnly fields (extracted from OpenAPI spec Watch schema)
# These are system-managed fields that should never be user-settable
readonly_fields = get_readonly_watch_fields()
# Also filter out @property attributes (computed/derived values from the model)
# These are not stored and should be ignored in PUT requests
from changedetectionio.model.Watch import model as WatchModel
property_fields = WatchModel.get_property_names()
# Combine both sets of fields to ignore
fields_to_ignore = readonly_fields | property_fields
# Remove all ignored fields from update data
for field in fields_to_ignore:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from WatchBase schema
from . import get_watch_schema_properties
valid_fields = set(get_watch_schema_properties().keys())
# Also allow last_viewed (explicitly defined in UpdateWatch schema)
valid_fields.add('last_viewed')
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
for key, value in request.json.items():
if key.startswith('processor_config_'):
config_key = key.replace('processor_config_', '')
if value: # Only save non-empty values
processor_config_data[config_key] = value
else:
regular_data[key] = value
# Update watch with regular (non-processor-config) fields
watch.update(json_data)
watch.commit()
watch.update(regular_data)
# Save processor config to JSON file
processors.save_processor_config(self.datastore, uuid, processor_config_data)
# Save processor config to JSON file if any config data exists
if processor_config_data:
try:
processor_name = request.json.get('processor', watch.get('processor'))
if processor_name:
# Create a processor instance to access config methods
from changedetectionio.processors import difference_detection_processor
processor_instance = difference_detection_processor(self.datastore, uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, processor_config_data)
logger.debug(f"API: Saved processor config to {config_filename}: {processor_config_data}")
# Call optional edit_hook if processor has one
try:
import importlib
edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
try:
edit_hook = importlib.import_module(edit_hook_module_name)
logger.debug(f"API: Found edit_hook module for {processor_name}")
if hasattr(edit_hook, 'on_config_save'):
logger.info(f"API: Calling edit_hook.on_config_save for {processor_name}")
# Call hook and get updated config
updated_config = edit_hook.on_config_save(watch, processor_config_data, self.datastore)
# Save updated config back to file
processor_instance.update_extra_watch_config(config_filename, updated_config)
logger.info(f"API: Edit hook updated config: {updated_config}")
else:
logger.debug(f"API: Edit hook module found but no on_config_save function")
except ModuleNotFoundError:
logger.debug(f"API: No edit_hook module for processor {processor_name} (this is normal)")
except Exception as hook_error:
logger.error(f"API: Edit hook error (non-fatal): {hook_error}", exc_info=True)
except Exception as e:
logger.error(f"API: Failed to save processor config: {e}")
return "OK", 200
@@ -400,10 +414,10 @@ class WatchFavicon(Resource):
favicon_filename = watch.get_favicon_filename()
if favicon_filename:
# Use cached MIME type detection
filepath = os.path.join(watch.data_dir, favicon_filename)
filepath = os.path.join(watch.watch_data_dir, favicon_filename)
mime = get_favicon_mime_type(filepath)
response = make_response(send_from_directory(watch.data_dir, favicon_filename))
response = make_response(send_from_directory(watch.watch_data_dir, favicon_filename))
response.headers['Content-type'] = mime
response.headers['Cache-Control'] = 'max-age=300, must-revalidate' # Cache for 5 minutes, then revalidate
return response
@@ -419,6 +433,7 @@ class CreateWatch(Resource):
@auth.check_token
@validate_openapi_request('createWatch')
@expects_json(schema_create_watch)
def post(self):
"""Create a single watch."""
@@ -449,14 +464,8 @@ class CreateWatch(Resource):
except ValidationError as e:
return str(e), 400
# Handle processor-config-* fields separately (save to JSON, not watch)
from changedetectionio import processors
extras = copy.deepcopy(json_data)
# Extract and remove processor config fields from extras
processor_config_data = processors.extract_processor_config_from_form_data(extras)
# Because we renamed 'tag' to 'tags' but don't want to change the API (can do this in v2 of the API)
tags = None
if extras.get('tag'):
@@ -466,13 +475,9 @@ class CreateWatch(Resource):
del extras['url']
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags)
# Save processor config to separate JSON file
if new_uuid and processor_config_data:
processors.save_processor_config(self.datastore, new_uuid, processor_config_data)
if new_uuid:
# Dont queue because the scheduler will check that it hasnt been checked before anyway
# worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
# worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
return {'uuid': new_uuid}, 201
else:
# Check if it was a limit issue
@@ -506,7 +511,6 @@ class CreateWatch(Resource):
'last_error': watch['last_error'],
'link': watch.link,
'page_title': watch['page_title'],
'tags': [*tags], # Unpack dict keys to list (can't use list() since variable named 'list')
'title': watch['title'],
'url': watch['url'],
'viewed': watch.viewed
@@ -520,7 +524,7 @@ class CreateWatch(Resource):
if len(watches_to_queue) < 20:
# Get already queued/running UUIDs once (efficient)
queued_uuids = set(self.update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
# Filter out watches that are already queued or running
watches_to_queue_filtered = [
@@ -530,7 +534,7 @@ class CreateWatch(Resource):
# Queue only the filtered watches
for uuid in watches_to_queue_filtered:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Provide feedback about skipped watches
skipped_count = len(watches_to_queue) - len(watches_to_queue_filtered)
@@ -542,7 +546,7 @@ class CreateWatch(Resource):
# 20+ watches - queue in background thread to avoid blocking API response
# Capture queued/running state before background thread
queued_uuids = set(self.update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
def queue_all_watches_background():
"""Background thread to queue all watches - discarded after completion."""
@@ -552,7 +556,7 @@ class CreateWatch(Resource):
for uuid in watches_to_queue:
# Check if already queued or running (state captured at start)
if uuid not in queued_uuids and uuid not in running_uuids:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
queued_count += 1
else:
skipped_count += 1
+29 -83
View File
@@ -1,6 +1,33 @@
import copy
import functools
from flask import request, abort
from loguru import logger
from . import api_schema
from ..model import watch_base
# Build a JSON Schema atleast partially based on our Watch model
watch_base_config = watch_base()
schema = api_schema.build_watch_json_schema(watch_base_config)
schema_create_watch = copy.deepcopy(schema)
schema_create_watch['required'] = ['url']
del schema_create_watch['properties']['last_viewed']
schema_update_watch = copy.deepcopy(schema)
schema_update_watch['additionalProperties'] = False
# Tag schema is also based on watch_base since Tag inherits from it
schema_tag = copy.deepcopy(schema)
schema_create_tag = copy.deepcopy(schema_tag)
schema_create_tag['required'] = ['title']
schema_update_tag = copy.deepcopy(schema_tag)
schema_update_tag['additionalProperties'] = False
schema_notification_urls = copy.deepcopy(schema)
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
schema_create_notification_urls['required'] = ['notification_urls']
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
schema_delete_notification_urls['required'] = ['notification_urls']
@functools.cache
def get_openapi_spec():
@@ -19,79 +46,6 @@ def get_openapi_spec():
_openapi_spec = OpenAPI.from_dict(spec_dict)
return _openapi_spec
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Used by Import endpoint to validate and convert query parameters.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_schema_properties(schema_name):
"""
Generic helper to resolve schema properties, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'WatchBase', 'Watch', 'Tag')
Returns:
dict: All properties including inherited ones from $ref schemas
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
properties = {}
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
properties.update(ref_schema.get('properties', {}))
# Add schema-specific properties
if 'properties' in item:
properties.update(item['properties'])
else:
# Direct properties (no inheritance)
properties = schema.get('properties', {})
return properties
@functools.cache
def get_watch_schema_properties():
"""
Extract watch schema properties from OpenAPI spec for Import endpoint.
Returns WatchBase properties (all writable Watch fields).
"""
return _resolve_schema_properties('WatchBase')
# Import readonly field utilities from shared module (avoids circular dependencies with model layer)
from changedetectionio.model.schema_utils import get_readonly_watch_fields, get_readonly_tag_fields
@functools.cache
def get_tag_schema_properties():
"""
Extract Tag schema properties from OpenAPI spec.
Returns WatchBase properties + Tag-specific properties (overrides_watch).
"""
return _resolve_schema_properties('Tag')
def validate_openapi_request(operation_id):
"""Decorator to validate incoming requests against OpenAPI spec."""
def decorator(f):
@@ -110,16 +64,8 @@ def validate_openapi_request(operation_id):
if result.errors:
error_details = []
for error in result.errors:
# Extract detailed schema errors from __cause__
if hasattr(error, '__cause__') and hasattr(error.__cause__, 'schema_errors'):
for schema_error in error.__cause__.schema_errors:
field = '.'.join(str(p) for p in schema_error.path) if schema_error.path else 'body'
msg = schema_error.message if hasattr(schema_error, 'message') else str(schema_error)
error_details.append(f"{field}: {msg}")
else:
error_details.append(str(error))
logger.error(f"API Call - Validation failed: {'; '.join(error_details)}")
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
error_details.append(str(error))
raise BadRequest(f"OpenAPI validation failed: {error_details}")
except BadRequest:
# Re-raise BadRequest exceptions (validation failures)
raise
+162
View File
@@ -0,0 +1,162 @@
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
# Probably other ways to solve this when the backend switches to some ORM
from changedetectionio.notification import valid_notification_formats
def build_time_between_check_json_schema():
# Setup time between check schema
schema_properties_time_between_check = {
"type": "object",
"additionalProperties": False,
"properties": {}
}
for p in ['weeks', 'days', 'hours', 'minutes', 'seconds']:
schema_properties_time_between_check['properties'][p] = {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
]
}
return schema_properties_time_between_check
def build_watch_json_schema(d):
# Base JSON schema
schema = {
'type': 'object',
'properties': {},
}
for k, v in d.items():
# @todo 'integer' is not covered here because its almost always for internal usage
if isinstance(v, type(None)):
schema['properties'][k] = {
"anyOf": [
{"type": "null"},
]
}
elif isinstance(v, list):
schema['properties'][k] = {
"anyOf": [
{"type": "array",
# Always is an array of strings, like text or regex or something
"items": {
"type": "string",
"maxLength": 5000
}
},
]
}
elif isinstance(v, bool):
schema['properties'][k] = {
"anyOf": [
{"type": "boolean"},
]
}
elif isinstance(v, str):
schema['properties'][k] = {
"anyOf": [
{"type": "string",
"maxLength": 5000},
]
}
# Can also be a string (or None by default above)
for v in ['body',
'notification_body',
'notification_format',
'notification_title',
'proxy',
'tag',
'title',
'webdriver_js_execute_code'
]:
schema['properties'][v]['anyOf'].append({'type': 'string', "maxLength": 5000})
for v in ['last_viewed']:
schema['properties'][v] = {
"type": "integer",
"description": "Unix timestamp in seconds of the last time the watch was viewed.",
"minimum": 0
}
# None or Boolean
schema['properties']['track_ldjson_price_data']['anyOf'].append({'type': 'boolean'})
schema['properties']['method'] = {"type": "string",
"enum": ["GET", "POST", "DELETE", "PUT"]
}
schema['properties']['fetch_backend']['anyOf'].append({"type": "string",
"enum": ["html_requests", "html_webdriver"]
})
schema['properties']['processor'] = {"anyOf": [
{"type": "string", "enum": ["restock_diff", "text_json_diff"]},
{"type": "null"}
]}
# All headers must be key/value type dict
schema['properties']['headers'] = {
"type": "object",
"patternProperties": {
# Should always be a string:string type value
".*": {"type": "string"},
}
}
schema['properties']['notification_format'] = {'type': 'string',
'enum': list(valid_notification_formats.keys())
}
# Stuff that shouldn't be available but is just state-storage
for v in ['previous_md5', 'last_error', 'has_ldjson_price_data', 'previous_md5_before_filters', 'uuid']:
del schema['properties'][v]
schema['properties']['webdriver_delay']['anyOf'].append({'type': 'integer'})
schema['properties']['time_between_check'] = build_time_between_check_json_schema()
schema['properties']['time_between_check_use_default'] = {
"type": "boolean",
"default": True,
"description": "Whether to use global settings for time between checks - defaults to true if not set"
}
schema['properties']['browser_steps'] = {
"anyOf": [
{
"type": "array",
"items": {
"type": "object",
"properties": {
"operation": {
"type": ["string", "null"],
"maxLength": 5000 # Allows null and any string up to 5000 chars (including "")
},
"selector": {
"type": ["string", "null"],
"maxLength": 5000
},
"optional_value": {
"type": ["string", "null"],
"maxLength": 5000
}
},
"required": ["operation", "selector", "optional_value"],
"additionalProperties": False # No extra keys allowed
}
},
{"type": "null"}, # Allows null for `browser_steps`
{"type": "array", "maxItems": 0} # Allows empty array []
]
}
# headers ?
return schema
@@ -3,9 +3,7 @@ from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
from changedetectionio import worker_pool
from changedetectionio.flask_app import watch_check_update
from changedetectionio.queuedWatchMetaData import PrioritizedItem
import asyncio
import importlib
@@ -48,33 +46,19 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
jobs_processed = 0
start_time = time.time()
# Log thread name for debugging
import threading
thread_name = threading.current_thread().name
logger.info(f"Starting async worker {worker_id} on thread '{thread_name}' (max_jobs={max_jobs}, max_runtime={max_runtime_seconds}s)")
logger.info(f"Starting async worker {worker_id} (max_jobs={max_jobs}, max_runtime={max_runtime_seconds}s)")
while not app.config.exit.is_set():
update_handler = None
watch = None
try:
# Efficient blocking via run_in_executor (no polling overhead!)
# Worker blocks in threading.Queue.get() which uses Condition.wait()
# Executor must be sized to match worker count (see worker_pool.py: 50 threads default)
# Single timeout (no double-timeout wrapper) = no race condition
queued_item_data = await q.async_get(executor=executor, timeout=1.0)
# CRITICAL: Claim UUID immediately after getting from queue to prevent race condition
# in wait_for_all_checks() which checks qsize() and running_uuids separately
uuid = queued_item_data.item.get('uuid')
if not worker_pool.claim_uuid_for_processing(uuid, worker_id):
# Already being processed - re-queue and continue
logger.trace(f"Worker {worker_id} detected UUID {uuid} already processing during claim - deferring")
await asyncio.sleep(DEFER_SLEEP_TIME_ALREADY_QUEUED)
deferred_priority = max(1000, queued_item_data.priority * 10)
deferred_item = PrioritizedItem(priority=deferred_priority, item=queued_item_data.item)
worker_pool.queue_item_async_safe(q, deferred_item, silent=True)
continue
# Use sync interface via run_in_executor since each worker has its own event loop
loop = asyncio.get_event_loop()
queued_item_data = await asyncio.wait_for(
loop.run_in_executor(executor, q.get, True, 1.0), # block=True, timeout=1.0
timeout=1.5
)
except asyncio.TimeoutError:
# No jobs available - check if we should restart based on time while idle
@@ -83,17 +67,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.info(f"Worker {worker_id} idle and reached max runtime ({runtime:.0f}s), restarting")
return "restart"
continue
except RuntimeError as e:
# Handle executor shutdown gracefully - this is expected during shutdown
if "cannot schedule new futures after shutdown" in str(e):
# Executor shut down - exit gracefully without logging in pytest
if not IN_PYTEST:
logger.debug(f"Worker {worker_id} detected executor shutdown, exiting")
break
# Other RuntimeError - log and continue
logger.error(f"Worker {worker_id} runtime error: {e}")
await asyncio.sleep(0.1)
continue
except Exception as e:
# Handle expected Empty exception from queue timeout
import queue
@@ -115,8 +88,26 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
await asyncio.sleep(0.1)
continue
# UUID already claimed above immediately after getting from queue
# to prevent race condition with wait_for_all_checks()
uuid = queued_item_data.item.get('uuid')
# RACE CONDITION FIX: Atomically claim this UUID for processing
from changedetectionio import worker_handler
from changedetectionio.queuedWatchMetaData import PrioritizedItem
# Try to claim the UUID atomically - prevents duplicate processing
if not worker_handler.claim_uuid_for_processing(uuid, worker_id):
# Already being processed by another worker
logger.trace(f"Worker {worker_id} detected UUID {uuid} already being processed - deferring")
# Sleep to avoid tight loop and give the other worker time to finish
await asyncio.sleep(DEFER_SLEEP_TIME_ALREADY_QUEUED)
# Re-queue with lower priority so it gets checked again after current processing finishes
deferred_priority = max(1000, queued_item_data.priority * 10)
deferred_item = PrioritizedItem(priority=deferred_priority, item=queued_item_data.item)
worker_handler.queue_item_async_safe(q, deferred_item, silent=True)
logger.debug(f"Worker {worker_id} re-queued UUID {uuid} for subsequent check")
continue
fetch_start_time = round(time.time())
@@ -142,14 +133,11 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
processor = watch.get('processor', 'text_json_diff')
# Init a new 'difference_detection_processor'
# Use get_processor_module() to support both built-in and plugin processors
from changedetectionio.processors import get_processor_module
processor_module = get_processor_module(processor)
if not processor_module:
error_msg = f"Processor module '{processor}' not found."
logger.error(error_msg)
raise ModuleNotFoundError(error_msg)
try:
processor_module = importlib.import_module(f"changedetectionio.processors.{processor}.processor")
except ModuleNotFoundError as e:
print(f"Processor module '{processor}' not found.")
raise e
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid)
@@ -236,7 +224,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
except FilterNotFoundInResponse as e:
if not datastore.data['watching'].get(uuid):
continue
logger.debug(f"Received FilterNotFoundInResponse exception for {uuid}")
err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary."
datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
@@ -256,19 +243,17 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
c += 1
# Send notification if we reached the threshold?
threshold = datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
logger.debug(f"FilterNotFoundInResponse - Filter for {uuid} not found, consecutive_filter_failures: {c} of threshold {threshold}")
logger.debug(f"Filter for {uuid} not found, consecutive_filter_failures: {c} of threshold {threshold}")
if c >= threshold:
if not watch.get('notification_muted'):
logger.debug(f"FilterNotFoundInResponse - Sending filter failed notification for {uuid}")
logger.debug(f"Sending filter failed notification for {uuid}")
await send_filter_failure_notification(uuid, notification_q, datastore)
c = 0
logger.debug(f"FilterNotFoundInResponse - Reset filter failure count back to zero")
else:
logger.debug(f"FilterNotFoundInResponse - {c} of threshold {threshold}..")
logger.debug(f"Reset filter failure count back to zero")
datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
else:
logger.trace(f"FilterNotFoundInResponse - {uuid} - filter_failure_notification_send not enabled, skipping")
logger.trace(f"{uuid} - filter_failure_notification_send not enabled, skipping")
process_changedetection_results = False
@@ -276,9 +261,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
logger.debug(f'[{uuid}] - checksumFromPreviousCheckWasTheSame - Checksum from previous check was the same, nothing todo here.')
# Reset the edited flag since we successfully completed the check
watch.reset_watch_edited_flag()
except content_fetchers_exceptions.BrowserConnectError as e:
datastore.update_watch(uuid=uuid,
@@ -371,9 +353,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}")
except Exception as e:
import traceback
logger.error(f"Worker {worker_id} exception processing watch UUID: {uuid}")
logger.exception(f"Worker {worker_id} full exception details:")
logger.error(str(e))
datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
process_changedetection_results = False
@@ -381,7 +362,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if not datastore.data['watching'].get(uuid):
continue
update_obj['content-type'] = str(update_handler.fetcher.get_all_headers().get('content-type', '') or "").lower()
update_obj['content-type'] = update_handler.fetcher.get_all_headers().get('content-type', '').lower()
if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
@@ -392,11 +373,9 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if not datastore.data['watching'].get(uuid):
continue
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler and update_handler.xpath_data else 'empty.'}")
if update_handler and process_changedetection_results:
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler.xpath_data else 'empty.'}")
if process_changedetection_results:
try:
# Reset the edited flag BEFORE update_watch (which calls watch.update() and would set it again)
watch.reset_watch_edited_flag()
datastore.update_watch(uuid=uuid, update_obj=update_obj)
if changed_detected or not watch.history_n:
@@ -439,69 +418,64 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
await send_content_changed_notification(uuid, notification_q, datastore)
except Exception as e:
logger.critical(f"Worker {worker_id} exception in process_changedetection_results")
logger.exception(f"Worker {worker_id} full exception details:")
logger.critical(str(e))
datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Always record attempt count
count = watch.get('check_count', 0) + 1
final_updates = {'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count,
}
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
try:
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Record server header
try:
server_header = str(update_handler.fetcher.get_all_headers().get('server', '') or "").strip().lower()[:255]
if server_header:
final_updates['remote_server_reply'] = server_header
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
except Exception as e:
server_header = None
pass
if update_handler: # Could be none or empty if the processor was not found
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
try:
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
final_updates['page_title'] = page_title
except Exception as e:
logger.exception(f"Worker {worker_id} full exception details:")
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
# Explicitly delete update_handler to free all references
if update_handler:
del update_handler
update_handler = None
datastore.update_watch(uuid=uuid, update_obj=final_updates)
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
# Explicitly delete update_handler to free all references
if update_handler:
del update_handler
update_handler = None
# Force garbage collection
# Force aggressive memory cleanup after clearing
import gc
gc.collect()
try:
import ctypes
ctypes.CDLL('libc.so.6').malloc_trim(0)
except Exception:
pass
except Exception as e:
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.exception(f"Worker {worker_id} full exception details:")
logger.error(f"Worker {worker_id} traceback:", exc_info=True)
# Also update the watch with error information
if datastore and uuid in datastore.data['watching']:
datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Worker error: {str(e)}"})
@@ -509,43 +483,49 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
finally:
# Always cleanup - this runs whether there was an exception or not
if uuid:
# Call quit() as backup (Puppeteer/Playwright have internal cleanup, but this acts as safety net)
try:
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
await update_handler.fetcher.quit(watch=watch)
except Exception as e:
logger.error(f"Exception while cleaning/quit after calling browser: {e}")
logger.exception(f"Worker {worker_id} full exception details:")
try:
# Release UUID from processing (thread-safe)
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
worker_handler.release_uuid_from_processing(uuid, worker_id=worker_id)
# Send completion signal
if watch:
#logger.info(f"Worker {worker_id} sending completion signal for UUID {watch['uuid']}")
watch_check_update.send(watch_uuid=watch['uuid'])
# Clean up all memory references BEFORE garbage collection
# Explicitly clean up update_handler and all its references
if update_handler:
# Clear fetcher content using the proper method
if hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
# Clear processor references
if hasattr(update_handler, 'content_processor'):
update_handler.content_processor = None
del update_handler
update_handler = None
# Clear large content variables
# Clear local contents variable if it still exists
if 'contents' in locals():
del contents
# Force garbage collection after all references are cleared
# Note: We don't set watch = None here because:
# 1. watch is just a local reference to datastore.data['watching'][uuid]
# 2. Setting it to None doesn't affect the datastore
# 3. GC can't collect the object anyway (still referenced by datastore)
# 4. It would just cause confusion
# Force garbage collection after cleanup
import gc
gc.collect()
logger.debug(f"Worker {worker_id} completed watch {uuid} in {time.time()-fetch_start_time:.2f}s")
except Exception as cleanup_error:
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")
logger.exception(f"Worker {worker_id} full exception details:")
del(uuid)
@@ -47,7 +47,7 @@ def create_backup(datastore_path, watches: dict):
# Add any data in the watch data directory.
for uuid, w in watches.items():
for f in Path(w.data_dir).glob('*'):
for f in Path(w.watch_data_dir).glob('*'):
zipObj.write(f,
# Use the full path to access the file, but make the file 'relative' in the Zip.
arcname=os.path.join(f.parts[-2], f.parts[-1]),
@@ -102,7 +102,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Maximum number of backups reached, please remove some"), "error")
return redirect(url_for('backups.index'))
# With immediate persistence, all data is already saved
# Be sure we're written fresh - force immediate save using abstract method
datastore.force_save_all()
zip_thread = threading.Thread(
target=create_backup,
args=(datastore.datastore_path, datastore.data.get("watching")),
@@ -3,10 +3,10 @@
{% from '_helpers.html' import render_simple_field, render_field %}
<div class="edit-form">
<div class="box-wrap inner">
<h2>{{ _('Backups') }}</h2>
<h4>{{ _('Backups') }}</h4>
{% if backup_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
<strong>{{ _('A backup is running!') }}</strong>
</p>
{% endif %}
<p>
@@ -174,7 +174,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
async def start_browsersteps_session(watch_uuid):
from changedetectionio.browser_steps import browser_steps
from . import browser_steps
import time
from playwright.async_api import async_playwright
@@ -238,6 +238,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@browser_steps_blueprint.route("/browsersteps_start_session", methods=['GET'])
def browsersteps_start_session():
# A new session was requested, return sessionID
import asyncio
import uuid
browsersteps_session_id = str(uuid.uuid4())
watch_uuid = request.args.get('uuid')
@@ -284,8 +285,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
watch = datastore.data['watching'].get(uuid)
filename = f"step_before-{step_n}.jpeg" if request.args.get('type', '') == 'before' else f"step_{step_n}.jpeg"
if step_n and watch and os.path.isfile(os.path.join(watch.data_dir, filename)):
response = make_response(send_from_directory(directory=watch.data_dir, path=filename))
if step_n and watch and os.path.isfile(os.path.join(watch.watch_data_dir, filename)):
response = make_response(send_from_directory(directory=watch.watch_data_dir, path=filename))
response.headers['Content-type'] = 'image/jpeg'
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
response.headers['Pragma'] = 'no-cache'
@@ -300,10 +301,11 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@browser_steps_blueprint.route("/browsersteps_update", methods=['POST'])
def browsersteps_ui_update():
import base64
import playwright._impl._errors
from changedetectionio.blueprint.browser_steps import browser_steps
remaining = 0
remaining =0
uuid = request.args.get('uuid')
goto_website_url_first_step = request.args.get('goto_website_url_first_step')
browsersteps_session_id = request.args.get('browsersteps_session_id')
@@ -314,33 +316,33 @@ def construct_blueprint(datastore: ChangeDetectionStore):
return make_response('No session exists under that ID', 500)
is_last_step = False
# @todo - should always be an existing session
if goto_website_url_first_step:
logger.debug("Going to site (requested automatically before stepping)..")
step_operation = "Goto site"
step_selector = None
step_optional_value = None
else:
# Actions - step/apply/etc, do the thing and return state
if request.method == 'POST':
# @todo - should always be an existing session
step_operation = request.form.get('operation')
step_selector = request.form.get('selector')
step_optional_value = request.form.get('optional_value')
is_last_step = strtobool(request.form.get('is_last_step'))
try:
# Run the async call_action method in the dedicated browser steps event loop
run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(
action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value
try:
# Run the async call_action method in the dedicated browser steps event loop
run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(
action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value
)
)
)
except Exception as e:
logger.error(f"Exception when calling step operation {step_operation} {str(e)}")
# Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401)
except Exception as e:
logger.error(f"Exception when calling step operation {step_operation} {str(e)}")
# Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401)
# if not this_session.page:
# cleanup_playwright_session()
# return make_response('Browser session ran out of time :( Please reload this page.', 401)
# Screenshots and other info only needed on requesting a step (POST)
try:
@@ -348,7 +350,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
(screenshot, xpath_data) = run_async_in_browser_loop(
browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
)
if is_last_step:
watch = datastore.data['watching'].get(uuid)
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
@@ -8,17 +8,6 @@ from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT
from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.jinja2_custom import render as jinja_render
def browser_steps_get_valid_steps(browser_steps: list):
if browser_steps is not None and len(browser_steps):
valid_steps = list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one'),browser_steps))
# Just incase they selected Goto site by accident with older JS
if valid_steps and valid_steps[0]['operation'] == 'Goto site':
del(valid_steps[0])
return valid_steps
return []
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
@@ -14,7 +14,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
from changedetectionio import forms
#
if request.method == 'POST':
# from changedetectionio import worker_pool
# from changedetectionio import worker_handler
from changedetectionio.blueprint.imports.importer import (
import_url_list,
@@ -26,13 +26,12 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# URL List import
if request.values.get('urls') and len(request.values.get('urls').strip()):
# Import and push into the queue for immediate update check
from changedetectionio import processors
importer_handler = import_url_list()
importer_handler.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', processors.get_default_processor()))
importer_handler.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', 'text_json_diff'))
logger.debug(f"Imported {len(importer_handler.new_uuids)} new UUIDs")
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
if len(importer_handler.remaining_data) == 0:
return redirect(url_for('watchlist.index'))
@@ -46,7 +45,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# XLSX importer
@@ -71,7 +70,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Could be some remaining, or we could be on GET
@@ -4,7 +4,7 @@ from flask import Blueprint, flash, redirect, url_for
from flask_login import login_required
from changedetectionio.store import ChangeDetectionStore
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from queue import PriorityQueue
PRICE_DATA_TRACK_ACCEPT = 'accepted'
@@ -20,15 +20,13 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
datastore.data['watching'][uuid]['processor'] = 'restock_diff'
datastore.data['watching'][uuid].clear_watch()
datastore.data['watching'][uuid].commit()
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
return redirect(url_for("watchlist.index"))
@login_required
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
def reject(uuid):
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_REJECT
datastore.data['watching'][uuid].commit()
return redirect(url_for("watchlist.index"))
@@ -37,8 +37,6 @@ def construct_single_watch_routes(rss_blueprint, datastore):
rss_content_format = datastore.data['settings']['application'].get('rss_content_format')
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# Get the watch by UUID
watch = datastore.data['watching'].get(uuid)
if not watch:
@@ -1,9 +1,8 @@
import os
from copy import deepcopy
from datetime import datetime, timedelta
from datetime import datetime
from zoneinfo import ZoneInfo, available_timezones
import secrets
import time
import flask_login
from flask import Blueprint, render_template, request, redirect, url_for, flash
from flask_babel import gettext
@@ -75,21 +74,16 @@ def construct_blueprint(datastore: ChangeDetectionStore):
del (app_update['password'])
datastore.data['settings']['application'].update(app_update)
# Handle dynamic worker count adjustment
old_worker_count = datastore.data['settings']['requests'].get('workers', 1)
new_worker_count = form.data['requests'].get('workers', 1)
datastore.data['settings']['requests'].update(form.data['requests'])
datastore.commit()
# Clear all checksums to force reprocessing with new settings
# Global settings can affect watch behavior (filters, rendering, etc.)
datastore.clear_all_last_checksums()
# Adjust worker count if it changed
if new_worker_count != old_worker_count:
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from changedetectionio.flask_app import update_q, notification_q, app, datastore as ds
# Check CPU core availability and warn if worker count is high
@@ -98,7 +92,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Warning: Worker count ({}) is close to or exceeds available CPU cores ({})").format(
new_worker_count, cpu_count), 'warning')
result = worker_pool.adjust_async_worker_count(
result = worker_handler.adjust_async_worker_count(
new_count=new_worker_count,
update_q=update_q,
notification_q=notification_q,
@@ -115,11 +109,13 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if not os.getenv("SALTED_PASS", False) and len(form.application.form.password.encrypted_password):
datastore.data['settings']['application']['password'] = form.application.form.password.encrypted_password
datastore.commit()
datastore.needs_write_urgent = True
flash(gettext("Password protection enabled."), 'notice')
flask_login.logout_user()
return redirect(url_for('watchlist.index'))
datastore.needs_write_urgent = True
# Also save plugin settings from the same form submission
plugin_tabs_list = get_plugin_settings_tabs()
for tab in plugin_tabs_list:
@@ -147,9 +143,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
active_plugins = get_active_plugins()
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
# Calculate uptime in seconds
uptime_seconds = time.time() - datastore.start_time
# Get plugin settings tabs and instantiate forms
plugin_tabs = get_plugin_settings_tabs()
plugin_forms = {}
@@ -168,7 +161,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
active_plugins=active_plugins,
api_key=datastore.data['settings']['application'].get('api_access_token'),
python_version=python_version,
uptime_seconds=uptime_seconds,
available_timezones=sorted(available_timezones()),
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False),
extra_notification_token_placeholder_info=datastore.get_unique_notification_token_placeholders_available(),
@@ -189,7 +181,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def settings_reset_api_key():
secret = secrets.token_hex(16)
datastore.data['settings']['application']['api_access_token'] = secret
datastore.commit()
datastore.needs_write_urgent = True
flash(gettext("API Key was regenerated."))
return redirect(url_for('settings.settings_page')+'#api')
@@ -206,7 +198,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def toggle_all_paused():
current_state = datastore.data['settings']['application'].get('all_paused', False)
datastore.data['settings']['application']['all_paused'] = not current_state
datastore.commit()
datastore.needs_write_urgent = True
if datastore.data['settings']['application']['all_paused']:
flash(gettext("Automatic scheduling paused - checks will not be queued."), 'notice')
@@ -220,7 +212,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def toggle_all_muted():
current_state = datastore.data['settings']['application'].get('all_muted', False)
datastore.data['settings']['application']['all_muted'] = not current_state
datastore.commit()
datastore.needs_write_urgent = True
if datastore.data['settings']['application']['all_muted']:
flash(gettext("All notifications muted."), 'notice')
@@ -25,7 +25,7 @@
<li class="tab"><a href="#ui-options">{{ _('UI Options') }}</a></li>
<li class="tab"><a href="#api">{{ _('API') }}</a></li>
<li class="tab"><a href="#rss">{{ _('RSS') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.index') }}">{{ _('Backups') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.index') }}" class="pure-menu-link">{{ _('Backups') }}</a></li>
<li class="tab"><a href="#timedate">{{ _('Time & Date') }}</a></li>
<li class="tab"><a href="#proxies">{{ _('CAPTCHA & Proxies') }}</a></li>
{% if plugin_tabs %}
@@ -59,14 +59,6 @@
{{ _('Set to') }} <strong>0</strong> {{ _('to disable') }}
</span>
</div>
<div class="pure-control-group">
{{ render_field(form.application.form.history_snapshot_max_length, class="history_snapshot_max_length") }}
<span class="pure-form-message-inline">{{ _('Limit collection of history snapshots for each watch to this number of history items.') }}
<br>
{{ _('Set to empty to disable / no limit') }}
</span>
</div>
<div class="pure-control-group">
{% if not hide_remove_pass %}
{% if current_user.is_authenticated %}
@@ -394,7 +386,6 @@ nav
{% endfor %}
{% endif %}
<div class="tab-pane-inner" id="info">
<p><strong>{{ _('Uptime:') }}</strong> {{ uptime_seconds|format_duration }}</p>
<p><strong>{{ _('Python version:') }}</strong> {{ python_version }}</p>
<p><strong>{{ _('Plugins active:') }}</strong></p>
{% if active_plugins %}
+7 -40
View File
@@ -57,10 +57,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@tags_blueprint.route("/mute/<string:uuid>", methods=['GET'])
@login_optionally_required
def mute(uuid):
tag = datastore.data['settings']['application']['tags'].get(uuid)
if tag:
tag['notification_muted'] = not tag['notification_muted']
tag.commit()
if datastore.data['settings']['application']['tags'].get(uuid):
datastore.data['settings']['application']['tags'][uuid]['notification_muted'] = not datastore.data['settings']['application']['tags'][uuid]['notification_muted']
return redirect(url_for('tags.tags_overview_page'))
@tags_blueprint.route("/delete/<string:uuid>", methods=['GET'])
@@ -70,17 +68,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if datastore.data['settings']['application']['tags'].get(uuid):
del datastore.data['settings']['application']['tags'][uuid]
# Delete tag.json file if it exists
import os
tag_dir = os.path.join(datastore.datastore_path, uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
logger.info(f"Deleted tag.json for tag {uuid}")
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
# Remove tag from all watches in background thread to avoid blocking
def remove_tag_background(tag_uuid):
"""Background thread to remove tag from watches - discarded after completion."""
@@ -89,7 +76,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
for watch_uuid, watch in datastore.data['watching'].items():
if watch.get('tags') and tag_uuid in watch['tags']:
watch['tags'].remove(tag_uuid)
watch.commit()
removed_count += 1
logger.info(f"Background: Tag {tag_uuid} removed from {removed_count} watches")
except Exception as e:
@@ -112,7 +98,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
for watch_uuid, watch in datastore.data['watching'].items():
if watch.get('tags') and tag_uuid in watch['tags']:
watch['tags'].remove(tag_uuid)
watch.commit()
unlinked_count += 1
logger.info(f"Background: Tag {tag_uuid} unlinked from {unlinked_count} watches")
except Exception as e:
@@ -127,17 +112,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@tags_blueprint.route("/delete_all", methods=['GET'])
@login_optionally_required
def delete_all():
# Delete all tag.json files
import os
for tag_uuid in list(datastore.data['settings']['application']['tags'].keys()):
tag_dir = os.path.join(datastore.datastore_path, tag_uuid)
tag_json = os.path.join(tag_dir, "tag.json")
if os.path.exists(tag_json):
try:
os.unlink(tag_json)
except Exception as e:
logger.error(f"Failed to delete tag.json for tag {tag_uuid}: {e}")
# Clear all tags from settings immediately
datastore.data['settings']['application']['tags'] = {}
@@ -148,7 +122,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
try:
for watch_uuid, watch in datastore.data['watching'].items():
watch['tags'] = []
watch.commit()
cleared_count += 1
logger.info(f"Background: Cleared tags from {cleared_count} watches")
except Exception as e:
@@ -229,10 +202,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if uuid == 'first':
uuid = list(datastore.data['settings']['application']['tags'].keys()).pop()
tag = datastore.data['settings']['application']['tags'].get(uuid)
default = datastore.data['settings']['application']['tags'].get(uuid)
form = group_restock_settings_form(formdata=request.form if request.method == 'POST' else None,
data=tag,
data=default,
extra_notification_tokens=datastore.get_unique_notification_tokens_available()
)
# @todo subclass form so validation works
@@ -241,15 +214,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# flash(','.join(l), 'error')
# return redirect(url_for('tags.form_tag_edit_submit', uuid=uuid))
tag.update(form.data)
tag['processor'] = 'restock_diff'
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated, cleared {cleared_count} watch checksums")
datastore.data['settings']['application']['tags'][uuid].update(form.data)
datastore.data['settings']['application']['tags'][uuid]['processor'] = 'restock_diff'
datastore.needs_write_urgent = True
flash(gettext("Updated"))
return redirect(url_for('tags.tags_overview_page'))
+23 -22
View File
@@ -10,7 +10,7 @@ from changedetectionio.blueprint.ui.notification import construct_blueprint as c
from changedetectionio.blueprint.ui.views import construct_blueprint as construct_views_blueprint
from changedetectionio.blueprint.ui import diff, preview
def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchMetaData, watch_check_update, extra_data=None, emit_flash=True):
def _handle_operations(op, uuids, datastore, worker_handler, update_q, queuedWatchMetaData, watch_check_update, extra_data=None, emit_flash=True):
from flask import request, flash
if op == 'delete':
@@ -24,7 +24,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['paused'] = True
datastore.data['watching'][uuid].commit()
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches paused").format(len(uuids)))
@@ -32,7 +32,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid.strip()]['paused'] = False
datastore.data['watching'][uuid].commit()
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches unpaused").format(len(uuids)))
@@ -47,7 +47,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['notification_muted'] = True
datastore.data['watching'][uuid].commit()
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches muted").format(len(uuids)))
@@ -55,7 +55,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['notification_muted'] = False
datastore.data['watching'][uuid].commit()
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches un-muted").format(len(uuids)))
@@ -63,7 +63,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
# Recheck and require a full reprocessing
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
if emit_flash:
flash(gettext("{} watches queued for rechecking").format(len(uuids)))
@@ -71,7 +71,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]["last_error"] = False
datastore.data['watching'][uuid].commit()
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches errors cleared").format(len(uuids)))
@@ -92,7 +92,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
datastore.data['watching'][uuid]['notification_body'] = None
datastore.data['watching'][uuid]['notification_urls'] = []
datastore.data['watching'][uuid]['notification_format'] = USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
datastore.data['watching'][uuid].commit()
if emit_flash:
flash(gettext("{} watches set to use default notification settings").format(len(uuids)))
@@ -108,7 +107,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
datastore.data['watching'][uuid]['tags'] = []
datastore.data['watching'][uuid]['tags'].append(tag_uuid)
datastore.data['watching'][uuid].commit()
if emit_flash:
flash(gettext("{} watches were tagged").format(len(uuids)))
@@ -116,7 +114,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
watch_check_update.send(watch_uuid=uuid)
def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool, queuedWatchMetaData, watch_check_update):
def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_handler, queuedWatchMetaData, watch_check_update):
ui_blueprint = Blueprint('ui', __name__, template_folder="templates")
# Register the edit blueprint
@@ -224,14 +222,14 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
@login_optionally_required
def form_delete():
uuid = request.args.get('uuid')
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
if uuid != 'all' and not uuid in datastore.data['watching'].keys():
flash(gettext('The watch by UUID {} does not exist.').format(uuid), 'error')
return redirect(url_for('watchlist.index'))
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
datastore.delete(uuid)
flash(gettext('Deleted.'))
@@ -241,14 +239,14 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
@login_optionally_required
def form_clone():
uuid = request.args.get('uuid')
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
new_uuid = datastore.clone(uuid)
if not datastore.data['watching'].get(uuid).get('paused'):
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid}))
flash(gettext('Cloned, you are editing the new watch.'))
@@ -264,10 +262,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
if uuid:
# Single watch - check if already queued or running
if worker_pool.is_watch_running(uuid) or uuid in update_q.get_queued_uuids():
if worker_handler.is_watch_running(uuid) or uuid in update_q.get_queued_uuids():
flash(gettext("Watch is already queued or being checked."))
else:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
flash(gettext("Queued 1 watch for rechecking."))
else:
# Multiple watches - first count how many need to be queued
@@ -286,7 +284,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
if len(watches_to_queue) < 20:
# Get already queued/running UUIDs once (efficient)
queued_uuids = set(update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
# Filter out watches that are already queued or running
watches_to_queue_filtered = []
@@ -296,7 +294,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
# Queue only the filtered watches
for watch_uuid in watches_to_queue_filtered:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
# Provide feedback about skipped watches
skipped_count = len(watches_to_queue) - len(watches_to_queue_filtered)
@@ -312,7 +310,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
# 20+ watches - queue in background thread to avoid blocking HTTP response
# Capture queued/running state before background thread
queued_uuids = set(update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
def queue_watches_background():
"""Background thread to queue watches - discarded after completion."""
@@ -322,7 +320,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
for watch_uuid in watches_to_queue:
# Check if already queued or running (state captured at start)
if watch_uuid not in queued_uuids and watch_uuid not in running_uuids:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
queued_count += 1
else:
skipped_count += 1
@@ -351,7 +349,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
extra_data=extra_data,
queuedWatchMetaData=queuedWatchMetaData,
uuids=uuids,
worker_pool=worker_pool,
worker_handler=worker_handler,
update_q=update_q,
watch_check_update=watch_check_update,
op=op,
@@ -369,6 +367,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
import json
from copy import deepcopy
# more for testing
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# copy it to memory as trim off what we dont need (history)
watch = deepcopy(datastore.data['watching'].get(uuid))
+82 -70
View File
@@ -83,6 +83,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
If a processor doesn't have a difference module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -100,21 +101,23 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
try:
# Try to import the processor's difference module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.difference')
# Call the processor's render() function
if processor_module and hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render() function
if hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a difference module, falling back to text_json_diff: {e}")
# Fallback: if processor doesn't have difference module, use text_json_diff as default
from changedetectionio.processors.text_json_diff.difference import render as default_render
@@ -141,10 +144,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/extract.py::render_form()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -154,21 +157,23 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
try:
# Try to import the processor's extract module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.extract')
# Call the processor's render_form() function
if processor_module and hasattr(processor_module, 'render_form'):
return processor_module.render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render_form() function
if hasattr(processor_module, 'render_form'):
return processor_module.render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have an extract module, falling back to base extractor: {e}")
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import render_form as default_render_form
@@ -195,7 +200,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/extract.py::process_extraction()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -208,22 +213,24 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
try:
# Try to import the processor's extract module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.extract')
# Call the processor's process_extraction() function
if processor_module and hasattr(processor_module, 'process_extraction'):
return processor_module.process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
# Call the processor's process_extraction() function
if hasattr(processor_module, 'process_extraction'):
return processor_module.process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have an extract module, falling back to base extractor: {e}")
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import process_extraction as default_process_extraction
@@ -260,7 +267,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
- /diff/{uuid}/processor-asset/after
- /diff/{uuid}/processor-asset/rendered_diff
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -273,33 +280,38 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
try:
# Try to import the processor's difference module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.difference')
# Call the processor's get_asset() function
if processor_module and hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
# Call the processor's get_asset() function
if hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
if result is None:
if result is None:
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
abort(404, description=f"Processor '{processor_name}' does not support assets")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a difference module: {e}")
from flask import abort
abort(404, description=f"Processor '{processor_name}' does not support assets")
abort(404, description=f"Processor '{processor_name}' not found")
return diff_blueprint
+68 -89
View File
@@ -9,7 +9,7 @@ from jinja2 import Environment, FileSystemLoader
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
from changedetectionio.time_handler import is_within_schedule
from changedetectionio import worker_pool
from changedetectionio import worker_handler
def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMetaData):
edit_blueprint = Blueprint('ui_edit', __name__, template_folder="../ui/templates")
@@ -26,17 +26,18 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# https://wtforms.readthedocs.io/en/3.0.x/forms/#wtforms.form.Form.populate_obj ?
def edit_page(uuid):
from changedetectionio import forms
from changedetectionio.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio import processors
import importlib
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# More for testing, possible to return the first/only
if not datastore.data['watching'].keys():
flash(gettext("No watches to edit"), "error")
return redirect(url_for('watchlist.index'))
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
if not uuid in datastore.data['watching']:
flash(gettext("No watch with the UUID {} found.").format(uuid), "error")
return redirect(url_for('watchlist.index'))
@@ -71,13 +72,8 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
processor_name = datastore.data['watching'][uuid].get('processor', '')
processor_classes = next((tpl for tpl in processors.find_processors() if tpl[1] == processor_name), None)
if not processor_classes:
flash(gettext("Could not load '{}' processor, processor plugin might be missing. Please select a different processor.").format(processor_name), 'error')
# Fall back to default processor so user can still edit and change processor
processor_classes = next((tpl for tpl in processors.find_processors() if tpl[1] == 'text_json_diff'), None)
if not processor_classes:
# If even text_json_diff is missing, something is very wrong
flash(gettext("Could not load '{}' processor, processor plugin might be missing.").format(processor_name), 'error')
return redirect(url_for('watchlist.index'))
flash(gettext("Cannot load the edit form for processor/plugin '{}', plugin missing?").format(processor_classes[1]), 'error')
return redirect(url_for('watchlist.index'))
parent_module = processors.get_parent_module(processor_classes[0])
@@ -154,10 +150,58 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
extra_update_obj['time_between_check'] = form.time_between_check.data
# Handle processor-config-* fields separately (save to JSON, not datastore)
# IMPORTANT: These must NOT be saved to url-watches.json, only to the processor-specific JSON file
processor_config_data = processors.extract_processor_config_from_form_data(form.data)
processors.save_processor_config(datastore, uuid, processor_config_data)
# Handle processor-config-* fields separately (save to JSON, not datastore)
processor_config_data = {}
fields_to_remove = []
for field_name, field_value in form.data.items():
if field_name.startswith('processor_config_'):
config_key = field_name.replace('processor_config_', '')
if field_value: # Only save non-empty values
processor_config_data[config_key] = field_value
fields_to_remove.append(field_name)
# Save processor config to JSON file if any config data exists
if processor_config_data:
try:
processor_name = form.data.get('processor')
# Create a processor instance to access config methods
processor_instance = processors.difference_detection_processor(datastore, uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, processor_config_data)
logger.debug(f"Saved processor config to {config_filename}: {processor_config_data}")
# Call optional edit_hook if processor has one
try:
# Try to import the edit_hook module from the processor package
import importlib
edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
try:
edit_hook = importlib.import_module(edit_hook_module_name)
logger.debug(f"Found edit_hook module for {processor_name}")
if hasattr(edit_hook, 'on_config_save'):
logger.info(f"Calling edit_hook.on_config_save for {processor_name}")
watch_obj = datastore.data['watching'][uuid]
# Call hook and get updated config
updated_config = edit_hook.on_config_save(watch_obj, processor_config_data, datastore)
# Save updated config back to file
processor_instance.update_extra_watch_config(config_filename, updated_config)
logger.info(f"Edit hook updated config: {updated_config}")
else:
logger.debug(f"Edit hook module found but no on_config_save function")
except ModuleNotFoundError:
logger.debug(f"No edit_hook module for processor {processor_name} (this is normal)")
except Exception as hook_error:
logger.error(f"Edit hook error (non-fatal): {hook_error}", exc_info=True)
except Exception as e:
logger.error(f"Failed to save processor config: {e}")
# Remove processor-config-* fields from form.data before updating datastore
for field_name in fields_to_remove:
form.data.pop(field_name, None)
# Ignore text
form_ignore_text = form.ignore_text.data
@@ -197,11 +241,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Recast it if need be to right data Watch handler
watch_class = processors.get_custom_watch_obj_for_processor(form.data.get('processor'))
datastore.data['watching'][uuid] = watch_class(datastore_path=datastore.datastore_path, __datastore=datastore.data, default=datastore.data['watching'][uuid])
# Save the watch immediately
datastore.data['watching'][uuid].commit()
datastore.data['watching'][uuid] = watch_class(datastore_path=datastore.datastore_path, default=datastore.data['watching'][uuid])
flash(gettext("Updated watch - unpaused!") if request.args.get('unpause_on_save') else gettext("Updated watch."))
# Cleanup any browsersteps session for this watch
@@ -211,6 +251,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
except Exception as e:
logger.debug(f"Error cleaning up browsersteps session: {e}")
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
# But in the case something is added we should save straight away
datastore.needs_write_urgent = True
# Do not queue on edit if its not within the time range
# @todo maybe it should never queue anyway on edit...
@@ -239,7 +283,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
#############################
if not datastore.data['watching'][uuid].get('paused') and is_in_schedule:
# Queue the watch for immediate recheck, with a higher priority
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
@@ -267,17 +311,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Get fetcher capabilities instead of hardcoded logic
capabilities = get_fetcher_capabilities(watch, datastore)
# Add processor capabilities from module
capabilities['supports_visual_selector'] = getattr(parent_module, 'supports_visual_selector', False)
capabilities['supports_text_filters_and_triggers'] = getattr(parent_module, 'supports_text_filters_and_triggers', False)
capabilities['supports_text_filters_and_triggers_elements'] = getattr(parent_module, 'supports_text_filters_and_triggers_elements', False)
capabilities['supports_request_type'] = getattr(parent_module, 'supports_request_type', False)
app_rss_token = datastore.data['settings']['application'].get('rss_access_token'),
c = [f"processor-{watch.get('processor')}"]
if worker_pool.is_watch_running(uuid):
if worker_handler.is_watch_running(uuid):
c.append('checking-now')
template_args = {
@@ -334,12 +371,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
from flask import send_file
import brotli
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
watch = datastore.data['watching'].get(uuid)
if watch and watch.history.keys() and os.path.isdir(watch.data_dir):
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[-1]
html_fname = os.path.join(watch.data_dir, f"{latest_filename}.html.br")
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
if html_fname.endswith('.br'):
# Read and decompress the Brotli file
@@ -354,65 +389,12 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Return a 500 error
abort(500)
@edit_blueprint.route("/edit/<string:uuid>/get-data-package", methods=['GET'])
@login_optionally_required
def watch_get_data_package(uuid):
"""Download all data for a single watch as a zip file"""
from io import BytesIO
from flask import send_file
import zipfile
from pathlib import Path
import datetime
watch = datastore.data['watching'].get(uuid)
if not watch:
abort(404)
# Create zip in memory
memory_file = BytesIO()
with zipfile.ZipFile(memory_file, 'w',
compression=zipfile.ZIP_DEFLATED,
compresslevel=8) as zipObj:
# Add the watch's JSON file if it exists
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
if os.path.isfile(watch_json_path):
zipObj.write(watch_json_path,
arcname=os.path.join(uuid, 'watch.json'),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
# Add all files in the watch data directory
if os.path.isdir(watch.data_dir):
for f in Path(watch.data_dir).glob('*'):
if f.is_file() and f.name != 'watch.json': # Skip watch.json since we already added it
zipObj.write(f,
arcname=os.path.join(uuid, f.name),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
# Seek to beginning of file
memory_file.seek(0)
# Generate filename with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"watch-data-{uuid[:8]}-{timestamp}.zip"
return send_file(memory_file,
as_attachment=True,
download_name=filename,
mimetype='application/zip')
# Ajax callback
@edit_blueprint.route("/edit/<string:uuid>/preview-rendered", methods=['POST'])
@login_optionally_required
def watch_get_preview_rendered(uuid):
'''For when viewing the "preview" of the rendered text from inside of Edit'''
from flask import jsonify
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
from changedetectionio.processors.text_json_diff import prepare_filter_prevew
result = prepare_filter_prevew(watch_uuid=uuid, form_data=request.form, datastore=datastore)
return jsonify(result)
@@ -436,9 +418,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
s = re.sub(r'[0-9]+', r'\\d+', s)
datastore.data["watching"][uuid]['ignore_text'].append('/' + s + '/')
# Save the updated ignore_text
datastore.data["watching"][uuid].commit()
return f"<a href={url_for('ui.ui_preview.preview_page', uuid=uuid)}>Click to preview</a>"
return edit_blueprint
+50 -38
View File
@@ -26,9 +26,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/preview.py::render()
If a processor doesn't have a preview module, falls back to default text preview.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -38,21 +39,24 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's preview module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'preview')
try:
# Try to import the processor's preview module
import importlib
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.preview')
# Call the processor's render() function
if processor_module and hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render() function
if hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.debug(f"Processor {processor_name} does not have a preview module, using default preview: {e}")
# Fallback: if processor doesn't have preview module, use default text preview
content = []
@@ -146,8 +150,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
"""
from flask import make_response
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -157,33 +163,39 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's preview module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'preview')
try:
# Try to import the processor's preview module
import importlib
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.preview')
# Call the processor's get_asset() function
if processor_module and hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
# Call the processor's get_asset() function
if hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
if result is None:
if result is None:
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
abort(404, description=f"Processor '{processor_name}' does not support assets")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a preview module: {e}")
from flask import abort
abort(404, description=f"Processor '{processor_name}' does not support assets")
abort(404, description=f"Processor '{processor_name}' not found")
return preview_blueprint
@@ -45,19 +45,14 @@
<div class="tabs collapsable">
<ul>
<li class="tab"><a href="#general">{{ _('General') }}</a></li>
{% if capabilities.supports_request_type %}
<li class="tab"><a href="#request">{{ _('Request') }}</a></li>
{% endif %}
{% if extra_tab_content %}
<li class="tab"><a href="#extras_tab">{{ extra_tab_content }}</a></li>
{% endif %}
{% if capabilities.supports_browser_steps %}
<li class="tab"><a id="browsersteps-tab" href="#browser-steps">{{ _('Browser Steps') }}</a></li>
{% endif %}
{% if capabilities.supports_visual_selector %}
<!-- should goto extra forms? -->
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<li class="tab"><a id="visualselector-tab" href="#visualselector">{{ _('Visual Filter Selector') }}</a></li>
{% endif %}
{% if capabilities.supports_text_filters_and_triggers %}
<li class="tab" id="filters-and-triggers-tab"><a href="#filters-and-triggers">{{ _('Filters & Triggers') }}</a></li>
<li class="tab" id="conditions-tab"><a href="#conditions">{{ _('Conditions') }}</a></li>
{% endif %}
@@ -115,20 +110,12 @@
{{ _('Sends a notification when the filter can no longer be seen on the page, good for knowing when the page changed and your filter will not work anymore.') }}
</span>
</div>
<div class="pure-control-group">
{{ render_field(form.history_snapshot_max_length, class="history_snapshot_max_length") }}
<span class="pure-form-message-inline">{{ _('Limit collection of history snapshots for each watch to this number of history items.') }}
<br>
{{ _('Set to empty to use system settings default') }}
</span>
</div>
<div class="pure-control-group">
{{ render_ternary_field(form.use_page_title_in_list) }}
</div>
</fieldset>
</div>
{% if capabilities.supports_request_type %}
<div class="tab-pane-inner" id="request">
<div class="pure-control-group inline-radio">
{{ render_field(form.fetch_backend, class="fetch-backend") }}
@@ -216,7 +203,6 @@ Math: {{ 1 + 1 }}") }}
</div>
</fieldset>
</div>
{% endif %}
<div class="tab-pane-inner" id="browser-steps">
{% if capabilities.supports_browser_steps %}
@@ -297,7 +283,8 @@ Math: {{ 1 + 1 }}") }}
</fieldset>
</div>
{% if capabilities.supports_text_filters_and_triggers %}
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<div class="tab-pane-inner" id="conditions">
<script>
const verify_condition_rule_url="{{url_for('conditions.verify_condition_single_rule', watch_uuid=uuid)}}";
@@ -316,9 +303,7 @@ Math: {{ 1 + 1 }}") }}
<span id="activate-text-preview" class="pure-button pure-button-primary button-xsmall">{{ _('Activate preview') }}</span>
<div>
<div id="edit-text-filter">
{% if capabilities.supports_text_filters_and_triggers_elements %}
<div class="pure-control-group" id="pro-tips">
<div class="pure-control-group" id="pro-tips">
<strong>{{ _('Pro-tips:') }}</strong><br>
<ul>
<li>
@@ -329,8 +314,8 @@ Math: {{ 1 + 1 }}") }}
</li>
</ul>
</div>
{% include "edit/include_subtract.html" %}
{% endif %}
<div class="text-filtering border-fieldset">
<fieldset class="pure-group" id="text-filtering-type-options">
<h3>{{ _('Text filtering') }}</h3>
@@ -389,7 +374,7 @@ Math: {{ 1 + 1 }}") }}
{{ extra_form_content|safe }}
</div>
{% endif %}
{% if capabilities.supports_visual_selector %}
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
@@ -401,7 +386,7 @@ Math: {{ 1 + 1 }}") }}
{{ _('The Visual Selector tool lets you select the') }} <i>{{ _('text') }}</i> {{ _('elements that will be used for the change detection. It automatically fills-in the filters in the "CSS/JSONPath/JQ/XPath Filters" box of the') }} <a href="#filters-and-triggers">{{ _('Filters & Triggers') }}</a> {{ _('tab. Use') }} <strong>{{ _('Shift+Click') }}</strong> {{ _('to select multiple items.') }}
</span>
{% if watch['processor'] == 'image_ssim_diff' %} {# @todo, integrate with image_ssim_diff selector better, use some extra form ? #}
{% if watch['processor'] == 'image_ssim_diff' %}
<div id="selection-mode-controls" style="margin: 10px 0; padding: 10px; background: var(--color-background-tab); border-radius: 5px;">
<label style="font-weight: 600; margin-right: 15px;">{{ _('Selection Mode:') }}</label>
<label style="margin-right: 15px;">
@@ -488,7 +473,6 @@ Math: {{ 1 + 1 }}") }}
{% if watch.history_n %}
<p>
<a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">{{ _('Download latest HTML snapshot') }}</a>
<a href="{{url_for('ui.ui_edit.watch_get_data_package', uuid=uuid)}}" class="pure-button button-small">{{ _('Download watch data package') }}</a>
</p>
{% endif %}
+5 -6
View File
@@ -2,7 +2,7 @@ from flask import Blueprint, request, redirect, url_for, flash
from flask_babel import gettext
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
from changedetectionio import worker_pool
from changedetectionio import worker_handler
def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMetaData, watch_check_update):
@@ -24,9 +24,8 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
flash(gettext('Warning, URL {} already exists').format(url), "notice")
add_paused = request.form.get('edit_and_watch_submit_button') != None
from changedetectionio import processors
processor = request.form.get('processor', processors.get_default_processor())
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags','').strip(), extras={'paused': add_paused, 'processor': processor})
processor = request.form.get('processor', 'text_json_diff')
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags').strip(), extras={'paused': add_paused, 'processor': processor})
if new_uuid:
if add_paused:
@@ -34,9 +33,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
return redirect(url_for('ui.ui_edit.edit_page', uuid=new_uuid, unpause_on_save=1, tag=request.args.get('tag')))
else:
# Straight into the queue.
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
flash(gettext("Watch added."))
return redirect(url_for('watchlist.index', tag=request.args.get('tag','')))
return views_blueprint
return views_blueprint
@@ -39,7 +39,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
elif op == 'mute':
datastore.data['watching'][uuid].toggle_mute()
datastore.data['watching'][uuid].commit()
datastore.needs_write = True
return redirect(url_for('watchlist.index', tag = active_tag_uuid))
# Sort by last_changed and add the uuid which is usually the key..
@@ -1,9 +1,5 @@
{%- extends 'base.html' -%}
{%- block content -%}
{%- set tips = [
_("Changedetection.io can monitor more than just web-pages! See our plugins!") ~ ' <a href="https://changedetection.io/plugins">' ~ _('More info') ~ '</a>',
_("You can also add 'shared' watches.") ~ ' <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">' ~ _('More info') ~ '</a>'
] -%}
{%- from '_helpers.html' import render_simple_field, render_field, render_nolabel_field, sort_by_title -%}
<script src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
@@ -14,46 +10,6 @@
// Initialize Feather icons after the page loads
document.addEventListener('DOMContentLoaded', function() {
feather.replace();
// Intersection Observer for lazy loading favicons
// Only load favicon images when they enter the viewport
if ('IntersectionObserver' in window) {
const faviconObserver = new IntersectionObserver((entries, observer) => {
entries.forEach(entry => {
if (entry.isIntersecting) {
const img = entry.target;
const src = img.getAttribute('data-src');
if (src) {
// Load the actual favicon
img.src = src;
img.removeAttribute('data-src');
}
// Stop observing this image
observer.unobserve(img);
}
});
}, {
// Start loading slightly before the image enters viewport
rootMargin: '50px',
threshold: 0.01
});
// Observe all lazy favicon images
document.querySelectorAll('.lazy-favicon').forEach(img => {
faviconObserver.observe(img);
});
} else {
// Fallback for older browsers: load all favicons immediately
document.querySelectorAll('.lazy-favicon').forEach(img => {
const src = img.getAttribute('data-src');
if (src) {
img.src = src;
img.removeAttribute('data-src');
}
});
}
});
</script>
<style>
@@ -113,9 +69,7 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
</div>
</fieldset>
<span style="color:#eee; font-size: 80%;">
<strong>Tip: </strong> {{ tips | random | safe }}
</span>
<span style="color:#eee; font-size: 80%;"><img alt="{{ _('Create a shareable link') }}" style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" > {{ _("Tip: You can also add 'shared' watches.") }} <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">{{ _('More info') }}</a></span>
</form>
</div>
<div class="box">
@@ -246,38 +200,28 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
<td class="title-col inline">
<div class="flex-wrapper">
{% if 'favicons_enabled' not in ui_settings or ui_settings['favicons_enabled'] %}
<div>
{# Intersection Observer lazy loading: store real URL in data-src, load only when visible in viewport #}
<img alt="Favicon thumbnail"
class="favicon lazy-favicon"
loading="lazy"
decoding="async"
fetchpriority="low"
{% if favicon %}
data-src="{{url_for('static_content', group='favicon', filename=watch.uuid)}}"
{% endif %}
src='data:image/svg+xml;utf8,%3Csvg xmlns="http://www.w3.org/2000/svg" width="7.087" height="7.087" viewBox="0 0 7.087 7.087"%3E%3Ccircle cx="3.543" cy="3.543" r="3.279" stroke="%23e1e1e1" stroke-width="0.45" fill="none" opacity="0.74"/%3E%3C/svg%3E'>
<div>{# A page might have hundreds of these images, set IMG options for lazy loading, don't set SRC if we dont have it so it doesnt fetch the placeholder' #}
<img alt="Favicon thumbnail" class="favicon" loading="lazy" decoding="async" fetchpriority="low" {% if favicon %} src="{{url_for('static_content', group='favicon', filename=watch.uuid)}}" {% else %} src='data:image/svg+xml;utf8,%3Csvg xmlns="http://www.w3.org/2000/svg" width="7.087" height="7.087" viewBox="0 0 7.087 7.087"%3E%3Ccircle cx="3.543" cy="3.543" r="3.279" stroke="%23e1e1e1" stroke-width="0.45" fill="none" opacity="0.74"/%3E%3C/svg%3E' {% endif %} >
</div>
{% endif %}
<div>
{%- if watch['processor'] and watch['processor'] in processor_badge_texts -%}
<span class="processor-badge processor-badge-{{ watch['processor'] }}" title="{{ processor_descriptions.get(watch['processor'], watch['processor']) }}">{{ processor_badge_texts[watch['processor']] }}</span>
{%- endif -%}
<span class="watch-title">
{% if system_use_url_watchlist or watch.get('use_page_title_in_list') %}
{{ watch.label }}
{% else %}
{{ watch.get('title') or watch.link }}
{% endif %}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
</span>
<span class="watch-title">
{% if system_use_url_watchlist or watch.get('use_page_title_in_list') %}
{{ watch.label }}
{% else %}
{{ watch.get('title') or watch.link }}
{% endif %}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
</span>
<div class="error-text" style="display:none;">{{ watch.compile_error_texts(has_proxies=datastore.proxy_list)|safe }}</div>
{%- if watch['processor'] == 'text_json_diff' -%}
{%- if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] -%}
<div class="ldjson-price-track-offer">Switch to Restock & Price watch mode? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
{%- endif -%}
{%- endif -%}
{%- if watch['processor'] and watch['processor'] in processor_badge_texts -%}
<span class="processor-badge processor-badge-{{ watch['processor'] }}" title="{{ processor_descriptions.get(watch['processor'], watch['processor']) }}">{{ processor_badge_texts[watch['processor']] }}</span>
{%- endif -%}
{%- for watch_tag_uuid, watch_tag in datastore.get_all_tags_for_watch(watch['uuid']).items() -%}
<a href="{{url_for('watchlist.index', tag=watch_tag_uuid) }}" class="watch-tag-list tag-{{ watch_tag.title|sanitize_tag_class }}">{{ watch_tag.title }}</a>
{%- endfor -%}
+18 -3
View File
@@ -38,6 +38,7 @@ def manage_user_agent(headers, current_ua=''):
return None
class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
@@ -162,16 +163,30 @@ class Fetcher():
"""
return {k.lower(): v for k, v in self.headers.items()}
def browser_steps_get_valid_steps(self):
if self.browser_steps is not None and len(self.browser_steps):
valid_steps = list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one'),
self.browser_steps))
# Just incase they selected Goto site by accident with older JS
if valid_steps and valid_steps[0]['operation'] == 'Goto site':
del(valid_steps[0])
return valid_steps
return None
async def iterate_browser_steps(self, start_url=None):
from changedetectionio.browser_steps.browser_steps import steppable_browser_interface, browser_steps_get_valid_steps
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._errors import TimeoutError, Error
from changedetectionio.jinja2_custom import render as jinja_render
step_n = 0
if self.browser_steps:
if self.browser_steps is not None and len(self.browser_steps):
interface = steppable_browser_interface(start_url=start_url)
interface.page = self.page
valid_steps = browser_steps_get_valid_steps(self.browser_steps)
valid_steps = self.browser_steps_get_valid_steps()
for step in valid_steps:
step_n += 1
@@ -1,4 +1,3 @@
import asyncio
import gc
import json
import os
@@ -295,7 +294,7 @@ class fetcher(Fetcher):
self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Re-use as much code from browser steps as possible so its the same
from changedetectionio.browser_steps.browser_steps import steppable_browser_interface
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
browsersteps_interface = steppable_browser_interface(start_url=url)
browsersteps_interface.page = self.page
@@ -350,7 +349,12 @@ class fetcher(Fetcher):
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
# Finally block will handle cleanup
# Cleanup before raising to prevent memory leak
await self.page.close()
await context.close()
await browser.close()
# Force garbage collection to release Playwright resources immediately
gc.collect()
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0:
@@ -362,11 +366,16 @@ class fetcher(Fetcher):
# Wrap remaining operations in try/finally to ensure cleanup
try:
# Run Browser Steps here
if self.browser_steps:
if self.browser_steps_get_valid_steps():
try:
await self.iterate_browser_steps(start_url=url)
except BrowserStepsStepException:
# Finally block will handle cleanup
try:
await context.close()
await browser.close()
except Exception as e:
# Fine, could be messy situation
pass
raise
await self.page.wait_for_timeout(extra_wait * 1000)
@@ -415,40 +424,35 @@ class fetcher(Fetcher):
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
finally:
# Clean up resources properly with timeouts to prevent hanging
# Request garbage collection one more time before closing
try:
if hasattr(self, 'page') and self.page:
await self.page.request_gc()
await asyncio.wait_for(self.page.close(), timeout=5.0)
logger.debug(f"Successfully closed page for {url}")
except asyncio.TimeoutError:
logger.warning(f"Timed out closing page for {url} (5s)")
except Exception as e:
logger.warning(f"Error closing page for {url}: {e}")
finally:
self.page = None
await self.page.request_gc()
except:
pass
# Clean up resources properly
try:
await self.page.request_gc()
except:
pass
try:
if context:
await asyncio.wait_for(context.close(), timeout=5.0)
logger.debug(f"Successfully closed context for {url}")
except asyncio.TimeoutError:
logger.warning(f"Timed out closing context for {url} (5s)")
except Exception as e:
logger.warning(f"Error closing context for {url}: {e}")
finally:
context = None
await self.page.close()
except:
pass
self.page = None
try:
if browser:
await asyncio.wait_for(browser.close(), timeout=5.0)
logger.debug(f"Successfully closed browser connection for {url}")
except asyncio.TimeoutError:
logger.warning(f"Timed out closing browser connection for {url} (5s)")
except Exception as e:
logger.warning(f"Error closing browser for {url}: {e}")
finally:
browser = None
await context.close()
except:
pass
context = None
try:
await browser.close()
except:
pass
browser = None
# Force Python GC to release Playwright resources immediately
# Playwright objects can have circular references that delay cleanup
+17 -50
View File
@@ -1,5 +1,4 @@
import asyncio
import gc
import json
import os
import websockets.exceptions
@@ -222,36 +221,19 @@ class fetcher(Fetcher):
self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
async def quit(self, watch=None):
watch_uuid = watch.get('uuid') if watch else 'unknown'
# Close page
try:
if hasattr(self, 'page') and self.page:
await asyncio.wait_for(self.page.close(), timeout=5.0)
logger.debug(f"[{watch_uuid}] Page closed successfully")
except asyncio.TimeoutError:
logger.warning(f"[{watch_uuid}] Timed out closing page (5s)")
await self.page.close()
del self.page
except Exception as e:
logger.warning(f"[{watch_uuid}] Error closing page: {e}")
finally:
self.page = None
pass
# Close browser connection
try:
if hasattr(self, 'browser') and self.browser:
await asyncio.wait_for(self.browser.close(), timeout=5.0)
logger.debug(f"[{watch_uuid}] Browser closed successfully")
except asyncio.TimeoutError:
logger.warning(f"[{watch_uuid}] Timed out closing browser (5s)")
await self.browser.close()
del self.browser
except Exception as e:
logger.warning(f"[{watch_uuid}] Error closing browser: {e}")
finally:
self.browser = None
pass
logger.info(f"[{watch_uuid}] Cleanup puppeteer complete")
# Force garbage collection to release resources
gc.collect()
logger.info("Cleanup puppeteer complete.")
async def fetch_page(self,
current_include_filters,
@@ -281,11 +263,9 @@ class fetcher(Fetcher):
# Connect directly using the specified browser_ws_endpoint
# @todo timeout
try:
logger.debug(f"[{watch_uuid}] Connecting to browser at {self.browser_connection_url}")
self.browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url,
ignoreHTTPSErrors=True
)
logger.debug(f"[{watch_uuid}] Browser connected successfully")
except websockets.exceptions.InvalidStatusCode as e:
raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access, whitelist IP, password etc)")
except websockets.exceptions.InvalidURI:
@@ -294,18 +274,7 @@ class fetcher(Fetcher):
raise BrowserConnectError(msg=f"Error connecting to the browser - Exception '{str(e)}'")
# more reliable is to just request a new page
try:
logger.debug(f"[{watch_uuid}] Creating new page")
self.page = await self.browser.newPage()
logger.debug(f"[{watch_uuid}] Page created successfully")
except Exception as e:
logger.error(f"[{watch_uuid}] Failed to create new page: {e}")
# Browser is connected but page creation failed - must cleanup browser
try:
await asyncio.wait_for(self.browser.close(), timeout=3.0)
except Exception as cleanup_error:
logger.error(f"[{watch_uuid}] Failed to cleanup browser after page creation failure: {cleanup_error}")
raise
self.page = await self.browser.newPage()
# Add console handler to capture console.log from favicon fetcher
#self.page.on('console', lambda msg: logger.debug(f"Browser console [{msg.type}]: {msg.text}"))
@@ -374,12 +343,6 @@ class fetcher(Fetcher):
w = extra_wait - 2 if extra_wait > 4 else 2
logger.debug(f"Waiting {w} seconds before calling Page.stopLoading...")
await asyncio.sleep(w)
# Check if page still exists (might have been closed due to error during sleep)
if not self.page or not hasattr(self.page, '_client'):
logger.debug("Page already closed, skipping stopLoading")
return
logger.debug("Issuing stopLoading command...")
await self.page._client.send('Page.stopLoading')
logger.debug("stopLoading command sent!")
@@ -405,9 +368,7 @@ class fetcher(Fetcher):
asyncio.create_task(handle_frame_navigation())
response = await self.page.goto(url, timeout=0)
await asyncio.sleep(1 + extra_wait)
# Check if page still exists before sending command
if self.page and hasattr(self.page, '_client'):
await self.page._client.send('Page.stopLoading')
await self.page._client.send('Page.stopLoading')
if response:
break
@@ -456,7 +417,7 @@ class fetcher(Fetcher):
# Run Browser Steps here
# @todo not yet supported, we switch to playwright in this case
# if self.browser_steps:
# if self.browser_steps_get_valid_steps():
# self.iterate_browser_steps()
@@ -476,9 +437,15 @@ class fetcher(Fetcher):
logger.debug(f"Screenshot format {self.screenshot_format}")
self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
# Force garbage collection - pyppeteer base64 decode creates temporary buffers
# Force aggressive memory cleanup - pyppeteer base64 decode creates temporary buffers
import gc
gc.collect()
# Release C-level memory from base64 decode back to OS
try:
import ctypes
ctypes.CDLL('libc.so.6').malloc_trim(0)
except Exception:
pass
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
"visualselector_xpath_selectors": visualselector_xpath_selectors,
"max_height": MAX_TOTAL_HEIGHT
+5 -25
View File
@@ -3,7 +3,7 @@ import hashlib
import os
import re
import asyncio
from functools import partial
from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
@@ -36,7 +36,7 @@ class fetcher(Fetcher):
import requests
from requests.exceptions import ProxyError, ConnectionError, RequestException
if self.browser_steps:
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
proxies = {}
@@ -55,26 +55,6 @@ class fetcher(Fetcher):
session = requests.Session()
# Configure retry adapter for low-level network errors only
# Retries connection timeouts, read timeouts, connection resets - not HTTP status codes
# Especially helpful in parallel test execution when servers are slow/overloaded
# Configurable via REQUESTS_RETRY_MAX_COUNT (default: 3 attempts)
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
max_retries = int(os.getenv("REQUESTS_RETRY_MAX_COUNT", "6"))
retry_strategy = Retry(
total=max_retries,
connect=max_retries, # Retry connection timeouts
read=max_retries, # Retry read timeouts
status=0, # Don't retry on HTTP status codes
backoff_factor=0.5, # Wait 0.3s, 0.6s, 1.2s between retries
allowed_methods=["HEAD", "GET", "OPTIONS", "POST"],
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
from requests_file import FileAdapter
@@ -162,11 +142,10 @@ class fetcher(Fetcher):
watch_uuid=None,
):
"""Async wrapper that runs the synchronous requests code in a thread pool"""
loop = asyncio.get_event_loop()
# Run the synchronous _run_sync in a thread pool to avoid blocking the event loop
# Retry logic is handled by requests' HTTPAdapter (see _run_sync for configuration)
await loop.run_in_executor(
None, # Use default ThreadPoolExecutor
lambda: self._run_sync(
@@ -184,6 +163,7 @@ class fetcher(Fetcher):
)
async def quit(self, watch=None):
# In case they switched to `requests` fetcher from something else
# Then the screenshot could be old, in any case, it's not used here.
# REMOVE_REQUESTS_OLD_SCREENSHOTS - Mainly used for testing
+27 -88
View File
@@ -14,7 +14,7 @@ from pathlib import Path
from changedetectionio.strtobool import strtobool
from threading import Event
from changedetectionio.queue_handlers import RecheckPriorityQueue, NotificationQueue
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask import (
Flask,
@@ -70,17 +70,13 @@ socketio_server = None
# Enable CORS, especially useful for the Chrome extension to operate from anywhere
CORS(app)
# Flask-Compress handles HTTP compression, Socket.IO compression disabled to prevent memory leak.
# There's also a bug between flask compress and socketio that causes some kind of slow memory leak
# It's better to use compression on your reverse proxy (nginx etc) instead.
if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
app.config['COMPRESS_MIN_SIZE'] = 2096
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
app.config['COMPRESS_ALGORITHM'] = ['gzip']
# Super handy for compressing large BrowserSteps responses and others
# Flask-Compress handles HTTP compression, Socket.IO compression disabled to prevent memory leak
compress = FlaskCompress()
app.config['COMPRESS_MIN_SIZE'] = 2096
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
app.config['COMPRESS_ALGORITHM'] = ['gzip']
compress.init_app(app)
app.config['TEMPLATES_AUTO_RELOAD'] = False
@@ -199,7 +195,7 @@ def _jinja2_filter_format_number_locale(value: float) -> str:
@app.template_global('is_checking_now')
def _watch_is_checking_now(watch_obj, format="%Y-%m-%d %H:%M:%S"):
return worker_pool.is_watch_running(watch_obj['uuid'])
return worker_handler.is_watch_running(watch_obj['uuid'])
@app.template_global('get_watch_queue_position')
def _get_watch_queue_position(watch_obj):
@@ -210,13 +206,13 @@ def _get_watch_queue_position(watch_obj):
@app.template_global('get_current_worker_count')
def _get_current_worker_count():
"""Get the current number of operational workers"""
return worker_pool.get_worker_count()
return worker_handler.get_worker_count()
@app.template_global('get_worker_status_info')
def _get_worker_status_info():
"""Get detailed worker status information for display"""
status = worker_pool.get_worker_status()
running_uuids = worker_pool.get_running_uuids()
status = worker_handler.get_worker_status()
running_uuids = worker_handler.get_running_uuids()
return {
'count': status['worker_count'],
@@ -270,47 +266,6 @@ def _jinja2_filter_seconds_precise(timestamp):
return format(int(time.time()-timestamp), ',d')
@app.template_filter('format_duration')
def _jinja2_filter_format_duration(seconds):
"""Format a duration in seconds into human readable string like '5 days, 3 hours, 30 minutes'"""
from datetime import timedelta
if not seconds or seconds < 0:
return gettext('0 seconds')
td = timedelta(seconds=int(seconds))
# Calculate components
years = td.days // 365
remaining_days = td.days % 365
months = remaining_days // 30
remaining_days = remaining_days % 30
weeks = remaining_days // 7
days = remaining_days % 7
hours = td.seconds // 3600
minutes = (td.seconds % 3600) // 60
secs = td.seconds % 60
# Build parts list
parts = []
if years > 0:
parts.append(f"{years} {gettext('year') if years == 1 else gettext('years')}")
if months > 0:
parts.append(f"{months} {gettext('month') if months == 1 else gettext('months')}")
if weeks > 0:
parts.append(f"{weeks} {gettext('week') if weeks == 1 else gettext('weeks')}")
if days > 0:
parts.append(f"{days} {gettext('day') if days == 1 else gettext('days')}")
if hours > 0:
parts.append(f"{hours} {gettext('hour') if hours == 1 else gettext('hours')}")
if minutes > 0:
parts.append(f"{minutes} {gettext('minute') if minutes == 1 else gettext('minutes')}")
if secs > 0 or not parts:
parts.append(f"{secs} {gettext('second') if secs == 1 else gettext('seconds')}")
return ", ".join(parts)
@app.template_filter('fetcher_status_icons')
def _jinja2_filter_fetcher_status_icons(fetcher_name):
"""Get status icon HTML for a given fetcher.
@@ -712,14 +667,8 @@ def changedetection_app(config=None, datastore_o=None):
def static_content(group, filename):
from flask import make_response
import re
# Strict sanitization: only allow a-z, 0-9, and underscore (blocks .. and other traversal)
group = re.sub(r'[^a-z0-9_-]+', '', group.lower())
filename = filename
# Additional safety: reject if sanitization resulted in empty strings
if not group or not filename:
abort(404)
group = re.sub(r'[^\w.-]+', '', group.lower())
filename = re.sub(r'[^\w.-]+', '', filename.lower())
if group == 'screenshot':
# Could be sensitive, follow password requirements
@@ -754,10 +703,10 @@ def changedetection_app(config=None, datastore_o=None):
favicon_filename = watch.get_favicon_filename()
if favicon_filename:
# Use cached MIME type detection
filepath = os.path.join(watch.data_dir, favicon_filename)
filepath = os.path.join(watch.watch_data_dir, favicon_filename)
mime = get_favicon_mime_type(filepath)
response = make_response(send_from_directory(watch.data_dir, favicon_filename))
response = make_response(send_from_directory(watch.watch_data_dir, favicon_filename))
response.headers['Content-type'] = mime
response.headers['Cache-Control'] = 'max-age=300, must-revalidate' # Cache for 5 minutes, then revalidate
return response
@@ -852,13 +801,13 @@ def changedetection_app(config=None, datastore_o=None):
# watchlist UI buttons etc
import changedetectionio.blueprint.ui as ui
app.register_blueprint(ui.construct_blueprint(datastore, update_q, worker_pool, queuedWatchMetaData, watch_check_update))
app.register_blueprint(ui.construct_blueprint(datastore, update_q, worker_handler, queuedWatchMetaData, watch_check_update))
import changedetectionio.blueprint.watchlist as watchlist
app.register_blueprint(watchlist.construct_blueprint(datastore=datastore, update_q=update_q, queuedWatchMetaData=queuedWatchMetaData), url_prefix='')
# Initialize Socket.IO server conditionally based on settings
socket_io_enabled = datastore.data['settings']['application'].get('ui', {}).get('socket_io_enabled', True)
socket_io_enabled = datastore.data['settings']['application']['ui'].get('socket_io_enabled', True)
if socket_io_enabled and app.config.get('batch_mode'):
socket_io_enabled = False
if socket_io_enabled:
@@ -889,10 +838,10 @@ def changedetection_app(config=None, datastore_o=None):
expected_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
# Get basic status
status = worker_pool.get_worker_status()
status = worker_handler.get_worker_status()
# Perform health check
health_result = worker_pool.check_worker_health(
health_result = worker_handler.check_worker_health(
expected_count=expected_workers,
update_q=update_q,
notification_q=notification_q,
@@ -956,24 +905,14 @@ def changedetection_app(config=None, datastore_o=None):
# Can be overridden by ENV or use the default settings
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
logger.info(f"Starting {n_workers} workers during app initialization")
worker_pool.start_workers(n_workers, update_q, notification_q, app, datastore)
worker_handler.start_workers(n_workers, update_q, notification_q, app, datastore)
# Skip background threads in batch mode (just process queue and exit)
batch_mode = app.config.get('batch_mode', False)
if not batch_mode:
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks, daemon=True, name="TickerThread-ScheduleChecker").start()
# Start configurable number of notification workers (default 1)
notification_workers = int(os.getenv("NOTIFICATION_WORKERS", "1"))
for i in range(notification_workers):
threading.Thread(
target=notification_runner,
args=(i,),
daemon=True,
name=f"NotificationRunner-{i}"
).start()
logger.info(f"Started {notification_workers} notification worker(s)")
threading.Thread(target=notification_runner, daemon=True, name="NotificationRunner").start()
in_pytest = "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ
# Check for new release version, but not when running in test/build or pytest
@@ -1015,14 +954,14 @@ def check_for_new_version():
app.config.exit.wait(86400)
def notification_runner(worker_id=0):
def notification_runner():
global notification_debug_log
from datetime import datetime
import json
with app.app_context():
while not app.config.exit.is_set():
try:
# Multiple workers can run concurrently (configurable via NOTIFICATION_WORKERS)
# At the moment only one thread runs (single runner)
n_object = notification_q.get(block=False)
except queue.Empty:
app.config.exit.wait(1)
@@ -1048,7 +987,7 @@ def notification_runner(worker_id=0):
sent_obj = process_notification(n_object, datastore)
except Exception as e:
logger.error(f"Notification worker {worker_id} - Watch URL: {n_object['watch_url']} Error {str(e)}")
logger.error(f"Watch URL: {n_object['watch_url']} Error {str(e)}")
# UUID wont be present when we submit a 'test' from the global settings
if 'uuid' in n_object:
@@ -1089,7 +1028,7 @@ def ticker_thread_check_time_launch_checks():
now = time.time()
if now - last_health_check > 60:
expected_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
health_result = worker_pool.check_worker_health(
health_result = worker_handler.check_worker_health(
expected_count=expected_workers,
update_q=update_q,
notification_q=notification_q,
@@ -1108,7 +1047,7 @@ def ticker_thread_check_time_launch_checks():
continue
# Get a list of watches by UUID that are currently fetching data
running_uuids = worker_pool.get_running_uuids()
running_uuids = worker_handler.get_running_uuids()
# Build set of queued UUIDs once for O(1) lookup instead of O(n) per watch
queued_uuids = {q_item.item['uuid'] for q_item in update_q.queue}
@@ -1214,7 +1153,7 @@ def ticker_thread_check_time_launch_checks():
priority = int(time.time())
# Into the queue with you
queued_successfully = worker_pool.queue_item_async_safe(update_q,
queued_successfully = worker_handler.queue_item_async_safe(update_q,
queuedWatchMetaData.PrioritizedItem(priority=priority,
item={'uuid': uuid})
)
+9 -9
View File
@@ -7,6 +7,8 @@ from flask_babel import lazy_gettext as _l, gettext
from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES, RSS_TEMPLATE_TYPE_OPTIONS, RSS_TEMPLATE_HTML_DEFAULT
from changedetectionio.conditions.form import ConditionFormRow
from changedetectionio.notification_service import NotificationContextData
from changedetectionio.processors.image_ssim_diff import SCREENSHOT_COMPARISON_THRESHOLD_OPTIONS, \
SCREENSHOT_COMPARISON_THRESHOLD_OPTIONS_DEFAULT
from changedetectionio.strtobool import strtobool
from changedetectionio import processors
@@ -35,7 +37,7 @@ from changedetectionio.widgets import TernaryNoneBooleanField
# default
# each select <option data-enabled="enabled-0-0"
from changedetectionio.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio import html_tools, content_fetchers
@@ -492,6 +494,7 @@ class ValidateJinja2Template(object):
Validates that a {token} is from a valid set
"""
def __call__(self, form, field):
from changedetectionio import notification
from changedetectionio.jinja2_custom import create_jinja_env
from jinja2 import BaseLoader, TemplateSyntaxError, UndefinedError
from jinja2.meta import find_undeclared_variables
@@ -727,7 +730,7 @@ class quickWatchForm(Form):
url = fields.URLField(_l('URL'), validators=[validateURL()])
tags = StringTagUUID(_l('Group tag'), validators=[validators.Optional()])
watch_submit_button = SubmitField(_l('Watch'), render_kw={"class": "pure-button pure-button-primary"})
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default="text_json_diff")
edit_and_watch_submit_button = SubmitField(_l('Edit > Watch'), render_kw={"class": "pure-button pure-button-primary"})
@@ -746,7 +749,7 @@ class commonSettingsForm(Form):
notification_format = SelectField(_l('Notification format'), choices=list(valid_notification_formats.items()))
notification_title = StringField(_l('Notification Title'), default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()])
notification_urls = StringListField(_l('Notification URL List'), validators=[validators.Optional(), ValidateAppRiseServers(), ValidateJinja2Template()])
processor = RadioField( label=_l("Processor - What do you want to achieve?"), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField( label=_l("Processor - What do you want to achieve?"), choices=lambda: processors.available_processors(), default="text_json_diff")
scheduler_timezone_default = StringField(_l("Default timezone for watch check scheduler"), render_kw={"list": "timezones"}, validators=[validateTimeZoneName()])
webdriver_delay = IntegerField(_l('Wait seconds before extracting text'), validators=[validators.Optional(), validators.NumberRange(min=1, message=_l("Should contain one or more seconds"))])
@@ -760,7 +763,7 @@ class commonSettingsForm(Form):
class importForm(Form):
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default="text_json_diff")
urls = TextAreaField(_l('URLs'))
xlsx_file = FileField(_l('Upload .xlsx file'), validators=[FileAllowed(['xlsx'], _l('Must be .xlsx file!'))])
file_mapping = SelectField(_l('File mapping'), [validators.DataRequired()], choices={('wachete', 'Wachete mapping'), ('custom','Custom mapping')})
@@ -817,7 +820,8 @@ class processor_text_json_diff_form(commonSettingsForm):
filter_text_removed = BooleanField(_l('Removed lines'), default=True)
trigger_text = StringListField(_l('Keyword triggers - Trigger/wait for text'), [validators.Optional(), ValidateListRegex()])
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
text_should_not_be_present = StringListField(_l('Block change-detection while text matches'), [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField(_l('Execute JavaScript before change detection'), render_kw={"rows": "5"}, validators=[validators.Optional()])
@@ -833,8 +837,6 @@ class processor_text_json_diff_form(commonSettingsForm):
conditions = FieldList(FormField(ConditionFormRow), min_entries=1) # Add rule logic here
use_page_title_in_list = TernaryNoneBooleanField(_l('Use page <title> in list'), default=None)
history_snapshot_max_length = IntegerField(_l('Number of history items per watch to keep'), render_kw={"style": "width: 5em;"}, validators=[validators.Optional(), validators.NumberRange(min=2)])
def extra_tab_content(self):
return None
@@ -1032,8 +1034,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0,
message=_l("Should contain zero or more attempts"))])
history_snapshot_max_length = IntegerField(_l('Number of history items per watch to keep'), render_kw={"style": "width: 5em;"}, validators=[validators.Optional(), validators.NumberRange(min=2)])
ui = FormField(globalSettingsApplicationUIForm)
@@ -52,13 +52,7 @@ def render(template_str, **args: t.Any) -> str:
return output[:JINJA2_MAX_RETURN_PAYLOAD_SIZE]
def render_fully_escaped(content):
"""
Escape HTML content safely.
MEMORY LEAK FIX: Use markupsafe.escape() directly instead of creating
Jinja2 environments (was causing 1M+ compilations per page load).
Simpler, faster, and no concerns about environment state.
"""
from markupsafe import escape
return str(escape(content))
env = jinja2.sandbox.ImmutableSandboxedEnvironment(autoescape=True)
template = env.from_string("{{ some_html|e }}")
return template.render(some_html=content)
+1 -2
View File
@@ -29,7 +29,7 @@ class model(dict):
'proxy': None, # Preferred proxy connection
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "5")), # Number of threads, lower is better for slow connections
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
'default_ua': {
'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT),
'html_webdriver': None,
@@ -46,7 +46,6 @@ class model(dict):
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [],
'history_snapshot_max_length': None,
'ignore_whitespace': True,
'ignore_status_codes': False, #@todo implement, as ternary.
'ssim_threshold': '0.96', # Default SSIM threshold for screenshot comparison
+1 -43
View File
@@ -1,48 +1,10 @@
"""
Tag/Group domain model for organizing and overriding watch settings.
ARCHITECTURE NOTE: Configuration Override Hierarchy
===================================================
Tags can override Watch settings when overrides_watch=True.
Current implementation requires manual checking in processors:
for tag_uuid in watch.get('tags'):
tag = datastore['settings']['application']['tags'][tag_uuid]
if tag.get('overrides_watch'):
restock_settings = tag.get('restock_settings', {})
break
With Pydantic, this would be automatic via chain resolution:
Watch Tag (first with overrides_watch) Global
See: Watch.py model docstring for full Pydantic architecture explanation
See: processors/restock_diff/processor.py:184-192 for current manual implementation
"""
from changedetectionio.model import watch_base
from changedetectionio.model.persistence import EntityPersistenceMixin
class model(EntityPersistenceMixin, watch_base):
"""
Tag domain model - groups watches and can override their settings.
Tags inherit from watch_base to reuse all the same fields as Watch.
When overrides_watch=True, tag settings take precedence over watch settings
for all watches in this tag/group.
Fields:
overrides_watch (bool): If True, this tag's settings override watch settings
title (str): Display name for this tag/group
uuid (str): Unique identifier
... (all fields from watch_base can be set as tag-level overrides)
Resolution order when overrides_watch=True:
Watch.field Tag.field (if overrides_watch) Global.field
"""
class model(watch_base):
def __init__(self, *arg, **kw):
# Parent class (watch_base) handles __datastore and __datastore_path
super(model, self).__init__(*arg, **kw)
self['overrides_watch'] = kw.get('default', {}).get('overrides_watch')
@@ -50,7 +12,3 @@ class model(EntityPersistenceMixin, watch_base):
if kw.get('default'):
self.update(kw['default'])
del kw['default']
# _save_to_disk() method provided by EntityPersistenceMixin
# commit() and _get_commit_data() methods inherited from watch_base
# Tag uses default _get_commit_data() (includes all keys)
+69 -304
View File
@@ -1,37 +1,9 @@
"""
Watch domain model for change detection monitoring.
ARCHITECTURE NOTE: Configuration Override Hierarchy
===================================================
This module implements Watch objects that inherit from dict (technical debt).
The dream architecture would use Pydantic for:
1. CHAIN RESOLUTION (Watch Tag Global Settings)
- Current: Manual resolution scattered across codebase
- Future: @computed_field properties with automatic resolution
- Examples: resolved_fetch_backend, resolved_restock_settings, etc.
2. DATABASE BACKEND ABSTRACTION
- Current: Domain model tightly coupled to file-based JSON storage
- Future: Domain model (Pydantic) separate from persistence layer
- Enables: Easy migration to PostgreSQL, MongoDB, etc.
3. TYPE SAFETY & VALIDATION
- Current: Dict access with no compile-time checks
- Future: Type hints, IDE autocomplete, validation at boundaries
See class model docstring for detailed explanation and examples.
See: processors/restock_diff/processor.py:184-192 for manual resolution example
"""
from blinker import signal
from changedetectionio.validate_url import is_safe_valid_url
from changedetectionio.strtobool import strtobool
from changedetectionio.jinja2_custom import render as jinja_render
from . import watch_base
from .persistence import EntityPersistenceMixin
import os
import re
from pathlib import Path
@@ -41,7 +13,7 @@ from .. import jinja2_custom as safe_jinja
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
FAVICON_RESAVE_THRESHOLD_SECONDS=86400
BROTLI_COMPRESS_SIZE_THRESHOLD = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024*20))
BROTLI_COMPRESS_SIZE_THRESHOLD = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
@@ -128,112 +100,17 @@ def _brotli_save(contents, filepath, mode=None, fallback_uncompressed=False):
raise Exception(f"Brotli compression failed for {filepath}: {e}")
class model(EntityPersistenceMixin, watch_base):
"""
Watch domain model for monitoring URL changes.
Inherits from watch_base (which inherits dict) - see watch_base docstring for field documentation.
## Configuration Override Hierarchy (Chain Resolution)
The dream architecture uses a 3-level resolution chain:
Watch settings Tag/Group settings Global settings
Current implementation is MANUAL (see processor.py:184-192 for example):
- Processors manually check watch.get('field')
- Then loop through watch.tags to find first tag with overrides_watch=True
- Finally fall back to datastore['settings']['application']['field']
FUTURE: Pydantic-based chain resolution would enable:
```python
# Instead of manual resolution in every processor:
restock_settings = watch.get('restock_settings', {})
for tag_uuid in watch.get('tags'):
tag = datastore['settings']['application']['tags'][tag_uuid]
if tag.get('overrides_watch'):
restock_settings = tag.get('restock_settings', {})
break
# Clean computed properties with automatic resolution:
@computed_field
def resolved_restock_settings(self) -> dict:
if self.restock_settings:
return self.restock_settings
for tag_uuid in self.tags:
tag = self._datastore.get_tag(tag_uuid)
if tag.overrides_watch and tag.restock_settings:
return tag.restock_settings
return self._datastore.settings.restock_settings or {}
# Usage: watch.resolved_restock_settings (automatic, type-safe, tested once)
```
Benefits of Pydantic migration:
1. Single source of truth for resolution logic (not scattered across processors)
2. Type safety + IDE autocomplete (watch.resolved_fetch_backend vs dict navigation)
3. Database backend abstraction (domain model separate from persistence)
4. Automatic validation at boundaries
5. Self-documenting via type hints
6. Easy to test resolution independently
Resolution chain examples that would benefit:
- fetch_backend: watch tag global (see get_fetch_backend property)
- notification_urls: watch tag global
- time_between_check: watch global (see threshold_seconds)
- restock_settings: watch tag (see processors/restock_diff/processor.py:184-192)
- history_snapshot_max_length: watch global (see save_history_blob:550-556)
- All processor_config_* settings could use tag overrides
## Database Backend Abstraction with Pydantic
Current: Watch inherits dict, tightly coupled to file-based JSON storage
Future: Domain model (Watch) separate from persistence layer
```python
# Domain model (database-agnostic)
class Watch(BaseModel):
uuid: str
url: str
# ... validation, business logic
# Pluggable backends
class DataStoreBackend(ABC):
def save_watch(self, watch: Watch): ...
def load_watch(self, uuid: str) -> Watch: ...
# Implementations: FileBackend, MongoBackend, PostgresBackend, etc.
```
This would enable:
- Easy migration between storage backends (file postgres mongodb)
- Pydantic handles serialization/deserialization automatically
- Domain logic stays clean (no storage concerns in Watch methods)
## Migration Path
Given existing codebase, incremental migration recommended:
1. Create Pydantic models alongside existing dict-based models
2. Add .to_pydantic() / .from_pydantic() bridge methods
3. Gradually migrate code to use Pydantic models
4. Remove dict inheritance once migration complete
See: watch_base docstring for technical debt discussion
See: processors/restock_diff/processor.py:184-192 for manual resolution example
See: Watch.py:550-556 for nested dict navigation that would become watch.resolved_*
"""
class model(watch_base):
__newest_history_key = None
__history_n = 0
jitter_seconds = 0
def __init__(self, *arg, **kw):
# Validate __datastore before calling parent (Watch requires it)
if not kw.get('__datastore'):
raise ValueError("Watch object requires '__datastore' reference - cannot access global settings without it")
# Parent class (watch_base) handles __datastore and __datastore_path
self.__datastore_path = kw.get('datastore_path')
if kw.get('datastore_path'):
del kw['datastore_path']
super(model, self).__init__(*arg, **kw)
if kw.get('default'):
self.update(kw['default'])
del kw['default']
@@ -244,9 +121,6 @@ class model(EntityPersistenceMixin, watch_base):
# Be sure the cached timestamp is ready
bump = self.history
# Note: __deepcopy__, __getstate__, and __setstate__ are inherited from watch_base
# This prevents memory leaks by sharing __datastore reference instead of copying it
@property
def viewed(self):
# Don't return viewed when last_viewed is 0 and newest_key is 0
@@ -259,6 +133,11 @@ class model(EntityPersistenceMixin, watch_base):
def has_unviewed(self):
return int(self.newest_history_key) > int(self['last_viewed']) and self.__history_n >= 2
def ensure_data_dir_exists(self):
if not os.path.isdir(self.watch_data_dir):
logger.debug(f"> Creating data dir {self.watch_data_dir}")
os.mkdir(self.watch_data_dir)
@property
def link(self):
@@ -314,8 +193,7 @@ class model(EntityPersistenceMixin, watch_base):
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
# But preserve processor config files (they're configuration, not history data)
# Use glob not rglob here for safety.
for item in pathlib.Path(str(self.data_dir)).glob("*.*"):
for item in pathlib.Path(str(self.watch_data_dir)).rglob("*.*"):
# Skip processor config files
if item.name in processor_config_files:
continue
@@ -335,6 +213,7 @@ class model(EntityPersistenceMixin, watch_base):
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None
})
@@ -351,30 +230,8 @@ class model(EntityPersistenceMixin, watch_base):
@property
def get_fetch_backend(self):
"""
Get the fetch backend for this watch with special case handling.
CHAIN RESOLUTION OPPORTUNITY:
Currently returns watch.fetch_backend directly, but doesn't implement
Watch Tag Global resolution chain. With Pydantic:
@computed_field
def resolved_fetch_backend(self) -> str:
# Special case: PDFs always use html_requests
if self.is_pdf:
return 'html_requests'
# Watch override
if self.fetch_backend and self.fetch_backend != 'system':
return self.fetch_backend
# Tag override (first tag with overrides_watch=True wins)
for tag_uuid in self.tags:
tag = self._datastore.get_tag(tag_uuid)
if tag.overrides_watch and tag.fetch_backend:
return tag.fetch_backend
# Global default
return self._datastore.settings.fetch_backend
Like just using the `fetch_backend` key but there could be some logic
:return:
"""
# Maybe also if is_image etc?
# This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text.
@@ -385,16 +242,10 @@ class model(EntityPersistenceMixin, watch_base):
@property
def is_pdf(self):
url = str(self.get("url") or "").lower()
content_type = str(self.get("content-type") or "").lower()
if content_type in ("none", "null", ""):
content_type = ""
return (
url.endswith(".pdf")
or content_type.split(";")[0].strip() == "application/pdf"
)
# content_type field is set in the future
# https://github.com/dgtlmoon/changedetection.io/issues/1392
# Not sure the best logic here
return self.get('url', '').lower().endswith('.pdf') or 'pdf' in self.get('content_type', '').lower()
@property
def label(self):
@@ -429,11 +280,11 @@ class model(EntityPersistenceMixin, watch_base):
tmp_history = {}
# In the case we are only using the watch for processing without history
if not self.data_dir:
if not self.watch_data_dir:
return []
# Read the history file as a dict
fname = os.path.join(self.data_dir, self.history_index_filename)
fname = os.path.join(self.watch_data_dir, self.history_index_filename)
if os.path.isfile(fname):
logger.debug(f"Reading watch history index for {self.get('uuid')}")
with open(fname, "r", encoding='utf-8') as f:
@@ -446,13 +297,13 @@ class model(EntityPersistenceMixin, watch_base):
# Cross-platform: check for any path separator (works on Windows and Unix)
if os.sep not in v and '/' not in v and '\\' not in v:
# Relative filename only, no path separators
v = os.path.join(self.data_dir, v)
v = os.path.join(self.watch_data_dir, v)
else:
# It's possible that they moved the datadir on older versions
# So the snapshot exists but is in a different path
# Cross-platform: use os.path.basename instead of split('/')
snapshot_fname = os.path.basename(v)
proposed_new_path = os.path.join(self.data_dir, snapshot_fname)
proposed_new_path = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(v) and os.path.exists(proposed_new_path):
v = proposed_new_path
@@ -469,7 +320,7 @@ class model(EntityPersistenceMixin, watch_base):
@property
def has_history(self):
fname = os.path.join(self.data_dir, self.history_index_filename)
fname = os.path.join(self.watch_data_dir, self.history_index_filename)
return os.path.isfile(fname)
@property
@@ -572,49 +423,16 @@ class model(EntityPersistenceMixin, watch_base):
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def _write_atomic(self, dest, data, mode='wb'):
def _write_atomic(self, dest, data):
"""Write data atomically to dest using a temp file"""
import tempfile
with tempfile.NamedTemporaryFile(mode, delete=False, dir=self.data_dir) as tmp:
tmp.write(data)
tmp.flush()
os.fsync(tmp.fileno())
tmp_path = tmp.name
os.replace(tmp_path, dest)
def history_trim(self, newest_n_items):
from pathlib import Path
import gc
# Sort by timestamp (key)
sorted_items = sorted(self.history.items(), key=lambda x: int(x[0]))
keep_part = dict(sorted_items[-newest_n_items:])
delete_part = dict(sorted_items[:-newest_n_items])
logger.info( f"[{self.get('uuid')}] Trimming history to most recent {newest_n_items} items, keeping {len(keep_part)} items deleting {len(delete_part)} items.")
if delete_part:
for item in delete_part.items():
try:
Path(item[1]).unlink(missing_ok=True)
except Exception as e:
logger.critical(f"{str(e)}")
finally:
logger.debug(f"[{self.get('uuid')}] Deleted {item[1]} history snapshot")
try:
dest = os.path.join(self.data_dir, self.history_index_filename)
output = "\r\n".join(
f"{k},{Path(v).name}"
for k, v in keep_part.items()
)+"\r\n"
self._write_atomic(dest=dest, data=output, mode='w')
except Exception as e:
logger.critical(f"{str(e)}")
finally:
logger.debug(f"[{self.get('uuid')}] Updated history index {dest}")
# reimport
bump = self.history
gc.collect()
if not os.path.exists(dest):
import tempfile
with tempfile.NamedTemporaryFile('wb', delete=False, dir=self.watch_data_dir) as tmp:
tmp.write(data)
tmp.flush()
os.fsync(tmp.fileno())
tmp_path = tmp.name
os.replace(tmp_path, dest)
# Save some text file to the appropriate path and bump the history
# result_obj from fetch_site_status.run()
@@ -623,6 +441,7 @@ class model(EntityPersistenceMixin, watch_base):
logger.trace(f"{self.get('uuid')} - Updating {self.history_index_filename} with timestamp {timestamp}")
self.ensure_data_dir_exists()
skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False'))
# Binary data - detect file type and save without compression
@@ -640,7 +459,7 @@ class model(EntityPersistenceMixin, watch_base):
ext = 'bin'
snapshot_fname = f"{snapshot_id}.{ext}"
dest = os.path.join(self.data_dir, snapshot_fname)
dest = os.path.join(self.watch_data_dir, snapshot_fname)
self._write_atomic(dest, contents)
logger.trace(f"Saved binary snapshot as {snapshot_fname} ({len(contents)} bytes)")
@@ -650,7 +469,7 @@ class model(EntityPersistenceMixin, watch_base):
# Compressed text
import brotli
snapshot_fname = f"{snapshot_id}.txt.br"
dest = os.path.join(self.data_dir, snapshot_fname)
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
try:
@@ -661,16 +480,16 @@ class model(EntityPersistenceMixin, watch_base):
logger.error(f"{self.get('uuid')} - Brotli compression failed: {e}")
# Fallback to uncompressed
snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.data_dir, snapshot_fname)
dest = os.path.join(self.watch_data_dir, snapshot_fname)
self._write_atomic(dest, contents.encode('utf-8'))
else:
# Plain text
snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.data_dir, snapshot_fname)
dest = os.path.join(self.watch_data_dir, snapshot_fname)
self._write_atomic(dest, contents.encode('utf-8'))
# Append to history.txt atomically
index_fname = os.path.join(self.data_dir, self.history_index_filename)
index_fname = os.path.join(self.watch_data_dir, self.history_index_filename)
index_line = f"{timestamp},{snapshot_fname}\n"
with open(index_fname, 'a', encoding='utf-8') as f:
@@ -682,17 +501,6 @@ class model(EntityPersistenceMixin, watch_base):
self.__newest_history_key = timestamp
self.__history_n += 1
# MANUAL CHAIN RESOLUTION: Watch → Global
# With Pydantic, this would become: maxlen = watch.resolved_history_snapshot_max_length
# @computed_field def resolved_history_snapshot_max_length(self) -> Optional[int]:
# if self.history_snapshot_max_length: return self.history_snapshot_max_length
# if tag := self._get_override_tag(): return tag.history_snapshot_max_length
# return self._datastore.settings.history_snapshot_max_length
maxlen = self.get('history_snapshot_max_length') or self.get_global_setting('application', 'history_snapshot_max_length')
if maxlen and self.__history_n and self.__history_n > maxlen:
self.history_trim(newest_n_items=maxlen)
# @todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
return snapshot_fname
@@ -745,7 +553,7 @@ class model(EntityPersistenceMixin, watch_base):
return not local_lines.issubset(existing_history)
def get_screenshot(self):
fname = os.path.join(self.data_dir, "last-screenshot.png")
fname = os.path.join(self.watch_data_dir, "last-screenshot.png")
if os.path.isfile(fname):
return fname
@@ -760,7 +568,7 @@ class model(EntityPersistenceMixin, watch_base):
if not favicon_fname:
return True
try:
fname = next(iter(glob.glob(os.path.join(self.data_dir, "favicon.*"))), None)
fname = next(iter(glob.glob(os.path.join(self.watch_data_dir, "favicon.*"))), None)
logger.trace(f"Favicon file maybe found at {fname}")
if os.path.isfile(fname):
file_age = int(time.time() - os.path.getmtime(fname))
@@ -793,7 +601,7 @@ class model(EntityPersistenceMixin, watch_base):
base = "favicon"
extension = "ico"
fname = os.path.join(self.data_dir, f"favicon.{extension}")
fname = os.path.join(self.watch_data_dir, f"favicon.{extension}")
try:
# validate=True makes sure the string only contains valid base64 chars
@@ -805,11 +613,6 @@ class model(EntityPersistenceMixin, watch_base):
try:
with open(fname, 'wb') as f:
f.write(decoded)
# Invalidate favicon filename cache
if hasattr(self, '_favicon_filename_cache'):
delattr(self, '_favicon_filename_cache')
# A signal that could trigger the socket server to update the browser also
watch_check_update = signal('watch_favicon_bump')
if watch_check_update:
@@ -826,32 +629,20 @@ class model(EntityPersistenceMixin, watch_base):
Find any favicon.* file in the current working directory
and return the contents of the newest one.
MEMORY LEAK FIX: Cache the result to avoid repeated glob.glob() operations.
glob.glob() causes millions of fnmatch allocations when called for every watch on page load.
Returns:
str: Basename of the newest favicon file, or None if not found.
bytes: Contents of the newest favicon file, or None if not found.
"""
# Check cache first (prevents 26M+ allocations from repeated glob operations)
cache_key = '_favicon_filename_cache'
if hasattr(self, cache_key):
return getattr(self, cache_key)
import glob
# Search for all favicon.* files
files = glob.glob(os.path.join(self.data_dir, "favicon.*"))
files = glob.glob(os.path.join(self.watch_data_dir, "favicon.*"))
if not files:
result = None
else:
# Find the newest by modification time
newest_file = max(files, key=os.path.getmtime)
result = os.path.basename(newest_file)
return None
# Cache the result
setattr(self, cache_key, result)
return result
# Find the newest by modification time
newest_file = max(files, key=os.path.getmtime)
return os.path.basename(newest_file)
def get_screenshot_as_thumbnail(self, max_age=3200):
"""Return path to a square thumbnail of the most recent screenshot.
@@ -867,7 +658,7 @@ class model(EntityPersistenceMixin, watch_base):
import os
import time
thumbnail_path = os.path.join(self.data_dir, "thumbnail.jpeg")
thumbnail_path = os.path.join(self.watch_data_dir, "thumbnail.jpeg")
top_trim = 500 # Pixels from top of screenshot to use
screenshot_path = self.get_screenshot()
@@ -918,7 +709,7 @@ class model(EntityPersistenceMixin, watch_base):
return None
def __get_file_ctime(self, filename):
fname = os.path.join(self.data_dir, filename)
fname = os.path.join(self.watch_data_dir, filename)
if os.path.isfile(fname):
return int(os.path.getmtime(fname))
return False
@@ -943,9 +734,14 @@ class model(EntityPersistenceMixin, watch_base):
def snapshot_error_screenshot_ctime(self):
return self.__get_file_ctime('last-error-screenshot.png')
@property
def watch_data_dir(self):
# The base dir of the watch data
return os.path.join(self.__datastore_path, self['uuid']) if self.__datastore_path else None
def get_error_text(self):
"""Return the text saved from a previous request that resulted in a non-200 error"""
fname = os.path.join(self.data_dir, "last-error.txt")
fname = os.path.join(self.watch_data_dir, "last-error.txt")
if os.path.isfile(fname):
with open(fname, 'r', encoding='utf-8') as f:
return f.read()
@@ -953,7 +749,7 @@ class model(EntityPersistenceMixin, watch_base):
def get_error_snapshot(self):
"""Return path to the screenshot that resulted in a non-200 error"""
fname = os.path.join(self.data_dir, "last-error-screenshot.png")
fname = os.path.join(self.watch_data_dir, "last-error-screenshot.png")
if os.path.isfile(fname):
return fname
return False
@@ -977,37 +773,6 @@ class model(EntityPersistenceMixin, watch_base):
def toggle_mute(self):
self['notification_muted'] ^= True
def _get_commit_data(self):
"""
Prepare watch data for commit.
Excludes processor_config_* keys (stored in separate files).
Normalizes browser_steps to empty list if no meaningful steps.
"""
import copy
# Get base snapshot with lock
lock = self._datastore.lock if self._datastore and hasattr(self._datastore, 'lock') else None
if lock:
with lock:
snapshot = dict(self)
else:
snapshot = dict(self)
# Exclude processor config keys (stored separately)
watch_dict = {k: copy.deepcopy(v) for k, v in snapshot.items() if not k.startswith('processor_config_')}
# Normalize browser_steps: if no meaningful steps, save as empty list
if not self.has_browser_steps:
watch_dict['browser_steps'] = []
return watch_dict
# _save_to_disk() method provided by EntityPersistenceMixin
# commit() method inherited from watch_base
def extra_notification_token_values(self):
# Used for providing extra tokens
# return {'widget': 555}
@@ -1037,7 +802,7 @@ class model(EntityPersistenceMixin, watch_base):
if not csv_writer:
# A file on the disk can be transferred much faster via flask than a string reply
csv_output_filename = f"report-{self.get('uuid')}.csv"
f = open(os.path.join(self.data_dir, csv_output_filename), 'w')
f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
# @todo some headers in the future
#fieldnames = ['Epoch seconds', 'Date']
csv_writer = csv.writer(f,
@@ -1079,7 +844,7 @@ class model(EntityPersistenceMixin, watch_base):
def save_error_text(self, contents):
self.ensure_data_dir_exists()
target_path = os.path.join(self.data_dir, "last-error.txt")
target_path = os.path.join(self.watch_data_dir, "last-error.txt")
with open(target_path, 'w', encoding='utf-8') as f:
f.write(contents)
@@ -1088,9 +853,9 @@ class model(EntityPersistenceMixin, watch_base):
import zlib
if as_error:
target_path = os.path.join(str(self.data_dir), "elements-error.deflate")
target_path = os.path.join(str(self.watch_data_dir), "elements-error.deflate")
else:
target_path = os.path.join(str(self.data_dir), "elements.deflate")
target_path = os.path.join(str(self.watch_data_dir), "elements.deflate")
self.ensure_data_dir_exists()
@@ -1105,9 +870,9 @@ class model(EntityPersistenceMixin, watch_base):
def save_screenshot(self, screenshot: bytes, as_error=False):
if as_error:
target_path = os.path.join(self.data_dir, "last-error-screenshot.png")
target_path = os.path.join(self.watch_data_dir, "last-error-screenshot.png")
else:
target_path = os.path.join(self.data_dir, "last-screenshot.png")
target_path = os.path.join(self.watch_data_dir, "last-screenshot.png")
self.ensure_data_dir_exists()
@@ -1118,7 +883,7 @@ class model(EntityPersistenceMixin, watch_base):
def get_last_fetched_text_before_filters(self):
import brotli
filepath = os.path.join(self.data_dir, 'last-fetched.br')
filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
if not os.path.isfile(filepath) or os.path.getsize(filepath) == 0:
# If a previous attempt doesnt yet exist, just snarf the previous snapshot instead
@@ -1133,13 +898,13 @@ class model(EntityPersistenceMixin, watch_base):
def save_last_text_fetched_before_filters(self, contents):
import brotli
filepath = os.path.join(self.data_dir, 'last-fetched.br')
filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
_brotli_save(contents, filepath, mode=brotli.MODE_TEXT, fallback_uncompressed=False)
def save_last_fetched_html(self, timestamp, contents):
self.ensure_data_dir_exists()
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.data_dir, snapshot_fname)
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
_brotli_save(contents, filepath, mode=None, fallback_uncompressed=True)
self._prune_last_fetched_html_snapshots()
@@ -1147,7 +912,7 @@ class model(EntityPersistenceMixin, watch_base):
import brotli
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.data_dir, snapshot_fname)
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
if os.path.isfile(filepath):
with open(filepath, 'rb') as f:
return (brotli.decompress(f.read()).decode('utf-8'))
@@ -1162,7 +927,7 @@ class model(EntityPersistenceMixin, watch_base):
for index, timestamp in enumerate(dates):
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.data_dir, snapshot_fname)
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
# Keep only the first 2
if index > 1 and os.path.isfile(filepath):
@@ -1173,7 +938,7 @@ class model(EntityPersistenceMixin, watch_base):
def get_browsersteps_available_screenshots(self):
"For knowing which screenshots are available to show the user in BrowserSteps UI"
available = []
for f in Path(self.data_dir).glob('step_before-*.jpeg'):
for f in Path(self.watch_data_dir).glob('step_before-*.jpeg'):
step_n=re.search(r'step_before-(\d+)', f.name)
if step_n:
available.append(step_n.group(1))
+3 -533
View File
@@ -2,175 +2,12 @@ import os
import uuid
from changedetectionio import strtobool
from .persistence import EntityPersistenceMixin, _determine_entity_type
__all__ = ['EntityPersistenceMixin', 'watch_base']
from ..browser_steps.browser_steps import browser_steps_get_valid_steps
USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH = 'System default'
CONDITIONS_MATCH_LOGIC_DEFAULT = 'ALL'
class watch_base(dict):
"""
Base watch domain model (inherits from dict for backward compatibility).
WARNING: This class inherits from dict, which violates proper encapsulation.
Dict inheritance is legacy technical debt that should be refactored to a proper
domain model (e.g., Pydantic BaseModel) for better type safety and validation.
TODO: Migrate to Pydantic BaseModel for:
- Type safety and IDE autocomplete
- Automatic validation
- Clear separation between domain model and serialization
- Database backend abstraction (file postgres mongodb)
- Configuration override chain resolution (Watch Tag Global)
- Immutability options
- Better testing
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
CHAIN RESOLUTION ARCHITECTURE:
The dream is a 3-level override hierarchy:
Watch settings Tag/Group settings Global settings
Current implementation: MANUAL resolution scattered across codebase
- Processors manually check watch.get('field')
- Loop through tags to find overrides_watch=True
- Fall back to datastore['settings']['application']['field']
Pydantic implementation: AUTOMATIC resolution via @computed_field
- Single source of truth for each setting's resolution logic
- Type-safe, testable, self-documenting
- Example: watch.resolved_fetch_backend (instead of nested dict navigation)
See: Watch.py model docstring for detailed Pydantic architecture plan
See: Tag.py model docstring for tag override explanation
See: processors/restock_diff/processor.py:184-192 for current manual example
Core Fields:
uuid (str): Unique identifier for this watch (auto-generated)
url (str): Target URL to monitor for changes
title (str|None): Custom display name (overrides page_title if set)
page_title (str|None): Title extracted from <title> tag of monitored page
tags (List[str]): List of tag UUIDs for categorization
tag (str): DEPRECATED - Old single-tag system, use tags instead
Check Configuration:
processor (str): Processor type ('text_json_diff', 'restock_diff', etc.)
fetch_backend (str): Fetcher to use ('system', 'html_requests', 'playwright', etc.)
method (str): HTTP method ('GET', 'POST', etc.)
headers (dict): Custom HTTP headers to send
proxy (str|None): Preferred proxy server
paused (bool): Whether change detection is paused
Scheduling:
time_between_check (dict): Check interval {'weeks': int, 'days': int, 'hours': int, 'minutes': int, 'seconds': int}
time_between_check_use_default (bool): Use global default interval if True
time_schedule_limit (dict): Weekly schedule limiting when checks can run
Structure: {
'enabled': bool,
'monday/tuesday/.../sunday': {
'enabled': bool,
'start_time': str ('HH:MM'),
'duration': {'hours': str, 'minutes': str}
}
}
Content Filtering:
include_filters (List[str]): CSS/XPath selectors to extract content
subtractive_selectors (List[str]): Selectors to remove from content
ignore_text (List[str]): Text patterns to ignore in change detection
trigger_text (List[str]): Text/regex that must be present to trigger change
text_should_not_be_present (List[str]): Text that should NOT be present
extract_text (List[str]): Regex patterns to extract specific text after filtering
Text Processing:
trim_text_whitespace (bool): Strip leading/trailing whitespace
sort_text_alphabetically (bool): Sort lines alphabetically before comparison
remove_duplicate_lines (bool): Remove duplicate lines
check_unique_lines (bool): Compare against all history for unique lines
strip_ignored_lines (bool|None): Remove lines matching ignore patterns
Change Detection Filters:
filter_text_added (bool): Include added text in change detection
filter_text_removed (bool): Include removed text in change detection
filter_text_replaced (bool): Include replaced text in change detection
Browser Automation:
browser_steps (List[dict]): Browser automation steps for JS-heavy sites
browser_steps_last_error_step (int|None): Last step that caused error
webdriver_delay (int|None): Seconds to wait after page load
webdriver_js_execute_code (str|None): JavaScript to execute before extraction
Restock Detection:
in_stock_only (bool): Only trigger on in-stock transitions
follow_price_changes (bool): Monitor price changes
has_ldjson_price_data (bool|None): Whether page has LD-JSON price data
track_ldjson_price_data (str|None): Track LD-JSON price data ('ACCEPT', 'REJECT', None)
price_change_threshold_percent (float|None): Minimum price change % to trigger
Notifications:
notification_urls (List[str]): Apprise URLs for notifications
notification_title (str|None): Custom notification title template
notification_body (str|None): Custom notification body template
notification_format (str): Notification format (e.g., 'System default', 'Text', 'HTML')
notification_muted (bool): Disable notifications for this watch
notification_screenshot (bool): Include screenshot in notifications
notification_alert_count (int): Number of notifications sent
last_notification_error (str|None): Last notification error message
body (str|None): DEPRECATED? Legacy notification body field
filter_failure_notification_send (bool): Send notification on filter failures
History & State:
date_created (int|None): Unix timestamp of watch creation
last_checked (int): Unix timestamp of last check
last_viewed (int): History snapshot key of last user view
last_error (str|bool): Last error message or False if no error
check_count (int): Total number of checks performed
fetch_time (float): Duration of last fetch in seconds
consecutive_filter_failures (int): Counter for consecutive filter match failures
previous_md5 (str|bool): MD5 hash of previous content
history_snapshot_max_length (int|None): Max history snapshots to keep (None = use global)
Conditions:
conditions (dict): Custom conditions for change detection logic
conditions_match_logic (str): Logic operator ('ALL', 'ANY') for conditions
Metadata:
content-type (str|None): Content-Type from last fetch
remote_server_reply (str|None): Server header from last response
ignore_status_codes (List[int]|None): HTTP status codes to ignore
use_page_title_in_list (bool|None): Display page title in watch list (None = use system default)
Instance Attributes (not serialized):
__datastore: Reference to parent DataStore (set externally after creation)
data_dir: Filesystem path for this watch's data directory
Notes:
- Many fields default to None to distinguish "not set" from "set to default"
- When field is None, system-level defaults are used
- Processor-specific configs (e.g., processor_config_*) are NOT stored in watch.json
They are stored in separate {processor_name}.json files
- This class is used for both Watch and Tag objects (tags reuse the structure)
"""
def __init__(self, *arg, **kw):
# Store datastore reference (common to Watch and Tag)
# Use single underscore to avoid name mangling issues in subclasses
self._datastore = kw.get('__datastore')
if kw.get('__datastore'):
del kw['__datastore']
# Store datastore_path (common to Watch and Tag)
self._datastore_path = kw.get('datastore_path')
if kw.get('datastore_path'):
del kw['datastore_path']
# IMPORTANT: Don't initialize __watch_was_edited yet!
# We'll initialize it AFTER the initial update() call below
# This prevents marking the watch as edited during initialization
self.update({
# Custom notification content
# Re #110, so then if this is set to None, we know to use the default value instead
@@ -179,7 +16,7 @@ class watch_base(dict):
'body': None,
'browser_steps': [],
'browser_steps_last_error_step': None,
'conditions' : [],
'conditions' : {},
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
'check_count': 0,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
@@ -195,7 +32,6 @@ class watch_base(dict):
'filter_text_replaced': True,
'follow_price_changes': True,
'has_ldjson_price_data': None,
'history_snapshot_max_length': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'ignore_status_codes': None,
@@ -216,6 +52,7 @@ class watch_base(dict):
'page_title': None, # <title> from the page
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'price_change_threshold_percent': None,
'proxy': None, # Preferred proxy connection
@@ -301,372 +138,5 @@ class watch_base(dict):
super(watch_base, self).__init__(*arg, **kw)
# Check if we're being initialized from an existing watch object
# that has was_edited=True, so we can preserve the flag
preserve_edited_flag = False
if self.get('default'):
# When creating a new watch object from an existing one (e.g., changing processor),
# preserve the was_edited flag if it was True
default_watch = self.get('default')
if hasattr(default_watch, 'was_edited') and default_watch.was_edited:
preserve_edited_flag = True
del self['default']
# NOW initialize the edited flag after all initial setup is complete
# This ensures initialization doesn't trigger the edited flag
# But preserve it if the source watch had it set to True
self.__watch_was_edited = preserve_edited_flag
def _mark_field_as_edited(self, key):
"""
Helper to mark a field as edited if it's writable.
Internal method used by __setitem__, update(), pop(), etc.
"""
# Don't track edits during initial load or if already edited
if not hasattr(self, '_watch_base__watch_was_edited'):
return
if self.__watch_was_edited:
return # Already marked as edited
# Import from shared schema utilities (no circular dependency)
from .schema_utils import get_readonly_watch_fields
readonly_fields = get_readonly_watch_fields()
# Additional system-managed fields not in OpenAPI spec (yet)
# These are set by processors/workers and should not trigger edited flag
additional_system_fields = {
'last_check_status', # Set by processors
'restock', # Set by restock processor
'last_viewed', # Set by mark_all_viewed endpoint
}
# Only mark as edited if this is a user-writable field
if key not in readonly_fields and key not in additional_system_fields:
self.__watch_was_edited = True
def __setitem__(self, key, value):
"""
Override dict.__setitem__ to track when writable watch fields are modified.
This enables skipping reprocessing when:
1. HTML content is unchanged (checksumFromPreviousCheckWasTheSame)
2. AND watch configuration was not edited
Only sets the edited flag when field is NOT in readonly_fields (from OpenAPI spec).
"""
# Set the value first (always)
super().__setitem__(key, value)
# Mark as edited if writable field
self._mark_field_as_edited(key)
def __delitem__(self, key):
"""Override dict.__delitem__ to track deletions of writable fields."""
super().__delitem__(key)
self._mark_field_as_edited(key)
def update(self, *args, **kwargs):
if args and args[0].get('browser_steps'):
args[0]['browser_steps'] = browser_steps_get_valid_steps(args[0].get('browser_steps'))
"""Override dict.update() to track modifications to writable fields."""
# Call parent update first
super().update(*args, **kwargs)
# Mark as edited for any writable fields that were updated
# Handle both update(dict) and update(key=value) forms
if args:
for key in args[0].keys():
self._mark_field_as_edited(key)
for key in kwargs.keys():
self._mark_field_as_edited(key)
def pop(self, key, *args):
"""Override dict.pop() to track removal of writable fields."""
result = super().pop(key, *args)
self._mark_field_as_edited(key)
return result
def setdefault(self, key, default=None):
"""Override dict.setdefault() to track modifications to writable fields."""
# Only marks as edited if key didn't exist (i.e., a new value was set)
existed = key in self
result = super().setdefault(key, default)
if not existed:
self._mark_field_as_edited(key)
return result
@property
def was_edited(self):
"""
Check if watch configuration was edited since last processing.
Returns:
bool: True if writable fields were modified, False otherwise
"""
return getattr(self, '_watch_base__watch_was_edited', False)
def reset_watch_edited_flag(self):
"""
Reset the watch edited flag after successful processing.
Call this after processing completes to allow future content-only change detection.
"""
self.__watch_was_edited = False
@classmethod
def get_property_names(cls):
"""
Get all @property attribute names from this model class using introspection.
This discovers computed/derived properties that are not stored in the datastore.
These properties should be filtered out during PUT/POST requests.
Returns:
frozenset: Immutable set of @property attribute names from the model class
"""
import functools
# Create a cached version if it doesn't exist
if not hasattr(cls, '_cached_get_property_names'):
@functools.cache
def _get_props():
properties = set()
# Use introspection to find all @property attributes
for name in dir(cls):
# Skip private/magic attributes
if name.startswith('_'):
continue
try:
attr = getattr(cls, name)
# Check if it's a property descriptor
if isinstance(attr, property):
properties.add(name)
except (AttributeError, TypeError):
continue
return frozenset(properties)
cls._cached_get_property_names = _get_props
return cls._cached_get_property_names()
def __deepcopy__(self, memo):
"""
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
CRITICAL FIX: Prevents copying large reference objects like __datastore
which would cause exponential memory growth when Watch objects are deepcopied.
This is called by:
- api/Watch.py:76 (API endpoint)
- api/Tags.py:28 (Tags API)
- processors/base.py:26 (EVERY processor run)
- store/__init__.py:544 (clone watch)
- And other locations
"""
from copy import deepcopy
# Create new instance without calling __init__
cls = self.__class__
new_obj = cls.__new__(cls)
memo[id(self)] = new_obj
# Copy the dict data (all the settings)
for key, value in self.items():
new_obj[key] = deepcopy(value, memo)
# Copy instance attributes dynamically
# This handles Watch-specific attrs (like __datastore) and any future subclass attrs
for attr_name in dir(self):
# Skip methods, special attrs, and dict keys
if attr_name.startswith('_') and not attr_name.startswith('__'):
# This catches _model__datastore, _model__history_n, etc.
try:
attr_value = getattr(self, attr_name)
# Special handling: Share references to large objects instead of copying
# Examples: _datastore, __datastore, __app_reference, __global_settings, etc.
if (attr_name == '_datastore' or
attr_name.endswith('__datastore') or
attr_name.endswith('__app')):
# Share the reference (don't copy!) to prevent memory leaks
setattr(new_obj, attr_name, attr_value)
# Skip cache attributes - let them regenerate on demand
elif 'cache' in attr_name.lower():
pass # Don't copy caches
# Copy regular instance attributes
elif not callable(attr_value):
setattr(new_obj, attr_name, attr_value)
except AttributeError:
pass # Attribute doesn't exist in this instance
return new_obj
def __getstate__(self):
"""
Custom pickle serialization for all watch_base subclasses.
Excludes large reference objects (like __datastore) from serialization.
"""
# Get the dict data
state = dict(self)
# Collect instance attributes (excluding methods and large references)
instance_attrs = {}
for attr_name in dir(self):
if attr_name.startswith('_') and not attr_name.startswith('__'):
try:
attr_value = getattr(self, attr_name)
# Exclude large reference objects and caches from serialization
if not (attr_name == '_datastore' or
attr_name.endswith('__datastore') or
attr_name.endswith('__app') or
'cache' in attr_name.lower() or
callable(attr_value)):
instance_attrs[attr_name] = attr_value
except AttributeError:
pass
if instance_attrs:
state['__instance_metadata__'] = instance_attrs
return state
def __setstate__(self, state):
"""
Custom pickle deserialization for all watch_base subclasses.
WARNING: Large reference objects (like __datastore) are NOT restored!
Caller must restore these references after unpickling if needed.
"""
# Extract metadata
metadata = state.pop('__instance_metadata__', {})
# Restore dict data
self.update(state)
# Restore instance attributes
for attr_name, attr_value in metadata.items():
setattr(self, attr_name, attr_value)
@property
def data_dir(self):
"""
The base directory for this watch/tag data (property, computed from UUID).
Common property for both Watch and Tag objects.
Returns path like: /datastore/{uuid}/
"""
return os.path.join(self._datastore_path, self['uuid']) if self._datastore_path else None
def ensure_data_dir_exists(self):
"""
Create the data directory if it doesn't exist.
Common method for both Watch and Tag objects.
"""
from loguru import logger
if not os.path.isdir(self.data_dir):
logger.debug(f"> Creating data dir {self.data_dir}")
os.mkdir(self.data_dir)
def get_global_setting(self, *path):
"""
Get a setting from the global datastore configuration.
Args:
*path: Path to the setting (e.g., 'application', 'history_snapshot_max_length')
Returns:
The setting value, or None if not found
Example:
maxlen = self.get_global_setting('application', 'history_snapshot_max_length')
"""
if not self._datastore:
return None
try:
value = self._datastore['settings']
for key in path:
value = value[key]
return value
except (KeyError, TypeError):
return None
def _get_commit_data(self):
"""
Prepare data for commit (can be overridden by subclasses).
Returns:
dict: Data to serialize (filtered as needed by subclass)
"""
import copy
# Acquire datastore lock to prevent concurrent modifications during copy
lock = self._datastore.lock if self._datastore and hasattr(self._datastore, 'lock') else None
if lock:
with lock:
snapshot = dict(self)
else:
snapshot = dict(self)
# Deep copy snapshot (slower, but done outside lock to minimize contention)
# Subclasses can override to filter keys (e.g., Watch excludes processor_config_*)
return {k: copy.deepcopy(v) for k, v in snapshot.items()}
def _save_to_disk(self, data_dict, uuid):
"""
Save data to disk (must be implemented by subclasses).
Args:
data_dict: Dictionary to save
uuid: UUID for logging
Raises:
NotImplementedError: If subclass doesn't implement
"""
raise NotImplementedError("Subclass must implement _save_to_disk()")
def commit(self):
"""
Save this watch/tag immediately to disk using atomic write.
Common commit logic for Watch and Tag objects.
Subclasses override _get_commit_data() and _save_to_disk() for specifics.
Fire-and-forget: Logs errors but does not raise exceptions.
Data remains in memory even if save fails, so next commit will retry.
"""
from loguru import logger
if not self.data_dir:
entity_type = self.__class__.__name__
logger.error(f"Cannot commit {entity_type} {self.get('uuid')} without datastore_path")
return
uuid = self.get('uuid')
if not uuid:
entity_type = self.__class__.__name__
logger.error(f"Cannot commit {entity_type} without UUID")
return
# Get data from subclass (may filter keys)
try:
data_dict = self._get_commit_data()
except Exception as e:
logger.error(f"Failed to prepare commit data for {uuid}: {e}")
return
# Save to disk via subclass implementation
try:
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
entity_type = _determine_entity_type(self.__class__)
filename = f"{entity_type}.json"
self._save_to_disk(data_dict, uuid)
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
except Exception as e:
logger.error(f"Failed to commit {uuid}: {e}")
del self['default']
-84
View File
@@ -1,84 +0,0 @@
"""
Entity persistence mixin for Watch and Tag models.
Provides file-based persistence using atomic writes.
"""
import functools
import inspect
@functools.lru_cache(maxsize=None)
def _determine_entity_type(cls):
"""
Determine entity type from class hierarchy (cached at class level).
Args:
cls: The class to inspect
Returns:
str: Entity type ('watch', 'tag', etc.)
Raises:
ValueError: If entity type cannot be determined
"""
for base_class in inspect.getmro(cls):
module_name = base_class.__module__
if module_name.startswith('changedetectionio.model.'):
# Get last part after dot: "changedetectionio.model.Watch" -> "watch"
return module_name.split('.')[-1].lower()
raise ValueError(
f"Cannot determine entity type for {cls.__module__}.{cls.__name__}. "
f"Entity must inherit from a class in changedetectionio.model (Watch or Tag)."
)
class EntityPersistenceMixin:
"""
Mixin providing file persistence for watch_base subclasses (Watch, Tag, etc.).
This mixin provides the _save_to_disk() method required by watch_base.commit().
It automatically determines the correct filename and size limits based on class hierarchy.
Usage:
class model(EntityPersistenceMixin, watch_base): # in Watch.py
pass
class model(EntityPersistenceMixin, watch_base): # in Tag.py
pass
"""
def _save_to_disk(self, data_dict, uuid):
"""
Save entity to disk using atomic write.
Implements the abstract method required by watch_base.commit().
Automatically determines filename and size limits from class hierarchy.
Args:
data_dict: Dictionary to save
uuid: UUID for logging
Raises:
ValueError: If entity type cannot be determined from class hierarchy
"""
# Import here to avoid circular dependency
from changedetectionio.store.file_saving_datastore import save_entity_atomic
# Determine entity type (cached at class level, not instance level)
entity_type = _determine_entity_type(self.__class__)
# Set filename and size limits based on entity type
filename = f'{entity_type}.json'
max_size_mb = 10 if entity_type == 'watch' else 1
# Save using generic function
save_entity_atomic(
self.data_dir,
uuid,
data_dict,
filename=filename,
entity_type=entity_type,
max_size_mb=max_size_mb
)
-92
View File
@@ -1,92 +0,0 @@
"""
Schema utilities for Watch and Tag models.
Provides functions to extract readonly fields and properties from OpenAPI spec.
Shared by both the model layer and API layer to avoid circular dependencies.
"""
import functools
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_readonly_fields(schema_name):
"""
Generic helper to resolve readOnly fields, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
Returns:
frozenset: All readOnly field names including inherited ones
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
readonly_fields = set()
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
if 'properties' in ref_schema:
for field_name, field_def in ref_schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
# Check schema-specific properties
if 'properties' in item:
for field_name, field_def in item['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
else:
# Direct properties (no inheritance)
if 'properties' in schema:
for field_name, field_def in schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
return frozenset(readonly_fields)
@functools.cache
def get_readonly_watch_fields():
"""
Extract readOnly field names from Watch schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
Used by:
- model/watch_base.py: Track when writable fields are edited
- api/Watch.py: Filter readonly fields from PUT requests
"""
return _resolve_readonly_fields('Watch')
@functools.cache
def get_readonly_tag_fields():
"""
Extract readOnly field names from Tag schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
"""
return _resolve_readonly_fields('Tag')
-24
View File
@@ -105,30 +105,6 @@ class ChangeDetectionSpec:
"""
pass
@hookspec
def register_processor(self):
"""Register an external processor plugin.
External packages can implement this hook to register custom processors
that will be discovered alongside built-in processors.
Returns:
dict or None: Dictionary with processor information:
{
'processor_name': str, # Machine name (e.g., 'osint_recon')
'processor_module': module, # Module containing processor.py
'processor_class': class, # The perform_site_check class
'metadata': { # Optional metadata
'name': str, # Display name
'description': str, # Description
'processor_weight': int,# Sort weight (lower = higher priority)
'list_badge_text': str, # Badge text for UI
}
}
Return None if this plugin doesn't provide a processor
"""
pass
# Set up Plugin Manager
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
+34 -240
View File
@@ -1,6 +1,6 @@
from functools import lru_cache
from loguru import logger
from flask_babel import gettext, get_locale
from flask_babel import gettext
import importlib
import inspect
import os
@@ -17,11 +17,9 @@ def find_sub_packages(package_name):
return [name for _, name, is_pkg in pkgutil.iter_modules(package.__path__) if is_pkg]
@lru_cache(maxsize=1)
def find_processors():
"""
Find all subclasses of DifferenceDetectionProcessor in the specified package.
Results are cached to avoid repeated discovery.
:param package_name: The name of the package to scan for processor modules.
:return: A list of (module, class) tuples.
@@ -48,23 +46,6 @@ def find_processors():
except (ModuleNotFoundError, ImportError) as e:
logger.warning(f"Failed to import module {module_name}: {e} (find_processors())")
# Discover plugin processors via pluggy
try:
from changedetectionio.pluggy_interface import plugin_manager
plugin_results = plugin_manager.hook.register_processor()
for result in plugin_results:
if result and isinstance(result, dict):
processor_module = result.get('processor_module')
processor_name = result.get('processor_name')
if processor_module and processor_name:
processors.append((processor_module, processor_name))
plugin_path = getattr(processor_module, '__file__', 'unknown location')
logger.info(f"Registered plugin processor: {processor_name} from {plugin_path}")
except Exception as e:
logger.warning(f"Error loading plugin processors: {e}")
return processors
@@ -116,138 +97,54 @@ def find_processor_module(processor_name):
return None
def get_processor_module(processor_name):
def available_processors():
"""
Get the actual processor module (with perform_site_check class) by name.
Works for both built-in and plugin processors.
Args:
processor_name: Processor machine name (e.g., 'text_json_diff', 'osint_recon')
Returns:
module: The processor module containing perform_site_check, or None if not found
Get a list of processors by name and description for the UI elements.
Can be filtered via ALLOWED_PROCESSORS environment variable (comma-separated list).
:return: A list :)
"""
processor_classes = find_processors()
processor_tuple = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
if processor_tuple:
# Return the actual processor module (first element of tuple)
return processor_tuple[0]
return None
def get_processor_submodule(processor_name, submodule_name):
"""
Get an optional submodule from a processor (e.g., 'difference', 'extract', 'preview').
Works for both built-in and plugin processors.
Args:
processor_name: Processor machine name (e.g., 'text_json_diff', 'osint_recon')
submodule_name: Name of the submodule (e.g., 'difference', 'extract', 'preview')
Returns:
module: The submodule if it exists, or None if not found
"""
processor_classes = find_processors()
processor_tuple = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
if not processor_tuple:
return None
processor_module = processor_tuple[0]
parent_module = get_parent_module(processor_module)
if not parent_module:
return None
# Try to import the submodule
try:
# For built-in processors: changedetectionio.processors.text_json_diff.difference
# For plugin processors: changedetectionio_osint.difference
parent_module_name = parent_module.__name__
submodule_full_name = f"{parent_module_name}.{submodule_name}"
return importlib.import_module(submodule_full_name)
except (ModuleNotFoundError, ImportError):
return None
@lru_cache(maxsize=1)
def get_plugin_processor_metadata():
"""Get metadata from plugin processors."""
metadata = {}
try:
from changedetectionio.pluggy_interface import plugin_manager
plugin_results = plugin_manager.hook.register_processor()
for result in plugin_results:
if result and isinstance(result, dict):
processor_name = result.get('processor_name')
meta = result.get('metadata', {})
if processor_name:
metadata[processor_name] = meta
except Exception as e:
logger.warning(f"Error getting plugin processor metadata: {e}")
return metadata
@lru_cache(maxsize=32)
def _available_processors_cached(locale_str):
"""
Internal cached function that includes locale in cache key.
This ensures translations are cached per-language instead of globally.
:param locale_str: The locale string (e.g., 'en', 'it', 'zh')
:return: A list of tuples (processor_name, translated_description, weight)
"""
processor_classes = find_processors()
# Check if DISABLED_PROCESSORS env var is set
disabled_processors_env = os.getenv('DISABLED_PROCESSORS', 'image_ssim_diff').strip()
disabled_processors = []
if disabled_processors_env:
# Check if ALLOWED_PROCESSORS env var is set
# For now we disable it, need to make a deploy with lots of new code and this will be an overload
allowed_processors_env = os.getenv('ALLOWED_PROCESSORS', 'text_json_diff, restock_diff').strip()
allowed_processors = None
if allowed_processors_env:
# Parse comma-separated list and strip whitespace
disabled_processors = [p.strip() for p in disabled_processors_env.split(',') if p.strip()]
logger.info(f"DISABLED_PROCESSORS set, disabling: {disabled_processors}")
allowed_processors = [p.strip() for p in allowed_processors_env.split(',') if p.strip()]
logger.info(f"ALLOWED_PROCESSORS set, filtering to: {allowed_processors}")
available = []
plugin_metadata = get_plugin_processor_metadata()
for module, sub_package_name in processor_classes:
# Skip disabled processors
if sub_package_name in disabled_processors:
logger.debug(f"Skipping processor '{sub_package_name}' (in DISABLED_PROCESSORS)")
# Filter by allowed processors if set
if allowed_processors and sub_package_name not in allowed_processors:
logger.debug(f"Skipping processor '{sub_package_name}' (not in ALLOWED_PROCESSORS)")
continue
# Check if this is a plugin processor
if sub_package_name in plugin_metadata:
meta = plugin_metadata[sub_package_name]
description = gettext(meta.get('name', sub_package_name))
# Plugin processors start from weight 10 to separate them from built-in processors
weight = 100 + meta.get('processor_weight', 0)
# Try to get the 'name' attribute from the processor module first
if hasattr(module, 'name'):
description = gettext(module.name)
else:
# Try to get the 'name' attribute from the processor module first
if hasattr(module, 'name'):
description = gettext(module.name)
# Fall back to processor_description from parent module's __init__.py
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_description'):
description = gettext(parent_module.processor_description)
else:
# Fall back to processor_description from parent module's __init__.py
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_description'):
description = gettext(parent_module.processor_description)
else:
# Final fallback to a readable name
description = sub_package_name.replace('_', ' ').title()
# Final fallback to a readable name
description = sub_package_name.replace('_', ' ').title()
# Get weight for sorting (lower weight = higher in list)
weight = 0 # Default weight for processors without explicit weight
# Get weight for sorting (lower weight = higher in list)
weight = 0 # Default weight for processors without explicit weight
# Check processor module itself first
if hasattr(module, 'processor_weight'):
weight = module.processor_weight
else:
# Fall back to parent module (package __init__.py)
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_weight'):
weight = parent_module.processor_weight
# Check processor module itself first
if hasattr(module, 'processor_weight'):
weight = module.processor_weight
else:
# Fall back to parent module (package __init__.py)
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_weight'):
weight = parent_module.processor_weight
available.append((sub_package_name, description, weight))
@@ -257,36 +154,6 @@ def _available_processors_cached(locale_str):
# Return as tuples without weight (for backwards compatibility)
return [(name, desc) for name, desc, weight in available]
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
This function delegates to a locale-aware cached version to ensure translations
are cached per-language instead of globally.
:return: A list of tuples (processor_name, translated_description)
"""
# Get current locale and use it as cache key
# Convert Babel Locale object to string for use as cache key
locale = get_locale()
locale_str = str(locale) if locale else 'en'
return _available_processors_cached(locale_str)
def get_default_processor():
"""
Get the default processor to use when none is specified.
Returns the first available processor based on weight (lowest weight = highest priority).
This ensures forms auto-select a valid processor even when DISABLED_PROCESSORS filters the list.
:return: The processor name string (e.g., 'text_json_diff')
"""
available = available_processors()
if available:
return available[0][0] # Return the processor name from first tuple
return 'text_json_diff' # Fallback if somehow no processors are available
def get_processor_badge_texts():
"""
@@ -412,76 +279,3 @@ def get_processor_badge_css():
return '\n\n'.join(css_rules)
def save_processor_config(datastore, watch_uuid, config_data):
"""
Save processor-specific configuration to JSON file.
This is a shared helper function used by both the UI edit form and API endpoints
to consistently handle processor configuration storage.
Args:
datastore: The application datastore instance
watch_uuid: UUID of the watch
config_data: Dictionary of configuration data to save (with processor_config_* prefix removed)
Returns:
bool: True if saved successfully, False otherwise
"""
if not config_data:
return True
try:
from changedetectionio.processors.base import difference_detection_processor
# Get processor name from watch
watch = datastore.data['watching'].get(watch_uuid)
if not watch:
logger.error(f"Cannot save processor config: watch {watch_uuid} not found")
return False
processor_name = watch.get('processor', 'text_json_diff')
# Create a processor instance to access config methods
processor_instance = difference_detection_processor(datastore, watch_uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, config_data)
logger.debug(f"Saved processor config to {config_filename}: {config_data}")
return True
except Exception as e:
logger.error(f"Failed to save processor config: {e}")
return False
def extract_processor_config_from_form_data(form_data):
"""
Extract processor_config_* fields from form data and return separate dicts.
This is a shared helper function used by both the UI edit form and API endpoints
to consistently handle processor configuration extraction.
IMPORTANT: This function modifies form_data in-place by removing processor_config_* fields.
Args:
form_data: Dictionary of form data (will be modified in-place)
Returns:
dict: Dictionary of processor config data (with processor_config_* prefix removed)
"""
processor_config_data = {}
# Use list() to create a copy of keys since we're modifying the dict
for field_name in list(form_data.keys()):
if field_name.startswith('processor_config_'):
config_key = field_name.replace('processor_config_', '')
# Save all values (including empty strings) to allow explicit clearing of settings
processor_config_data[config_key] = form_data[field_name]
# Remove from form_data to prevent it from reaching datastore
del form_data[field_name]
return processor_config_data
+9 -85
View File
@@ -1,7 +1,5 @@
import re
import hashlib
from changedetectionio.browser_steps.browser_steps import browser_steps_get_valid_steps
from changedetectionio.content_fetchers.base import Fetcher
from changedetectionio.strtobool import strtobool
from copy import deepcopy
@@ -21,80 +19,14 @@ class difference_detection_processor():
xpath_data = None
preferred_proxy = None
screenshot_format = SCREENSHOT_FORMAT_JPEG
last_raw_content_checksum = None
def __init__(self, datastore, watch_uuid):
self.datastore = datastore
self.watch_uuid = watch_uuid
# Create a stable snapshot of the watch for processing
# Why deepcopy?
# 1. Prevents "dict changed during iteration" errors if watch is modified during processing
# 2. Preserves Watch object with properties (.link, .is_pdf, etc.) - can't use dict()
# 3. Safe now: Watch.__deepcopy__() shares datastore ref (no memory leak) but copies dict data
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()
# Load the last raw content checksum from file
self.read_last_raw_content_checksum()
def update_last_raw_content_checksum(self, checksum):
"""
Save the raw content MD5 checksum to file.
This is used for skip logic - avoid reprocessing if raw HTML unchanged.
"""
if not checksum:
return
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
return
data_dir = watch.data_dir
if not data_dir:
return
watch.ensure_data_dir_exists()
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
try:
with open(checksum_file, 'w', encoding='utf-8') as f:
f.write(checksum)
self.last_raw_content_checksum = checksum
except IOError as e:
logger.warning(f"Failed to write checksum file for {self.watch_uuid}: {e}")
def read_last_raw_content_checksum(self):
"""
Read the last raw content MD5 checksum from file.
Returns None if file doesn't exist (first run) or can't be read.
"""
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
self.last_raw_content_checksum = None
return
data_dir = watch.data_dir
if not data_dir:
self.last_raw_content_checksum = None
return
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
if not os.path.isfile(checksum_file):
self.last_raw_content_checksum = None
return
try:
with open(checksum_file, 'r', encoding='utf-8') as f:
self.last_raw_content_checksum = f.read().strip()
except IOError as e:
logger.warning(f"Failed to read checksum file for {self.watch_uuid}: {e}")
self.last_raw_content_checksum = None
async def call_browser(self, preferred_proxy_id=None):
from requests.structures import CaseInsensitiveDict
@@ -171,7 +103,7 @@ class difference_detection_processor():
)
if self.watch.has_browser_steps:
self.fetcher.browser_steps = browser_steps_get_valid_steps(self.watch.get('browser_steps', []))
self.fetcher.browser_steps = self.watch.get('browser_steps', [])
self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid'))
# Tweak the base config with the per-watch ones
@@ -254,12 +186,12 @@ class difference_detection_processor():
import os
watch = self.datastore.data['watching'].get(self.watch_uuid)
data_dir = watch.data_dir
watch_data_dir = watch.watch_data_dir
if not data_dir:
if not watch_data_dir:
return {}
filepath = os.path.join(data_dir, filename)
filepath = os.path.join(watch_data_dir, filename)
if not os.path.isfile(filepath):
return {}
@@ -284,16 +216,16 @@ class difference_detection_processor():
import os
watch = self.datastore.data['watching'].get(self.watch_uuid)
data_dir = watch.data_dir
watch_data_dir = watch.watch_data_dir
if not data_dir:
logger.warning(f"Cannot save extra watch config {filename}: no data_dir")
if not watch_data_dir:
logger.warning(f"Cannot save extra watch config {filename}: no watch_data_dir")
return
# Ensure directory exists
watch.ensure_data_dir_exists()
filepath = os.path.join(data_dir, filename)
filepath = os.path.join(watch_data_dir, filename)
try:
# If merge is enabled, read existing data first
@@ -318,16 +250,8 @@ class difference_detection_processor():
except IOError as e:
logger.error(f"Failed to write extra watch config {filename}: {e}")
def get_raw_document_checksum(self):
checksum = None
if self.fetcher.content:
checksum = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
return checksum
@abstractmethod
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
@@ -12,13 +12,6 @@ processor_description = "Visual/Screenshot change detection (Fast)"
processor_name = "image_ssim_diff"
processor_weight = 2 # Lower weight = appears at top, heavier weight = appears lower (bottom)
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = False
supports_text_filters_and_triggers_elements = False
supports_request_type = True
PROCESSOR_CONFIG_NAME = f"{Path(__file__).parent.name}.json"
# Subprocess timeout settings
@@ -414,7 +414,7 @@ def render(watch, datastore, request, url_for, render_template, flash, redirect)
# Load historical data if available (for charts/visualization)
comparison_data = {}
comparison_config_path = os.path.join(watch.data_dir, "visual_comparison_data.json")
comparison_config_path = os.path.join(watch.watch_data_dir, "visual_comparison_data.json")
if os.path.isfile(comparison_config_path):
try:
with open(comparison_config_path, 'r') as f:
@@ -90,7 +90,7 @@ def on_config_save(watch, processor_config, datastore):
processor_config['auto_track_region'] = False
# Remove old template file if exists
template_path = os.path.join(watch.data_dir, CROPPED_IMAGE_TEMPLATE_FILENAME)
template_path = os.path.join(watch.watch_data_dir, CROPPED_IMAGE_TEMPLATE_FILENAME)
if os.path.exists(template_path):
os.remove(template_path)
logger.debug(f"Removed old template file: {template_path}")
@@ -30,7 +30,7 @@ class perform_site_check(difference_detection_processor):
# Override to use PNG format for better image comparison (JPEG compression creates noise)
screenshot_format = SCREENSHOT_FORMAT_PNG
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
"""
Perform screenshot comparison using OpenCV subprocess handler.
@@ -4,13 +4,6 @@ from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = True
supports_text_filters_and_triggers_elements = True
supports_request_type = True
class Restock(dict):
def parse_currency(self, raw_value: str) -> Union[float, None]:
@@ -2,7 +2,6 @@ from ..base import difference_detection_processor
from ..exceptions import ProcessorException
from . import Restock
from loguru import logger
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
import urllib3
import time
@@ -57,259 +56,6 @@ def _deduplicate_prices(data):
return list(unique_data)
# =============================================================================
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
# =============================================================================
#
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
# libxml2 - a C library that allocates memory at the C level.
#
# Memory Leak Problem:
# --------------------
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
# C-level allocations (nodes, attributes, text content)
# 2. Python's garbage collector can mark these objects as collectible, but
# cannot force the OS to reclaim the actual C-level memory
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
# "free in the process address space"
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
# after Python GC runs
#
# Why Multiprocessing Fixes This:
# --------------------------------
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
# allocations that Python GC couldn't release. This ensures clean memory state
# after each extraction.
#
# Performance Impact:
# -------------------
# - Memray analysis showed 1.2M document_fromstring allocations per page
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
# - With subprocess: ~35MB overhead but forces full cleanup after each run
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
#
# References:
# -----------
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
#
# Additional Context:
# -------------------
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
# use direct call as fallback
#
# Alternative Solution (Future Optimization):
# -------------------------------------------
# This entire problem could be avoided by using regex to extract just the machine
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
#
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
# 2. Use regex to extract <meta property="og:*"> tags
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
#
# Benefits:
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
# - Eliminates the lxml memory leak entirely
# - Faster extraction (regex is much faster than DOM parsing)
# - No subprocess overhead needed
#
# Trade-offs:
# - Regex for HTML is brittle (comments, CDATA, edge cases)
# - Microdata extraction would be complex (need to track element boundaries)
# - Would need extensive testing to ensure we don't miss valid data
# - extruct is battle-tested; regex solution would need similar maturity
#
# For now, the subprocess approach is safer and leverages existing extruct code.
# =============================================================================
def _extract_itemprop_availability_worker(pipe_conn):
"""
Subprocess worker for itemprop extraction (Linux memory management).
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
When the subprocess exits, the OS reclaims ALL memory including lxml's
C-level allocations that Python's GC cannot release.
Args:
pipe_conn: Pipe connection to receive HTML and send result
"""
import json
import gc
html_content = None
result_data = None
try:
# Receive HTML as raw bytes (no pickle)
html_bytes = pipe_conn.recv_bytes()
html_content = html_bytes.decode('utf-8')
# Explicitly delete html_bytes to free memory
del html_bytes
gc.collect()
# Perform extraction in subprocess (uses extruct/lxml)
result_data = get_itemprop_availability(html_content)
# Convert Restock object to dict for JSON serialization
result = {
'success': True,
'data': dict(result_data) if result_data else {}
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
# Clean up before exit
del result_data, html_content, result
gc.collect()
except MoreThanOnePriceFound:
# Serialize the specific exception type
result = {
'success': False,
'exception_type': 'MoreThanOnePriceFound'
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
except Exception as e:
# Serialize other exceptions
result = {
'success': False,
'exception_type': type(e).__name__,
'exception_message': str(e)
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
finally:
# Final cleanup before subprocess exits
# Variables may already be deleted in try block, so use try/except
try:
del html_content
except (NameError, UnboundLocalError):
pass
try:
del result_data
except (NameError, UnboundLocalError):
pass
gc.collect()
pipe_conn.close()
def extract_itemprop_availability_safe(html_content) -> Restock:
"""
Extract itemprop availability with hybrid approach for memory efficiency.
Strategy (fastest to slowest, least to most memory):
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
2. Fall back to extruct with subprocess isolation on Linux for complex cases
Args:
html_content: HTML string to parse
Returns:
Restock: Extracted availability data
Raises:
MoreThanOnePriceFound: When multiple prices detected
Other exceptions: From extruct/parsing
"""
import platform
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
try:
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
logger.trace("Attempting pure Python metadata extraction (no lxml)")
extracted_data = extract_metadata_pure_python(html_content)
price_data = query_price_availability(extracted_data)
# If we got price AND availability, we're done!
if price_data.get('price') and price_data.get('availability'):
result = Restock(price_data)
logger.debug(f"Pure Python extraction successful: {dict(result)}")
return result
# If we got some data but not everything, still try extruct for completeness
if price_data.get('price') or price_data.get('availability'):
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
except Exception as e:
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
# Only use subprocess isolation on Linux
# Other platforms may have issues with spawn or don't need the aggressive memory management
if platform.system() == 'Linux':
import multiprocessing
import json
import gc
try:
ctx = multiprocessing.get_context('spawn')
parent_conn, child_conn = ctx.Pipe()
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
p.start()
# Send HTML as raw bytes (no pickle)
html_bytes = html_content.encode('utf-8')
parent_conn.send_bytes(html_bytes)
# Explicitly delete html_bytes copy immediately after sending
del html_bytes
gc.collect()
# Receive result as JSON
result_bytes = parent_conn.recv_bytes()
result = json.loads(result_bytes.decode('utf-8'))
# Wait for subprocess to complete
p.join()
# Close pipes
parent_conn.close()
child_conn.close()
# Clean up all subprocess-related objects
del p, parent_conn, child_conn, result_bytes
gc.collect()
# Handle result or re-raise exception
if result['success']:
# Reconstruct Restock object from dict
restock_obj = Restock(result['data'])
# Clean up result dict
del result
gc.collect()
return restock_obj
else:
# Re-raise the exception that occurred in subprocess
exception_type = result['exception_type']
exception_msg = result.get('exception_message', '')
del result
gc.collect()
if exception_type == 'MoreThanOnePriceFound':
raise MoreThanOnePriceFound()
else:
raise Exception(f"{exception_type}: {exception_msg}")
except Exception as e:
# If multiprocessing itself fails, log and fall back to direct call
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
gc.collect()
return get_itemprop_availability(html_content)
else:
# Non-Linux: direct call (no subprocess overhead needed)
return get_itemprop_availability(html_content)
# should return Restock()
# add casting?
def get_itemprop_availability(html_content) -> Restock:
@@ -404,37 +150,22 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
import hashlib
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False, 'restock': Restock()}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = self.fetcher.headers.get('Content-Type', '') # Use hyphen (matches OpenAPI spec)
# Track the content type
update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.
# Otherwise it will assume "in stock" because nothing suggesting the opposite was found
from ...html_tools import html_to_text
@@ -462,18 +193,18 @@ class perform_site_check(difference_detection_processor):
itemprop_availability = {}
multiple_prices_found = False
# Try built-in extraction first, this will scan metadata in the HTML
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
try:
itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
itemprop_availability = get_itemprop_availability(self.fetcher.content)
except MoreThanOnePriceFound as e:
# Don't raise immediately - let plugins try to handle this case
# Plugins might be able to determine which price is correct
logger.warning(f"Built-in detection found multiple prices on {watch.get('url')}, will try plugin override")
multiple_prices_found = True
itemprop_availability = {}
# Add the real data
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
url=watch.get('url'),
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,
xpath_data=self.fetcher.xpath_data
)
# If built-in extraction didn't get both price AND availability, try plugin override
# Only check plugin if this watch is using a fetcher that might provide better data
@@ -485,21 +216,9 @@ class perform_site_check(difference_detection_processor):
from changedetectionio.pluggy_interface import get_itemprop_availability_from_plugin
fetcher_name = watch.get('fetch_backend', 'html_requests')
# Resolve 'system' to the actual fetcher being used
# This allows plugins to work even when watch uses "system settings default"
if fetcher_name == 'system':
# Get the actual fetcher that was used (from self.fetcher)
# Fetcher class name gives us the actual backend (e.g., 'html_requests', 'html_webdriver')
actual_fetcher = type(self.fetcher).__name__
if 'html_requests' in actual_fetcher.lower():
fetcher_name = 'html_requests'
elif 'webdriver' in actual_fetcher.lower() or 'playwright' in actual_fetcher.lower():
fetcher_name = 'html_webdriver'
logger.debug(f"Resolved 'system' fetcher to actual fetcher: {fetcher_name}")
# Try plugin override - plugins can decide if they support this fetcher
if fetcher_name:
logger.debug(f"Calling extra plugins for getting item price/availability (fetcher: {fetcher_name})")
# Only try plugin override if not using system default (which might be anything)
if fetcher_name and fetcher_name != 'system':
logger.debug("Calling extra plugins for getting item price/availability")
plugin_availability = get_itemprop_availability_from_plugin(self.fetcher.content, fetcher_name, self.fetcher, watch.link)
if plugin_availability:
@@ -514,16 +233,6 @@ class perform_site_check(difference_detection_processor):
if not plugin_availability:
logger.debug("No item price/availability from plugins")
# If we had multiple prices and plugins also failed, NOW raise the exception
if multiple_prices_found and not itemprop_availability.get('price'):
raise ProcessorException(
message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
url=watch.get('url'),
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,
xpath_data=self.fetcher.xpath_data
)
# Something valid in get_itemprop_availability() by scraping metadata ?
if itemprop_availability.get('price') or itemprop_availability.get('availability'):
# Store for other usage
@@ -1,286 +0,0 @@
"""
Pure Python metadata extractor - no lxml, no memory leaks.
This module provides a fast, memory-efficient alternative to extruct for common
e-commerce metadata extraction. It handles:
- JSON-LD (covers 80%+ of modern sites)
- OpenGraph meta tags
- Basic microdata attributes
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
memory allocation issues. For edge cases, the main processor can fall back to
extruct (with subprocess isolation on Linux).
"""
from html.parser import HTMLParser
import json
import re
from loguru import logger
class JSONLDExtractor(HTMLParser):
"""
Extract JSON-LD structured data from HTML.
Finds all <script type="application/ld+json"> tags and parses their content.
Handles multiple JSON-LD blocks on the same page.
"""
def __init__(self):
super().__init__()
self.in_jsonld = False
self.data = [] # List of all parsed JSON-LD objects
self.current_script = []
def handle_starttag(self, tag, attrs):
if tag == 'script':
# Check if this is a JSON-LD script tag
for attr, value in attrs:
if attr == 'type' and value == 'application/ld+json':
self.in_jsonld = True
self.current_script = []
break
def handle_data(self, data):
if self.in_jsonld:
self.current_script.append(data)
def handle_endtag(self, tag):
if tag == 'script' and self.in_jsonld:
# Parse the accumulated script content
script_content = ''.join(self.current_script)
if script_content.strip():
try:
# Parse JSON (handles both objects and arrays)
parsed = json.loads(script_content)
if isinstance(parsed, list):
self.data.extend(parsed)
else:
self.data.append(parsed)
except json.JSONDecodeError as e:
logger.debug(f"Failed to parse JSON-LD: {e}")
pass
self.in_jsonld = False
self.current_script = []
class OpenGraphExtractor(HTMLParser):
"""
Extract OpenGraph meta tags from HTML.
Finds <meta property="og:*"> tags commonly used for social media sharing.
"""
def __init__(self):
super().__init__()
self.og_data = {}
def handle_starttag(self, tag, attrs):
if tag == 'meta':
attrs_dict = dict(attrs)
prop = attrs_dict.get('property', '')
# Extract OpenGraph properties
if prop.startswith('og:'):
content = attrs_dict.get('content', '')
if content:
self.og_data[prop] = content
class MicrodataExtractor(HTMLParser):
"""
Extract basic microdata attributes from HTML.
Finds elements with itemprop attributes. This is a simplified extractor
that doesn't handle nested itemscope/itemtype hierarchies - for complex
cases, use extruct as fallback.
"""
def __init__(self):
super().__init__()
self.microdata = {}
self.current_itemprop = None
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if 'itemprop' in attrs_dict:
itemprop = attrs_dict['itemprop']
# Price/currency/availability can be in content/href attributes
if itemprop == 'price':
if 'content' in attrs_dict:
self.microdata['price'] = attrs_dict['content']
else:
self.current_itemprop = 'price'
elif itemprop == 'priceCurrency':
if 'content' in attrs_dict:
self.microdata['currency'] = attrs_dict['content']
else:
self.current_itemprop = 'priceCurrency'
elif itemprop == 'availability':
# Can be in href (link) or content (meta)
if 'href' in attrs_dict:
self.microdata['availability'] = attrs_dict['href']
elif 'content' in attrs_dict:
self.microdata['availability'] = attrs_dict['content']
else:
self.current_itemprop = 'availability'
def handle_data(self, data):
# Capture text content for itemprop elements
if self.current_itemprop == 'price':
# Try to extract numeric price from text
try:
price_text = re.sub(r'[^\d.]', '', data.strip())
if price_text:
self.microdata['price'] = float(price_text)
except ValueError:
pass
elif self.current_itemprop == 'priceCurrency':
currency = data.strip()
if currency:
self.microdata['currency'] = currency
elif self.current_itemprop == 'availability':
availability = data.strip()
if availability:
self.microdata['availability'] = availability
def handle_endtag(self, tag):
# Reset current itemprop after closing tag
self.current_itemprop = None
def extract_metadata_pure_python(html_content):
"""
Extract structured metadata from HTML using pure Python parsers.
Returns a dict with three keys:
- 'json-ld': List of parsed JSON-LD objects
- 'opengraph': Dict of OpenGraph properties
- 'microdata': Dict of microdata properties
Args:
html_content: HTML string to parse
Returns:
dict: Extracted metadata in three formats
"""
result = {
'json-ld': [],
'opengraph': {},
'microdata': {}
}
# Extract JSON-LD
try:
jsonld_extractor = JSONLDExtractor()
jsonld_extractor.feed(html_content)
result['json-ld'] = jsonld_extractor.data
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
except Exception as e:
logger.debug(f"JSON-LD extraction failed: {e}")
# Extract OpenGraph
try:
og_extractor = OpenGraphExtractor()
og_extractor.feed(html_content)
result['opengraph'] = og_extractor.og_data
if result['opengraph']:
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
except Exception as e:
logger.debug(f"OpenGraph extraction failed: {e}")
# Extract Microdata
try:
microdata_extractor = MicrodataExtractor()
microdata_extractor.feed(html_content)
result['microdata'] = microdata_extractor.microdata
if result['microdata']:
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
except Exception as e:
logger.debug(f"Microdata extraction failed: {e}")
return result
def query_price_availability(extracted_data):
"""
Query extracted metadata for price and availability information.
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
Args:
extracted_data: Dict from extract_metadata_pure_python()
Returns:
dict: {'price': float, 'currency': str, 'availability': str}
"""
from jsonpath_ng import parse
result = {}
# 1. Try JSON-LD first (most reliable and common)
for data in extracted_data.get('json-ld', []):
try:
# Use jsonpath to find price/availability anywhere in the structure
price_parse = parse('$..(price|Price)')
availability_parse = parse('$..(availability|Availability)')
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
price_results = [m.value for m in price_parse.find(data)]
if price_results and not result.get('price'):
# Handle various price formats
price_val = price_results[0]
if isinstance(price_val, (int, float)):
result['price'] = float(price_val)
elif isinstance(price_val, str):
# Extract numeric value from string
try:
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
except ValueError:
pass
avail_results = [m.value for m in availability_parse.find(data)]
if avail_results and not result.get('availability'):
result['availability'] = str(avail_results[0])
curr_results = [m.value for m in currency_parse.find(data)]
if curr_results and not result.get('currency'):
result['currency'] = str(curr_results[0])
# If we found price, this JSON-LD block is good
if result.get('price'):
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
break
except Exception as e:
logger.debug(f"Error querying JSON-LD: {e}")
continue
# 2. Try OpenGraph if JSON-LD didn't provide everything
og_data = extracted_data.get('opengraph', {})
if not result.get('price') and 'og:price:amount' in og_data:
try:
result['price'] = float(og_data['og:price:amount'])
except ValueError:
pass
if not result.get('currency') and 'og:price:currency' in og_data:
result['currency'] = og_data['og:price:currency']
if not result.get('availability') and 'og:availability' in og_data:
result['availability'] = og_data['og:availability']
# 3. Use microdata as last resort
microdata = extracted_data.get('microdata', {})
if not result.get('price') and 'price' in microdata:
result['price'] = microdata['price']
if not result.get('currency') and 'currency' in microdata:
result['currency'] = microdata['currency']
if not result.get('availability') and 'availability' in microdata:
result['availability'] = microdata['availability']
return result
@@ -1,11 +1,5 @@
from loguru import logger
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = True
supports_text_filters_and_triggers_elements = True
supports_request_type = True
from loguru import logger
@@ -17,8 +11,7 @@ def _task(watch, update_handler):
try:
# The slow process (we run 2 of these in parallel)
# Always force reprocess for preview - we want to show the filtered content regardless of checksums
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch, force_reprocess=True)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:
@@ -56,7 +49,7 @@ def prepare_filter_prevew(datastore, watch_uuid, form_data):
tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid))
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.data_dir):
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
# Splice in the temporary stuff from the form
form = forms.processor_text_json_diff_form(formdata=form_data if request.method == 'POST' else None,
data=form_data
@@ -65,11 +58,11 @@ def prepare_filter_prevew(datastore, watch_uuid, form_data):
# Only update vars that came in via the AJAX post
p = {k: v for k, v in form.data.items() if k in form_data.keys()}
tmp_watch.update(p)
blank_watch_no_filters = watch_model(datastore_path=datastore.datastore_path, __datastore=datastore.data)
blank_watch_no_filters = watch_model()
blank_watch_no_filters['url'] = tmp_watch.get('url')
latest_filename = next(reversed(tmp_watch.history))
html_fname = os.path.join(tmp_watch.data_dir, f"{latest_filename}.html.br")
html_fname = os.path.join(tmp_watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read()).decode('utf-8') if html_fname.endswith('.br') else f.read().decode('utf-8')
@@ -7,7 +7,6 @@ import re
import urllib3
from changedetectionio.conditions import execute_ruleset_against_all_plugins
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
from ..base import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
@@ -369,24 +368,12 @@ class ChecksumCalculator:
# (set_proxy_from_list)
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, force_reprocess=False):
def run_changedetection(self, watch):
changed_detected = False
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Initialize components
filter_config = FilterConfig(watch, self.datastore)
content_processor = ContentProcessor(self.fetcher, watch, filter_config, self.datastore)
@@ -404,11 +391,9 @@ class perform_site_check(difference_detection_processor):
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = ctype_header # Use hyphen (matches OpenAPI spec and watch_base default)
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# Track the content type and checksum before filters
update_obj['content_type'] = ctype_header
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
# === CONTENT PREPROCESSING ===
# Avoid creating unnecessary intermediate string copies by reassigning only when needed
+141 -228
View File
@@ -5,57 +5,51 @@ import heapq
import queue
import threading
# Janus is no longer required - we use pure threading.Queue for multi-loop support
# try:
# import janus
# except ImportError:
# pass # Not needed anymore
try:
import janus
except ImportError:
logger.critical(f"CRITICAL: janus library is required. Install with: pip install janus")
raise
class RecheckPriorityQueue:
"""
Thread-safe priority queue supporting multiple async event loops.
ARCHITECTURE:
- Multiple async workers, each with its own event loop in its own thread
- Hybrid sync/async design for maximum scalability
- Sync interface for ticker thread (threading.Queue)
- Async interface for workers (asyncio.Event - NO executor threads!)
SCALABILITY:
- Scales to 100-200+ workers without executor thread exhaustion
- Async workers wait on asyncio.Event (pure coroutines, no threads)
- Sync callers use threading.Queue (backward compatible)
WHY NOT JANUS:
- Janus binds to ONE event loop at creation time
- Our architecture has 15+ workers, each with separate event loops
- Workers in different threads/loops cannot share janus async interface
WHY NOT RUN_IN_EXECUTOR:
- With 200 workers, run_in_executor() would block 200 threads
- Exhausts ThreadPoolExecutor, starves Flask HTTP handlers
- Pure async approach uses 0 threads while waiting
Ultra-reliable priority queue using janus for async/sync bridging.
CRITICAL DESIGN NOTE: Both sync_q and async_q are required because:
- sync_q: Used by Flask routes, ticker threads, and other synchronous code
- async_q: Used by async workers (the actual fetchers/processors) and coroutines
DO NOT REMOVE EITHER INTERFACE - they bridge different execution contexts:
- Synchronous code (Flask, threads) cannot use async methods without blocking
- Async code cannot use sync methods without blocking the event loop
- janus provides the only safe bridge between these two worlds
Attempting to unify to async-only would require:
- Converting all Flask routes to async (major breaking change)
- Using asyncio.run() in sync contexts (causes deadlocks)
- Thread-pool wrapping (adds complexity and overhead)
Minimal implementation focused on reliability:
- Pure janus for sync/async bridge
- Thread-safe priority ordering
- Bulletproof error handling with critical logging
"""
def __init__(self, maxsize: int = 0):
try:
import asyncio
# Sync interface: threading.Queue for ticker thread and Flask routes
self._notification_queue = queue.Queue(maxsize=maxsize if maxsize > 0 else 0)
self._janus_queue = janus.Queue(maxsize=maxsize)
# BOTH interfaces required - see class docstring for why
self.sync_q = self._janus_queue.sync_q # Flask routes, ticker thread
self.async_q = self._janus_queue.async_q # Async workers
# Priority storage - thread-safe
self._priority_items = []
self._lock = threading.RLock()
# No event signaling needed - pure polling approach
# Workers check queue every 50ms (latency acceptable: 0-500ms)
# Scales to 1000+ workers: each sleeping worker = ~4KB coroutine, not thread
# Signals for UI updates
self.queue_length_signal = signal('queue_length')
logger.debug("RecheckPriorityQueue initialized successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to initialize RecheckPriorityQueue: {str(e)}")
@@ -64,48 +58,38 @@ class RecheckPriorityQueue:
# SYNC INTERFACE (for ticker thread)
def put(self, item, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync put with priority ordering"""
logger.trace(f"RecheckQueue.put() called for item: {self._get_item_uuid(item)}, block={block}, timeout={timeout}")
try:
# CRITICAL: Add to both priority storage AND notification queue atomically
# to prevent desynchronization where item exists but no notification
# Add to priority storage
with self._lock:
heapq.heappush(self._priority_items, item)
# Add notification - use blocking with timeout for safety
# Notification queue is unlimited size, so should never block in practice
# but timeout ensures we detect any unexpected issues (deadlock, etc)
try:
self._notification_queue.put(True, block=True, timeout=5.0)
except Exception as notif_e:
# Notification failed - MUST remove from priority_items to keep in sync
# This prevents "Priority queue inconsistency" errors in get()
logger.critical(f"CRITICAL: Notification queue put failed, removing from priority_items: {notif_e}")
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
raise # Re-raise to be caught by outer exception handler
# Signal emission after successful queue - log but don't fail the operation
# Item is already safely queued, so signal failure shouldn't affect queue state
try:
self._emit_put_signals(item)
except Exception as signal_e:
logger.error(f"Failed to emit put signals but item queued successfully: {signal_e}")
# Notify via janus sync queue
self.sync_q.put(True, block=block, timeout=timeout)
# Emit signals
self._emit_put_signals(item)
logger.trace(f"Successfully queued item: {self._get_item_uuid(item)}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to put item {self._get_item_uuid(item)}: {type(e).__name__}: {str(e)}")
# Item should have been cleaned up in the inner try/except if notification failed
logger.critical(f"CRITICAL: Failed to put item {self._get_item_uuid(item)}: {str(e)}")
# Remove from priority storage if janus put failed
try:
with self._lock:
if item in self._priority_items:
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
except Exception as cleanup_e:
logger.critical(f"CRITICAL: Failed to cleanup after put failure: {str(e)}")
return False
def get(self, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync get with priority ordering"""
logger.trace(f"RecheckQueue.get() called, block={block}, timeout={timeout}")
import queue as queue_module
import queue
try:
# Wait for notification (this doesn't return the actual item, just signals availability)
self._notification_queue.get(block=block, timeout=timeout)
# Wait for notification
self.sync_q.get(block=block, timeout=timeout)
# Get highest priority item
with self._lock:
@@ -114,91 +98,69 @@ class RecheckPriorityQueue:
raise Exception("Priority queue inconsistency")
item = heapq.heappop(self._priority_items)
# Signal emission after successful retrieval - log but don't lose the item
# Item is already retrieved, so signal failure shouldn't affect queue state
try:
self._emit_get_signals()
except Exception as signal_e:
logger.error(f"Failed to emit get signals but item retrieved successfully: {signal_e}")
# Emit signals
self._emit_get_signals()
logger.trace(f"RecheckQueue.get() successfully retrieved item: {self._get_item_uuid(item)}")
return item
except queue_module.Empty:
# Queue is empty with timeout - expected behavior
logger.trace(f"RecheckQueue.get() timed out - queue is empty (timeout={timeout})")
raise # noqa
except Exception as e:
# Re-raise without logging - caller (worker) will handle and log appropriately
logger.trace(f"RecheckQueue.get() failed with exception: {type(e).__name__}: {str(e)}")
raise
# ASYNC INTERFACE (for workers)
async def async_put(self, item, executor=None):
"""Async put with priority ordering - uses thread pool to avoid blocking
Args:
item: Item to add to queue
executor: Optional ThreadPoolExecutor. If None, uses default pool.
"""
logger.trace(f"RecheckQueue.async_put() called for item: {self._get_item_uuid(item)}, executor={executor}")
import asyncio
try:
# Use run_in_executor to call sync put without blocking event loop
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
executor, # Use provided executor or default
lambda: self.put(item, block=True, timeout=5.0)
)
logger.trace(f"RecheckQueue.async_put() successfully queued item: {self._get_item_uuid(item)}")
return result
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put item {self._get_item_uuid(item)}: {str(e)}")
return False
async def async_get(self, executor=None, timeout=1.0):
"""
Efficient async get using executor for blocking call.
HYBRID APPROACH: Best of both worlds
- Uses run_in_executor for efficient blocking (no polling overhead)
- Single timeout (no double-timeout race condition)
- Scales well: executor sized to match worker count
With FETCH_WORKERS=10: 10 threads blocked max (acceptable)
With FETCH_WORKERS=200: Need executor with 200+ threads (see worker_pool.py)
Args:
executor: ThreadPoolExecutor (sized to match worker count)
timeout: Maximum time to wait in seconds
Returns:
Item from queue
Raises:
queue.Empty: If timeout expires with no item available
"""
logger.trace(f"RecheckQueue.async_get() called, timeout={timeout}")
import asyncio
try:
# Use run_in_executor to call sync get efficiently
# No outer asyncio.wait_for wrapper = no double timeout issue!
loop = asyncio.get_event_loop()
item = await loop.run_in_executor(
executor,
lambda: self.get(block=True, timeout=timeout)
)
logger.trace(f"RecheckQueue.async_get() successfully retrieved item: {self._get_item_uuid(item)}")
logger.debug(f"Successfully retrieved item: {self._get_item_uuid(item)}")
return item
except queue.Empty:
logger.trace(f"RecheckQueue.async_get() timed out - queue is empty")
# Queue is empty with timeout - expected behavior, re-raise without logging
raise
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get item from queue: {type(e).__name__}: {str(e)}")
# Re-raise without logging - caller (worker) will handle and log appropriately
raise
# ASYNC INTERFACE (for workers)
async def async_put(self, item):
"""Pure async put with priority ordering"""
try:
# Add to priority storage
with self._lock:
heapq.heappush(self._priority_items, item)
# Notify via janus async queue
await self.async_q.put(True)
# Emit signals
self._emit_put_signals(item)
logger.debug(f"Successfully async queued item: {self._get_item_uuid(item)}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put item {self._get_item_uuid(item)}: {str(e)}")
# Remove from priority storage if janus put failed
try:
with self._lock:
if item in self._priority_items:
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
except Exception as cleanup_e:
logger.critical(f"CRITICAL: Failed to cleanup after async put failure: {str(e)}")
return False
async def async_get(self):
"""Pure async get with priority ordering"""
try:
# Wait for notification
await self.async_q.get()
# Get highest priority item
with self._lock:
if not self._priority_items:
logger.critical(f"CRITICAL: Async queue notification received but no priority items available")
raise Exception("Priority queue inconsistency")
item = heapq.heappop(self._priority_items)
# Emit signals
self._emit_get_signals()
logger.debug(f"Successfully async retrieved item: {self._get_item_uuid(item)}")
return item
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get item from queue: {str(e)}")
raise
# UTILITY METHODS
@@ -224,35 +186,10 @@ class RecheckPriorityQueue:
logger.critical(f"CRITICAL: Failed to get queued UUIDs: {str(e)}")
return []
def clear(self):
"""Clear all items from both priority storage and notification queue"""
try:
with self._lock:
# Clear priority items
self._priority_items.clear()
# Drain all notifications to prevent stale notifications
# This is critical for test cleanup to prevent queue desynchronization
drained = 0
while not self._notification_queue.empty():
try:
self._notification_queue.get_nowait()
drained += 1
except queue.Empty:
break
if drained > 0:
logger.debug(f"Cleared queue: removed {drained} notifications")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to clear queue: {str(e)}")
return False
def close(self):
"""Close the queue"""
"""Close the janus queue"""
try:
# Nothing to close for threading.Queue
self._janus_queue.close()
logger.debug("RecheckPriorityQueue closed successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to close RecheckPriorityQueue: {str(e)}")
@@ -384,7 +321,7 @@ class RecheckPriorityQueue:
except Exception:
pass
return 'unknown'
def _emit_put_signals(self, item):
"""Emit signals when item is added"""
try:
@@ -393,14 +330,14 @@ class RecheckPriorityQueue:
watch_check_update = signal('watch_check_update')
if watch_check_update:
watch_check_update.send(watch_uuid=item.item['uuid'])
# Queue length signal
if self.queue_length_signal:
self.queue_length_signal.send(length=self.qsize())
except Exception as e:
logger.critical(f"CRITICAL: Failed to emit put signals: {str(e)}")
def _emit_get_signals(self):
"""Emit signals when item is removed"""
try:
@@ -426,11 +363,12 @@ class NotificationQueue:
def __init__(self, maxsize: int = 0, datastore=None):
try:
# Use pure threading.Queue to avoid event loop binding issues
self._notification_queue = queue.Queue(maxsize=maxsize if maxsize > 0 else 0)
self._janus_queue = janus.Queue(maxsize=maxsize)
# BOTH interfaces required - see class docstring for why
self.sync_q = self._janus_queue.sync_q # Flask routes, threads
self.async_q = self._janus_queue.async_q # Async workers
self.notification_event_signal = signal('notification_event')
self.datastore = datastore # For checking all_muted setting
self._lock = threading.RLock()
logger.debug("NotificationQueue initialized successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to initialize NotificationQueue: {str(e)}")
@@ -442,97 +380,72 @@ class NotificationQueue:
def put(self, item: Dict[str, Any], block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync put with signal emission"""
logger.trace(f"NotificationQueue.put() called for item: {item.get('uuid', 'unknown')}, block={block}, timeout={timeout}")
try:
# Check if all notifications are muted
if self.datastore and self.datastore.data['settings']['application'].get('all_muted', False):
logger.debug(f"Notification blocked - all notifications are muted: {item.get('uuid', 'unknown')}")
return False
with self._lock:
self._notification_queue.put(item, block=block, timeout=timeout)
self.sync_q.put(item, block=block, timeout=timeout)
self._emit_notification_signal(item)
logger.trace(f"NotificationQueue.put() successfully queued notification: {item.get('uuid', 'unknown')}")
logger.debug(f"Successfully queued notification: {item.get('uuid', 'unknown')}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to put notification {item.get('uuid', 'unknown')}: {str(e)}")
return False
async def async_put(self, item: Dict[str, Any], executor=None):
"""Async put with signal emission - uses thread pool
Args:
item: Notification item to queue
executor: Optional ThreadPoolExecutor
"""
logger.trace(f"NotificationQueue.async_put() called for item: {item.get('uuid', 'unknown')}, executor={executor}")
import asyncio
async def async_put(self, item: Dict[str, Any]):
"""Pure async put with signal emission"""
try:
# Check if all notifications are muted
if self.datastore and self.datastore.data['settings']['application'].get('all_muted', False):
logger.debug(f"Notification blocked - all notifications are muted: {item.get('uuid', 'unknown')}")
return False
loop = asyncio.get_event_loop()
await loop.run_in_executor(executor, lambda: self.put(item, block=True, timeout=5.0))
logger.trace(f"NotificationQueue.async_put() successfully queued notification: {item.get('uuid', 'unknown')}")
await self.async_q.put(item)
self._emit_notification_signal(item)
logger.debug(f"Successfully async queued notification: {item.get('uuid', 'unknown')}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put notification {item.get('uuid', 'unknown')}: {str(e)}")
return False
def get(self, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync get"""
logger.trace(f"NotificationQueue.get() called, block={block}, timeout={timeout}")
try:
with self._lock:
item = self._notification_queue.get(block=block, timeout=timeout)
logger.trace(f"NotificationQueue.get() retrieved item: {item.get('uuid', 'unknown') if isinstance(item, dict) else 'unknown'}")
return item
return self.sync_q.get(block=block, timeout=timeout)
except queue.Empty as e:
logger.trace(f"NotificationQueue.get() timed out - queue is empty (timeout={timeout})")
raise e
except Exception as e:
logger.critical(f"CRITICAL: Failed to get notification: {type(e).__name__}: {str(e)}")
logger.critical(f"CRITICAL: Failed to get notification: {str(e)}")
raise e
async def async_get(self, executor=None):
"""Async get - uses thread pool
Args:
executor: Optional ThreadPoolExecutor
"""
logger.trace(f"NotificationQueue.async_get() called, executor={executor}")
import asyncio
async def async_get(self):
"""Pure async get"""
try:
loop = asyncio.get_event_loop()
item = await loop.run_in_executor(executor, lambda: self.get(block=True, timeout=1.0))
logger.trace(f"NotificationQueue.async_get() retrieved item: {item.get('uuid', 'unknown') if isinstance(item, dict) else 'unknown'}")
return item
return await self.async_q.get()
except queue.Empty as e:
logger.trace(f"NotificationQueue.async_get() timed out - queue is empty")
raise e
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get notification: {type(e).__name__}: {str(e)}")
logger.critical(f"CRITICAL: Failed to async get notification: {str(e)}")
raise e
def qsize(self) -> int:
"""Get current queue size"""
try:
with self._lock:
return self._notification_queue.qsize()
return self.sync_q.qsize()
except Exception as e:
logger.critical(f"CRITICAL: Failed to get notification queue size: {str(e)}")
return 0
def empty(self) -> bool:
"""Check if queue is empty"""
return self.qsize() == 0
def close(self):
"""Close the queue"""
"""Close the janus queue"""
try:
# Nothing to close for threading.Queue
self._janus_queue.close()
logger.debug("NotificationQueue closed successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to close NotificationQueue: {str(e)}")
+2 -2
View File
@@ -37,9 +37,9 @@ def register_watch_operation_handlers(socketio, datastore):
# Import here to avoid circular imports
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
logger.info(f"Socket.IO: Queued recheck for watch {uuid}")
else:
emit('operation_result', {'success': False, 'error': f'Unknown operation: {op}'})
+4 -4
View File
@@ -145,10 +145,10 @@ def handle_watch_update(socketio, **kwargs):
# Emit the watch update to all connected clients
from changedetectionio.flask_app import update_q
from changedetectionio.flask_app import _jinja2_filter_datetime
from changedetectionio import worker_pool
from changedetectionio import worker_handler
# Get list of watches that are currently running
running_uuids = worker_pool.get_running_uuids()
running_uuids = worker_handler.get_running_uuids()
# Get list of watches in the queue (efficient single-lock method)
queue_list = update_q.get_queued_uuids()
@@ -252,7 +252,7 @@ def init_socketio(app, datastore):
def event_checkbox_operations(data):
from changedetectionio.blueprint.ui import _handle_operations
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from changedetectionio.flask_app import update_q, watch_check_update
import threading
@@ -268,7 +268,7 @@ def init_socketio(app, datastore):
uuids=data.get('uuids'),
datastore=datastore,
extra_data=data.get('extra_data'),
worker_pool=worker_pool,
worker_handler=worker_handler,
update_q=update_q,
queuedWatchMetaData=queuedWatchMetaData,
watch_check_update=watch_check_update,
+2 -6
View File
@@ -10,7 +10,6 @@
set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
rm tests/logs/* -f
# Since theres no curl installed lets roll with python3
check_sanity() {
@@ -65,21 +64,18 @@ data_sanity_test
echo "-------------------- Running rest of tests in parallel -------------------------------"
# REMOVE_REQUESTS_OLD_SCREENSHOTS disabled so that we can write a screenshot and send it in test_notifications.py without a real browser
FETCH_WORKERS=2 REMOVE_REQUESTS_OLD_SCREENSHOTS=false \
REMOVE_REQUESTS_OLD_SCREENSHOTS=false \
pytest tests/test_*.py \
-n 8 \
-n 30 \
--dist=load \
-vvv \
-s \
--capture=no \
-k "not test_queue_system" \
--log-cli-level=DEBUG \
--log-cli-format="%(asctime)s [%(process)d] [%(levelname)s] %(name)s: %(message)s"
echo "---------------------------- DONE parallel test ---------------------------------------"
FETCH_WORKERS=20 pytest -vvv -s tests/test_queue_handler.py
echo "RUNNING WITH BASE_URL SET"
# Now re-run some tests with BASE_URL enabled
+80 -64
View File
@@ -17,6 +17,8 @@ $(document).ready(function () {
set_scale();
});
// Should always be disabled
$('#browser_steps-0-operation option[value="Goto site"]').prop("selected", "selected");
$('#browser_steps-0-operation').attr('disabled', 'disabled');
$('#browsersteps-click-start').click(function () {
$("#browsersteps-click-start").fadeOut();
@@ -43,6 +45,12 @@ $(document).ready(function () {
browsersteps_session_id = false;
apply_buttons_disabled = false;
ctx.clearRect(0, 0, c.width, c.height);
set_first_gotosite_disabled();
}
function set_first_gotosite_disabled() {
$('#browser_steps >li:first-child select').val('Goto site').attr('disabled', 'disabled');
$('#browser_steps >li:first-child').css('opacity', '0.5');
}
// Show seconds remaining until the browser interface needs to restart the session
@@ -235,54 +243,14 @@ $(document).ready(function () {
ctx.fill();
}
// Reusable AJAX function for browser step operations
function executeBrowserStep(url, data = {}) {
$('#browser-steps-ui .loader .spinner').fadeIn();
apply_buttons_disabled = true;
$('ul#browser_steps li .control .apply').css('opacity', 0.5);
$("#browsersteps-img").css('opacity', 0.65);
return $.ajax({
method: "POST",
url: url,
data: data,
statusCode: {
400: function () {
alert("There was a problem processing the request, please reload the page.");
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
},
401: function (data) {
alert(data.responseText);
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
}
}
}).done(function (data) {
xpath_data = data.xpath_data;
$('#browsersteps-img').attr('src', data.screenshot);
$('#browser-steps-ui .loader .spinner').fadeOut();
apply_buttons_disabled = false;
$("#browsersteps-img").css('opacity', 1);
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#loading-status-text").hide();
}).fail(function (data) {
console.log(data);
if (data.responseText && data.responseText.includes("Browser session expired")) {
disable_browsersteps_ui();
}
apply_buttons_disabled = false;
$("#loading-status-text").hide();
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#browsersteps-img").css('opacity', 1);
});
}
function start() {
console.log("Starting browser-steps UI");
browsersteps_session_id = false;
// @todo This setting of the first one should be done at the datalayer but wtforms doesnt wanna play nice
$('#browser_steps >li:first-child').removeClass('empty');
set_first_gotosite_disabled();
$('#browser-steps-ui .loader .spinner').show();
// Request a new session
$('.clear,.remove', $('#browser_steps >li:first-child')).hide();
$.ajax({
type: "GET",
url: browser_steps_start_url,
@@ -299,12 +267,11 @@ $(document).ready(function () {
}).done(function (data) {
$("#loading-status-text").fadeIn();
browsersteps_session_id = data.browsersteps_session_id;
// This should trigger 'Goto site'
console.log("Got startup response, requesting Goto-Site (first) step fake click");
$('#browser_steps >li:first-child .apply').click();
browser_interface_seconds_remaining = 500;
// Request goto_site operation
executeBrowserStep(
browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id + "&goto_website_url_first_step=true"
);
set_first_gotosite_disabled();
}).fail(function (data) {
console.log(data);
alert('There was an error communicating with the server.');
@@ -313,6 +280,7 @@ $(document).ready(function () {
}
function disable_browsersteps_ui() {
set_first_gotosite_disabled();
$("#browser-steps-ui").css('opacity', '0.3');
$('#browsersteps-selector-canvas').off("mousemove mousedown click");
}
@@ -360,13 +328,16 @@ $(document).ready(function () {
// Add the extra buttons to the steps
$('ul#browser_steps li').each(function (i) {
var s = '<div class="control">' + '<a data-step-index=' + i + ' class="pure-button button-secondary button-green button-xsmall apply" >Apply</a>&nbsp;';
s += `<a data-step-index="${i}" class="pure-button button-secondary button-xsmall clear" >Clear</a>&nbsp;` +
`<a data-step-index="${i}" class="pure-button button-secondary button-red button-xsmall remove" >Remove</a>`;
if (i > 0) {
// The first step never gets these (Goto-site)
s += `<a data-step-index="${i}" class="pure-button button-secondary button-xsmall clear" >Clear</a>&nbsp;` +
`<a data-step-index="${i}" class="pure-button button-secondary button-red button-xsmall remove" >Remove</a>`;
// if a screenshot is available
if (browser_steps_available_screenshots.includes(i.toString())) {
var d = (browser_steps_last_error_step === i+1) ? 'before' : 'after';
s += `&nbsp;<a data-step-index="${i}" class="pure-button button-secondary button-xsmall show-screenshot" title="Show screenshot from last run" data-type="${d}">Pic</a>&nbsp;`;
// if a screenshot is available
if (browser_steps_available_screenshots.includes(i.toString())) {
var d = (browser_steps_last_error_step === i+1) ? 'before' : 'after';
s += `&nbsp;<a data-step-index="${i}" class="pure-button button-secondary button-xsmall show-screenshot" title="Show screenshot from last run" data-type="${d}">Pic</a>&nbsp;`;
}
}
s += '</div>';
$(this).append(s)
@@ -405,35 +376,80 @@ $(document).ready(function () {
});
$('ul#browser_steps li .control .apply').click(function (event) {
// sequential requests @todo refactor
if (apply_buttons_disabled) {
return;
}
var current_data = $(event.currentTarget).closest('li');
$('#browser-steps-ui .loader .spinner').fadeIn();
apply_buttons_disabled = true;
$('ul#browser_steps li .control .apply').css('opacity', 0.5);
$("#browsersteps-img").css('opacity', 0.65);
var is_last_step = 0;
var step_n = $(event.currentTarget).data('step-index');
// Determine if this is the last configured step
var is_last_step = 0;
// On the last step, we should also be getting data ready for the visual selector
$('ul#browser_steps li select').each(function (i) {
if ($(this).val() !== 'Choose one') {
is_last_step += 1;
}
});
is_last_step = (is_last_step == (step_n + 1));
if (is_last_step == (step_n + 1)) {
is_last_step = true;
} else {
is_last_step = false;
}
console.log("Requesting step via POST " + $("select[id$='operation']", current_data).first().val());
// Execute the browser step
executeBrowserStep(
browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id,
{
// POST the currently clicked step form widget back and await response, redraw
$.ajax({
method: "POST",
url: browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id,
data: {
'operation': $("select[id$='operation']", current_data).first().val(),
'selector': $("input[id$='selector']", current_data).first().val(),
'optional_value': $("input[id$='optional_value']", current_data).first().val(),
'step_n': step_n,
'is_last_step': is_last_step
},
statusCode: {
400: function () {
// More than likely the CSRF token was lost when the server restarted
alert("There was a problem processing the request, please reload the page.");
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
},
401: function (data) {
// More than likely the CSRF token was lost when the server restarted
alert(data.responseText);
$("#loading-status-text").hide();
$('#browser-steps-ui .loader .spinner').fadeOut();
}
}
);
}).done(function (data) {
// it should return the new state (selectors available and screenshot)
xpath_data = data.xpath_data;
$('#browsersteps-img').attr('src', data.screenshot);
$('#browser-steps-ui .loader .spinner').fadeOut();
apply_buttons_disabled = false;
$("#browsersteps-img").css('opacity', 1);
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#loading-status-text").hide();
set_first_gotosite_disabled();
}).fail(function (data) {
console.log(data);
if (data.responseText.includes("Browser session expired")) {
disable_browsersteps_ui();
}
apply_buttons_disabled = false;
$("#loading-status-text").hide();
$('ul#browser_steps li .control .apply').css('opacity', 1);
$("#browsersteps-img").css('opacity', 1);
});
});
$('ul#browser_steps li .control .show-screenshot').click(function (element) {
+1 -2
View File
@@ -184,8 +184,7 @@ $(document).ready(function() {
}
// If it's a button in a form, submit the form
else if ($element.is('button')) {
// Use requestSubmit() to include the button's name/value in the form data
$element.closest('form')[0].requestSubmit($element[0]);
$element.closest('form').submit();
}
}
};
+156 -253
View File
@@ -9,14 +9,18 @@ from flask import (
)
from flask_babel import gettext
from ..model import App, Watch
from copy import deepcopy
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..model import App, Watch, USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from copy import deepcopy, copy
from os import path, unlink
from threading import Lock
import json
import os
import re
import secrets
import sys
import threading
import time
import uuid as uuid_builder
from loguru import logger
@@ -31,10 +35,12 @@ except ImportError:
HAS_ORJSON = False
from ..processors import get_custom_watch_obj_for_processor
from ..processors.restock_diff import Restock
# Import the base class and helpers
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_json_atomic
from .file_saving_datastore import FileSavingDataStore, load_all_watches, save_watch_atomic, save_json_atomic
from .updates import DatastoreUpdatesMixin
from .legacy_loader import has_legacy_datastore
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
@@ -55,7 +61,9 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Should only be active for docker
# logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
self.datastore_path = datastore_path
self.needs_write = False
self.start_time = time.time()
self.stop_thread = False
self.save_version_copy_json_db(version_tag)
self.reload_state(datastore_path=datastore_path, include_default_watches=include_default_watches, version_tag=version_tag)
@@ -77,7 +85,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
copyfile(db_path, db_path_version_backup)
def _load_settings(self, filename="changedetection.json"):
def _load_settings(self):
"""
Load settings from storage.
@@ -86,7 +94,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Returns:
dict: Settings data loaded from storage
"""
changedetection_json = os.path.join(self.datastore_path, filename)
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
logger.info(f"Loading settings from {changedetection_json}")
@@ -121,63 +129,29 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
if 'application' in settings_data['settings']:
self.__data['settings']['application'].update(settings_data['settings']['application'])
# More or less for the old format which had this data in the one url-watches.json
# cant hurt to leave it here,
if 'watching' in settings_data:
self.__data['watching'].update(settings_data['watching'])
def _rehydrate_tags(self):
"""Rehydrate tag entities from stored data into Tag objects with restock_diff processor."""
from ..model import Tag
"""Rehydrate tag entities from stored data."""
for uuid, tag in self.__data['settings']['application']['tags'].items():
# Force processor to restock_diff for override functionality (technical debt)
tag['processor'] = 'restock_diff'
self.__data['settings']['application']['tags'][uuid] = Tag.model(
datastore_path=self.datastore_path,
__datastore=self.__data,
default=tag
self.__data['settings']['application']['tags'][uuid] = self.rehydrate_entity(
uuid, tag, processor_override='restock_diff'
)
logger.info(f"Tag: {uuid} {tag['title']}")
def _rehydrate_watches(self):
"""Rehydrate watch entities from stored data (converts dicts to Watch objects)."""
watch_count = len(self.__data.get('watching', {}))
if watch_count == 0:
return
logger.info(f"Rehydrating {watch_count} watches...")
watching_rehydrated = {}
for uuid, watch_dict in self.__data.get('watching', {}).items():
if isinstance(watch_dict, dict):
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
else:
logger.error(f"Watch UUID {uuid} already rehydrated")
self.__data['watching'] = watching_rehydrated
logger.success(f"Rehydrated {watch_count} watches into Watch objects")
def _load_state(self, main_settings_filename="changedetection.json"):
def _load_state(self):
"""
Load complete datastore state from storage.
Orchestrates loading of settings, watches, and tags using polymorphic methods.
Orchestrates loading of settings and watches using polymorphic methods.
"""
# Load settings
settings_data = self._load_settings(filename=main_settings_filename)
settings_data = self._load_settings()
self._apply_settings(settings_data)
# Load watches, scan them from the disk
# Load watches (polymorphic - parent class method)
self._load_watches()
self._rehydrate_watches()
# Load tags from individual tag.json files
# These will override any tags in settings (migration path)
self._load_tags()
# Rehydrate any remaining tags from settings (legacy/fallback)
# Rehydrate tags
self._rehydrate_tags()
def reload_state(self, datastore_path, include_default_watches, version_tag):
@@ -192,15 +166,12 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
"""
logger.info(f"Datastore path is '{datastore_path}'")
# CRITICAL: Update datastore_path (was using old path from __init__)
self.datastore_path = datastore_path
# Initialize data structure
self.__data = App.model()
self.json_store_path = os.path.join(self.datastore_path, "changedetection.json")
# Base definition for all watchers (deepcopy part of #569)
self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, __datastore=self.__data, default={}))
self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, default={}))
# Load build SHA if available (Docker deployments)
if path.isfile('changedetectionio/source.txt'):
@@ -209,75 +180,82 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Check if datastore already exists
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
changedetection_json_old_schema = os.path.join(self.datastore_path, "url-watches.json")
if os.path.exists(changedetection_json):
# Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set)
# Load existing datastore (changedetection.json + watch.json files)
logger.info("Loading existing datastore")
self._load_state()
current_schema = self.data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
try:
self._load_state()
except Exception as e:
logger.critical(f"Failed to load datastore: {e}")
raise
# Legacy datastore detected - trigger migration, even works if the schema is much before the migration step.
elif os.path.exists(changedetection_json_old_schema):
logger.critical(f"Legacy datastore detected at {changedetection_json_old_schema}, loading and running updates")
self._load_state(main_settings_filename="url-watches.json")
# update 26 will load the whole old config from disk to __data
# Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set)
current_schema = self.__data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
# Probably tags were also shifted to disk and many other changes, so best to reload here.
self._load_state()
else:
# No datastore yet - check if this is a fresh install or legacy migration
self.init_fresh_install(include_default_watches=include_default_watches,
version_tag=version_tag)
# Maybe they copied a bunch of watch subdirs across too
self._load_state()
# Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
def init_fresh_install(self, include_default_watches, version_tag):
# Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
# Generate RSS access token
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Generate RSS access token
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Generate API access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
# Generate API access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
# Check if legacy datastore exists (url-watches.json)
if has_legacy_datastore(self.datastore_path):
# Legacy datastore detected - trigger migration
logger.critical(f"Legacy datastore detected at {self.datastore_path}/url-watches.json")
logger.critical("Migration will be triggered via update_26")
# Set schema version to latest (no updates needed)
latest_update_available = self.get_updates_available().pop()
logger.info(f"Marking fresh install to schema version {latest_update_available}")
self.__data['settings']['application']['schema_version'] = latest_update_available
# Load the legacy datastore to get its schema_version
from .legacy_loader import load_legacy_format
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
with open(legacy_path) as f:
self.__data = json.load(f)
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
if not self.__data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# update_26 will load the legacy data again and migrate to new format
# Only run updates AFTER the legacy schema version (e.g., if legacy is at 25, only run 26+)
self.run_updates()
else:
# Fresh install - create new datastore
logger.critical(f"No datastore found, creating new datastore at {self.datastore_path}")
# Set schema version to latest (no updates needed)
updates_available = self.get_updates_available()
self.__data['settings']['application']['schema_version'] = updates_available.pop() if updates_available else 26
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# Set version tag
self.__data['version_tag'] = version_tag
@@ -291,19 +269,19 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
self.commit()
self.mark_settings_dirty()
# Ensure RSS access token exists
if not self.__data['settings']['application'].get('rss_access_token'):
secret = secrets.token_hex(16)
self.__data['settings']['application']['rss_access_token'] = secret
self.commit()
self.mark_settings_dirty()
# Ensure API access token exists
if not self.__data['settings']['application'].get('api_access_token'):
secret = secrets.token_hex(16)
self.__data['settings']['application']['api_access_token'] = secret
self.commit()
self.mark_settings_dirty()
# Handle password reset lockfile
password_reset_lockfile = os.path.join(self.datastore_path, "removepassword.lock")
@@ -311,6 +289,9 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.remove_password()
unlink(password_reset_lockfile)
# Start the background save thread
self.start_save_thread()
def rehydrate_entity(self, uuid, entity, processor_override=None):
"""Set the dict back to the dict Watch object"""
entity['uuid'] = uuid
@@ -324,7 +305,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
if entity.get('processor') != 'text_json_diff':
logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}")
entity = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, default=entity)
entity = watch_class(datastore_path=self.datastore_path, default=entity)
return entity
# ============================================================================
@@ -343,22 +324,13 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
"""
Build settings data structure for saving.
Tags behavior depends on schema version:
- Before update_28 (schema < 28): Tags saved in settings for migration
- After update_28 (schema >= 28): Tags excluded from settings (in individual files)
Returns:
dict: Settings data ready for serialization
"""
import copy
# Deep copy settings to avoid modifying the original
settings_copy = copy.deepcopy(self.__data['settings'])
return {
'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
'app_guid': self.__data.get('app_guid'),
'settings': settings_copy,
'note': 'Settings file - watches are stored in individual {uuid}/watch.json files',
'app_guid': self.__data['app_guid'],
'settings': self.__data['settings'],
'build_sha': self.__data.get('build_sha'),
'version_tag': self.__data.get('version_tag')
}
@@ -376,7 +348,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
"""
settings_data = self._build_settings_data()
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
save_json_atomic(changedetection_json, settings_data, label="settings")
save_json_atomic(changedetection_json, settings_data, label="settings", max_size_mb=10)
def _load_watches(self):
"""
@@ -386,45 +358,22 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Implementation of abstract method from FileSavingDataStore.
Delegates to helper function and stores results in internal data structure.
"""
# Store loaded data
# @note this will also work for the old legacy format because self.__data['watching'] should already have them loaded by this point.
self.__data['watching'].update(load_all_watches(
watching, watch_hashes = load_all_watches(
self.datastore_path,
self.rehydrate_entity
))
logger.debug(f"Loaded {len(self.__data['watching'])} watches")
def _load_tags(self):
"""
Load all tags from storage.
File backend implementation: reads individual tag.json files.
Tags loaded from files override any tags in settings (migration path).
"""
from ..model import Tag
def rehydrate_tag(uuid, entity_dict):
"""Rehydrate tag as Tag object with forced restock_diff processor."""
entity_dict['uuid'] = uuid
entity_dict['processor'] = 'restock_diff' # Force processor for override functionality
return Tag.model(
datastore_path=self.datastore_path,
__datastore=self.__data,
default=entity_dict
)
tags = load_all_tags(
self.datastore_path,
rehydrate_tag
self.rehydrate_entity,
self._compute_hash
)
# Override settings tags with loaded tags
# This ensures tag.json files take precedence over settings
if tags:
self.__data['settings']['application']['tags'].update(tags)
logger.info(f"Loaded {len(tags)} tags from individual tag.json files")
# Store loaded data
self.__data['watching'] = watching
self._watch_hashes = watch_hashes
# Verify all watches have hashes
missing_hashes = [uuid for uuid in watching.keys() if uuid not in watch_hashes]
if missing_hashes:
logger.error(f"WARNING: {len(missing_hashes)} watches missing hashes after load: {missing_hashes[:5]}")
else:
logger.debug(f"All {len(watching)} watches have valid hashes")
def _delete_watch(self, uuid):
"""
@@ -448,7 +397,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
def set_last_viewed(self, uuid, timestamp):
logger.debug(f"Setting watch UUID: {uuid} last viewed to {int(timestamp)}")
self.data['watching'][uuid].update({'last_viewed': int(timestamp)})
self.data['watching'][uuid].commit()
self.mark_watch_dirty(uuid)
watch_check_update = signal('watch_check_update')
if watch_check_update:
@@ -456,79 +405,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
def remove_password(self):
self.__data['settings']['application']['password'] = False
self.commit()
def clear_all_last_checksums(self):
"""
Delete all last-checksum.txt files to force reprocessing of all watches.
This should be called when global settings change, since watches inherit
configuration and need to reprocess even if their individual watch dict
hasn't been modified.
Note: We delete the checksum file rather than setting was_edited=True because:
- was_edited is not persisted across restarts
- File deletion ensures reprocessing works across app restarts
"""
deleted_count = 0
for uuid in self.__data['watching'].keys():
watch = self.__data['watching'][uuid]
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid}")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files to force reprocessing")
return deleted_count
def clear_checksums_for_tag(self, tag_uuid):
"""
Delete last-checksum.txt files for all watches using a specific tag.
This should be called when a tag configuration is edited, since watches
inherit tag settings and need to reprocess.
Args:
tag_uuid: UUID of the tag that was modified
Returns:
int: Number of checksum files deleted
"""
deleted_count = 0
for uuid, watch in self.__data['watching'].items():
if watch.get('tags') and tag_uuid in watch['tags']:
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid} (tag {tag_uuid})")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files for tag {tag_uuid}")
return deleted_count
def commit(self):
"""
Save settings immediately to disk using atomic write.
Uses atomic write pattern (temp file + rename) for crash safety.
Fire-and-forget: Logs errors but does not raise exceptions.
Settings data remains in memory even if save fails, so next commit will retry.
"""
try:
self._save_settings()
logger.debug("Committed settings")
except Exception as e:
logger.error(f"Failed to commit settings: {e}")
self.mark_settings_dirty()
def update_watch(self, uuid, update_obj):
@@ -547,8 +424,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.__data['watching'][uuid].update(update_obj)
# Immediate save
self.__data['watching'][uuid].commit()
self.mark_watch_dirty(uuid)
@property
def threshold_seconds(self):
@@ -609,6 +485,10 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
except Exception as e:
logger.error(f"Failed to delete watch {watch_uuid} from storage: {e}")
# Clean up tracking data
self._watch_hashes.pop(watch_uuid, None)
self._dirty_watches.discard(watch_uuid)
# Send delete signal
watch_delete_signal = signal('watch_deleted')
if watch_delete_signal:
@@ -630,19 +510,21 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Remove from watching dict
del self.data['watching'][uuid]
# Clean up tracking data
self._watch_hashes.pop(uuid, None)
self._dirty_watches.discard(uuid)
# Send delete signal
watch_delete_signal = signal('watch_deleted')
if watch_delete_signal:
watch_delete_signal.send(watch_uuid=uuid)
self.needs_write_urgent = True
# Clone a watch by UUID
def clone(self, uuid):
url = self.data['watching'][uuid].get('url')
# No need to deepcopy here - add_watch() will deepcopy extras anyway (line 569)
# Just pass a dict copy (with lock for thread safety)
# NOTE: dict() is shallow copy but safe since add_watch() deepcopies it
with self.lock:
extras = dict(self.data['watching'][uuid])
extras = deepcopy(self.data['watching'][uuid])
new_uuid = self.add_watch(url=url, extras=extras)
watch = self.data['watching'][new_uuid]
return new_uuid
@@ -659,7 +541,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Remove a watchs data but keep the entry (URL etc)
def clear_watch_history(self, uuid):
self.__data['watching'][uuid].clear_watch()
self.__data['watching'][uuid].commit()
self.needs_write_urgent = True
def add_watch(self, url, tag='', extras=None, tag_uuids=None, save_immediately=True):
@@ -754,7 +636,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# If the processor also has its own Watch implementation
watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor'))
new_watch = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, url=url)
new_watch = watch_class(datastore_path=self.datastore_path, url=url)
new_uuid = new_watch.get('uuid')
@@ -772,9 +654,16 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.__data['watching'][new_uuid] = new_watch
if save_immediately:
# Save immediately using commit
new_watch.commit()
logger.debug(f"Saved new watch {new_uuid}")
# Save immediately using polymorphic method
try:
self.save_watch(new_uuid, force=True)
logger.debug(f"Saved new watch {new_uuid}")
except Exception as e:
logger.error(f"Failed to save new watch {new_uuid}: {e}")
# Mark dirty for retry
self.mark_watch_dirty(new_uuid)
else:
self.mark_watch_dirty(new_uuid)
logger.debug(f"Added '{url}'")
@@ -809,6 +698,25 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Old sync_to_json and save_datastore methods removed - now handled by FileSavingDataStore parent class
# Go through the datastore path and remove any snapshots that are not mentioned in the index
# This usually is not used, but can be handy.
def remove_unused_snapshots(self):
logger.info("Removing snapshots from datastore that are not in the index..")
index = []
for uuid in self.data['watching']:
for id in self.data['watching'][uuid].history:
index.append(self.data['watching'][uuid].history[str(id)])
import pathlib
# Only in the sub-directories
for uuid in self.data['watching']:
for item in pathlib.Path(self.datastore_path).rglob(uuid + "/*.txt"):
if not str(item) in index:
logger.info(f"Removing {item}")
unlink(item)
@property
def proxy_list(self):
proxy_list = {}
@@ -900,7 +808,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
if watch:
# In /datastore/xyz-xyz/headers.txt
filepath = os.path.join(watch.data_dir, 'headers.txt')
filepath = os.path.join(watch.watch_data_dir, 'headers.txt')
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
@@ -947,21 +855,16 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# So we use the same model as a Watch
with self.lock:
from ..model import Tag
new_tag = Tag.model(
datastore_path=self.datastore_path,
__datastore=self.__data,
default={
'title': title.strip(),
'date_created': int(time.time())
}
)
new_tag = Tag.model(datastore_path=self.datastore_path, default={
'title': title.strip(),
'date_created': int(time.time())
})
new_uuid = new_tag.get('uuid')
self.__data['settings']['application']['tags'][new_uuid] = new_tag
# Save tag to its own tag.json file instead of settings
new_tag.commit()
self.mark_settings_dirty()
return new_uuid
def get_all_tags_for_watch(self, uuid):
@@ -1078,7 +981,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
notification_urls.append(notification_url)
self.__data['settings']['application']['notification_urls'] = notification_urls
self.commit()
self.mark_settings_dirty()
return notification_url
# Schema update methods moved to store/updates.py (DatastoreUpdatesMixin)
+17
View File
@@ -81,3 +81,20 @@ class DataStore(ABC):
"""
pass
@abstractmethod
def force_save_all(self):
"""
Force immediate synchronous save of all data to storage.
This is the abstract method for forcing a complete save.
Different backends implement this differently:
- File backend: Mark all watches/settings dirty, then save
- Redis backend: SAVE command or pipeline flush
- SQL backend: COMMIT transaction
Used by:
- Backup creation (ensure everything is saved before backup)
- Shutdown (ensure all changes are persisted)
- Manual save operations
"""
pass
+554 -165
View File
@@ -1,17 +1,22 @@
"""
File-based datastore with individual watch persistence and immediate commits.
File-based datastore with individual watch persistence and dirty tracking.
This module provides the FileSavingDataStore abstract class that implements:
- Individual watch.json file persistence
- Immediate commit-based persistence (watch.commit(), datastore.commit())
- Hash-based change detection (only save what changed)
- Periodic audit scan (catches unmarked changes)
- Background save thread with batched parallel saves
- Atomic file writes safe for NFS/NAS
"""
import glob
import hashlib
import json
import os
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Thread
from loguru import logger
from .base import DataStore
@@ -29,6 +34,19 @@ except ImportError:
# Set to True for mission-critical deployments requiring crash consistency
FORCE_FSYNC_DATA_IS_CRITICAL = bool(strtobool(os.getenv('FORCE_FSYNC_DATA_IS_CRITICAL', 'False')))
# Save interval configuration: How often the background thread saves dirty items
# Default 10 seconds - increase for less frequent saves, decrease for more frequent
DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS = int(os.getenv('DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS', '10'))
# Rolling audit configuration: Scans a fraction of watches each cycle
# Default: Run audit every 10s, split into 5 shards
# Full audit completes every 50s (10s × 5 shards)
# With 56k watches: 56k / 5 = ~11k watches per cycle (~60ms vs 316ms for all)
# Handles dynamic watch count - recalculates shard boundaries each cycle
DATASTORE_AUDIT_INTERVAL_SECONDS = int(os.getenv('DATASTORE_AUDIT_INTERVAL_SECONDS', '10'))
DATASTORE_AUDIT_SHARDS = int(os.getenv('DATASTORE_AUDIT_SHARDS', '5'))
# ============================================================================
# Helper Functions for Atomic File Operations
# ============================================================================
@@ -43,9 +61,6 @@ def save_json_atomic(file_path, data_dict, label="file", max_size_mb=10):
- Size validation
- Proper error handling
Thread safety: Caller must hold datastore.lock to prevent concurrent modifications.
Multi-process safety: Not supported - run only one app instance per datastore.
Args:
file_path: Full path to target JSON file
data_dict: Dictionary to serialize
@@ -175,37 +190,23 @@ def save_json_atomic(file_path, data_dict, label="file", max_size_mb=10):
raise e
def save_entity_atomic(entity_dir, uuid, entity_dict, filename, entity_type, max_size_mb):
"""
Save an entity (watch/tag) to disk using atomic write pattern.
Generic function for saving any watch_base subclass (Watch, Tag, etc.).
Args:
entity_dir: Directory for this entity (e.g., /datastore/{uuid})
uuid: Entity UUID (for logging)
entity_dict: Dictionary representation of the entity
filename: JSON filename (e.g., 'watch.json', 'tag.json')
entity_type: Type label for logging (e.g., 'watch', 'tag')
max_size_mb: Maximum allowed file size in MB
Raises:
ValueError: If serialized data exceeds max_size_mb
OSError: If disk is full (ENOSPC) or other I/O error
"""
entity_json = os.path.join(entity_dir, filename)
save_json_atomic(entity_json, entity_dict, label=f"{entity_type} {uuid}", max_size_mb=max_size_mb)
def save_watch_atomic(watch_dir, uuid, watch_dict):
"""
Save a watch to disk using atomic write pattern.
Convenience wrapper around save_entity_atomic for watches.
Kept for backwards compatibility.
"""
save_entity_atomic(watch_dir, uuid, watch_dict, "watch.json", "watch", max_size_mb=10)
Convenience wrapper around save_json_atomic for watches.
Args:
watch_dir: Directory for this watch (e.g., /datastore/{uuid})
uuid: Watch UUID (for logging)
watch_dict: Dictionary representation of the watch
Raises:
ValueError: If serialized data exceeds 10MB (indicates bug or corruption)
OSError: If disk is full (ENOSPC) or other I/O error
"""
watch_json = os.path.join(watch_dir, "watch.json")
save_json_atomic(watch_json, watch_dict, label=f"watch {uuid}", max_size_mb=10)
def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
@@ -218,7 +219,8 @@ def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
rehydrate_entity_func: Function to convert dict to Watch object
Returns:
Watch object or None if failed
Tuple of (Watch object, raw_data_dict) or (None, None) if failed
The raw_data_dict is needed to compute the hash before rehydration
"""
try:
# Check file size before reading
@@ -231,7 +233,7 @@ def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
f"File: {watch_json}. This indicates a bug or data corruption. "
f"Watch will be skipped."
)
return None
return None, None
if HAS_ORJSON:
with open(watch_json, 'rb') as f:
@@ -240,9 +242,15 @@ def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
with open(watch_json, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
# Rehydrate and return watch object
if watch_data.get('time_schedule_limit'):
del watch_data['time_schedule_limit']
if watch_data.get('time_between_check'):
del watch_data['time_between_check']
# Return both the raw data and the rehydrated watch
# Raw data is needed to compute hash before rehydration changes anything
watch_obj = rehydrate_entity_func(uuid, watch_data)
return watch_obj
return watch_obj, watch_data
except json.JSONDecodeError as e:
logger.critical(
@@ -250,7 +258,7 @@ def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
f"File: {watch_json}. Error: {e}. "
f"Watch will be skipped and may need manual recovery from backup."
)
return None
return None, None
except ValueError as e:
# orjson raises ValueError for invalid JSON
if "invalid json" in str(e).lower() or HAS_ORJSON:
@@ -259,18 +267,18 @@ def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
f"File: {watch_json}. Error: {e}. "
f"Watch will be skipped and may need manual recovery from backup."
)
return None
return None, None
# Re-raise if it's not a JSON parsing error
raise
except FileNotFoundError:
logger.error(f"Watch file not found: {watch_json} for watch {uuid}")
return None
return None, None
except Exception as e:
logger.error(f"Failed to load watch {uuid} from {watch_json}: {e}")
return None
return None, None
def load_all_watches(datastore_path, rehydrate_entity_func):
def load_all_watches(datastore_path, rehydrate_entity_func, compute_hash_func):
"""
Load all watches from individual watch.json files.
@@ -281,17 +289,21 @@ def load_all_watches(datastore_path, rehydrate_entity_func):
Args:
datastore_path: Path to the datastore directory
rehydrate_entity_func: Function to convert dict to Watch object
compute_hash_func: Function to compute hash from raw watch dict
Returns:
Dictionary of uuid -> Watch object
Tuple of (watching_dict, hashes_dict)
- watching_dict: uuid -> Watch object
- hashes_dict: uuid -> hash string (computed from raw data)
"""
start_time = time.time()
logger.info("Loading watches from individual watch.json files...")
watching = {}
watch_hashes = {}
if not os.path.exists(datastore_path):
return watching
return watching, watch_hashes
# Find all watch.json files using glob (faster than manual directory traversal)
glob_start = time.time()
@@ -307,9 +319,12 @@ def load_all_watches(datastore_path, rehydrate_entity_func):
for watch_json in watch_files:
# Extract UUID from path: /datastore/{uuid}/watch.json
uuid_dir = os.path.basename(os.path.dirname(watch_json))
watch = load_watch_from_file(watch_json, uuid_dir, rehydrate_entity_func)
if watch:
watch, raw_data = load_watch_from_file(watch_json, uuid_dir, rehydrate_entity_func)
if watch and raw_data:
watching[uuid_dir] = watch
# Compute hash from rehydrated Watch object (as dict) to match how we compute on save
# This ensures hash matches what audit will compute from dict(watch)
watch_hashes[uuid_dir] = compute_hash_func(dict(watch))
loaded += 1
if loaded % 100 == 0:
@@ -329,123 +344,7 @@ def load_all_watches(datastore_path, rehydrate_entity_func):
else:
logger.info(f"Loaded {loaded} watches from disk in {elapsed:.2f}s ({loaded/elapsed:.0f} watches/sec)")
return watching
def load_tag_from_file(tag_json, uuid, rehydrate_entity_func):
"""
Load a tag from its JSON file.
Args:
tag_json: Path to the tag.json file
uuid: Tag UUID
rehydrate_entity_func: Function to convert dict to Tag object
Returns:
Tag object or None if failed
"""
try:
# Check file size before reading
file_size = os.path.getsize(tag_json)
MAX_TAG_SIZE = 1 * 1024 * 1024 # 1MB
if file_size > MAX_TAG_SIZE:
logger.critical(
f"CORRUPTED TAG DATA: Tag {uuid} file is unexpectedly large: "
f"{file_size / 1024 / 1024:.2f}MB (max: {MAX_TAG_SIZE / 1024 / 1024}MB). "
f"File: {tag_json}. This indicates a bug or data corruption. "
f"Tag will be skipped."
)
return None
if HAS_ORJSON:
with open(tag_json, 'rb') as f:
tag_data = orjson.loads(f.read())
else:
with open(tag_json, 'r', encoding='utf-8') as f:
tag_data = json.load(f)
tag_data['processor'] = 'restock_diff'
# Rehydrate tag (convert dict to Tag object)
# processor_override is set inside the rehydration function
tag_obj = rehydrate_entity_func(uuid, tag_data)
return tag_obj
except json.JSONDecodeError as e:
logger.critical(
f"CORRUPTED TAG DATA: Failed to parse JSON for tag {uuid}. "
f"File: {tag_json}. Error: {e}. "
f"Tag will be skipped and may need manual recovery from backup."
)
return None
except ValueError as e:
# orjson raises ValueError for invalid JSON
if "invalid json" in str(e).lower() or HAS_ORJSON:
logger.critical(
f"CORRUPTED TAG DATA: Failed to parse JSON for tag {uuid}. "
f"File: {tag_json}. Error: {e}. "
f"Tag will be skipped and may need manual recovery from backup."
)
return None
# Re-raise if it's not a JSON parsing error
raise
except FileNotFoundError:
logger.debug(f"Tag file not found: {tag_json} for tag {uuid}")
return None
except Exception as e:
logger.error(f"Failed to load tag {uuid} from {tag_json}: {e}")
return None
def load_all_tags(datastore_path, rehydrate_entity_func):
"""
Load all tags from individual tag.json files.
Tags are stored separately from settings in {uuid}/tag.json files.
Args:
datastore_path: Path to the datastore directory
rehydrate_entity_func: Function to convert dict to Tag object
Returns:
Dictionary of uuid -> Tag object
"""
logger.info("Loading tags from individual tag.json files...")
tags = {}
if not os.path.exists(datastore_path):
return tags
# Find all tag.json files using glob
tag_files = glob.glob(os.path.join(datastore_path, "*", "tag.json"))
total = len(tag_files)
if total == 0:
logger.debug("No tag.json files found")
return tags
logger.debug(f"Found {total} tag.json files")
loaded = 0
failed = 0
for tag_json in tag_files:
# Extract UUID from path: /datastore/{uuid}/tag.json
uuid_dir = os.path.basename(os.path.dirname(tag_json))
tag = load_tag_from_file(tag_json, uuid_dir, rehydrate_entity_func)
if tag:
tags[uuid_dir] = tag
loaded += 1
else:
# load_tag_from_file already logged the specific error
failed += 1
if failed > 0:
logger.warning(f"Loaded {loaded} tags, {failed} tags FAILED to load")
else:
logger.info(f"Loaded {loaded} tags from disk")
return tags
return watching, watch_hashes
# ============================================================================
@@ -454,20 +353,151 @@ def load_all_tags(datastore_path, rehydrate_entity_func):
class FileSavingDataStore(DataStore):
"""
Abstract datastore that provides file persistence with immediate commits.
Abstract datastore that provides file persistence with change tracking.
Features:
- Individual watch.json files (one per watch)
- Immediate persistence via watch.commit() and datastore.commit()
- Atomic file writes for crash safety
- Dirty tracking: Only saves items that have changed
- Hash-based change detection: Prevents unnecessary writes
- Background save thread: Non-blocking persistence
- Two-tier urgency: Standard (60s) and urgent (immediate) saves
Subclasses must implement:
- rehydrate_entity(): Convert dict to Watch object
- Access to internal __data structure for watch management
"""
needs_write = False
needs_write_urgent = False
stop_thread = False
# Change tracking
_dirty_watches = set() # Watch UUIDs that need saving
_dirty_settings = False # Settings changed
_watch_hashes = {} # UUID -> SHA256 hash for change detection
# Health monitoring
_last_save_time = 0 # Timestamp of last successful save
_last_audit_time = 0 # Timestamp of last audit scan
_save_cycle_count = 0 # Number of save cycles completed
_total_saves = 0 # Total watches saved (lifetime)
_save_errors = 0 # Total save errors (lifetime)
_audit_count = 0 # Number of audit scans completed
_audit_found_changes = 0 # Total unmarked changes found by audits
_audit_shard_index = 0 # Current shard being audited (rolling audit)
def __init__(self):
super().__init__()
self.save_data_thread = None
self._last_save_time = time.time()
self._last_audit_time = time.time()
def mark_watch_dirty(self, uuid):
"""
Mark a watch as needing save.
Args:
uuid: Watch UUID
"""
with self.lock:
self._dirty_watches.add(uuid)
dirty_count = len(self._dirty_watches)
# Backpressure detection - warn if dirty set grows too large
if dirty_count > 1000:
logger.critical(
f"BACKPRESSURE WARNING: {dirty_count} watches pending save! "
f"Save thread may not be keeping up with write rate. "
f"This could indicate disk I/O bottleneck or save thread failure."
)
elif dirty_count > 500:
logger.warning(
f"Dirty watch count high: {dirty_count} watches pending save. "
f"Monitoring for potential backpressure."
)
self.needs_write = True
def mark_settings_dirty(self):
"""Mark settings as needing save."""
with self.lock:
self._dirty_settings = True
self.needs_write = True
def _compute_hash(self, watch_dict):
"""
Compute SHA256 hash of watch for change detection.
Args:
watch_dict: Dictionary representation of watch
Returns:
Hex string of SHA256 hash
"""
# Use orjson for deterministic serialization if available
if HAS_ORJSON:
json_bytes = orjson.dumps(watch_dict, option=orjson.OPT_SORT_KEYS)
else:
json_str = json.dumps(watch_dict, sort_keys=True, ensure_ascii=False)
json_bytes = json_str.encode('utf-8')
return hashlib.sha256(json_bytes).hexdigest()
def save_watch(self, uuid, force=False, watch_dict=None, current_hash=None):
"""
Save a single watch if it has changed (polymorphic method).
Args:
uuid: Watch UUID
force: If True, skip hash check and save anyway
watch_dict: Pre-computed watch dictionary (optimization)
current_hash: Pre-computed hash (optimization)
Returns:
True if saved, False if skipped (unchanged)
"""
if not self._watch_exists(uuid):
logger.warning(f"Cannot save watch {uuid} - does not exist")
return False
# Get watch dict if not provided
if watch_dict is None:
watch_dict = self._get_watch_dict(uuid)
# Compute hash if not provided
if current_hash is None:
current_hash = self._compute_hash(watch_dict)
# Skip save if unchanged (unless forced)
if not force and current_hash == self._watch_hashes.get(uuid):
return False
try:
self._save_watch(uuid, watch_dict)
self._watch_hashes[uuid] = current_hash
logger.debug(f"Saved watch {uuid}")
return True
except Exception as e:
logger.error(f"Failed to save watch {uuid}: {e}")
raise
def _save_watch(self, uuid, watch_dict):
"""
Save a single watch to storage (polymorphic).
Backend-specific implementation. Subclasses override for different storage:
- File backend: Writes to {uuid}/watch.json
- Redis backend: SET watch:{uuid}
- SQL backend: UPDATE watches WHERE uuid=?
Args:
uuid: Watch UUID
watch_dict: Dictionary representation of watch
"""
# Default file implementation
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
def _save_settings(self):
"""
@@ -480,7 +510,6 @@ class FileSavingDataStore(DataStore):
"""
raise NotImplementedError("Subclass must implement _save_settings")
def _load_watches(self):
"""
Load all watches from storage (polymorphic).
@@ -506,4 +535,364 @@ class FileSavingDataStore(DataStore):
"""
raise NotImplementedError("Subclass must implement _delete_watch")
def _save_dirty_items(self):
"""
Save dirty watches and settings.
This is the core optimization: instead of saving the entire datastore,
we only save watches that were marked dirty and settings if changed.
"""
start_time = time.time()
# Capture dirty sets under lock
with self.lock:
dirty_watches = list(self._dirty_watches)
dirty_settings = self._dirty_settings
self._dirty_watches.clear()
self._dirty_settings = False
if not dirty_watches and not dirty_settings:
return
logger.trace(f"Saving {len(dirty_watches)} dirty watches, settings_dirty={dirty_settings}")
# Save each dirty watch using the polymorphic save method
saved_count = 0
error_count = 0
skipped_unchanged = 0
# Process in batches of 50, using thread pool for parallel saves
BATCH_SIZE = 50
MAX_WORKERS = 20 # Number of parallel save threads
def save_single_watch(uuid):
"""Helper function for thread pool execution."""
try:
# Check if watch still exists (might have been deleted)
if not self._watch_exists(uuid):
# Watch was deleted, remove hash
self._watch_hashes.pop(uuid, None)
return {'status': 'deleted', 'uuid': uuid}
# Pre-check hash to avoid unnecessary save_watch() calls
watch_dict = self._get_watch_dict(uuid)
current_hash = self._compute_hash(watch_dict)
if current_hash == self._watch_hashes.get(uuid):
# Watch hasn't actually changed, skip
return {'status': 'unchanged', 'uuid': uuid}
# Pass pre-computed values to avoid redundant serialization/hashing
if self.save_watch(uuid, force=True, watch_dict=watch_dict, current_hash=current_hash):
return {'status': 'saved', 'uuid': uuid}
else:
return {'status': 'skipped', 'uuid': uuid}
except Exception as e:
logger.error(f"Error saving watch {uuid}: {e}")
return {'status': 'error', 'uuid': uuid, 'error': e}
# Process dirty watches in batches
for batch_start in range(0, len(dirty_watches), BATCH_SIZE):
batch = dirty_watches[batch_start:batch_start + BATCH_SIZE]
batch_num = (batch_start // BATCH_SIZE) + 1
total_batches = (len(dirty_watches) + BATCH_SIZE - 1) // BATCH_SIZE
if len(dirty_watches) > BATCH_SIZE:
logger.trace(f"Save batch {batch_num}/{total_batches} ({len(batch)} watches)")
# Use thread pool to save watches in parallel
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# Submit all save tasks
future_to_uuid = {executor.submit(save_single_watch, uuid): uuid for uuid in batch}
# Collect results as they complete
for future in as_completed(future_to_uuid):
result = future.result()
status = result['status']
if status == 'saved':
saved_count += 1
elif status == 'unchanged':
skipped_unchanged += 1
elif status == 'error':
error_count += 1
# Re-mark for retry
with self.lock:
self._dirty_watches.add(result['uuid'])
# 'deleted' and 'skipped' don't need special handling
# Save settings if changed
if dirty_settings:
try:
self._save_settings()
logger.debug("Saved settings")
except Exception as e:
logger.error(f"Failed to save settings: {e}")
error_count += 1
with self.lock:
self._dirty_settings = True
# Update metrics
elapsed = time.time() - start_time
self._save_cycle_count += 1
self._total_saves += saved_count
self._save_errors += error_count
self._last_save_time = time.time()
# Log performance metrics
if saved_count > 0:
avg_time_per_watch = (elapsed / saved_count) * 1000 # milliseconds
skipped_msg = f", {skipped_unchanged} unchanged" if skipped_unchanged > 0 else ""
parallel_msg = f" [parallel: {MAX_WORKERS} workers]" if saved_count > 1 else ""
logger.info(
f"Successfully saved {saved_count} watches in {elapsed:.2f}s "
f"(avg {avg_time_per_watch:.1f}ms per watch{skipped_msg}){parallel_msg}. "
f"Total: {self._total_saves} saves, {self._save_errors} errors (lifetime)"
)
elif skipped_unchanged > 0:
logger.debug(f"Save cycle: {skipped_unchanged} watches verified unchanged (hash match), nothing saved")
if error_count > 0:
logger.error(f"Save cycle completed with {error_count} errors")
self.needs_write = False
self.needs_write_urgent = False
def _watch_exists(self, uuid):
"""
Check if watch exists. Subclass must implement.
Args:
uuid: Watch UUID
Returns:
bool
"""
raise NotImplementedError("Subclass must implement _watch_exists")
def _get_watch_dict(self, uuid):
"""
Get watch as dictionary. Subclass must implement.
Args:
uuid: Watch UUID
Returns:
Dictionary representation of watch
"""
raise NotImplementedError("Subclass must implement _get_watch_dict")
def _audit_all_watches(self):
"""
Rolling audit: Scans a fraction of watches to detect unmarked changes.
Instead of scanning ALL watches at once, this scans 1/N shards per cycle.
The shard rotates each cycle, completing a full audit every N cycles.
Handles dynamic watch count - recalculates shard boundaries each cycle,
so newly added watches will be audited in subsequent cycles.
Benefits:
- Lower CPU per cycle (56k / 5 = ~11k watches vs all 56k)
- More frequent audits overall (every 50s vs every 10s)
- Spreads load evenly across time
"""
audit_start = time.time()
# Get list of all watch UUIDs (read-only, no lock needed)
try:
all_uuids = list(self.data['watching'].keys())
except (KeyError, AttributeError, RuntimeError):
# Data structure not ready or being modified
return
if not all_uuids:
return
total_watches = len(all_uuids)
# Calculate this cycle's shard boundaries
# Example: 56,278 watches / 5 shards = 11,255 watches per shard
# Shard 0: [0:11255], Shard 1: [11255:22510], etc.
shard_size = (total_watches + DATASTORE_AUDIT_SHARDS - 1) // DATASTORE_AUDIT_SHARDS
start_idx = self._audit_shard_index * shard_size
end_idx = min(start_idx + shard_size, total_watches)
# Handle wrap-around (shouldn't happen normally, but defensive)
if start_idx >= total_watches:
self._audit_shard_index = 0
start_idx = 0
end_idx = min(shard_size, total_watches)
# Audit only this shard's watches
shard_uuids = all_uuids[start_idx:end_idx]
changes_found = 0
errors = 0
for uuid in shard_uuids:
try:
# Get current watch dict and compute hash
watch_dict = self._get_watch_dict(uuid)
current_hash = self._compute_hash(watch_dict)
stored_hash = self._watch_hashes.get(uuid)
# If hash changed and not already marked dirty, mark it
if current_hash != stored_hash:
with self.lock:
if uuid not in self._dirty_watches:
self._dirty_watches.add(uuid)
changes_found += 1
logger.warning(
f"Audit detected unmarked change in watch {uuid[:8]}... current {current_hash:8} stored hash {stored_hash[:8]}"
f"(hash changed but not marked dirty)"
)
self.needs_write = True
except Exception as e:
errors += 1
logger.trace(f"Audit error for watch {uuid[:8]}...: {e}")
audit_elapsed = (time.time() - audit_start) * 1000 # milliseconds
# Advance to next shard (wrap around after last shard)
self._audit_shard_index = (self._audit_shard_index + 1) % DATASTORE_AUDIT_SHARDS
# Update metrics
self._audit_count += 1
self._audit_found_changes += changes_found
self._last_audit_time = time.time()
if changes_found > 0:
logger.warning(
f"Audit shard {self._audit_shard_index}/{DATASTORE_AUDIT_SHARDS} found {changes_found} "
f"unmarked changes in {len(shard_uuids)}/{total_watches} watches ({audit_elapsed:.1f}ms)"
)
else:
logger.trace(
f"Audit shard {self._audit_shard_index}/{DATASTORE_AUDIT_SHARDS}: "
f"{len(shard_uuids)}/{total_watches} watches checked, 0 changes ({audit_elapsed:.1f}ms)"
)
def save_datastore(self):
"""
Background thread that periodically saves dirty items and audits watches.
Runs two independent cycles:
1. Save dirty items every DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS (default 10s)
2. Rolling audit: every DATASTORE_AUDIT_INTERVAL_SECONDS (default 10s)
- Scans 1/DATASTORE_AUDIT_SHARDS watches per cycle (default 1/5)
- Full audit completes every 50s (10s × 5 shards)
- Automatically handles new/deleted watches
Uses 0.5s sleep intervals for responsiveness to urgent saves.
"""
while True:
if self.stop_thread:
# Graceful shutdown: flush any remaining dirty items before stopping
if self.needs_write or self._dirty_watches or self._dirty_settings:
logger.warning("Datastore save thread stopping - flushing remaining dirty items...")
try:
self._save_dirty_items()
logger.info("Graceful shutdown complete - all data saved")
except Exception as e:
logger.critical(f"FAILED to save dirty items during shutdown: {e}")
else:
logger.info("Datastore save thread stopping - no dirty items")
return
# Check if it's time to run audit scan (every N seconds)
if time.time() - self._last_audit_time >= DATASTORE_AUDIT_INTERVAL_SECONDS:
try:
self._audit_all_watches()
except Exception as e:
logger.error(f"Error in audit cycle: {e}")
# Save dirty items if needed
if self.needs_write or self.needs_write_urgent:
try:
self._save_dirty_items()
except Exception as e:
logger.error(f"Error in save cycle: {e}")
# Timer with early break for urgent saves
# Each iteration is 0.5 seconds, so iterations = DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS * 2
for i in range(DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS * 2):
time.sleep(0.5)
if self.stop_thread or self.needs_write_urgent:
break
def start_save_thread(self):
"""Start the background save thread."""
if not self.save_data_thread or not self.save_data_thread.is_alive():
self.save_data_thread = Thread(target=self.save_datastore, daemon=True, name="DatastoreSaver")
self.save_data_thread.start()
logger.info("Datastore save thread started")
def force_save_all(self):
"""
Force immediate synchronous save of all changes to storage.
File backend implementation of the abstract force_save_all() method.
Marks all watches and settings as dirty, then saves immediately.
Used by:
- Backup creation (ensure everything is saved before backup)
- Shutdown (ensure all changes are persisted)
- Manual save operations
"""
logger.info("Force saving all data to storage...")
# Mark everything as dirty to ensure complete save
for uuid in self.data['watching'].keys():
self.mark_watch_dirty(uuid)
self.mark_settings_dirty()
# Save immediately (synchronous)
self._save_dirty_items()
logger.success("All data saved to storage")
def get_health_status(self):
"""
Get datastore health status for monitoring.
Returns:
dict with health metrics and status
"""
now = time.time()
time_since_last_save = now - self._last_save_time
with self.lock:
dirty_count = len(self._dirty_watches)
is_thread_alive = self.save_data_thread and self.save_data_thread.is_alive()
# Determine health status
if not is_thread_alive:
status = "CRITICAL"
message = "Save thread is DEAD"
elif time_since_last_save > 300: # 5 minutes
status = "WARNING"
message = f"No save activity for {time_since_last_save:.0f}s"
elif dirty_count > 1000:
status = "WARNING"
message = f"High backpressure: {dirty_count} watches pending"
elif self._save_errors > 0 and (self._save_errors / max(self._total_saves, 1)) > 0.01:
status = "WARNING"
message = f"High error rate: {self._save_errors} errors"
else:
status = "HEALTHY"
message = "Operating normally"
return {
"status": status,
"message": message,
"thread_alive": is_thread_alive,
"dirty_watches": dirty_count,
"dirty_settings": self._dirty_settings,
"last_save_seconds_ago": int(time_since_last_save),
"save_cycles": self._save_cycle_count,
"total_saves": self._total_saves,
"total_errors": self._save_errors,
"error_rate_percent": round((self._save_errors / max(self._total_saves, 1)) * 100, 2)
}
+66
View File
@@ -0,0 +1,66 @@
"""
Legacy format loader for url-watches.json.
Provides functions to detect and load from the legacy monolithic JSON format.
Used during migration (update_26) to transition to individual watch.json files.
"""
import os
import json
from loguru import logger
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
def has_legacy_datastore(datastore_path):
"""
Check if a legacy url-watches.json file exists.
This is used by update_26 to determine if migration is needed.
Args:
datastore_path: Path to datastore directory
Returns:
bool: True if url-watches.json exists
"""
url_watches_json = os.path.join(datastore_path, "url-watches.json")
return os.path.exists(url_watches_json)
def load_legacy_format(json_store_path):
"""
Load datastore from legacy url-watches.json format.
Args:
json_store_path: Full path to url-watches.json file
Returns:
dict: Loaded datastore data with 'watching', 'settings', etc.
None: If file doesn't exist or loading failed
"""
logger.info(f"Loading from legacy format: {json_store_path}")
if not os.path.isfile(json_store_path):
logger.warning(f"Legacy file not found: {json_store_path}")
return None
try:
if HAS_ORJSON:
with open(json_store_path, 'rb') as f:
data = orjson.loads(f.read())
else:
with open(json_store_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Loaded {len(data.get('watching', {}))} watches from legacy format")
return data
except Exception as e:
logger.error(f"Failed to load legacy format: {e}")
return None
+81 -120
View File
@@ -16,18 +16,12 @@ import time
from loguru import logger
from copy import deepcopy
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..processors.restock_diff import Restock
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from .file_saving_datastore import save_watch_atomic
def create_backup_tarball(datastore_path, update_number):
"""
@@ -35,7 +29,6 @@ def create_backup_tarball(datastore_path, update_number):
Includes:
- All {uuid}/watch.json files
- All {uuid}/tag.json files
- changedetection.json (settings, if it exists)
- url-watches.json (legacy format, if it exists)
- Directory structure preserved
@@ -51,7 +44,7 @@ def create_backup_tarball(datastore_path, update_number):
To restore from a backup:
cd /path/to/datastore
tar -xzf before-update-N-timestamp.tar.gz
This will restore all watch.json and tag.json files and settings to their pre-update state.
This will restore all watch.json files and settings to their pre-update state.
"""
timestamp = int(time.time())
backup_filename = f"before-update-{update_number}-{timestamp}.tar.gz"
@@ -73,10 +66,9 @@ def create_backup_tarball(datastore_path, update_number):
tar.add(url_watches_json, arcname="url-watches.json")
logger.debug("Added url-watches.json to backup")
# Backup all watch/tag directories with their JSON files
# Backup all watch directories with their watch.json files
# This preserves the UUID directory structure
watch_count = 0
tag_count = 0
for entry in os.listdir(datastore_path):
entry_path = os.path.join(datastore_path, entry)
@@ -88,22 +80,17 @@ def create_backup_tarball(datastore_path, update_number):
if entry.startswith('.') or entry.startswith('before-update-'):
continue
# Backup watch.json if exists
# Check if this directory has a watch.json (indicates it's a watch UUID directory)
watch_json = os.path.join(entry_path, "watch.json")
if os.path.isfile(watch_json):
# Add the watch.json file preserving directory structure
tar.add(watch_json, arcname=f"{entry}/watch.json")
watch_count += 1
if watch_count % 100 == 0:
logger.debug(f"Backed up {watch_count} watch.json files...")
# Backup tag.json if exists
tag_json = os.path.join(entry_path, "tag.json")
if os.path.isfile(tag_json):
tar.add(tag_json, arcname=f"{entry}/tag.json")
tag_count += 1
logger.success(f"Backup created: {backup_filename} ({watch_count} watches from disk, {tag_count} tags from disk)")
logger.success(f"Backup created: {backup_filename} ({watch_count} watches)")
return backup_path
except Exception as e:
@@ -143,7 +130,6 @@ class DatastoreUpdatesMixin:
return updates_available
def run_updates(self, current_schema_version=None):
import sys
"""
Run all pending schema updates sequentially.
@@ -161,29 +147,12 @@ class DatastoreUpdatesMixin:
2. For each update > current schema version:
- Create backup of datastore
- Run update method
- Update schema version and commit settings
- Commit all watches and tags
- Update schema version
- Mark settings and watches dirty
3. If any update fails, stop processing
4. All changes saved via individual .commit() calls
4. Save all changes immediately
"""
updates_available = self.get_updates_available()
if self.data.get('watching'):
test_watch = self.data['watching'].get(next(iter(self.data.get('watching', {}))))
from ..model.Watch import model
if not isinstance(test_watch, model):
import sys
logger.critical("Cannot run updates! Watch structure must be re-hydrated back to a Watch model object!")
sys.exit(1)
if self.data['settings']['application'].get('tags',{}):
test_tag = self.data['settings']['application'].get('tags',{}).get(next(iter(self.data['settings']['application'].get('tags',{}))))
from ..model.Tag import model as tag_model
if not isinstance(test_tag, tag_model):
import sys
logger.critical("Cannot run updates! Watch tag/group structure must be re-hydrated back to a Tag model object!")
sys.exit(1)
# Determine current schema version
if current_schema_version is None:
@@ -199,7 +168,7 @@ class DatastoreUpdatesMixin:
latest_update = updates_available[-1] if updates_available else 0
logger.info(f"No schema version found and no watches exist - assuming fresh install, setting schema_version to {latest_update}")
self.data['settings']['application']['schema_version'] = latest_update
self.commit()
self.mark_settings_dirty()
return # No updates needed for fresh install
else:
# Has watches but no schema version - likely old datastore, run all updates
@@ -225,15 +194,31 @@ class DatastoreUpdatesMixin:
try:
update_method = getattr(self, f"update_{update_n}")()
except Exception as e:
logger.critical(f"Error while trying update_{update_n}")
logger.exception(e)
sys.exit(1)
logger.error(f"Error while trying update_{update_n}")
logger.error(e)
# Don't run any more updates
return
else:
# Bump the version
# Bump the version, important
self.data['settings']['application']['schema_version'] = update_n
self.commit()
self.mark_settings_dirty()
logger.success(f"Update {update_n} completed")
# CRITICAL: Mark all watches as dirty so changes are persisted
# Most updates modify watches, and in the new individual watch.json structure,
# we need to ensure those changes are saved
logger.info(f"Marking all {len(self.data['watching'])} watches as dirty after update_{update_n} (so that it saves them to disk)")
for uuid in self.data['watching'].keys():
self.mark_watch_dirty(uuid)
# Save changes immediately after each update (more resilient than batching)
logger.critical(f"Saving all changes after update_{update_n}")
try:
self._save_dirty_items()
logger.success(f"Update {update_n} changes saved successfully")
except Exception as e:
logger.error(f"Failed to save update_{update_n} changes: {e}")
# Don't raise - update already ran, but changes might not be persisted
# The update will try to run again on next startup
# Track which updates ran
updates_ran.append(update_n)
@@ -483,14 +468,6 @@ class DatastoreUpdatesMixin:
del self.data['watching'][uuid]['extract_title_as_title']
if self.data['settings']['application'].get('extract_title_as_title'):
# Ensure 'ui' key exists (defensive for edge cases where base_config merge didn't happen)
if 'ui' not in self.data['settings']['application']:
self.data['settings']['application']['ui'] = {
'use_page_title_in_list': True,
'open_diff_in_new_tab': True,
'socket_io_enabled': True,
'favicons_enabled': True
}
self.data['settings']['application']['ui']['use_page_title_in_list'] = self.data['settings']['application'].get('extract_title_as_title')
def update_21(self):
@@ -578,6 +555,27 @@ class DatastoreUpdatesMixin:
logger.critical("COPY-based migration: url-watches.json will remain intact for rollback")
logger.critical("=" * 80)
# Check if already migrated
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
if os.path.exists(changedetection_json):
logger.info("Migration already completed (changedetection.json exists), skipping")
return
# Check if we need to load legacy data
from .legacy_loader import has_legacy_datastore, load_legacy_format
if not has_legacy_datastore(self.datastore_path):
logger.info("No legacy datastore found, nothing to migrate")
return
# Load legacy data from url-watches.json
logger.critical("Loading legacy datastore from url-watches.json...")
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
legacy_data = load_legacy_format(legacy_path)
if not legacy_data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Populate settings from legacy data
logger.info("Populating settings from legacy data...")
watch_count = len(self.data['watching'])
@@ -589,7 +587,9 @@ class DatastoreUpdatesMixin:
saved_count = 0
for uuid, watch in self.data['watching'].items():
try:
watch.commit()
watch_dict = dict(watch)
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
saved_count += 1
if saved_count % 100 == 0:
@@ -635,20 +635,36 @@ class DatastoreUpdatesMixin:
# Phase 4: Verify settings file exists
logger.critical("Phase 4/4: Verifying changedetection.json exists...")
changedetection_json_new_schema=os.path.join(self.datastore_path, "changedetection.json")
if not os.path.isfile(changedetection_json_new_schema):
import sys
logger.critical("Migration failed, changedetection.json not found after update ran!")
sys.exit(1)
if not os.path.isfile(changedetection_json):
raise Exception(
"Migration failed: changedetection.json not found after save. "
"url-watches.json remains intact, safe to retry."
)
logger.critical("Phase 4 complete: Verified changedetection.json exists")
# Success! Now reload from new format
logger.critical("Reloading datastore from new format...")
# write it to disk, it will be saved without ['watching'] in the JSON db because we find it from disk glob
self._save_settings()
self._load_state() # Includes load_watches
logger.success("Datastore reloaded from new format successfully")
# Verify all watches have hashes after migration
missing_hashes = [uuid for uuid in self.data['watching'].keys() if uuid not in self._watch_hashes]
if missing_hashes:
logger.error(f"WARNING: {len(missing_hashes)} watches missing hashes after migration: {missing_hashes[:5]}")
else:
logger.success(f"All {len(self.data['watching'])} watches have valid hashes after migration")
# Set schema version to latest available update
# This prevents re-running updates and re-marking all watches as dirty
updates_available = self.get_updates_available()
latest_schema = updates_available[-1] if updates_available else 26
self.data['settings']['application']['schema_version'] = latest_schema
self.mark_settings_dirty()
logger.info(f"Set schema_version to {latest_schema} (migration complete, all watches already saved)")
logger.critical("=" * 80)
logger.critical("MIGRATION COMPLETED SUCCESSFULLY!")
logger.critical("=" * 80)
@@ -667,59 +683,4 @@ class DatastoreUpdatesMixin:
logger.info("")
def update_26(self):
self.migrate_legacy_db_format()
def update_28(self):
"""
Migrate tags to individual tag.json files.
Tags are currently saved only in changedetection.json (settings).
This migration ALSO saves them to individual {uuid}/tag.json files,
similar to how watches are stored (dual storage).
Benefits:
- Allows atomic tag updates without rewriting entire settings
- Enables independent tag versioning/backup
- Maintains backwards compatibility (tags stay in settings too)
"""
# Force save as tag.json (not watch.json) even if object is corrupted
logger.critical("=" * 80)
logger.critical("Running migration: Individual tag persistence (update_28)")
logger.critical("Creating individual tag.json files")
logger.critical("=" * 80)
tags = self.data['settings']['application'].get('tags', {})
tag_count = len(tags)
if tag_count == 0:
logger.info("No tags found, skipping migration")
return
logger.info(f"Migrating {tag_count} tags to individual tag.json files...")
saved_count = 0
failed_count = 0
for uuid, tag_data in tags.items():
try:
tag_data.commit()
saved_count += 1
if saved_count % 10 == 0:
logger.info(f" Progress: {saved_count}/{tag_count} tags migrated...")
except Exception as e:
logger.error(f"Failed to save tag {uuid} ({tag_data.get('title', 'unknown')}): {e}")
failed_count += 1
if failed_count > 0:
logger.warning(f"Migration complete: {saved_count} tags saved, {failed_count} tags FAILED")
else:
logger.success(f"Migration complete: {saved_count} tags saved to individual tag.json files")
# Tags remain in settings for backwards compatibility AND easy access
# On next load, _load_tags() will read from tag.json files and merge with settings
logger.info("Tags saved to both settings AND individual tag.json files")
logger.info("Future tag edits will update both locations (dual storage)")
logger.critical("=" * 80)
self.migrate_legacy_db_format()
+7 -157
View File
@@ -9,11 +9,6 @@ from changedetectionio import store
import os
import sys
# CRITICAL: Set short timeout for tests to prevent 45-second hangs
# When test server is slow/unresponsive, workers fail fast instead of holding UUIDs for 45s
# This prevents exponential priority growth from repeated deferrals (priority × 10 each defer)
os.environ['DEFAULT_SETTINGS_REQUESTS_TIMEOUT'] = '5'
from changedetectionio.flask_app import init_app_secret, changedetection_app
from changedetectionio.tests.util import live_server_setup, new_live_server_setup
@@ -34,93 +29,6 @@ def reportlog(pytestconfig):
logger.remove(handler_id)
@pytest.fixture(autouse=True)
def per_test_log_file(request):
"""Create a separate log file for each test function with pytest output."""
import re
# Create logs directory if it doesn't exist
log_dir = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(log_dir, exist_ok=True)
# Generate log filename from test name and worker ID (for parallel runs)
test_name = request.node.name
# Sanitize test name - replace unsafe characters with underscores
# Keep only alphanumeric, dash, underscore, and period
safe_test_name = re.sub(r'[^\w\-.]', '_', test_name)
# Limit length to avoid filesystem issues (max 200 chars)
if len(safe_test_name) > 200:
# Keep first 150 chars + hash of full name + last 30 chars
import hashlib
name_hash = hashlib.md5(test_name.encode()).hexdigest()[:8]
safe_test_name = f"{safe_test_name[:150]}_{name_hash}_{safe_test_name[-30:]}"
worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'master')
log_file = os.path.join(log_dir, f"{safe_test_name}_{worker_id}.log")
# Add file handler for this test with TRACE level
handler_id = logger.add(
log_file,
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {process} | {name}:{function}:{line} - {message}",
level="TRACE",
mode="w", # Overwrite if exists
enqueue=True # Thread-safe
)
logger.info(f"=== Starting test: {test_name} (worker: {worker_id}) ===")
logger.info(f"Test location: {request.node.nodeid}")
yield
# Capture test outcome (PASSED/FAILED/SKIPPED/ERROR)
outcome = "UNKNOWN"
exc_info = None
stdout = None
stderr = None
if hasattr(request.node, 'rep_call'):
outcome = request.node.rep_call.outcome.upper()
if request.node.rep_call.failed:
exc_info = request.node.rep_call.longreprtext
# Capture stdout/stderr from call phase
if hasattr(request.node.rep_call, 'sections'):
for section_name, section_content in request.node.rep_call.sections:
if 'stdout' in section_name.lower():
stdout = section_content
elif 'stderr' in section_name.lower():
stderr = section_content
elif hasattr(request.node, 'rep_setup'):
if request.node.rep_setup.failed:
outcome = "SETUP_FAILED"
exc_info = request.node.rep_setup.longreprtext
logger.info(f"=== Test Result: {outcome} ===")
if exc_info:
logger.error(f"=== Test Failure Details ===\n{exc_info}")
if stdout:
logger.info(f"=== Captured stdout ===\n{stdout}")
if stderr:
logger.warning(f"=== Captured stderr ===\n{stderr}")
logger.info(f"=== Finished test: {test_name} ===")
logger.remove(handler_id)
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
"""Hook to capture test results and attach to the test node."""
outcome = yield
rep = outcome.get_result()
# Store report on the test node for access in fixtures
setattr(item, f"rep_{rep.when}", rep)
@pytest.fixture
def environment(mocker):
"""Mock arrow.now() to return a fixed datetime for testing jinja2 time extension."""
@@ -257,56 +165,9 @@ def prepare_test_function(live_server, datastore_path):
except:
break
# Add test helper methods to the app for worker management
def set_workers(count):
"""Set the number of workers for testing - brutal shutdown, no delays"""
from changedetectionio import worker_pool
from changedetectionio.flask_app import update_q, notification_q
current_count = worker_pool.get_worker_count()
# Special case: Setting to 0 means shutdown all workers brutally
if count == 0:
logger.debug(f"Brutally shutting down all {current_count} workers")
worker_pool.shutdown_workers()
return {
'status': 'success',
'message': f'Shutdown all {current_count} workers',
'previous_count': current_count,
'current_count': 0
}
# Adjust worker count (no delays, no verification)
result = worker_pool.adjust_async_worker_count(
count,
update_q=update_q,
notification_q=notification_q,
app=live_server.app,
datastore=datastore
)
return result
def check_all_workers_alive(expected_count):
"""Check that all expected workers are alive"""
from changedetectionio import worker_pool
from changedetectionio.flask_app import update_q, notification_q
result = worker_pool.check_worker_health(
expected_count,
update_q=update_q,
notification_q=notification_q,
app=live_server.app,
datastore=datastore
)
assert result['status'] == 'healthy', f"Workers not healthy: {result['message']}"
return result
# Attach helper methods to app for easy test access
live_server.app.set_workers = set_workers
live_server.app.check_all_workers_alive = check_all_workers_alive
# Prevent background thread from writing during cleanup/reload
datastore.needs_write = False
datastore.needs_write_urgent = False
# CRITICAL: Clean up any files from previous tests
# This ensures a completely clean directory
@@ -331,7 +192,6 @@ def prepare_test_function(live_server, datastore_path):
# Cleanup: Clear watches and queue after test
try:
from changedetectionio.flask_app import update_q
from pathlib import Path
# Clear the queue to prevent leakage to next test
while not update_q.empty():
@@ -341,18 +201,7 @@ def prepare_test_function(live_server, datastore_path):
break
datastore.data['watching'] = {}
# Delete any old watch metadata JSON files
base_path = Path(datastore.datastore_path).resolve()
max_depth = 2
for file in base_path.rglob("*.json"):
# Calculate depth relative to base path
depth = len(file.relative_to(base_path).parts) - 1
if depth <= max_depth and file.is_file():
file.unlink()
datastore.needs_write = True
except Exception as e:
logger.warning(f"Error during datastore cleanup: {e}")
@@ -413,8 +262,8 @@ def app(request, datastore_path):
# Shutdown workers gracefully before loguru cleanup
try:
from changedetectionio import worker_pool
worker_pool.shutdown_workers()
from changedetectionio import worker_handler
worker_handler.shutdown_workers()
except Exception:
pass
@@ -462,3 +311,4 @@ def app(request, datastore_path):
yield app
@@ -1,41 +0,0 @@
import time
from flask import url_for
from changedetectionio.tests.util import wait_for_all_checks
def test_check_plugin_processor(client, live_server, measure_memory_usage, datastore_path):
# requires os-int intelligence plugin installed (first basic one we test with)
res = client.get(url_for("watchlist.index"))
assert b'OSINT Reconnaissance' in res.data, "Must have the OSINT plugin installed at test time"
assert b'<input checked id="processor-0" name="processor" type="radio" value="text_json_diff">' in res.data, "But the first text_json_diff processor should always be selected by default in quick watch form"
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": 'http://127.0.0.1', "tags": '', 'processor': 'osint_recon'},
follow_redirects=True
)
assert b"Watch added" in res.data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
assert b'Target: http://127.0.0.1' in res.data
assert b'DNSKEY Records' in res.data
wait_for_all_checks(client)
# Now change it to something that doesnt exist
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
live_server.app.config['DATASTORE'].data['watching'][uuid]['processor'] = "now_missing"
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b"Exception: Processor module" in res.data and b'now_missing' in res.data, f'Should register that the plugin is missing for {uuid}'
+3 -445
View File
@@ -328,68 +328,6 @@ def test_api_simple(client, live_server, measure_memory_usage, datastore_path):
)
assert len(res.json) == 0, "Watch list should be empty"
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
# Create new
res = client.post(
url_for("createwatch"),
data=json.dumps({"url": test_url}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
watch=res.json
# Be sure that 'readOnly' values are never updated in the real watch
watch['last_changed'] = 454444444444
watch['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(watch),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
last_changed = res.json.get('last_changed')
assert last_changed != 454444444444
assert last_changed != "454444444444"
date_created = res.json.get('date_created')
assert date_created != 454444444444
assert date_created != "454444444444"
def test_access_denied(client, live_server, measure_memory_usage, datastore_path):
# `config_api_token_enabled` Should be On by default
res = client.get(
@@ -463,9 +401,6 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
follow_redirects=True
)
if res.status_code != 201:
print(f"\n=== POST createwatch failed with {res.status_code} ===")
print(f"Response: {res.data}")
assert res.status_code == 201
wait_for_all_checks(client)
@@ -529,12 +464,8 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
)
assert res.status_code == 400, "Should get error 400 when we give a field that doesnt exist"
# Backend validation now rejects unknown fields with a clear error message
assert (b'Unknown field' in res.data or
b'Additional properties are not allowed' in res.data or
b'Unevaluated properties are not allowed' in res.data or
b'does not match any of the regexes' in res.data), \
"Should reject unknown fields with validation error"
# Message will come from `flask_expects_json`
assert b'Additional properties are not allowed' in res.data
# Try a XSS URL
@@ -555,7 +486,6 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: Basic import with tag
res = client.post(
url_for("import") + "?tag=import-test",
data='https://website1.com\r\nhttps://website2.com',
@@ -574,239 +504,6 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
res = client.get(url_for('tags.tags_overview_page'))
assert b'import-test' in res.data
# Test 2: Import with watch configuration fields (issue #3845)
# Test string field (include_filters), boolean (paused), and processor
import urllib.parse
params = urllib.parse.urlencode({
'tag': 'config-test',
'include_filters': 'div.content',
'paused': 'true',
'processor': 'text_json_diff',
'title': 'Imported with Config'
})
res = client.post(
url_for("import") + "?" + params,
data='https://website3.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
assert len(res.json) == 1
uuid = res.json[0]
# Verify the configuration was applied
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert watch['include_filters'] == ['div.content'], "include_filters should be set as array"
assert watch['paused'] == True, "paused should be True"
assert watch['processor'] == 'text_json_diff', "processor should be set"
assert watch['title'] == 'Imported with Config', "title should be set"
# Test 3: Import with array field (notification_urls) - using valid Apprise format
params = urllib.parse.urlencode({
'tag': 'notification-test',
'notification_urls': 'mailto://test@example.com,mailto://admin@example.com'
})
res = client.post(
url_for("import") + "?" + params,
data='https://website4.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert isinstance(watch['notification_urls'], list), "notification_urls must be stored as a list"
assert len(watch['notification_urls']) == 2, "notification_urls should have 2 entries"
assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email"
assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email"
# Test 4: Import with object field (time_between_check)
import json
time_config = json.dumps({"hours": 2, "minutes": 30})
params = urllib.parse.urlencode({
'tag': 'schedule-test',
'time_between_check': time_config
})
res = client.post(
url_for("import") + "?" + params,
data='https://website5.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert watch['time_between_check']['hours'] == 2, "time_between_check hours should be 2"
assert watch['time_between_check']['minutes'] == 30, "time_between_check minutes should be 30"
# Test 5: Import with invalid processor (should fail)
res = client.post(
url_for("import") + "?processor=invalid_processor",
data='https://website6.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 400, "Should reject invalid processor"
assert b"Invalid processor" in res.data, "Error message should mention invalid processor"
# Test 6: Import with invalid field (should fail)
res = client.post(
url_for("import") + "?unknown_field=value",
data='https://website7.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 400, "Should reject unknown field"
assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter"
# Test 7: Import with complex nested array (browser_steps) - array of objects
browser_steps = json.dumps([
{"operation": "wait", "selector": "5", "optional_value": ""},
{"operation": "click", "selector": "button.submit", "optional_value": ""}
])
params = urllib.parse.urlencode({
'tag': 'browser-test',
'browser_steps': browser_steps
})
res = client.post(
url_for("import") + "?" + params,
data='https://website8.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200, "Should accept browser_steps array"
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert len(watch['browser_steps']) == 2, "Should have 2 browser steps"
assert watch['browser_steps'][0]['operation'] == 'wait', "First step should be wait"
assert watch['browser_steps'][1]['operation'] == 'click', "Second step should be click"
assert watch['browser_steps'][1]['selector'] == 'button.submit', "Second step selector should be button.submit"
# Cleanup
delete_all_watches(client)
def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path):
"""Test that small imports (< threshold) are processed synchronously"""
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Use local test endpoint to avoid network delays
test_url_base = url_for('test_endpoint', _external=True)
# Create URLs: threshold - 1 to stay under limit
num_urls = min(5, IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD - 1) # Use small number for faster test
urls = '\n'.join([f'{test_url_base}?id=small-{i}' for i in range(num_urls)])
# Import small batch
res = client.post(
url_for("import") + "?tag=small-test",
data=urls,
headers={'x-api-key': api_key},
follow_redirects=True
)
# Should return 200 OK with UUID list (synchronous)
assert res.status_code == 200, f"Should return 200 for small imports, got {res.status_code}"
assert isinstance(res.json, list), "Response should be a list of UUIDs"
assert len(res.json) == num_urls, f"Should return {num_urls} UUIDs, got {len(res.json)}"
# Verify all watches were created immediately
for uuid in res.json:
assert uuid in live_server.app.config['DATASTORE'].data['watching'], \
f"Watch {uuid} should exist immediately after synchronous import"
print(f"\n✓ Successfully created {num_urls} watches synchronously")
def test_api_import_large_background(client, live_server, measure_memory_usage, datastore_path):
"""Test that large imports (>= threshold) are processed in background thread"""
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
import time
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Use local test endpoint to avoid network delays
test_url_base = url_for('test_endpoint', _external=True)
# Create URLs: threshold + 10 to trigger background processing
num_urls = IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD + 10
urls = '\n'.join([f'{test_url_base}?id=bulk-{i}' for i in range(num_urls)])
# Import large batch
res = client.post(
url_for("import") + "?tag=bulk-test",
data=urls,
headers={'x-api-key': api_key},
follow_redirects=True
)
# Should return 202 Accepted (background processing)
assert res.status_code == 202, f"Should return 202 for large imports, got {res.status_code}"
assert b"background" in res.data.lower(), "Response should mention background processing"
# Extract expected count from response
response_json = res.json
assert 'count' in response_json, "Response should include count"
assert response_json['count'] == num_urls, f"Count should be {num_urls}, got {response_json['count']}"
# Wait for background thread to complete (with timeout)
max_wait = 10 # seconds
wait_interval = 0.5
elapsed = 0
watches_created = 0
while elapsed < max_wait:
time.sleep(wait_interval)
elapsed += wait_interval
# Count how many watches have been created
watches_created = len([
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
if 'id=bulk-' in watch['url']
])
if watches_created == num_urls:
break
# Verify all watches were created
assert watches_created == num_urls, \
f"Expected {num_urls} watches to be created, but found {watches_created} after {elapsed}s"
# Verify watches have correct configuration
bulk_watches = [
watch for watch in live_server.app.config['DATASTORE'].data['watching'].values()
if 'id=bulk-' in watch['url']
]
assert len(bulk_watches) == num_urls, "All bulk watches should exist"
# Check that they have the correct tag
datastore = live_server.app.config['DATASTORE']
# Get UUIDs of bulk watches by filtering the datastore keys
bulk_watch_uuids = [
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
if 'id=bulk-' in watch['url']
]
for watch_uuid in bulk_watch_uuids:
tags = datastore.get_all_tags_for_watch(uuid=watch_uuid)
tag_names = [t['title'] for t in tags.values()]
assert 'bulk-test' in tag_names, f"Watch {watch_uuid} should have 'bulk-test' tag"
print(f"\n✓ Successfully created {num_urls} watches in background (took {elapsed}s)")
def test_api_conflict_UI_password(client, live_server, measure_memory_usage, datastore_path):
@@ -933,9 +630,7 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
)
assert res.status_code == 400, "Updating watch URL to null should fail"
# Accept either OpenAPI validation error or our custom validation error
assert (b'URL cannot be null' in res.data or
b'Validation failed' in res.data or
b'validation error' in res.data.lower())
assert b'URL cannot be null' in res.data or b'OpenAPI validation failed' in res.data or b'validation error' in res.data.lower()
# Test 8: UPDATE to empty string URL should fail
res = client.put(
@@ -1022,140 +717,3 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
headers={'x-api-key': api_key},
)
delete_all_watches(client)
def test_api_time_between_check_validation(client, live_server, measure_memory_usage, datastore_path):
"""
Test that time_between_check validation works correctly:
- When time_between_check_use_default is false, at least one time value must be > 0
- Values must be valid integers
"""
import json
from flask import url_for
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: time_between_check_use_default=false with NO time_between_check should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time_between_check_use_default=false with no time_between_check"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 2: time_between_check_use_default=false with ALL zeros should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": 0,
"days": 0,
"hours": 0,
"minutes": 0,
"seconds": 0
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are 0"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 3: time_between_check_use_default=false with NULL values should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": None,
"days": None,
"hours": None,
"minutes": None,
"seconds": None
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are null"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 4: time_between_check_use_default=false with valid hours should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": 2
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid hours value"
uuid1 = res.json.get('uuid')
# Test 5: time_between_check_use_default=false with valid minutes should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example2.com",
"time_between_check_use_default": False,
"time_between_check": {
"minutes": 30
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid minutes value"
uuid2 = res.json.get('uuid')
# Test 6: time_between_check_use_default=true (or missing) with no time_between_check should succeed (uses defaults)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example3.com",
"time_between_check_use_default": True
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed when using default settings"
uuid3 = res.json.get('uuid')
# Test 7: Default behavior (no time_between_check_use_default field) should use defaults and succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example4.com"
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with default behavior (using global settings)"
uuid4 = res.json.get('uuid')
# Test 8: Verify integer type validation - string should fail (OpenAPI validation)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example5.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": "not_a_number"
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time value is not an integer"
assert b"Validation failed" in res.data or b"not of type" in res.data, "Should mention validation/type error"
# Cleanup
for uuid in [uuid1, uuid2, uuid3, uuid4]:
client.delete(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key},
)
@@ -107,7 +107,7 @@ def test_watch_notification_urls_validation(client, live_server, measure_memory_
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"Validation failed" in res.data or b"is not of type" in res.data
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 6: Verify original URLs are preserved after failed update
res = client.get(
@@ -159,7 +159,7 @@ def test_tag_notification_urls_validation(client, live_server, measure_memory_us
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"Validation failed" in res.data or b"is not of type" in res.data
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 4: Verify original URLs are preserved after failed update
tag = datastore.data['settings']['application']['tags'][tag_uuid]
+7 -19
View File
@@ -9,7 +9,7 @@ by testing various scenarios that should trigger validation errors.
import time
import json
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, delete_all_watches
from .util import live_server_setup, wait_for_all_checks
def test_openapi_validation_invalid_content_type_on_create_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -26,8 +26,7 @@ def test_openapi_validation_invalid_content_type_on_create_watch(client, live_se
# Should get 400 error due to OpenAPI validation failure
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_missing_required_field_create_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -44,8 +43,7 @@ def test_openapi_validation_missing_required_field_create_watch(client, live_ser
# Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_invalid_field_in_request_body(client, live_server, measure_memory_usage, datastore_path):
@@ -82,10 +80,7 @@ def test_openapi_validation_invalid_field_in_request_body(client, live_server, m
# Should get 400 error due to invalid field (this will be caught by internal validation)
# Note: This tests the flow where OpenAPI validation passes but internal validation catches it
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
# Backend validation now returns "Unknown field(s):" message
assert b"Unknown field" in res.data, \
"Should contain validation error about unknown fields"
delete_all_watches(client)
assert b"Additional properties are not allowed" in res.data, "Should contain validation error about additional properties"
def test_openapi_validation_import_wrong_content_type(client, live_server, measure_memory_usage, datastore_path):
@@ -102,8 +97,7 @@ def test_openapi_validation_import_wrong_content_type(client, live_server, measu
# Should get 400 error due to content-type mismatch
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_import_correct_content_type_succeeds(client, live_server, measure_memory_usage, datastore_path):
@@ -121,7 +115,6 @@ def test_openapi_validation_import_correct_content_type_succeeds(client, live_se
# Should succeed
assert res.status_code == 200, f"Expected 200 but got {res.status_code}"
assert len(res.json) == 2, "Should import 2 URLs"
delete_all_watches(client)
def test_openapi_validation_get_requests_bypass_validation(client, live_server, measure_memory_usage, datastore_path):
@@ -146,7 +139,6 @@ def test_openapi_validation_get_requests_bypass_validation(client, live_server,
# Should return JSON with watch list (empty in this case)
assert isinstance(res.json, dict), "Should return JSON dictionary for watch list"
delete_all_watches(client)
def test_openapi_validation_create_tag_missing_required_title(client, live_server, measure_memory_usage, datastore_path):
@@ -163,14 +155,11 @@ def test_openapi_validation_create_tag_missing_required_title(client, live_serve
# Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"Validation failed" in res.data, "Should contain validation error message"
delete_all_watches(client)
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
def test_openapi_validation_watch_update_allows_partial_updates(client, live_server, measure_memory_usage, datastore_path):
"""Test that watch updates allow partial updates without requiring all fields (positive test)."""
#xxx
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# First create a valid watch
@@ -207,5 +196,4 @@ def test_openapi_validation_watch_update_allows_partial_updates(client, live_ser
)
assert res.status_code == 200
assert res.json.get('title') == 'Updated Title Only', "Title should be updated"
assert res.json.get('url') == 'https://example.com', "URL should remain unchanged"
delete_all_watches(client)
assert res.json.get('url') == 'https://example.com', "URL should remain unchanged"
+3 -66
View File
@@ -18,7 +18,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
url_for("tags"),
headers={'x-api-key': api_key}
)
assert res.get_data(as_text=True).strip() == "{}", "Should be empty list"
assert res.text.strip() == "{}", "Should be empty list"
assert res.status_code == 200
res = client.post(
@@ -36,7 +36,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert new_tag_uuid in res.get_data(as_text=True)
assert new_tag_uuid in res.text
assert res.json[new_tag_uuid]['title'] == tag_title
assert res.json[new_tag_uuid]['notification_muted'] == False
@@ -118,16 +118,6 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
assert res.status_code == 200
assert new_tag_uuid in res.json.get('tags', [])
# Test that tags are returned when listing ALL watches (issue #3854)
res = client.get(
url_for("createwatch"), # GET /api/v1/watch - list all watches
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert watch_uuid in res.json, "Watch should be in the list"
assert 'tags' in res.json[watch_uuid], "Tags field should be present in watch list"
assert new_tag_uuid in res.json[watch_uuid]['tags'], "Tag UUID should be in tags array"
# Check recheck by tag
before_check_time = live_server.app.config['DATASTORE'].data['watching'][watch_uuid].get('last_checked')
time.sleep(1)
@@ -158,7 +148,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert new_tag_uuid not in res.get_data(as_text=True)
assert new_tag_uuid not in res.text
# Verify tag was removed from watch
res = client.get(
@@ -176,57 +166,4 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
assert res.status_code == 204
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
res = client.post(
url_for("tag"),
data=json.dumps({"title": "My tag title"}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
tag = res.json
# Only test with date_created (readOnly field that should be filtered out)
# last_changed is Watch-specific and doesn't apply to Tags
tag['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(tag),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
# Verify readOnly fields like date_created cannot be overridden
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
date_created = res.json.get('date_created')
assert date_created != 454444444444, "ReadOnly date_created should not be updateable"
assert date_created != "454444444444", "ReadOnly date_created should not be updateable"
@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, extract_UUID_from_client, wait_for_all_checks, delete_all_watches
from .util import live_server_setup, extract_UUID_from_client, wait_for_all_checks
import os
@@ -116,7 +116,7 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage
# And not this cause its not the ld-json
assert b"So let's see what happens" not in res.data
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
##########################################################################################
# And we shouldnt see the offer
@@ -131,7 +131,7 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage
assert b'ldjson-price-track-offer' not in res.data
##########################################################################################
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data):
@@ -147,7 +147,7 @@ def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_
##########################################################################################
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_bad_ldjson_is_correctly_ignored(client, live_server, measure_memory_usage, datastore_path):
+3 -1
View File
@@ -6,6 +6,8 @@ from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, \
extract_UUID_from_client, delete_all_watches
sleep_time_for_fetch_thread = 3
# Basic test to check inscriptus is not adding return line chars, basically works etc
def test_inscriptus():
@@ -412,4 +414,4 @@ def test_plaintext_even_if_xml_content_and_can_apply_filters(client, live_server
assert b'Abonnementen bijwerken' in res.data
assert b'&lt;foobar' not in res.data
res = delete_all_watches(client)
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+4 -42
View File
@@ -54,11 +54,11 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
backup = ZipFile(io.BytesIO(res.data))
l = backup.namelist()
# Check for UUID-based txt files (history, snapshot, and last-checksum)
# Check for UUID-based txt files (history and snapshot)
uuid4hex_txt = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, l))
# Should be three txt files in the archive (history, snapshot, and last-checksum)
assert len(txt_files) == 3
# Should be two txt files in the archive (history and the snapshot)
assert len(txt_files) == 2
# Check for watch.json files (new format)
uuid4hex_json = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}/watch\.json$', re.I)
@@ -75,42 +75,4 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
assert b'No backups found.' in res.data
def test_watch_data_package_download(client, live_server, measure_memory_usage, datastore_path):
"""Test downloading a single watch's data as a zip package"""
import os
set_original_response(datastore_path=datastore_path)
uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Download the watch data package
res = client.get(url_for("ui.ui_edit.watch_get_data_package", uuid=uuid))
# Should get the right zip content type
assert res.content_type == "application/zip"
# Should be PK/ZIP stream (PKzip header)
assert res.data[:2] == b'PK', "File should start with PK (PKzip header)"
assert res.data.count(b'PK') >= 2, "Should have multiple PK markers (zip file structure)"
# Verify zip contents
backup = ZipFile(io.BytesIO(res.data))
files = backup.namelist()
# Should have files in a UUID directory
assert any(uuid in f for f in files), f"Files should be in UUID directory: {files}"
# Should contain watch.json
watch_json_path = f"{uuid}/watch.json"
assert watch_json_path in files, f"Should contain watch.json, got: {files}"
# Should contain history/snapshot files
uuid4hex_txt = re.compile(f'^{re.escape(uuid)}/.*\\.txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, files))
assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
assert b'No backups found.' in res.data
@@ -6,7 +6,7 @@ from .util import (
set_original_response,
set_modified_response,
live_server_setup,
wait_for_all_checks, delete_all_watches
wait_for_all_checks
)
from loguru import logger
@@ -104,7 +104,7 @@ def run_socketio_watch_update_test(client, live_server, password_mode="", datast
assert watch.has_unviewed, "The watch was not marked as unviewed after content change"
# Clean up
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_everything(live_server, client, measure_memory_usage, datastore_path):
@@ -1,661 +0,0 @@
#!/usr/bin/env python3
"""
Tests for immediate commit-based persistence system.
Tests cover:
- Watch.commit() persistence to disk
- Concurrent commit safety (race conditions)
- Processor config separation
- Data loss prevention (settings, tags, watch modifications)
"""
import json
import os
import threading
import time
from flask import url_for
from .util import wait_for_all_checks
# ==============================================================================
# 2. Commit() Persistence Tests
# ==============================================================================
def test_watch_commit_persists_to_disk(client, live_server):
"""Test that watch.commit() actually writes to watch.json immediately"""
datastore = client.application.config.get('DATASTORE')
# Create a watch
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Original Title'})
watch = datastore.data['watching'][uuid]
# Modify and commit
watch['title'] = 'Modified Title'
watch['paused'] = True
watch.commit()
# Read directly from disk (bypass datastore cache)
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
assert os.path.exists(watch_json_path), "watch.json should exist on disk"
with open(watch_json_path, 'r') as f:
disk_data = json.load(f)
assert disk_data['title'] == 'Modified Title', "Title should be persisted to disk"
assert disk_data['paused'] == True, "Paused state should be persisted to disk"
assert disk_data['uuid'] == uuid, "UUID should match"
def test_watch_commit_survives_reload(client, live_server):
"""Test that committed changes survive datastore reload"""
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
datastore_path = datastore.datastore_path
# Create and modify a watch
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Test Watch'})
watch = datastore.data['watching'][uuid]
watch['title'] = 'Persisted Title'
watch['paused'] = True
watch['tags'] = ['tag-1', 'tag-2']
watch.commit()
# Simulate app restart - create new datastore instance
datastore2 = ChangeDetectionStore(datastore_path=datastore_path)
datastore2.reload_state(
datastore_path=datastore_path,
include_default_watches=False,
version_tag='test'
)
# Check data survived
assert uuid in datastore2.data['watching'], "Watch should exist after reload"
reloaded_watch = datastore2.data['watching'][uuid]
assert reloaded_watch['title'] == 'Persisted Title', "Title should survive reload"
assert reloaded_watch['paused'] == True, "Paused state should survive reload"
assert reloaded_watch['tags'] == ['tag-1', 'tag-2'], "Tags should survive reload"
def test_watch_commit_atomic_on_crash(client, live_server):
"""Test that atomic writes prevent corruption (temp file pattern)"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Original'})
watch = datastore.data['watching'][uuid]
# First successful commit
watch['title'] = 'First Save'
watch.commit()
# Verify watch.json exists and is valid
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f) # Should not raise JSONDecodeError
assert data['title'] == 'First Save'
# Second commit - even if interrupted, original file should be intact
# (atomic write uses temp file + rename, so original is never corrupted)
watch['title'] = 'Second Save'
watch.commit()
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['title'] == 'Second Save'
def test_multiple_watches_commit_independently(client, live_server):
"""Test that committing one watch doesn't affect others"""
datastore = client.application.config.get('DATASTORE')
# Create multiple watches
uuid1 = datastore.add_watch(url='http://example1.com', extras={'title': 'Watch 1'})
uuid2 = datastore.add_watch(url='http://example2.com', extras={'title': 'Watch 2'})
uuid3 = datastore.add_watch(url='http://example3.com', extras={'title': 'Watch 3'})
watch1 = datastore.data['watching'][uuid1]
watch2 = datastore.data['watching'][uuid2]
watch3 = datastore.data['watching'][uuid3]
# Modify and commit only watch2
watch2['title'] = 'Modified Watch 2'
watch2['paused'] = True
watch2.commit()
# Read all from disk
def read_watch_json(uuid):
watch = datastore.data['watching'][uuid]
path = os.path.join(watch.data_dir, 'watch.json')
with open(path, 'r') as f:
return json.load(f)
data1 = read_watch_json(uuid1)
data2 = read_watch_json(uuid2)
data3 = read_watch_json(uuid3)
# Only watch2 should have changes
assert data1['title'] == 'Watch 1', "Watch 1 should be unchanged"
assert data1['paused'] == False, "Watch 1 should not be paused"
assert data2['title'] == 'Modified Watch 2', "Watch 2 should be modified"
assert data2['paused'] == True, "Watch 2 should be paused"
assert data3['title'] == 'Watch 3', "Watch 3 should be unchanged"
assert data3['paused'] == False, "Watch 3 should not be paused"
# ==============================================================================
# 3. Concurrency/Race Condition Tests
# ==============================================================================
def test_concurrent_watch_commits_dont_corrupt(client, live_server):
"""Test that simultaneous commits to same watch don't corrupt JSON"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Test'})
watch = datastore.data['watching'][uuid]
errors = []
def modify_and_commit(field, value):
try:
watch[field] = value
watch.commit()
except Exception as e:
errors.append(e)
# Run 10 concurrent commits
threads = []
for i in range(10):
t = threading.Thread(target=modify_and_commit, args=('title', f'Title {i}'))
threads.append(t)
t.start()
for t in threads:
t.join()
# Should not have any errors
assert len(errors) == 0, f"Expected no errors, got: {errors}"
# JSON file should still be valid (not corrupted)
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f) # Should not raise JSONDecodeError
assert data['uuid'] == uuid, "UUID should still be correct"
assert 'Title' in data['title'], "Title should contain 'Title'"
def test_concurrent_modifications_during_commit(client, live_server):
"""Test that modifying watch during commit doesn't cause RuntimeError"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Test'})
watch = datastore.data['watching'][uuid]
errors = []
stop_flag = threading.Event()
def keep_modifying():
"""Continuously modify watch"""
try:
i = 0
while not stop_flag.is_set():
watch['title'] = f'Title {i}'
watch['paused'] = i % 2 == 0
i += 1
time.sleep(0.001)
except Exception as e:
errors.append(('modifier', e))
def keep_committing():
"""Continuously commit watch"""
try:
for _ in range(20):
watch.commit()
time.sleep(0.005)
except Exception as e:
errors.append(('committer', e))
# Start concurrent modification and commits
modifier = threading.Thread(target=keep_modifying)
committer = threading.Thread(target=keep_committing)
modifier.start()
committer.start()
committer.join()
stop_flag.set()
modifier.join()
# Should not have RuntimeError from dict changing during iteration
runtime_errors = [e for source, e in errors if isinstance(e, RuntimeError)]
assert len(runtime_errors) == 0, f"Should not have RuntimeError, got: {runtime_errors}"
def test_datastore_lock_protects_commit_snapshot(client, live_server):
"""Test that datastore.lock prevents race conditions during deepcopy"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(url='http://example.com', extras={'title': 'Test'})
watch = datastore.data['watching'][uuid]
# Add some complex nested data
watch['browser_steps'] = [
{'operation': 'click', 'selector': '#foo'},
{'operation': 'wait', 'seconds': 5}
]
errors = []
commits_succeeded = [0]
def rapid_commits():
try:
for i in range(50):
watch['title'] = f'Title {i}'
watch.commit()
commits_succeeded[0] += 1
time.sleep(0.001)
except Exception as e:
errors.append(e)
# Multiple threads doing rapid commits
threads = [threading.Thread(target=rapid_commits) for _ in range(3)]
for t in threads:
t.start()
for t in threads:
t.join()
assert len(errors) == 0, f"Expected no errors, got: {errors}"
assert commits_succeeded[0] == 150, f"Expected 150 commits, got {commits_succeeded[0]}"
# Final JSON should be valid
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['uuid'] == uuid
# ==============================================================================
# 4. Processor Config Separation Tests
# ==============================================================================
def test_processor_config_never_in_watch_json(client, live_server):
"""Test that processor_config_* fields are filtered out of watch.json"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(
url='http://example.com',
extras={
'title': 'Test Watch',
'processor': 'restock_diff'
}
)
watch = datastore.data['watching'][uuid]
# Try to set processor config fields (these should be filtered during commit)
watch['processor_config_price_threshold'] = 10.0
watch['processor_config_some_setting'] = 'value'
watch['processor_config_another'] = {'nested': 'data'}
watch.commit()
# Read watch.json from disk
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f)
# Verify processor_config_* fields are NOT in watch.json
for key in data.keys():
assert not key.startswith('processor_config_'), \
f"Found {key} in watch.json - processor configs should be in separate file!"
# Normal fields should still be there
assert data['title'] == 'Test Watch'
assert data['processor'] == 'restock_diff'
def test_api_post_saves_processor_config_separately(client, live_server):
"""Test that API POST saves processor configs to {processor}.json"""
import json
from changedetectionio.processors import extract_processor_config_from_form_data
# Get API key
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Create watch via API with processor config
response = client.post(
url_for("createwatch"),
data=json.dumps({
'url': 'http://example.com',
'processor': 'restock_diff',
'processor_config_price_threshold': 10.0,
'processor_config_in_stock_only': True
}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert response.status_code in (200, 201), f"Expected 200/201, got {response.status_code}"
uuid = response.json.get('uuid')
assert uuid, "Should return UUID"
datastore = client.application.config.get('DATASTORE')
watch = datastore.data['watching'][uuid]
# Check that processor config file exists
processor_config_path = os.path.join(watch.data_dir, 'restock_diff.json')
assert os.path.exists(processor_config_path), "Processor config file should exist"
with open(processor_config_path, 'r') as f:
config = json.load(f)
# Verify fields are saved WITHOUT processor_config_ prefix
assert config.get('price_threshold') == 10.0, "Should have price_threshold (no prefix)"
assert config.get('in_stock_only') == True, "Should have in_stock_only (no prefix)"
assert 'processor_config_price_threshold' not in config, "Should NOT have prefixed keys"
def test_api_put_saves_processor_config_separately(client, live_server):
"""Test that API PUT updates processor configs in {processor}.json"""
import json
datastore = client.application.config.get('DATASTORE')
# Get API key
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Create watch
uuid = datastore.add_watch(
url='http://example.com',
extras={'processor': 'restock_diff'}
)
# Update via API with processor config
response = client.put(
url_for("watch", uuid=uuid),
data=json.dumps({
'processor_config_price_threshold': 15.0,
'processor_config_min_stock': 5
}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
# PUT might return different status codes, 200 or 204 are both OK
assert response.status_code in (200, 204), f"Expected 200/204, got {response.status_code}: {response.data}"
watch = datastore.data['watching'][uuid]
# Check processor config file
processor_config_path = os.path.join(watch.data_dir, 'restock_diff.json')
assert os.path.exists(processor_config_path), "Processor config file should exist"
with open(processor_config_path, 'r') as f:
config = json.load(f)
assert config.get('price_threshold') == 15.0, "Should have updated price_threshold"
assert config.get('min_stock') == 5, "Should have min_stock"
def test_ui_edit_saves_processor_config_separately(client, live_server):
"""Test that processor_config_* fields never appear in watch.json (even from UI)"""
datastore = client.application.config.get('DATASTORE')
# Create watch
uuid = datastore.add_watch(
url='http://example.com',
extras={'processor': 'text_json_diff', 'title': 'Test'}
)
watch = datastore.data['watching'][uuid]
# Simulate someone accidentally trying to set processor_config fields directly
watch['processor_config_should_not_save'] = 'test_value'
watch['processor_config_another_field'] = 123
watch['normal_field'] = 'this_should_save'
watch.commit()
# Check watch.json has NO processor_config_* fields (main point of this test)
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
watch_data = json.load(f)
for key in watch_data.keys():
assert not key.startswith('processor_config_'), \
f"Found {key} in watch.json - processor configs should be filtered during commit"
# Verify normal fields still save
assert watch_data['normal_field'] == 'this_should_save', "Normal fields should save"
assert watch_data['title'] == 'Test', "Original fields should still be there"
def test_browser_steps_normalized_to_empty_list(client, live_server):
"""Test that meaningless browser_steps are normalized to [] during commit"""
datastore = client.application.config.get('DATASTORE')
uuid = datastore.add_watch(url='http://example.com')
watch = datastore.data['watching'][uuid]
# Set browser_steps to meaningless values
watch['browser_steps'] = [
{'operation': 'Choose one', 'selector': ''},
{'operation': 'Goto site', 'selector': ''},
{'operation': '', 'selector': '#foo'}
]
watch.commit()
# Read from disk
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f)
# Should be normalized to empty list
assert data['browser_steps'] == [], "Meaningless browser_steps should be normalized to []"
# ==============================================================================
# 5. Data Loss Prevention Tests
# ==============================================================================
def test_settings_persist_after_update(client, live_server):
"""Test that settings updates are committed and survive restart"""
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
datastore_path = datastore.datastore_path
# Update settings directly (bypass form validation issues)
datastore.data['settings']['application']['empty_pages_are_a_change'] = True
datastore.data['settings']['application']['fetch_backend'] = 'html_requests'
datastore.data['settings']['requests']['time_between_check']['minutes'] = 120
datastore.commit()
# Simulate restart
datastore2 = ChangeDetectionStore(datastore_path=datastore_path)
datastore2.reload_state(
datastore_path=datastore_path,
include_default_watches=False,
version_tag='test'
)
# Verify settings survived
assert datastore2.data['settings']['application']['empty_pages_are_a_change'] == True, "empty_pages_are_a_change should persist"
assert datastore2.data['settings']['application']['fetch_backend'] == 'html_requests', "fetch_backend should persist"
assert datastore2.data['settings']['requests']['time_between_check']['minutes'] == 120, "time_between_check should persist"
def test_tag_mute_persists(client, live_server):
"""Test that tag mute/unmute operations persist"""
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
datastore_path = datastore.datastore_path
# Add a tag
tag_uuid = datastore.add_tag('Test Tag')
# Mute the tag
response = client.get(url_for("tags.mute", uuid=tag_uuid))
assert response.status_code == 302 # Redirect
# Verify muted in memory
assert datastore.data['settings']['application']['tags'][tag_uuid]['notification_muted'] == True
# Simulate restart
datastore2 = ChangeDetectionStore(datastore_path=datastore_path)
datastore2.reload_state(
datastore_path=datastore_path,
include_default_watches=False,
version_tag='test'
)
# Verify mute state survived
assert tag_uuid in datastore2.data['settings']['application']['tags']
assert datastore2.data['settings']['application']['tags'][tag_uuid]['notification_muted'] == True
def test_tag_delete_removes_from_watches(client, live_server):
"""Test that deleting a tag removes it from all watches"""
datastore = client.application.config.get('DATASTORE')
# Create a tag
tag_uuid = datastore.add_tag('Test Tag')
# Create watches with this tag
uuid1 = datastore.add_watch(url='http://example1.com')
uuid2 = datastore.add_watch(url='http://example2.com')
uuid3 = datastore.add_watch(url='http://example3.com')
watch1 = datastore.data['watching'][uuid1]
watch2 = datastore.data['watching'][uuid2]
watch3 = datastore.data['watching'][uuid3]
watch1['tags'] = [tag_uuid]
watch1.commit()
watch2['tags'] = [tag_uuid, 'other-tag']
watch2.commit()
# watch3 has no tags
# Delete the tag
response = client.get(url_for("tags.delete", uuid=tag_uuid))
assert response.status_code == 302
# Wait for background thread to complete
time.sleep(1)
# Tag should be removed from settings
assert tag_uuid not in datastore.data['settings']['application']['tags']
# Tag should be removed from watches and persisted
def check_watch_tags(uuid):
watch = datastore.data['watching'][uuid]
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
return json.load(f)['tags']
assert tag_uuid not in check_watch_tags(uuid1), "Tag should be removed from watch1"
assert tag_uuid not in check_watch_tags(uuid2), "Tag should be removed from watch2"
assert 'other-tag' in check_watch_tags(uuid2), "Other tags should remain in watch2"
assert check_watch_tags(uuid3) == [], "Watch3 should still have empty tags"
def test_watch_pause_unpause_persists(client, live_server):
"""Test that pause/unpause operations commit and persist"""
datastore = client.application.config.get('DATASTORE')
# Get API key
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
uuid = datastore.add_watch(url='http://example.com')
watch = datastore.data['watching'][uuid]
# Pause via API
response = client.get(url_for("watch", uuid=uuid, paused='paused'), headers={'x-api-key': api_key})
assert response.status_code == 200
# Check persisted to disk
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['paused'] == True, "Pause should be persisted"
# Unpause
response = client.get(url_for("watch", uuid=uuid, paused='unpaused'), headers={'x-api-key': api_key})
assert response.status_code == 200
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['paused'] == False, "Unpause should be persisted"
def test_watch_mute_unmute_persists(client, live_server):
"""Test that mute/unmute operations commit and persist"""
datastore = client.application.config.get('DATASTORE')
# Get API key
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
uuid = datastore.add_watch(url='http://example.com')
watch = datastore.data['watching'][uuid]
# Mute via API
response = client.get(url_for("watch", uuid=uuid, muted='muted'), headers={'x-api-key': api_key})
assert response.status_code == 200
# Check persisted to disk
watch_json_path = os.path.join(watch.data_dir, 'watch.json')
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['notification_muted'] == True, "Mute should be persisted"
# Unmute
response = client.get(url_for("watch", uuid=uuid, muted='unmuted'), headers={'x-api-key': api_key})
assert response.status_code == 200
with open(watch_json_path, 'r') as f:
data = json.load(f)
assert data['notification_muted'] == False, "Unmute should be persisted"
def test_ui_watch_edit_persists_all_fields(client, live_server):
"""Test that UI watch edit form persists all modified fields"""
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
datastore_path = datastore.datastore_path
# Create watch
uuid = datastore.add_watch(url='http://example.com')
# Edit via UI with multiple field changes
response = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
data={
'url': 'http://updated-example.com',
'title': 'Updated Watch Title',
'time_between_check-hours': '2',
'time_between_check-minutes': '30',
'include_filters': '#content',
'fetch_backend': 'html_requests',
'method': 'POST',
'ignore_text': 'Advertisement\nTracking'
},
follow_redirects=True
)
assert b"Updated watch" in response.data or b"Saved" in response.data
# Simulate restart
datastore2 = ChangeDetectionStore(datastore_path=datastore_path)
datastore2.reload_state(
datastore_path=datastore_path,
include_default_watches=False,
version_tag='test'
)
# Verify all fields survived
watch = datastore2.data['watching'][uuid]
assert watch['url'] == 'http://updated-example.com'
assert watch['title'] == 'Updated Watch Title'
assert watch['time_between_check']['hours'] == 2
assert watch['time_between_check']['minutes'] == 30
assert watch['fetch_backend'] == 'html_requests'
assert watch['method'] == 'POST'
+20 -5
View File
@@ -69,7 +69,7 @@ def test_conditions_with_text_and_number(client, live_server, measure_memory_usa
# 1. The page filtered text must contain "5" (first digit of value)
# 2. The extracted number should be >= 20 and <= 100
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"fetch_backend": "html_requests",
@@ -110,20 +110,25 @@ def test_conditions_with_text_and_number(client, live_server, measure_memory_usa
wait_for_all_checks(client)
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
time.sleep(1)
time.sleep(0.2)
wait_for_all_checks(client)
# Case 1
set_number_in_range_response(datastore_path=datastore_path, number="70.5")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(2)
# 75 is > 20 and < 100 and contains "5"
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data
# Case 2: Change with one condition violated
# Number out of range (150) but contains '5'
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
time.sleep(0.2)
set_number_out_of_range_response(datastore_path=datastore_path, number="150.5")
@@ -149,6 +154,7 @@ def test_condition_validate_rule_row(client, live_server, measure_memory_usage,
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
# the front end submits the current form state which should override the watch in a temporary copy
res = client.post(
@@ -189,8 +195,12 @@ def test_condition_validate_rule_row(client, live_server, measure_memory_usage,
)
assert res.status_code == 200
assert b'false' in res.data
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
delete_all_watches(client)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
@@ -220,12 +230,17 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage,
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
# Assert the word count is counted correctly
assert b'<td>13</td>' in res.data
delete_all_watches(client)
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_lev_conditions_plugin(client, live_server, measure_memory_usage, datastore_path):
+9 -6
View File
@@ -71,19 +71,22 @@ def test_include_filters_output():
# Tests the whole stack works with the CSS Filter
def test_check_markup_include_filters_restriction(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
include_filters = "#sametext"
set_original_response(datastore_path=datastore_path)
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, add our ignore text
# Add our URL to the import page
@@ -100,15 +103,15 @@ def test_check_markup_include_filters_restriction(client, live_server, measure_m
)
assert bytes(include_filters.encode('utf-8')) in res.data
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Make a change
set_modified_response(datastore_path=datastore_path)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should have 'has-unread-changes' still
# Because it should be looking at only that 'sametext' id
@@ -64,7 +64,6 @@ def test_DNS_errors(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
assert b"1 Imported" in res.data
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
@@ -80,7 +79,7 @@ def test_DNS_errors(client, live_server, measure_memory_usage, datastore_path):
)
assert found_name_resolution_error
# Should always record that we tried
assert "just now".encode('utf-8') in res.data or 'seconds ago'.encode('utf-8') in res.data
assert bytes("just now".encode('utf-8')) in res.data
delete_all_watches(client)
# Re 1513
@@ -6,6 +6,10 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import os
sleep_time_for_fetch_thread = 3
def test_check_extract_text_from_diff(client, live_server, measure_memory_usage, datastore_path):
import time
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
@@ -1,7 +1,7 @@
import os
import time
from flask import url_for
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output, delete_all_watches
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output
from ..notification import valid_notification_formats
@@ -118,10 +118,8 @@ def run_filter_test(client, live_server, content_filter, app_notification_format
res = client.get(url_for("watchlist.index"))
assert b'Warning, no filters were found' in res.data
assert not os.path.isfile(notification_file)
time.sleep(2)
wait_for_all_checks(client)
time.sleep(1)
wait_for_all_checks(client)
assert live_server.app.config['DATASTORE'].data['watching'][uuid]['consecutive_filter_failures'] == 5
time.sleep(2)
@@ -180,7 +178,6 @@ def run_filter_test(client, live_server, content_filter, app_notification_format
follow_redirects=True
)
os.unlink(notification_file)
delete_all_watches(client)
def test_check_include_filters_failure_notification(client, live_server, measure_memory_usage, datastore_path):
@@ -188,12 +185,10 @@ def test_check_include_filters_failure_notification(client, live_server, measure
run_filter_test(client=client, live_server=live_server, content_filter='#nope-doesnt-exist', app_notification_format=valid_notification_formats.get('htmlcolor'), datastore_path=datastore_path)
# Check markup send conversion didnt affect plaintext preference
run_filter_test(client=client, live_server=live_server, content_filter='#nope-doesnt-exist', app_notification_format=valid_notification_formats.get('text'), datastore_path=datastore_path)
delete_all_watches(client)
def test_check_xpath_filter_failure_notification(client, live_server, measure_memory_usage, datastore_path):
# # live_server_setup(live_server) # Setup on conftest per function
run_filter_test(client=client, live_server=live_server, content_filter='//*[@id="nope-doesnt-exist"]', app_notification_format=valid_notification_formats.get('htmlcolor'), datastore_path=datastore_path)
delete_all_watches(client)
# Test that notification is never sent
@@ -202,4 +197,3 @@ def test_basic_markup_from_text(client, live_server, measure_memory_usage, datas
from ..notification.handler import markup_text_links_to_html
x = markup_text_links_to_html("hello https://google.com")
assert 'a href' in x
delete_all_watches(client)
+1 -144
View File
@@ -5,8 +5,6 @@ from flask import url_for
from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, extract_UUID_from_client, delete_all_watches
import os
from ..store import ChangeDetectionStore
# def test_setup(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
@@ -168,8 +166,7 @@ def test_tag_add_in_ui(client, live_server, measure_memory_usage, datastore_path
delete_all_watches(client)
def test_group_tag_notification(client, live_server, measure_memory_usage, datastore_path):
delete_all_watches(client)
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
@@ -476,143 +473,3 @@ the {test} appeared before. {test in res.data[:n]=}
n += t_index + len(test)
delete_all_watches(client)
def test_tag_json_persistence(client, live_server, measure_memory_usage, datastore_path):
"""
Test that tags are saved to individual tag.json files and loaded correctly.
This test verifies the update_27 tag storage refactoring:
- Tags are saved to {uuid}/tag.json files
- Tags persist across datastore restarts
- Tag edits write to tag.json
- Tag deletion removes tag.json file
"""
import json
datastore = client.application.config.get('DATASTORE')
# 1. Create a tag
res = client.post(
url_for("tags.form_tag_add"),
data={"name": "persistence-test-tag"},
follow_redirects=True
)
assert b"Tag added" in res.data
tag_uuid = get_UUID_for_tag_name(client, name="persistence-test-tag")
assert tag_uuid, "Tag UUID should exist"
# 2. Verify tag.json file was created
tag_json_path = os.path.join(datastore_path, tag_uuid, "tag.json")
assert os.path.exists(tag_json_path), f"tag.json should exist at {tag_json_path}"
# 3. Verify tag.json contains correct data
with open(tag_json_path, 'r') as f:
tag_data = json.load(f)
assert tag_data['title'] == 'persistence-test-tag'
assert tag_data['uuid'] == tag_uuid
assert 'date_created' in tag_data
# 4. Edit the tag
res = client.post(
url_for("tags.form_tag_edit_submit", uuid=tag_uuid),
data={
"name": "persistence-test-tag",
"notification_muted": True,
"include_filters": '#test-filter'
},
follow_redirects=True
)
assert b"Updated" in res.data
# 5. Verify tag.json was updated
with open(tag_json_path, 'r') as f:
tag_data = json.load(f)
assert tag_data['notification_muted'] == True
assert '#test-filter' in tag_data.get('include_filters', [])
# 5a. Verify tag is NOT in changedetection.json (tags should be in tag.json only)
changedetection_json_path = os.path.join(datastore_path, "changedetection.json")
with open(changedetection_json_path, 'r') as f:
settings_data = json.load(f)
# Tags dict should be empty in settings (all tags are in individual files)
assert settings_data['settings']['application']['tags'] == {}, \
"Tags should NOT be saved to changedetection.json (should be empty dict)"
# 6. Simulate restart - reload datastore
datastore2 = ChangeDetectionStore(datastore_path=datastore_path, include_default_watches=False, version_tag='test')
# 7. Verify tag was loaded from tag.json
assert tag_uuid in datastore2.data['settings']['application']['tags']
loaded_tag = datastore2.data['settings']['application']['tags'][tag_uuid]
assert loaded_tag['title'] == 'persistence-test-tag'
assert loaded_tag['notification_muted'] == True
assert '#test-filter' in loaded_tag.get('include_filters', [])
# 8. Delete the tag via API
res = client.get(url_for("tags.delete", uuid=tag_uuid), follow_redirects=True)
assert b"Tag deleted" in res.data
# 9. Verify tag.json file was deleted
assert not os.path.exists(tag_json_path), f"tag.json should be deleted at {tag_json_path}"
# 10. Verify tag is removed from settings
assert tag_uuid not in datastore.data['settings']['application']['tags']
delete_all_watches(client)
def test_tag_json_migration_update_27(client, live_server, measure_memory_usage, datastore_path):
"""
Test that update_27 migration correctly moves tags to individual files.
This simulates a pre-update_27 datastore and verifies migration works.
"""
import json
# 1. Create multiple tags
tag_names = ['migration-tag-1', 'migration-tag-2', 'migration-tag-3']
tag_uuids = []
for tag_name in tag_names:
res = client.post(
url_for("tags.form_tag_add"),
data={"name": tag_name},
follow_redirects=True
)
assert b"Tag added" in res.data
tag_uuid = get_UUID_for_tag_name(client, name=tag_name)
tag_uuids.append(tag_uuid)
# 2. Verify all tag.json files exist (update_27 already ran during add_tag)
for tag_uuid in tag_uuids:
tag_json_path = os.path.join(datastore_path, tag_uuid, "tag.json")
assert os.path.exists(tag_json_path), f"tag.json should exist for {tag_uuid}"
# 2a. Verify tags are NOT in changedetection.json
changedetection_json_path = os.path.join(datastore_path, "changedetection.json")
with open(changedetection_json_path, 'r') as f:
settings_data = json.load(f)
assert settings_data['settings']['application']['tags'] == {}, \
"Tags should NOT be in changedetection.json after migration"
# 3. Simulate restart
datastore2 = ChangeDetectionStore(datastore_path=datastore_path, include_default_watches=False, version_tag='test')
# 4. Verify all tags loaded from tag.json files
for idx, tag_uuid in enumerate(tag_uuids):
assert tag_uuid in datastore2.data['settings']['application']['tags']
loaded_tag = datastore2.data['settings']['application']['tags'][tag_uuid]
assert loaded_tag['title'] == tag_names[idx]
# Cleanup
res = client.get(url_for("tags.delete_all"), follow_redirects=True)
assert b'All tags deleted' in res.data
# Verify all tag.json files were deleted
for tag_uuid in tag_uuids:
tag_json_path = os.path.join(datastore_path, tag_uuid, "tag.json")
assert not os.path.exists(tag_json_path), f"tag.json should be deleted for {tag_uuid}"
delete_all_watches(client)
@@ -106,7 +106,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and fname != 'watch.json' and fname != 'last-checksum.txt' and 'html' not in fname:
if fname != 'history.txt' and fname != 'watch.json' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
@@ -123,18 +123,11 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
# With new format, we have watch.json, so 4 files minimum
# Note: last-checksum.txt may or may not exist - it gets cleared by settings changes,
# and this test changes settings before checking files
# This assertion should be AFTER the loop, not inside it
# With new format, we also have watch.json, so 4 files total
if os.path.exists(changedetection_json):
# 4 required files: watch.json, html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional (cleared by settings changes in this test)
assert len(files_in_watch_dir) >= 4 and len(files_in_watch_dir) <= 5, f"Should be 4-5 files in the dir with new format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
assert len(files_in_watch_dir) == 4, "Should be four files in the dir with new format: watch.json, html.br snapshot, history.txt and the extracted text snapshot"
else:
# 3 required files: html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional
assert len(files_in_watch_dir) >= 3 and len(files_in_watch_dir) <= 4, f"Should be 3-4 files in the dir with legacy format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir with legacy format: html.br snapshot, history.txt and the extracted text snapshot"
# Check that 'default' Watch vars aren't accidentally being saved
if os.path.exists(changedetection_json):
@@ -149,8 +142,6 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
delete_all_watches(client)
def test_check_text_history_view(client, live_server, measure_memory_usage, datastore_path):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
@@ -171,7 +162,7 @@ def test_check_text_history_view(client, live_server, measure_memory_usage, data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid=uuid))
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid="first"))
assert b'test-one' in res.data
assert b'test-two' in res.data
@@ -189,86 +180,3 @@ def test_check_text_history_view(client, live_server, measure_memory_usage, data
assert b'test-one' not in res.data
delete_all_watches(client)
def test_history_trim_global_only(client, live_server, measure_memory_usage, datastore_path):
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = None
limit = 3
for i in range(0, 10):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(f"<html>test {i}</html>")
if not uuid:
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
if i ==8:
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
history_n = len(list(watch.history.keys()))
logger.debug(f"History length should be at limit {limit} and it is {history_n}")
assert history_n == limit
if i == 6:
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": limit},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
delete_all_watches(client)
def test_history_trim_global_override_in_watch(client, live_server, measure_memory_usage, datastore_path):
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = None
limit = 3
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": 10000},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
for i in range(0, 10):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(f"<html>test {i}</html>")
if not uuid:
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"include_filters": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests",
"time_between_check_use_default": "y", "history_snapshot_max_length": str(limit)},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
if i == 8:
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
history_n = len(list(watch.history.keys()))
logger.debug(f"History length should be at limit {limit} and it is {history_n}")
assert history_n == limit
if i == 6:
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": limit},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
delete_all_watches(client)
@@ -41,6 +41,7 @@ def set_modified_ignore_response(datastore_path):
def test_render_anchor_tag_content_true(client, live_server, measure_memory_usage, datastore_path):
"""Testing that the link changes are detected when
render_anchor_tag_content setting is set to true"""
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
@@ -40,7 +40,10 @@ def set_some_changed_response(datastore_path):
def test_normal_page_check_works_with_ignore_status_code(client, live_server, measure_memory_usage, datastore_path):
from loguru import logger
# Give the endpoint time to spin up
time.sleep(1)
set_original_response(datastore_path=datastore_path)
@@ -59,47 +62,27 @@ def test_normal_page_check_works_with_ignore_status_code(client, live_server, me
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
logger.info(f"TEST: First check - queuing UUID {uuid}")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
logger.info(f"TEST: Waiting for first check to complete")
wait_result = wait_for_all_checks(client)
logger.info(f"TEST: First check wait completed: {wait_result}")
# Check history after first check
watch = client.application.config.get('DATASTORE').data['watching'][uuid]
logger.info(f"TEST: After first check - history count: {len(watch.history.keys())}")
wait_for_all_checks(client)
set_some_changed_response(datastore_path=datastore_path)
wait_for_all_checks(client)
# Trigger a check
logger.info(f"TEST: Second check - queuing UUID {uuid}")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
logger.info(f"TEST: Waiting for second check to complete")
wait_result = wait_for_all_checks(client)
logger.info(f"TEST: Second check wait completed: {wait_result}")
# Check history after second check
watch = client.application.config.get('DATASTORE').data['watching'][uuid]
logger.info(f"TEST: After second check - history count: {len(watch.history.keys())}")
logger.info(f"TEST: Watch history keys: {list(watch.history.keys())}")
# Give the thread time to pick it up
wait_for_all_checks(client)
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
if b'has-unread-changes' not in res.data:
logger.error(f"TEST FAILED: has-unread-changes not found in response")
logger.error(f"TEST: Watch last_error: {watch.get('last_error')}")
logger.error(f"TEST: Watch last_checked: {watch.get('last_checked')}")
assert b'has-unread-changes' in res.data
assert b'/test-endpoint' in res.data
# Tests the whole stack works with staus codes ignored
def test_403_page_check_works_with_ignore_status_code(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
set_original_response(datastore_path=datastore_path)
@@ -111,7 +94,8 @@ def test_403_page_check_works_with_ignore_status_code(client, live_server, measu
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, check our ignore option
# Add our URL to the import page
@@ -2,9 +2,10 @@
import time
from flask import url_for
from . util import live_server_setup
import os
from .util import live_server_setup, delete_all_watches, wait_for_all_checks
# Should be the same as set_original_ignore_response(datastore_path=datastore_path) but with a little more whitespacing
@@ -49,7 +50,10 @@ def set_original_ignore_response(datastore_path):
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_check_ignore_whitespace(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
set_original_ignore_response(datastore_path=datastore_path)
@@ -70,17 +74,17 @@ def test_check_ignore_whitespace(client, live_server, measure_memory_usage, data
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
set_original_ignore_response_but_with_whitespace(datastore_path)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
+1 -1
View File
@@ -82,7 +82,7 @@ def test_import_distillio(client, live_server, measure_memory_usage, datastore_p
# Give the endpoint time to spin up
time.sleep(1)
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
res = client.post(
url_for("imports.import_page"),
data={
@@ -224,7 +224,6 @@ def check_json_filter(json_filter, client, live_server, datastore_path):
set_original_response(datastore_path=datastore_path)
delete_all_watches(client)
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type="application/json", _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={"include_filters": json_filter.splitlines()})
@@ -298,17 +297,14 @@ def check_json_filter_bool_val(json_filter, client, live_server, datastore_path)
def test_check_jsonpath_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
check_json_filter_bool_val("json:$['available']", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jq_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_filter_bool_val("jq:.available", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jqraw_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_filter_bool_val("jq:.available", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
# Re #265 - Extended JSON selector test
# Stuff to consider here
@@ -456,17 +452,14 @@ def test_correct_header_detect(client, live_server, measure_memory_usage, datast
def test_check_jsonpath_ext_filter(client, live_server, measure_memory_usage, datastore_path):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jq_ext_filter(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage, datastore_path):
from .. import html_tools
@@ -477,6 +470,5 @@ def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage, datastore_
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(json_str, "json:$.name")
assert text == '"José"'
delete_all_watches(client)
+8 -2
View File
@@ -313,8 +313,14 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
# Add a watch and trigger a HTTP POST
test_url = url_for('test_endpoint', _external=True)
watch_uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, tag="nice one")
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": 'nice one'},
follow_redirects=True
)
assert b"Watch added" in res.data
watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
wait_for_all_checks(client)
set_modified_response(datastore_path=datastore_path)
@@ -1,7 +1,7 @@
import os
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, delete_all_watches
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import logging
def test_check_notification_error_handling(client, live_server, measure_memory_usage, datastore_path):
@@ -81,4 +81,4 @@ def test_check_notification_error_handling(client, live_server, measure_memory_u
os.unlink(os.path.join(datastore_path, "notification.txt"))
assert 'xxxxx' in notification_submission
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
@@ -1,52 +0,0 @@
import os
import time
from flask import url_for
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output
from ..notification import valid_notification_formats
from loguru import logger
def test_queue_system(client, live_server, measure_memory_usage, datastore_path):
"""Test that multiple workers can process queue concurrently without blocking each other"""
# (pytest) Werkzeug's threaded server uses ThreadPoolExecutor with a default limit of around 40 threads (or min(32, os.cpu_count() + 4)).
items = os.cpu_count() +3
delay = 10
# Auto-queue is off here.
live_server.app.config['DATASTORE'].data['settings']['application']['all_paused'] = True
test_urls = [
f"{url_for('test_endpoint', _external=True)}?delay={delay}&id={i}&content=hello+test+content+{i}"
for i in range(0, items)
]
# Import 30 URLs to queue
res = client.post(
url_for("imports.import_page"),
data={"urls": "\r\n".join(test_urls)},
follow_redirects=True
)
assert f"{items} Imported".encode('utf-8') in res.data
client.application.set_workers(items)
start = time.time()
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
time.sleep(delay/2)
# Verify all workers are idle (no UUIDs being processed)
from changedetectionio import worker_pool
running_uuids = worker_pool.get_running_uuids()
logger.debug( f"Should be atleast some workers running - {len(running_uuids)} UUIDs still being processed: {running_uuids}")
assert len(running_uuids) != 0, f"Should be atleast some workers running - {len(running_uuids)} UUIDs still being processed: {running_uuids}"
wait_for_all_checks(client)
# all workers should be done in less than say 10 seconds (they take time to 'see' something is in the queue too)
total_time = (time.time() - start)
logger.debug(f"All workers finished {items} items in less than {delay} seconds per job. {total_time}s total")
# if there was a bug in queue handler not running parallel, this would blow out to items*delay seconds
assert total_time < delay + 10, f"All workers finished {items} items in less than {delay} seconds per job, total time {total_time}s"
# Verify all workers are idle (no UUIDs being processed)
from changedetectionio import worker_pool
running_uuids = worker_pool.get_running_uuids()
assert len(running_uuids) == 0, f"Expected all workers to be idle, but {len(running_uuids)} UUIDs still being processed: {running_uuids}"
+8 -9
View File
@@ -17,12 +17,12 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
test_url = test_url.replace('localhost', 'changedet')
# Add the test URL twice, we will check
uuidA = client.application.config.get('DATASTORE').add_watch(url=test_url)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
uuidB = client.application.config.get('DATASTORE').add_watch(url=test_url)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
@@ -31,7 +31,7 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
# Add some headers to a request
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuidA),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -42,14 +42,13 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
)
assert b"Updated watch." in res.data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick up the first version
wait_for_all_checks(client)
# The service should echo back the request headers
res = client.get(
url_for("ui.ui_preview.preview_page", uuid=uuidA),
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
@@ -93,7 +92,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# add the first 'version'
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -111,7 +110,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
body_value = 'Test Body Value {{ 1+1 }}'
body_value_formatted = 'Test Body Value 2'
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -127,7 +126,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# The service should echo back the body
res = client.get(
url_for("ui.ui_preview.preview_page", uuid=uuid),
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
@@ -158,7 +157,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# Attempt to add a body with a GET method
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -236,7 +236,6 @@ def test_restock_itemprop_with_tag(client, live_server, measure_memory_usage, da
}
_run_test_minmax_limit(client, extra_watch_edit_form=extras,datastore_path=datastore_path)
delete_all_watches(client)
@@ -389,10 +388,9 @@ def test_change_with_notification_values(client, live_server, measure_memory_usa
os.unlink(os.path.join(datastore_path, "notification.txt"))
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
res = client.post(url_for("ui.ui_notification.ajax_callback_send_notification_test", watch_uuid=uuid), data={}, follow_redirects=True)
wait_for_notification_endpoint_output(datastore_path=datastore_path)
time.sleep(5)
assert os.path.isfile(os.path.join(datastore_path, "notification.txt")), "Notification received"
delete_all_watches(client)
def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
@@ -408,7 +406,6 @@ def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
@@ -420,7 +417,6 @@ def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert str(res.data.decode()).count("950.95") == 1, "Price should only show once (for the watch added, no other watches yet)"
@@ -466,4 +462,3 @@ def test_special_prop_examples(client, live_server, measure_memory_usage, datast
assert b'ception' not in res.data
assert b'155.55' in res.data
delete_all_watches(client)
+1 -1
View File
@@ -107,7 +107,7 @@ def test_rss_and_token(client, live_server, measure_memory_usage, datastore_path
assert b"Access denied, bad token" not in res.data
assert b"Random content" in res.data
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage, datastore_path):
@@ -23,7 +23,6 @@ def test_rss_feed_empty(client, live_server, measure_memory_usage, datastore_pat
)
assert res.status_code == 400
assert b'does not have enough history snapshots to show' in res.data
delete_all_watches(client)
def test_rss_single_watch_order(client, live_server, measure_memory_usage, datastore_path):
"""
-100
View File
@@ -24,29 +24,6 @@ def set_original_response(datastore_path):
f.write(test_return_data)
return None
def test_favicon(client, live_server, measure_memory_usage, datastore_path):
# Attempt to fetch it, make sure that works
SVG_BASE64 = 'PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxIDEiLz4='
uuid = client.application.config.get('DATASTORE').add_watch(url='https://localhost')
live_server.app.config['DATASTORE'].data['watching'][uuid].bump_favicon(url="favicon-set-type.svg",
favicon_base_64=SVG_BASE64
)
res = client.get(url_for('static_content', group='favicon', filename=uuid))
assert res.status_code == 200
assert len(res.data) > 10
res = client.get(url_for('static_content', group='..', filename='__init__.py'))
assert res.status_code != 200
res = client.get(url_for('static_content', group='.', filename='../__init__.py'))
assert res.status_code != 200
# Traverse by filename protection
res = client.get(url_for('static_content', group='js', filename='../styles/styles.css'))
assert res.status_code != 200
def test_bad_access(client, live_server, measure_memory_usage, datastore_path):
res = client.post(
@@ -501,80 +478,3 @@ def test_logout_with_redirect(client, live_server, measure_memory_usage, datasto
# Cleanup
del client.application.config['DATASTORE'].data['settings']['application']['password']
def test_static_directory_traversal(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the static file serving route properly blocks directory traversal attempts.
This tests the fix for GHSA-9jj8-v89v-xjvw (CVE pending).
The vulnerability was in /static/<group>/<filename> where the sanitization regex
allowed dots, enabling "../" traversal to read application source files.
The fix changed the regex from r'[^\w.-]+' to r'[^a-z0-9_]+' which blocks dots.
"""
# Test 1: Direct .. traversal attempt (URL-encoded)
res = client.get(
"/static/%2e%2e/flask_app.py",
follow_redirects=False
)
# Should be blocked (404 or 403)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
# Should NOT contain application source code
assert b"def static_content" not in res.data
assert b"changedetection_app" not in res.data
# Test 2: Direct .. traversal attempt (unencoded)
res = client.get(
"/static/../flask_app.py",
follow_redirects=False
)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
assert b"def static_content" not in res.data
# Test 3: Multiple dots traversal
res = client.get(
"/static/..../flask_app.py",
follow_redirects=False
)
assert res.status_code in [404, 403], f"Expected 404/403, got {res.status_code}"
assert b"def static_content" not in res.data
# Test 4: Try to access other application files
for filename in ["__init__.py", "datastore.py", "store.py"]:
res = client.get(
f"/static/%2e%2e/{filename}",
follow_redirects=False
)
assert res.status_code in [404, 403], f"File {filename} should be blocked"
# Should not contain Python code indicators
assert b"import" not in res.data or b"# Test" in res.data # Allow "1 Imported" etc
# Test 5: Verify legitimate static files still work
# Note: We can't test actual files without knowing what exists,
# but we can verify the sanitization doesn't break valid groups
res = client.get(
"/static/images/test.png", # Will 404 if file doesn't exist, but won't traverse
follow_redirects=False
)
# Should get 404 (file not found) not 403 (blocked)
# This confirms the group name "images" is valid
assert res.status_code == 404
# Test 6: Ensure hyphens and dots are blocked in group names
res = client.get(
"/static/../../../etc/passwd",
follow_redirects=False
)
assert res.status_code in [404, 403]
assert b"root:" not in res.data
# Test 7: Test that underscores still work (they're allowed)
res = client.get(
"/static/visual_selector_data/test.json",
follow_redirects=False
)
# visual_selector_data is a real group, but requires auth
# Should get 403 (not authenticated) or 404 (file not found), not a path traversal
assert res.status_code in [403, 404]
@@ -1,208 +0,0 @@
#!/usr/bin/env python3
"""
Test that changing global settings or tag configurations forces reprocessing.
When settings or tag configurations change, all affected watches need to
reprocess even if their content hasn't changed, because configuration affects
the processing result.
"""
import os
import time
from flask import url_for
from .util import wait_for_all_checks
def test_settings_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing global settings clears all checksums to force reprocessing.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Add two watches
datastore = client.application.config.get('DATASTORE')
uuid1 = datastore.add_watch(url=test_url, extras={'title': 'Watch 1'})
uuid2 = datastore.add_watch(url=test_url, extras={'title': 'Watch 2'})
# Unpause watches
datastore.data['watching'][uuid1]['paused'] = False
datastore.data['watching'][uuid2]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum1 = os.path.join(datastore_path, uuid1, 'last-checksum.txt')
checksum2 = os.path.join(datastore_path, uuid2, 'last-checksum.txt')
assert os.path.isfile(checksum1), "First check should create checksum file for watch 1"
assert os.path.isfile(checksum2), "First check should create checksum file for watch 2"
# Change global settings (any setting will do)
res = client.post(
url_for("settings.settings_page"),
data={
"application-empty_pages_are_a_change": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Settings updated." in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ALL checksum files were deleted
assert not os.path.isfile(checksum1), "Settings change should delete checksum for watch 1"
assert not os.path.isfile(checksum2), "Settings change should delete checksum for watch 2"
# Next check should reprocess (not skip) and recreate checksums
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were recreated
assert os.path.isfile(checksum1), "Reprocessing should recreate checksum file for watch 1"
assert os.path.isfile(checksum2), "Reprocessing should recreate checksum file for watch 2"
print("✓ Settings change forces reprocessing of all watches")
def test_tag_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing a tag configuration clears checksums only for watches with that tag.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('Test Tag')
# Add watches - one with tag, one without
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch With Tag', 'tags': [tag_uuid]})
uuid_without_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch Without Tag'})
# Unpause watches
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_without_tag]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum_with = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
checksum_without = os.path.join(datastore_path, uuid_without_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_with), "First check should create checksum for tagged watch"
assert os.path.isfile(checksum_without), "First check should create checksum for untagged watch"
# Edit the tag (change notification_muted as an example)
tag = datastore.data['settings']['application']['tags'][tag_uuid]
res = client.post(
url_for("tags.form_tag_edit_submit", uuid=tag_uuid),
data={
'title': 'Test Tag',
'notification_muted': 'y',
'overrides_watch': 'n'
},
follow_redirects=True
)
assert b"Updated" in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ONLY the tagged watch's checksum was deleted
assert not os.path.isfile(checksum_with), "Tag change should delete checksum for watch WITH tag"
assert os.path.isfile(checksum_without), "Tag change should NOT delete checksum for watch WITHOUT tag"
# Next check should reprocess tagged watch and recreate its checksum
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify tagged watch's checksum was recreated
assert os.path.isfile(checksum_with), "Reprocessing should recreate checksum for tagged watch"
assert os.path.isfile(checksum_without), "Untagged watch should still have its checksum"
print("✓ Tag change forces reprocessing only for watches with that tag")
def test_tag_change_via_api_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that updating a tag via API also clears checksums for affected watches.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('API Test Tag')
# Add watch with tag
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'API Watch'})
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_with_tag]['tags'] = [tag_uuid]
datastore.data['watching'][uuid_with_tag].commit()
# First check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum exists
checksum_file = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_file), "First check should create checksum file"
# Update tag via API
res = client.put(
f'/api/v1/tag/{tag_uuid}',
json={'notification_muted': True},
headers={'x-api-key': datastore.data['settings']['application']['api_access_token']}
)
assert res.status_code == 200, f"API call failed with status {res.status_code}: {res.data}"
# Give it more time for async operations
time.sleep(1.0)
# Debug: Check if checksum still exists
if os.path.isfile(checksum_file):
# Read checksum to see if it changed
with open(checksum_file, 'r') as f:
checksum_content = f.read()
print(f"Checksum still exists: {checksum_content}")
# Verify checksum was deleted
assert not os.path.isfile(checksum_file), "API tag update should delete checksum"
print("✓ Tag update via API forces reprocessing")
+8 -8
View File
@@ -6,6 +6,9 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, delete_all_watches
import re
sleep_time_for_fetch_thread = 3
def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
@@ -21,20 +24,20 @@ def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
assert bytes(include_filters.encode('utf-8')) in res.data
# click share the link
res = client.get(
url_for("ui.form_share_put_watch", uuid=uuid),
url_for("ui.form_share_put_watch", uuid="first"),
follow_redirects=True
)
@@ -60,16 +63,13 @@ def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
# Now hit edit, we should see what we expect
# that the import fetched the meta-data
uuids = list(client.application.config.get('DATASTORE').data['watching'])
assert uuids, "It saved/imported and created a new URL from the share"
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuids[0]),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
assert bytes(include_filters.encode('utf-8')) in res.data
# Check it saved the URL
res = client.get(url_for("watchlist.index"))
assert bytes(test_url.encode('utf-8')) in res.data
delete_all_watches(client)
+2 -4
View File
@@ -6,6 +6,7 @@ from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
from ..diff import ADDED_STYLE
sleep_time_for_fetch_thread = 3
def test_check_basic_change_detection_functionality_source(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
@@ -71,10 +72,7 @@ def test_check_ignore_elements(client, live_server, measure_memory_usage, datast
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(
url_for("ui.ui_preview.preview_page", uuid="first"),

Some files were not shown because too many files have changed in this diff Show More