Compare commits

...

16 Commits

Author SHA1 Message Date
dgtlmoon
fffcc9af39 WIP
Some checks failed
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2026-02-13 15:44:54 +01:00
dgtlmoon
961901c594 WIP 2026-02-13 15:15:34 +01:00
dgtlmoon
340421ea36 Minor cache 2026-02-13 14:58:23 +01:00
dgtlmoon
f29c4c8f5f WIP 2026-02-13 14:54:34 +01:00
dgtlmoon
9702b6c8a1 Tweak message 2026-02-13 14:52:34 +01:00
dgtlmoon
798fc21f1c WIP 2026-02-13 14:50:23 +01:00
dgtlmoon
0c6931c07c WIP 2026-02-13 14:40:43 +01:00
dgtlmoon
60ed2a26ea WIP 2026-02-13 14:28:56 +01:00
dgtlmoon
490ca0a663 WIP 2026-02-13 11:41:55 +01:00
dgtlmoon
10c9df288a WIP 2026-02-13 11:24:17 +01:00
dgtlmoon
f54725d292 Increase test coverage 2026-02-13 09:18:28 +01:00
dgtlmoon
acf9e4a1e6 Remove flask_expects_json 2026-02-13 09:10:31 +01:00
dgtlmoon
7ddc0f9be0 Sync API Spec with base model 2026-02-13 09:10:04 +01:00
dgtlmoon
20f11c5c4a Improve error logging 2026-02-13 08:49:09 +01:00
dgtlmoon
4bc01aca8d Price tracker - Use a more memory efficient price scraper, use subprocess on linux for cleaner memory management. (#3864)
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
2026-02-11 17:21:08 +01:00
dgtlmoon
ef41dd304c Refactoring upgrade path (#3861) 2026-02-11 16:13:08 +01:00
24 changed files with 2090 additions and 596 deletions

View File

@@ -103,7 +103,7 @@ jobs:
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model' docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security' docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver' docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text' docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
# Basic pytest tests with ancillary services # Basic pytest tests with ancillary services
basic-tests: basic-tests:
@@ -516,3 +516,142 @@ jobs:
exit 1 exit 1
fi fi
docker rm sig-test docker rm sig-test
# Upgrade path test
upgrade-path-test:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 25
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # Fetch all history and tags for upgrade testing
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Check upgrade works without error
run: |
echo "=== Testing upgrade path from 0.49.1 to ${{ github.ref_name }} (${{ github.sha }}) ==="
# Checkout old version and create datastore
git checkout 0.49.1
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
pip install 'pyOpenSSL>=23.2.0'
echo "=== Running version 0.49.1 to create datastore ==="
python3 ./changedetection.py -C -d /tmp/data &
APP_PID=$!
# Wait for app to be ready
echo "Waiting for 0.49.1 to be ready..."
sleep 6
# Extract API key from datastore (0.49.1 uses url-watches.json)
API_KEY=$(jq -r '.settings.application.api_access_token // empty' /tmp/data/url-watches.json)
echo "API Key: ${API_KEY:0:8}..."
# Create a watch with tag "github-group-test" via API
echo "Creating test watch with tag via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
--retry 6 --retry-delay 1 --retry-connrefused \
-d '{
"url": "https://example.com/upgrade-test",
"tag": "github-group-test"
}'
echo "✓ Created watch with tag 'github-group-test'"
# Create a specific test URL watch
echo "Creating test URL watch via API..."
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
--show-error --fail \
-d '{
"url": "http://localhost/test.txt"
}'
echo "✓ Created watch for 'http://localhost/test.txt' in version 0.49.1"
# Stop the old version gracefully
kill $APP_PID
wait $APP_PID || true
echo "✓ Version 0.49.1 stopped"
# Upgrade to current version (use commit SHA since we're in detached HEAD)
echo "Upgrading to commit ${{ github.sha }}"
git checkout ${{ github.sha }}
pip install -r requirements.txt
echo "=== Running current version (commit ${{ github.sha }}) with old datastore (testing mode) ==="
TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD=1 python3 ./changedetection.py -d /tmp/data > /tmp/upgrade-test.log 2>&1
echo "=== Upgrade test output ==="
cat /tmp/upgrade-test.log
echo "✓ Datastore upgraded successfully"
# Now start the current version normally to verify the tag survived
echo "=== Starting current version to verify tag exists after upgrade ==="
timeout 20 python3 ./changedetection.py -d /tmp/data > /tmp/ui-test.log 2>&1 &
APP_PID=$!
# Wait for app to be ready and fetch UI
echo "Waiting for current version to be ready..."
sleep 5
curl --retry 6 --retry-delay 1 --retry-connrefused --silent http://127.0.0.1:5000 > /tmp/ui-output.html
# Verify tag exists in UI
if grep -q "github-group-test" /tmp/ui-output.html; then
echo "✓ Tag 'github-group-test' found in UI after upgrade"
else
echo "ERROR: Tag 'github-group-test' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Verify test URL exists in UI
if grep -q "http://localhost/test.txt" /tmp/ui-output.html; then
echo "✓ Watch URL 'http://localhost/test.txt' found in UI after upgrade"
else
echo "ERROR: Watch URL 'http://localhost/test.txt' not found in UI after upgrade"
echo "=== UI Output ==="
cat /tmp/ui-output.html
echo "=== App Log ==="
cat /tmp/ui-test.log
kill $APP_PID || true
exit 1
fi
# Cleanup
kill $APP_PID || true
wait $APP_PID || true
echo ""
echo "✓✓✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }} ✓✓✓"
echo " - Commit: ${{ github.sha }}"
echo " - Datastore migrated successfully"
echo " - Tag 'github-group-test' survived upgrade"
echo " - Watch URL 'http://localhost/test.txt' survived upgrade"
echo "✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }}"
- name: Upload upgrade test logs
if: always()
uses: actions/upload-artifact@v6
with:
name: upgrade-test-logs-py${{ env.PYTHON_VERSION }}
path: /tmp/upgrade-test.log

View File

@@ -371,7 +371,15 @@ def main():
# Dont' start if the JSON DB looks corrupt # Dont' start if the JSON DB looks corrupt
logger.critical(f"ERROR: JSON DB or Proxy List JSON at '{app_config['datastore_path']}' appears to be corrupt, aborting.") logger.critical(f"ERROR: JSON DB or Proxy List JSON at '{app_config['datastore_path']}' appears to be corrupt, aborting.")
logger.critical(str(e)) logger.critical(str(e))
return sys.exit(1)
# Testing mode: Exit cleanly after datastore initialization (for CI/CD upgrade tests)
if os.environ.get('TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD'):
logger.success(f"TESTING MODE: Datastore loaded successfully from {app_config['datastore_path']}")
logger.success(f"TESTING MODE: Schema version: {datastore.data['settings']['application'].get('schema_version', 'unknown')}")
logger.success(f"TESTING MODE: Loaded {len(datastore.data['watching'])} watches")
logger.success("TESTING MODE: Exiting cleanly (TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD is set)")
sys.exit(0)
# Apply all_paused setting if specified via CLI # Apply all_paused setting if specified via CLI
if all_paused is not None: if all_paused is not None:

View File

@@ -2,7 +2,7 @@ from changedetectionio.strtobool import strtobool
from flask_restful import abort, Resource from flask_restful import abort, Resource
from flask import request from flask import request
from functools import wraps from functools import wraps
from . import auth, validate_openapi_request, schema_create_watch from . import auth, validate_openapi_request
from ..validate_url import is_safe_valid_url from ..validate_url import is_safe_valid_url
import json import json
@@ -33,9 +33,25 @@ def convert_query_param_to_type(value, schema_property):
Returns: Returns:
Converted value in the appropriate type Converted value in the appropriate type
Supports both OpenAPI 3.1 formats:
- type: [string, 'null'] (array format)
- anyOf: [{type: string}, {type: null}] (anyOf format)
""" """
# Handle anyOf schemas (extract the first type) prop_type = schema_property.get('type')
if 'anyOf' in schema_property:
# Handle OpenAPI 3.1 type arrays: type: [string, 'null']
if isinstance(prop_type, list):
# Use the first non-null type from the array
for t in prop_type:
if t != 'null':
prop_type = t
break
else:
prop_type = None
# Handle anyOf schemas (older format)
elif 'anyOf' in schema_property:
# Use the first non-null type from anyOf # Use the first non-null type from anyOf
for option in schema_property['anyOf']: for option in schema_property['anyOf']:
if option.get('type') and option.get('type') != 'null': if option.get('type') and option.get('type') != 'null':
@@ -43,8 +59,6 @@ def convert_query_param_to_type(value, schema_property):
break break
else: else:
prop_type = None prop_type = None
else:
prop_type = schema_property.get('type')
# Handle array type (e.g., notification_urls) # Handle array type (e.g., notification_urls)
if prop_type == 'array': if prop_type == 'array':
@@ -89,7 +103,7 @@ class Import(Resource):
@validate_openapi_request('importWatches') @validate_openapi_request('importWatches')
def post(self): def post(self):
"""Import a list of watched URLs with optional watch configuration.""" """Import a list of watched URLs with optional watch configuration."""
from . import get_watch_schema_properties
# Special parameters that are NOT watch configuration # Special parameters that are NOT watch configuration
special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'} special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'}
@@ -115,7 +129,8 @@ class Import(Resource):
tag_uuids = tag_uuids.split(',') tag_uuids = tag_uuids.split(',')
# Extract ALL other query parameters as watch configuration # Extract ALL other query parameters as watch configuration
schema_properties = schema_create_watch.get('properties', {}) # Get schema from OpenAPI spec (replaces old schema_create_watch)
schema_properties = get_watch_schema_properties()
for param_name, param_value in request.args.items(): for param_name, param_value in request.args.items():
# Skip special parameters # Skip special parameters
if param_name in special_params: if param_name in special_params:

View File

@@ -1,8 +1,6 @@
from flask_expects_json import expects_json
from flask_restful import Resource, abort from flask_restful import Resource, abort
from flask import request from flask import request
from . import auth, validate_openapi_request from . import auth, validate_openapi_request
from . import schema_create_notification_urls, schema_delete_notification_urls
class Notifications(Resource): class Notifications(Resource):
def __init__(self, **kwargs): def __init__(self, **kwargs):
@@ -22,7 +20,6 @@ class Notifications(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('addNotifications') @validate_openapi_request('addNotifications')
@expects_json(schema_create_notification_urls)
def post(self): def post(self):
"""Create Notification URLs.""" """Create Notification URLs."""
@@ -50,7 +47,6 @@ class Notifications(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('replaceNotifications') @validate_openapi_request('replaceNotifications')
@expects_json(schema_create_notification_urls)
def put(self): def put(self):
"""Replace Notification URLs.""" """Replace Notification URLs."""
json_data = request.get_json() json_data = request.get_json()
@@ -73,7 +69,6 @@ class Notifications(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('deleteNotifications') @validate_openapi_request('deleteNotifications')
@expects_json(schema_delete_notification_urls)
def delete(self): def delete(self):
"""Delete Notification URLs.""" """Delete Notification URLs."""

View File

@@ -1,6 +1,5 @@
from changedetectionio import queuedWatchMetaData from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool from changedetectionio import worker_pool
from flask_expects_json import expects_json
from flask_restful import abort, Resource from flask_restful import abort, Resource
from loguru import logger from loguru import logger
@@ -8,8 +7,7 @@ import threading
from flask import request from flask import request
from . import auth from . import auth
# Import schemas from __init__.py from . import validate_openapi_request
from . import schema_tag, schema_create_tag, schema_update_tag, validate_openapi_request
class Tag(Resource): class Tag(Resource):
@@ -69,7 +67,25 @@ class Tag(Resource):
tag.commit() tag.commit()
return "OK", 200 return "OK", 200
return tag # Filter out Watch-specific runtime fields that don't apply to Tags (yet)
# TODO: Future enhancement - aggregate these values from all Watches that have this tag:
# - check_count: sum of all watches' check_count
# - last_checked: most recent last_checked from all watches
# - last_changed: most recent last_changed from all watches
# - consecutive_filter_failures: count of watches with failures
# - etc.
# These come from watch_base inheritance but currently have no meaningful value for Tags
watch_only_fields = {
'browser_steps_last_error_step', 'check_count', 'consecutive_filter_failures',
'content-type', 'fetch_time', 'last_changed', 'last_checked', 'last_error',
'last_notification_error', 'last_viewed', 'notification_alert_count',
'page_title', 'previous_md5', 'previous_md5_before_filters', 'remote_server_reply'
}
# Create clean tag dict without Watch-specific fields
clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields}
return clean_tag
@auth.check_token @auth.check_token
@validate_openapi_request('deleteTag') @validate_openapi_request('deleteTag')
@@ -102,24 +118,46 @@ class Tag(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('updateTag') @validate_openapi_request('updateTag')
@expects_json(schema_update_tag)
def put(self, uuid): def put(self, uuid):
"""Update tag information.""" """Update tag information."""
tag = self.datastore.data['settings']['application']['tags'].get(uuid) tag = self.datastore.data['settings']['application']['tags'].get(uuid)
if not tag: if not tag:
abort(404, message='No tag exists with the UUID of {}'.format(uuid)) abort(404, message='No tag exists with the UUID of {}'.format(uuid))
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Validate notification_urls if provided # Validate notification_urls if provided
if 'notification_urls' in request.json: if 'notification_urls' in json_data:
from wtforms import ValidationError from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls from changedetectionio.api.Notifications import validate_notification_urls
try: try:
notification_urls = request.json.get('notification_urls', []) notification_urls = json_data.get('notification_urls', [])
validate_notification_urls(notification_urls) validate_notification_urls(notification_urls)
except ValidationError as e: except ValidationError as e:
return str(e), 400 return str(e), 400
tag.update(request.json) # Filter out readOnly fields (extracted from OpenAPI spec Tag schema)
# These are system-managed fields that should never be user-settable
from . import get_readonly_tag_fields
readonly_fields = get_readonly_tag_fields()
# Tag model inherits from watch_base but has no @property attributes of its own
# So we only need to filter readOnly fields
for field in readonly_fields:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
tag.update(json_data)
tag.commit() tag.commit()
return "OK", 200 return "OK", 200
@@ -127,13 +165,21 @@ class Tag(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('createTag') @validate_openapi_request('createTag')
# Only cares for {'title': 'xxxx'}
def post(self): def post(self):
"""Create a single tag/group.""" """Create a single tag/group."""
json_data = request.get_json() json_data = request.get_json()
title = json_data.get("title",'').strip() title = json_data.get("title",'').strip()
# Validate that only valid fields are provided
# Get valid fields from Tag schema
from . import get_tag_schema_properties
valid_fields = set(get_tag_schema_properties().keys())
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
new_uuid = self.datastore.add_tag(title=title) new_uuid = self.datastore.add_tag(title=title)
if new_uuid: if new_uuid:

View File

@@ -8,13 +8,11 @@ from . import auth
from changedetectionio import queuedWatchMetaData, strtobool from changedetectionio import queuedWatchMetaData, strtobool
from changedetectionio import worker_pool from changedetectionio import worker_pool
from flask import request, make_response, send_from_directory from flask import request, make_response, send_from_directory
from flask_expects_json import expects_json
from flask_restful import abort, Resource from flask_restful import abort, Resource
from loguru import logger from loguru import logger
import copy import copy
# Import schemas from __init__.py from . import validate_openapi_request, get_readonly_watch_fields
from . import schema, schema_create_watch, schema_update_watch, validate_openapi_request
from ..notification import valid_notification_formats from ..notification import valid_notification_formats
from ..notification.handler import newline_re from ..notification.handler import newline_re
@@ -121,7 +119,6 @@ class Watch(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('updateWatch') @validate_openapi_request('updateWatch')
@expects_json(schema_update_watch)
def put(self, uuid): def put(self, uuid):
"""Update watch information.""" """Update watch information."""
watch = self.datastore.data['watching'].get(uuid) watch = self.datastore.data['watching'].get(uuid)
@@ -175,6 +172,35 @@ class Watch(Resource):
# Extract and remove processor config fields from json_data # Extract and remove processor config fields from json_data
processor_config_data = processors.extract_processor_config_from_form_data(json_data) processor_config_data = processors.extract_processor_config_from_form_data(json_data)
# Filter out readOnly fields (extracted from OpenAPI spec Watch schema)
# These are system-managed fields that should never be user-settable
readonly_fields = get_readonly_watch_fields()
# Also filter out @property attributes (computed/derived values from the model)
# These are not stored and should be ignored in PUT requests
from changedetectionio.model.Watch import model as WatchModel
property_fields = WatchModel.get_property_names()
# Combine both sets of fields to ignore
fields_to_ignore = readonly_fields | property_fields
# Remove all ignored fields from update data
for field in fields_to_ignore:
json_data.pop(field, None)
# Validate remaining fields - reject truly unknown fields
# Get valid fields from WatchBase schema
from . import get_watch_schema_properties
valid_fields = set(get_watch_schema_properties().keys())
# Also allow last_viewed (explicitly defined in UpdateWatch schema)
valid_fields.add('last_viewed')
# Check for unknown fields
unknown_fields = set(json_data.keys()) - valid_fields
if unknown_fields:
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
# Update watch with regular (non-processor-config) fields # Update watch with regular (non-processor-config) fields
watch.update(json_data) watch.update(json_data)
watch.commit() watch.commit()
@@ -393,7 +419,6 @@ class CreateWatch(Resource):
@auth.check_token @auth.check_token
@validate_openapi_request('createWatch') @validate_openapi_request('createWatch')
@expects_json(schema_create_watch)
def post(self): def post(self):
"""Create a single watch.""" """Create a single watch."""

View File

@@ -1,41 +1,6 @@
import copy
import functools import functools
from flask import request, abort from flask import request, abort
from loguru import logger from loguru import logger
from . import api_schema
from ..model import watch_base
# Build a JSON Schema atleast partially based on our Watch model
watch_base_config = watch_base()
schema = api_schema.build_watch_json_schema(watch_base_config)
schema_create_watch = copy.deepcopy(schema)
schema_create_watch['required'] = ['url']
del schema_create_watch['properties']['last_viewed']
# Allow processor_config_* fields (handled separately in endpoint)
schema_create_watch['patternProperties'] = {
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
}
schema_update_watch = copy.deepcopy(schema)
schema_update_watch['additionalProperties'] = False
# Allow processor_config_* fields (handled separately in endpoint)
schema_update_watch['patternProperties'] = {
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
}
# Tag schema is also based on watch_base since Tag inherits from it
schema_tag = copy.deepcopy(schema)
schema_create_tag = copy.deepcopy(schema_tag)
schema_create_tag['required'] = ['title']
schema_update_tag = copy.deepcopy(schema_tag)
schema_update_tag['additionalProperties'] = False
schema_notification_urls = copy.deepcopy(schema)
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
schema_create_notification_urls['required'] = ['notification_urls']
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
schema_delete_notification_urls['required'] = ['notification_urls']
@functools.cache @functools.cache
def get_openapi_spec(): def get_openapi_spec():
@@ -54,6 +19,134 @@ def get_openapi_spec():
_openapi_spec = OpenAPI.from_dict(spec_dict) _openapi_spec = OpenAPI.from_dict(spec_dict)
return _openapi_spec return _openapi_spec
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Used by Import endpoint to validate and convert query parameters.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_schema_properties(schema_name):
"""
Generic helper to resolve schema properties, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'WatchBase', 'Watch', 'Tag')
Returns:
dict: All properties including inherited ones from $ref schemas
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
properties = {}
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
properties.update(ref_schema.get('properties', {}))
# Add schema-specific properties
if 'properties' in item:
properties.update(item['properties'])
else:
# Direct properties (no inheritance)
properties = schema.get('properties', {})
return properties
@functools.cache
def _resolve_readonly_fields(schema_name):
"""
Generic helper to resolve readOnly fields, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
Returns:
frozenset: All readOnly field names including inherited ones
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
readonly_fields = set()
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
if 'properties' in ref_schema:
for field_name, field_def in ref_schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
# Check schema-specific properties
if 'properties' in item:
for field_name, field_def in item['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
else:
# Direct properties (no inheritance)
if 'properties' in schema:
for field_name, field_def in schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
return frozenset(readonly_fields)
@functools.cache
def get_watch_schema_properties():
"""
Extract watch schema properties from OpenAPI spec for Import endpoint.
Returns WatchBase properties (all writable Watch fields).
"""
return _resolve_schema_properties('WatchBase')
@functools.cache
def get_readonly_watch_fields():
"""
Extract readOnly field names from Watch schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
"""
return _resolve_readonly_fields('Watch')
@functools.cache
def get_tag_schema_properties():
"""
Extract Tag schema properties from OpenAPI spec.
Returns WatchBase properties + Tag-specific properties (overrides_watch).
"""
return _resolve_schema_properties('Tag')
@functools.cache
def get_readonly_tag_fields():
"""
Extract readOnly field names from Tag schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
"""
return _resolve_readonly_fields('Tag')
def validate_openapi_request(operation_id): def validate_openapi_request(operation_id):
"""Decorator to validate incoming requests against OpenAPI spec.""" """Decorator to validate incoming requests against OpenAPI spec."""
def decorator(f): def decorator(f):
@@ -72,8 +165,16 @@ def validate_openapi_request(operation_id):
if result.errors: if result.errors:
error_details = [] error_details = []
for error in result.errors: for error in result.errors:
error_details.append(str(error)) # Extract detailed schema errors from __cause__
raise BadRequest(f"OpenAPI validation failed: {error_details}") if hasattr(error, '__cause__') and hasattr(error.__cause__, 'schema_errors'):
for schema_error in error.__cause__.schema_errors:
field = '.'.join(str(p) for p in schema_error.path) if schema_error.path else 'body'
msg = schema_error.message if hasattr(schema_error, 'message') else str(schema_error)
error_details.append(f"{field}: {msg}")
else:
error_details.append(str(error))
logger.error(f"API Call - Validation failed: {'; '.join(error_details)}")
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
except BadRequest: except BadRequest:
# Re-raise BadRequest exceptions (validation failures) # Re-raise BadRequest exceptions (validation failures)
raise raise

View File

@@ -1,162 +0,0 @@
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
# Probably other ways to solve this when the backend switches to some ORM
from changedetectionio.notification import valid_notification_formats
def build_time_between_check_json_schema():
# Setup time between check schema
schema_properties_time_between_check = {
"type": "object",
"additionalProperties": False,
"properties": {}
}
for p in ['weeks', 'days', 'hours', 'minutes', 'seconds']:
schema_properties_time_between_check['properties'][p] = {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
]
}
return schema_properties_time_between_check
def build_watch_json_schema(d):
# Base JSON schema
schema = {
'type': 'object',
'properties': {},
}
for k, v in d.items():
# @todo 'integer' is not covered here because its almost always for internal usage
if isinstance(v, type(None)):
schema['properties'][k] = {
"anyOf": [
{"type": "null"},
]
}
elif isinstance(v, list):
schema['properties'][k] = {
"anyOf": [
{"type": "array",
# Always is an array of strings, like text or regex or something
"items": {
"type": "string",
"maxLength": 5000
}
},
]
}
elif isinstance(v, bool):
schema['properties'][k] = {
"anyOf": [
{"type": "boolean"},
]
}
elif isinstance(v, str):
schema['properties'][k] = {
"anyOf": [
{"type": "string",
"maxLength": 5000},
]
}
# Can also be a string (or None by default above)
for v in ['body',
'notification_body',
'notification_format',
'notification_title',
'proxy',
'tag',
'title',
'webdriver_js_execute_code'
]:
schema['properties'][v]['anyOf'].append({'type': 'string', "maxLength": 5000})
for v in ['last_viewed']:
schema['properties'][v] = {
"type": "integer",
"description": "Unix timestamp in seconds of the last time the watch was viewed.",
"minimum": 0
}
# None or Boolean
schema['properties']['track_ldjson_price_data']['anyOf'].append({'type': 'boolean'})
schema['properties']['method'] = {"type": "string",
"enum": ["GET", "POST", "DELETE", "PUT"]
}
schema['properties']['fetch_backend']['anyOf'].append({"type": "string",
"enum": ["html_requests", "html_webdriver"]
})
schema['properties']['processor'] = {"anyOf": [
{"type": "string", "enum": ["restock_diff", "text_json_diff"]},
{"type": "null"}
]}
# All headers must be key/value type dict
schema['properties']['headers'] = {
"type": "object",
"patternProperties": {
# Should always be a string:string type value
".*": {"type": "string"},
}
}
schema['properties']['notification_format'] = {'type': 'string',
'enum': list(valid_notification_formats.keys())
}
# Stuff that shouldn't be available but is just state-storage
for v in ['previous_md5', 'last_error', 'has_ldjson_price_data', 'previous_md5_before_filters', 'uuid']:
del schema['properties'][v]
schema['properties']['webdriver_delay']['anyOf'].append({'type': 'integer'})
schema['properties']['time_between_check'] = build_time_between_check_json_schema()
schema['properties']['time_between_check_use_default'] = {
"type": "boolean",
"default": True,
"description": "Whether to use global settings for time between checks - defaults to true if not set"
}
schema['properties']['browser_steps'] = {
"anyOf": [
{
"type": "array",
"items": {
"type": "object",
"properties": {
"operation": {
"type": ["string", "null"],
"maxLength": 5000 # Allows null and any string up to 5000 chars (including "")
},
"selector": {
"type": ["string", "null"],
"maxLength": 5000
},
"optional_value": {
"type": ["string", "null"],
"maxLength": 5000
}
},
"required": ["operation", "selector", "optional_value"],
"additionalProperties": False # No extra keys allowed
}
},
{"type": "null"}, # Allows null for `browser_steps`
{"type": "array", "maxItems": 0} # Allows empty array []
]
}
# headers ?
return schema

View File

@@ -20,11 +20,9 @@ See: Watch.py model docstring for full Pydantic architecture explanation
See: processors/restock_diff/processor.py:184-192 for current manual implementation See: processors/restock_diff/processor.py:184-192 for current manual implementation
""" """
import os
from changedetectionio.model import watch_base from changedetectionio.model import watch_base
from changedetectionio.model.persistence import EntityPersistenceMixin from changedetectionio.model.persistence import EntityPersistenceMixin
class model(EntityPersistenceMixin, watch_base): class model(EntityPersistenceMixin, watch_base):
""" """
Tag domain model - groups watches and can override their settings. Tag domain model - groups watches and can override their settings.

View File

@@ -2,7 +2,7 @@ import os
import uuid import uuid
from changedetectionio import strtobool from changedetectionio import strtobool
from .persistence import EntityPersistenceMixin from .persistence import EntityPersistenceMixin, _determine_entity_type
__all__ = ['EntityPersistenceMixin', 'watch_base'] __all__ = ['EntityPersistenceMixin', 'watch_base']
@@ -26,6 +26,7 @@ class watch_base(dict):
- Configuration override chain resolution (Watch → Tag → Global) - Configuration override chain resolution (Watch → Tag → Global)
- Immutability options - Immutability options
- Better testing - Better testing
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
CHAIN RESOLUTION ARCHITECTURE: CHAIN RESOLUTION ARCHITECTURE:
The dream is a 3-level override hierarchy: The dream is a 3-level override hierarchy:
@@ -173,7 +174,7 @@ class watch_base(dict):
'body': None, 'body': None,
'browser_steps': [], 'browser_steps': [],
'browser_steps_last_error_step': None, 'browser_steps_last_error_step': None,
'conditions' : {}, 'conditions' : [],
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT, 'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
'check_count': 0, 'check_count': 0,
'check_unique_lines': False, # On change-detected, compare against all history if its something new 'check_unique_lines': False, # On change-detected, compare against all history if its something new
@@ -299,6 +300,42 @@ class watch_base(dict):
if self.get('default'): if self.get('default'):
del self['default'] del self['default']
@classmethod
def get_property_names(cls):
"""
Get all @property attribute names from this model class using introspection.
This discovers computed/derived properties that are not stored in the datastore.
These properties should be filtered out during PUT/POST requests.
Returns:
frozenset: Immutable set of @property attribute names from the model class
"""
import functools
# Create a cached version if it doesn't exist
if not hasattr(cls, '_cached_get_property_names'):
@functools.cache
def _get_props():
properties = set()
# Use introspection to find all @property attributes
for name in dir(cls):
# Skip private/magic attributes
if name.startswith('_'):
continue
try:
attr = getattr(cls, name)
# Check if it's a property descriptor
if isinstance(attr, property):
properties.add(name)
except (AttributeError, TypeError):
continue
return frozenset(properties)
cls._cached_get_property_names = _get_props
return cls._cached_get_property_names()
def __deepcopy__(self, memo): def __deepcopy__(self, memo):
""" """
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.). Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
@@ -511,10 +548,8 @@ class watch_base(dict):
# Save to disk via subclass implementation # Save to disk via subclass implementation
try: try:
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag) # Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
from changedetectionio.model.persistence import _determine_entity_type
entity_type = _determine_entity_type(self.__class__) entity_type = _determine_entity_type(self.__class__)
filename = f"{entity_type}.json" filename = f"{entity_type}.json"
self._save_to_disk(data_dict, uuid) self._save_to_disk(data_dict, uuid)
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}") logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
except Exception as e: except Exception as e:

View File

@@ -56,6 +56,259 @@ def _deduplicate_prices(data):
return list(unique_data) return list(unique_data)
# =============================================================================
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
# =============================================================================
#
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
# libxml2 - a C library that allocates memory at the C level.
#
# Memory Leak Problem:
# --------------------
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
# C-level allocations (nodes, attributes, text content)
# 2. Python's garbage collector can mark these objects as collectible, but
# cannot force the OS to reclaim the actual C-level memory
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
# "free in the process address space"
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
# after Python GC runs
#
# Why Multiprocessing Fixes This:
# --------------------------------
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
# allocations that Python GC couldn't release. This ensures clean memory state
# after each extraction.
#
# Performance Impact:
# -------------------
# - Memray analysis showed 1.2M document_fromstring allocations per page
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
# - With subprocess: ~35MB overhead but forces full cleanup after each run
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
#
# References:
# -----------
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
#
# Additional Context:
# -------------------
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
# use direct call as fallback
#
# Alternative Solution (Future Optimization):
# -------------------------------------------
# This entire problem could be avoided by using regex to extract just the machine
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
#
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
# 2. Use regex to extract <meta property="og:*"> tags
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
#
# Benefits:
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
# - Eliminates the lxml memory leak entirely
# - Faster extraction (regex is much faster than DOM parsing)
# - No subprocess overhead needed
#
# Trade-offs:
# - Regex for HTML is brittle (comments, CDATA, edge cases)
# - Microdata extraction would be complex (need to track element boundaries)
# - Would need extensive testing to ensure we don't miss valid data
# - extruct is battle-tested; regex solution would need similar maturity
#
# For now, the subprocess approach is safer and leverages existing extruct code.
# =============================================================================
def _extract_itemprop_availability_worker(pipe_conn):
"""
Subprocess worker for itemprop extraction (Linux memory management).
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
When the subprocess exits, the OS reclaims ALL memory including lxml's
C-level allocations that Python's GC cannot release.
Args:
pipe_conn: Pipe connection to receive HTML and send result
"""
import json
import gc
html_content = None
result_data = None
try:
# Receive HTML as raw bytes (no pickle)
html_bytes = pipe_conn.recv_bytes()
html_content = html_bytes.decode('utf-8')
# Explicitly delete html_bytes to free memory
del html_bytes
gc.collect()
# Perform extraction in subprocess (uses extruct/lxml)
result_data = get_itemprop_availability(html_content)
# Convert Restock object to dict for JSON serialization
result = {
'success': True,
'data': dict(result_data) if result_data else {}
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
# Clean up before exit
del result_data, html_content, result
gc.collect()
except MoreThanOnePriceFound:
# Serialize the specific exception type
result = {
'success': False,
'exception_type': 'MoreThanOnePriceFound'
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
except Exception as e:
# Serialize other exceptions
result = {
'success': False,
'exception_type': type(e).__name__,
'exception_message': str(e)
}
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
finally:
# Final cleanup before subprocess exits
# Variables may already be deleted in try block, so use try/except
try:
del html_content
except (NameError, UnboundLocalError):
pass
try:
del result_data
except (NameError, UnboundLocalError):
pass
gc.collect()
pipe_conn.close()
def extract_itemprop_availability_safe(html_content) -> Restock:
"""
Extract itemprop availability with hybrid approach for memory efficiency.
Strategy (fastest to slowest, least to most memory):
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
2. Fall back to extruct with subprocess isolation on Linux for complex cases
Args:
html_content: HTML string to parse
Returns:
Restock: Extracted availability data
Raises:
MoreThanOnePriceFound: When multiple prices detected
Other exceptions: From extruct/parsing
"""
import platform
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
try:
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
logger.trace("Attempting pure Python metadata extraction (no lxml)")
extracted_data = extract_metadata_pure_python(html_content)
price_data = query_price_availability(extracted_data)
# If we got price AND availability, we're done!
if price_data.get('price') and price_data.get('availability'):
result = Restock(price_data)
logger.debug(f"Pure Python extraction successful: {dict(result)}")
return result
# If we got some data but not everything, still try extruct for completeness
if price_data.get('price') or price_data.get('availability'):
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
except Exception as e:
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
# Only use subprocess isolation on Linux
# Other platforms may have issues with spawn or don't need the aggressive memory management
if platform.system() == 'Linux':
import multiprocessing
import json
import gc
try:
ctx = multiprocessing.get_context('spawn')
parent_conn, child_conn = ctx.Pipe()
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
p.start()
# Send HTML as raw bytes (no pickle)
html_bytes = html_content.encode('utf-8')
parent_conn.send_bytes(html_bytes)
# Explicitly delete html_bytes copy immediately after sending
del html_bytes
gc.collect()
# Receive result as JSON
result_bytes = parent_conn.recv_bytes()
result = json.loads(result_bytes.decode('utf-8'))
# Wait for subprocess to complete
p.join()
# Close pipes
parent_conn.close()
child_conn.close()
# Clean up all subprocess-related objects
del p, parent_conn, child_conn, result_bytes
gc.collect()
# Handle result or re-raise exception
if result['success']:
# Reconstruct Restock object from dict
restock_obj = Restock(result['data'])
# Clean up result dict
del result
gc.collect()
return restock_obj
else:
# Re-raise the exception that occurred in subprocess
exception_type = result['exception_type']
exception_msg = result.get('exception_message', '')
del result
gc.collect()
if exception_type == 'MoreThanOnePriceFound':
raise MoreThanOnePriceFound()
else:
raise Exception(f"{exception_type}: {exception_msg}")
except Exception as e:
# If multiprocessing itself fails, log and fall back to direct call
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
gc.collect()
return get_itemprop_availability(html_content)
else:
# Non-Linux: direct call (no subprocess overhead needed)
return get_itemprop_availability(html_content)
# should return Restock() # should return Restock()
# add casting? # add casting?
def get_itemprop_availability(html_content) -> Restock: def get_itemprop_availability(html_content) -> Restock:
@@ -196,8 +449,9 @@ class perform_site_check(difference_detection_processor):
multiple_prices_found = False multiple_prices_found = False
# Try built-in extraction first, this will scan metadata in the HTML # Try built-in extraction first, this will scan metadata in the HTML
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
try: try:
itemprop_availability = get_itemprop_availability(self.fetcher.content) itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
except MoreThanOnePriceFound as e: except MoreThanOnePriceFound as e:
# Don't raise immediately - let plugins try to handle this case # Don't raise immediately - let plugins try to handle this case
# Plugins might be able to determine which price is correct # Plugins might be able to determine which price is correct

View File

@@ -0,0 +1,286 @@
"""
Pure Python metadata extractor - no lxml, no memory leaks.
This module provides a fast, memory-efficient alternative to extruct for common
e-commerce metadata extraction. It handles:
- JSON-LD (covers 80%+ of modern sites)
- OpenGraph meta tags
- Basic microdata attributes
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
memory allocation issues. For edge cases, the main processor can fall back to
extruct (with subprocess isolation on Linux).
"""
from html.parser import HTMLParser
import json
import re
from loguru import logger
class JSONLDExtractor(HTMLParser):
"""
Extract JSON-LD structured data from HTML.
Finds all <script type="application/ld+json"> tags and parses their content.
Handles multiple JSON-LD blocks on the same page.
"""
def __init__(self):
super().__init__()
self.in_jsonld = False
self.data = [] # List of all parsed JSON-LD objects
self.current_script = []
def handle_starttag(self, tag, attrs):
if tag == 'script':
# Check if this is a JSON-LD script tag
for attr, value in attrs:
if attr == 'type' and value == 'application/ld+json':
self.in_jsonld = True
self.current_script = []
break
def handle_data(self, data):
if self.in_jsonld:
self.current_script.append(data)
def handle_endtag(self, tag):
if tag == 'script' and self.in_jsonld:
# Parse the accumulated script content
script_content = ''.join(self.current_script)
if script_content.strip():
try:
# Parse JSON (handles both objects and arrays)
parsed = json.loads(script_content)
if isinstance(parsed, list):
self.data.extend(parsed)
else:
self.data.append(parsed)
except json.JSONDecodeError as e:
logger.debug(f"Failed to parse JSON-LD: {e}")
pass
self.in_jsonld = False
self.current_script = []
class OpenGraphExtractor(HTMLParser):
"""
Extract OpenGraph meta tags from HTML.
Finds <meta property="og:*"> tags commonly used for social media sharing.
"""
def __init__(self):
super().__init__()
self.og_data = {}
def handle_starttag(self, tag, attrs):
if tag == 'meta':
attrs_dict = dict(attrs)
prop = attrs_dict.get('property', '')
# Extract OpenGraph properties
if prop.startswith('og:'):
content = attrs_dict.get('content', '')
if content:
self.og_data[prop] = content
class MicrodataExtractor(HTMLParser):
"""
Extract basic microdata attributes from HTML.
Finds elements with itemprop attributes. This is a simplified extractor
that doesn't handle nested itemscope/itemtype hierarchies - for complex
cases, use extruct as fallback.
"""
def __init__(self):
super().__init__()
self.microdata = {}
self.current_itemprop = None
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if 'itemprop' in attrs_dict:
itemprop = attrs_dict['itemprop']
# Price/currency/availability can be in content/href attributes
if itemprop == 'price':
if 'content' in attrs_dict:
self.microdata['price'] = attrs_dict['content']
else:
self.current_itemprop = 'price'
elif itemprop == 'priceCurrency':
if 'content' in attrs_dict:
self.microdata['currency'] = attrs_dict['content']
else:
self.current_itemprop = 'priceCurrency'
elif itemprop == 'availability':
# Can be in href (link) or content (meta)
if 'href' in attrs_dict:
self.microdata['availability'] = attrs_dict['href']
elif 'content' in attrs_dict:
self.microdata['availability'] = attrs_dict['content']
else:
self.current_itemprop = 'availability'
def handle_data(self, data):
# Capture text content for itemprop elements
if self.current_itemprop == 'price':
# Try to extract numeric price from text
try:
price_text = re.sub(r'[^\d.]', '', data.strip())
if price_text:
self.microdata['price'] = float(price_text)
except ValueError:
pass
elif self.current_itemprop == 'priceCurrency':
currency = data.strip()
if currency:
self.microdata['currency'] = currency
elif self.current_itemprop == 'availability':
availability = data.strip()
if availability:
self.microdata['availability'] = availability
def handle_endtag(self, tag):
# Reset current itemprop after closing tag
self.current_itemprop = None
def extract_metadata_pure_python(html_content):
"""
Extract structured metadata from HTML using pure Python parsers.
Returns a dict with three keys:
- 'json-ld': List of parsed JSON-LD objects
- 'opengraph': Dict of OpenGraph properties
- 'microdata': Dict of microdata properties
Args:
html_content: HTML string to parse
Returns:
dict: Extracted metadata in three formats
"""
result = {
'json-ld': [],
'opengraph': {},
'microdata': {}
}
# Extract JSON-LD
try:
jsonld_extractor = JSONLDExtractor()
jsonld_extractor.feed(html_content)
result['json-ld'] = jsonld_extractor.data
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
except Exception as e:
logger.debug(f"JSON-LD extraction failed: {e}")
# Extract OpenGraph
try:
og_extractor = OpenGraphExtractor()
og_extractor.feed(html_content)
result['opengraph'] = og_extractor.og_data
if result['opengraph']:
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
except Exception as e:
logger.debug(f"OpenGraph extraction failed: {e}")
# Extract Microdata
try:
microdata_extractor = MicrodataExtractor()
microdata_extractor.feed(html_content)
result['microdata'] = microdata_extractor.microdata
if result['microdata']:
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
except Exception as e:
logger.debug(f"Microdata extraction failed: {e}")
return result
def query_price_availability(extracted_data):
"""
Query extracted metadata for price and availability information.
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
Args:
extracted_data: Dict from extract_metadata_pure_python()
Returns:
dict: {'price': float, 'currency': str, 'availability': str}
"""
from jsonpath_ng import parse
result = {}
# 1. Try JSON-LD first (most reliable and common)
for data in extracted_data.get('json-ld', []):
try:
# Use jsonpath to find price/availability anywhere in the structure
price_parse = parse('$..(price|Price)')
availability_parse = parse('$..(availability|Availability)')
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
price_results = [m.value for m in price_parse.find(data)]
if price_results and not result.get('price'):
# Handle various price formats
price_val = price_results[0]
if isinstance(price_val, (int, float)):
result['price'] = float(price_val)
elif isinstance(price_val, str):
# Extract numeric value from string
try:
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
except ValueError:
pass
avail_results = [m.value for m in availability_parse.find(data)]
if avail_results and not result.get('availability'):
result['availability'] = str(avail_results[0])
curr_results = [m.value for m in currency_parse.find(data)]
if curr_results and not result.get('currency'):
result['currency'] = str(curr_results[0])
# If we found price, this JSON-LD block is good
if result.get('price'):
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
break
except Exception as e:
logger.debug(f"Error querying JSON-LD: {e}")
continue
# 2. Try OpenGraph if JSON-LD didn't provide everything
og_data = extracted_data.get('opengraph', {})
if not result.get('price') and 'og:price:amount' in og_data:
try:
result['price'] = float(og_data['og:price:amount'])
except ValueError:
pass
if not result.get('currency') and 'og:price:currency' in og_data:
result['currency'] = og_data['og:price:currency']
if not result.get('availability') and 'og:availability' in og_data:
result['availability'] = og_data['og:availability']
# 3. Use microdata as last resort
microdata = extracted_data.get('microdata', {})
if not result.get('price') and 'price' in microdata:
result['price'] = microdata['price']
if not result.get('currency') and 'currency' in microdata:
result['currency'] = microdata['currency']
if not result.get('availability') and 'availability' in microdata:
result['availability'] = microdata['availability']
return result

View File

@@ -33,9 +33,8 @@ except ImportError:
from ..processors import get_custom_watch_obj_for_processor from ..processors import get_custom_watch_obj_for_processor
# Import the base class and helpers # Import the base class and helpers
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_watch_atomic, save_tag_atomic, save_json_atomic from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_json_atomic
from .updates import DatastoreUpdatesMixin from .updates import DatastoreUpdatesMixin
from .legacy_loader import has_legacy_datastore
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification # Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)' BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
@@ -78,7 +77,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.") logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
copyfile(db_path, db_path_version_backup) copyfile(db_path, db_path_version_backup)
def _load_settings(self): def _load_settings(self, filename="changedetection.json"):
""" """
Load settings from storage. Load settings from storage.
@@ -87,7 +86,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Returns: Returns:
dict: Settings data loaded from storage dict: Settings data loaded from storage
""" """
changedetection_json = os.path.join(self.datastore_path, "changedetection.json") changedetection_json = os.path.join(self.datastore_path, filename)
logger.info(f"Loading settings from {changedetection_json}") logger.info(f"Loading settings from {changedetection_json}")
@@ -122,6 +121,11 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
if 'application' in settings_data['settings']: if 'application' in settings_data['settings']:
self.__data['settings']['application'].update(settings_data['settings']['application']) self.__data['settings']['application'].update(settings_data['settings']['application'])
# More or less for the old format which had this data in the one url-watches.json
# cant hurt to leave it here,
if 'watching' in settings_data:
self.__data['watching'].update(settings_data['watching'])
def _rehydrate_tags(self): def _rehydrate_tags(self):
"""Rehydrate tag entities from stored data into Tag objects with restock_diff processor.""" """Rehydrate tag entities from stored data into Tag objects with restock_diff processor."""
from ..model import Tag from ..model import Tag
@@ -146,23 +150,28 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
logger.info(f"Rehydrating {watch_count} watches...") logger.info(f"Rehydrating {watch_count} watches...")
watching_rehydrated = {} watching_rehydrated = {}
for uuid, watch_dict in self.__data.get('watching', {}).items(): for uuid, watch_dict in self.__data.get('watching', {}).items():
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict) if isinstance(watch_dict, dict):
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
else:
logger.error(f"Watch UUID {uuid} already rehydrated")
self.__data['watching'] = watching_rehydrated self.__data['watching'] = watching_rehydrated
logger.success(f"Rehydrated {watch_count} watches into Watch objects") logger.success(f"Rehydrated {watch_count} watches into Watch objects")
def _load_state(self): def _load_state(self, main_settings_filename="changedetection.json"):
""" """
Load complete datastore state from storage. Load complete datastore state from storage.
Orchestrates loading of settings, watches, and tags using polymorphic methods. Orchestrates loading of settings, watches, and tags using polymorphic methods.
""" """
# Load settings # Load settings
settings_data = self._load_settings() settings_data = self._load_settings(filename=main_settings_filename)
self._apply_settings(settings_data) self._apply_settings(settings_data)
# Load watches (polymorphic - parent class method) # Load watches, scan them from the disk
self._load_watches() self._load_watches()
self._rehydrate_watches()
# Load tags from individual tag.json files # Load tags from individual tag.json files
# These will override any tags in settings (migration path) # These will override any tags in settings (migration path)
@@ -200,112 +209,73 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Check if datastore already exists # Check if datastore already exists
changedetection_json = os.path.join(self.datastore_path, "changedetection.json") changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
changedetection_json_old_schema = os.path.join(self.datastore_path, "url-watches.json")
if os.path.exists(changedetection_json): if os.path.exists(changedetection_json):
# Load existing datastore (changedetection.json + watch.json files)
logger.info("Loading existing datastore")
try:
self._load_state()
except Exception as e:
logger.critical(f"Failed to load datastore: {e}")
raise
# Run schema updates if needed # Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set) # Pass current schema version from loaded datastore (defaults to 0 if not set)
# Load existing datastore (changedetection.json + watch.json files)
logger.info("Loading existing datastore")
self._load_state()
current_schema = self.data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
# Legacy datastore detected - trigger migration, even works if the schema is much before the migration step.
elif os.path.exists(changedetection_json_old_schema):
logger.critical(f"Legacy datastore detected at {changedetection_json_old_schema}, loading and running updates")
self._load_state(main_settings_filename="url-watches.json")
# update 26 will load the whole old config from disk to __data
current_schema = self.__data['settings']['application'].get('schema_version', 0) current_schema = self.__data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema) self.run_updates(current_schema_version=current_schema)
# Probably tags were also shifted to disk and many other changes, so best to reload here.
self._load_state()
else: else:
# No datastore yet - check if this is a fresh install or legacy migration # No datastore yet - check if this is a fresh install or legacy migration
# Generate app_guid FIRST (required for all operations) self.init_fresh_install(include_default_watches=include_default_watches,
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ: version_tag=version_tag)
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
# Generate RSS access token def init_fresh_install(self, include_default_watches, version_tag):
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16) # Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
# Generate API access token # Generate RSS access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16) self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Check if legacy datastore exists (url-watches.json) # Generate API access token
if has_legacy_datastore(self.datastore_path): self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
# Legacy datastore detected - trigger migration logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
logger.critical(f"Legacy datastore detected at {self.datastore_path}/url-watches.json")
logger.critical("Migration will be triggered via update_26")
# Load the legacy datastore # Set schema version to latest (no updates needed)
from .legacy_loader import load_legacy_format latest_update_available = self.get_updates_available().pop()
legacy_path = os.path.join(self.datastore_path, "url-watches.json") logger.info(f"Marking fresh install to schema version {latest_update_available}")
legacy_data = load_legacy_format(legacy_path) self.__data['settings']['application']['schema_version'] = latest_update_available
if not legacy_data: # Add default watches if requested
raise Exception("Failed to load legacy datastore from url-watches.json") if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
# Merge legacy data with base_config defaults (preserves new fields like 'ui') # Create changedetection.json immediately
# self.__data already has App.model() defaults from line 190 try:
logger.info("Merging legacy data with base_config defaults...") self._save_settings()
logger.info("Created changedetection.json for new datastore")
# Apply top-level fields from legacy data except Exception as e:
if 'app_guid' in legacy_data: logger.error(f"Failed to create initial changedetection.json: {e}")
self.__data['app_guid'] = legacy_data['app_guid']
if 'build_sha' in legacy_data:
self.__data['build_sha'] = legacy_data['build_sha']
if 'version_tag' in legacy_data:
self.__data['version_tag'] = legacy_data['version_tag']
# Apply watching data (complete replacement as these are user's watches)
if 'watching' in legacy_data:
self.__data['watching'] = legacy_data['watching']
# Merge settings sections (preserves base_config defaults for missing fields)
if 'settings' in legacy_data:
if 'headers' in legacy_data['settings']:
self.__data['settings']['headers'].update(legacy_data['settings']['headers'])
if 'requests' in legacy_data['settings']:
self.__data['settings']['requests'].update(legacy_data['settings']['requests'])
if 'application' in legacy_data['settings']:
# CRITICAL: Use .update() to merge, not replace
# This preserves new fields like 'ui' that exist in base_config
self.__data['settings']['application'].update(legacy_data['settings']['application'])
# CRITICAL: Rehydrate watches from dicts into Watch objects
# This ensures watches have their methods available during migration
self._rehydrate_watches()
# update_26 will save watches to individual files and create changedetection.json
# Next startup will load from new format normally
self.run_updates()
else:
# Fresh install - create new datastore
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
# Set schema version to latest (no updates needed)
updates_available = self.get_updates_available()
self.__data['settings']['application']['schema_version'] = updates_available.pop() if updates_available else 26
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# Set version tag # Set version tag
self.__data['version_tag'] = version_tag self.__data['version_tag'] = version_tag
@@ -383,17 +353,9 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Deep copy settings to avoid modifying the original # Deep copy settings to avoid modifying the original
settings_copy = copy.deepcopy(self.__data['settings']) settings_copy = copy.deepcopy(self.__data['settings'])
# Only exclude tags if we've already migrated them to individual files (schema >= 28)
# This ensures update_28 can migrate tags from settings
schema_version = self.__data['settings']['application'].get('schema_version', 0)
if schema_version >= 28:
# Tags are in individual tag.json files, don't save to settings
settings_copy['application']['tags'] = {}
# else: keep tags in settings for update_28 migration
return { return {
'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json', 'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
'app_guid': self.__data['app_guid'], 'app_guid': self.__data.get('app_guid'),
'settings': settings_copy, 'settings': settings_copy,
'build_sha': self.__data.get('build_sha'), 'build_sha': self.__data.get('build_sha'),
'version_tag': self.__data.get('version_tag') 'version_tag': self.__data.get('version_tag')
@@ -422,15 +384,14 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
Implementation of abstract method from FileSavingDataStore. Implementation of abstract method from FileSavingDataStore.
Delegates to helper function and stores results in internal data structure. Delegates to helper function and stores results in internal data structure.
""" """
watching = load_all_watches(
self.datastore_path,
self.rehydrate_entity
)
# Store loaded data # Store loaded data
self.__data['watching'] = watching # @note this will also work for the old legacy format because self.__data['watching'] should already have them loaded by this point.
self.__data['watching'].update(load_all_watches(
logger.debug(f"Loaded {len(watching)} watches") self.datastore_path,
self.rehydrate_entity
))
logger.debug(f"Loaded {len(self.__data['watching'])} watches")
def _load_tags(self): def _load_tags(self):
""" """

View File

@@ -207,15 +207,6 @@ def save_watch_atomic(watch_dir, uuid, watch_dict):
save_entity_atomic(watch_dir, uuid, watch_dict, "watch.json", "watch", max_size_mb=10) save_entity_atomic(watch_dir, uuid, watch_dict, "watch.json", "watch", max_size_mb=10)
def save_tag_atomic(tag_dir, uuid, tag_dict):
"""
Save a tag to disk using atomic write pattern.
Convenience wrapper around save_entity_atomic for tags.
Kept for backwards compatibility.
"""
save_entity_atomic(tag_dir, uuid, tag_dict, "tag.json", "tag", max_size_mb=1)
def load_watch_from_file(watch_json, uuid, rehydrate_entity_func): def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
""" """

View File

@@ -1,66 +0,0 @@
"""
Legacy format loader for url-watches.json.
Provides functions to detect and load from the legacy monolithic JSON format.
Used during migration (update_26) to transition to individual watch.json files.
"""
import os
import json
from loguru import logger
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
def has_legacy_datastore(datastore_path):
"""
Check if a legacy url-watches.json file exists.
This is used by update_26 to determine if migration is needed.
Args:
datastore_path: Path to datastore directory
Returns:
bool: True if url-watches.json exists
"""
url_watches_json = os.path.join(datastore_path, "url-watches.json")
return os.path.exists(url_watches_json)
def load_legacy_format(json_store_path):
"""
Load datastore from legacy url-watches.json format.
Args:
json_store_path: Full path to url-watches.json file
Returns:
dict: Loaded datastore data with 'watching', 'settings', etc.
None: If file doesn't exist or loading failed
"""
logger.info(f"Loading from legacy format: {json_store_path}")
if not os.path.isfile(json_store_path):
logger.warning(f"Legacy file not found: {json_store_path}")
return None
try:
if HAS_ORJSON:
with open(json_store_path, 'rb') as f:
data = orjson.loads(f.read())
else:
with open(json_store_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Loaded {len(data.get('watching', {}))} watches from legacy format")
return data
except Exception as e:
logger.error(f"Failed to load legacy format: {e}")
return None

View File

@@ -16,12 +16,18 @@ import time
from loguru import logger from loguru import logger
from copy import deepcopy from copy import deepcopy
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
from ..html_tools import TRANSLATE_WHITESPACE_TABLE from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..processors.restock_diff import Restock from ..processors.restock_diff import Restock
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from .file_saving_datastore import save_watch_atomic
def create_backup_tarball(datastore_path, update_number): def create_backup_tarball(datastore_path, update_number):
""" """
@@ -97,7 +103,7 @@ def create_backup_tarball(datastore_path, update_number):
tar.add(tag_json, arcname=f"{entry}/tag.json") tar.add(tag_json, arcname=f"{entry}/tag.json")
tag_count += 1 tag_count += 1
logger.success(f"Backup created: {backup_filename} ({watch_count} watches, {tag_count} tags)") logger.success(f"Backup created: {backup_filename} ({watch_count} watches from disk, {tag_count} tags from disk)")
return backup_path return backup_path
except Exception as e: except Exception as e:
@@ -137,6 +143,7 @@ class DatastoreUpdatesMixin:
return updates_available return updates_available
def run_updates(self, current_schema_version=None): def run_updates(self, current_schema_version=None):
import sys
""" """
Run all pending schema updates sequentially. Run all pending schema updates sequentially.
@@ -160,6 +167,23 @@ class DatastoreUpdatesMixin:
4. All changes saved via individual .commit() calls 4. All changes saved via individual .commit() calls
""" """
updates_available = self.get_updates_available() updates_available = self.get_updates_available()
if self.data.get('watching'):
test_watch = self.data['watching'].get(next(iter(self.data.get('watching', {}))))
from ..model.Watch import model
if not isinstance(test_watch, model):
import sys
logger.critical("Cannot run updates! Watch structure must be re-hydrated back to a Watch model object!")
sys.exit(1)
if self.data['settings']['application'].get('tags',{}):
test_tag = self.data['settings']['application'].get('tags',{}).get(next(iter(self.data['settings']['application'].get('tags',{}))))
from ..model.Tag import model as tag_model
if not isinstance(test_tag, tag_model):
import sys
logger.critical("Cannot run updates! Watch tag/group structure must be re-hydrated back to a Tag model object!")
sys.exit(1)
# Determine current schema version # Determine current schema version
if current_schema_version is None: if current_schema_version is None:
@@ -201,10 +225,9 @@ class DatastoreUpdatesMixin:
try: try:
update_method = getattr(self, f"update_{update_n}")() update_method = getattr(self, f"update_{update_n}")()
except Exception as e: except Exception as e:
logger.error(f"Error while trying update_{update_n}") logger.critical(f"Error while trying update_{update_n}")
logger.error(e) logger.exception(e)
# Don't run any more updates sys.exit(1)
return
else: else:
# Bump the version # Bump the version
self.data['settings']['application']['schema_version'] = update_n self.data['settings']['application']['schema_version'] = update_n
@@ -555,27 +578,6 @@ class DatastoreUpdatesMixin:
logger.critical("COPY-based migration: url-watches.json will remain intact for rollback") logger.critical("COPY-based migration: url-watches.json will remain intact for rollback")
logger.critical("=" * 80) logger.critical("=" * 80)
# Check if already migrated
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
if os.path.exists(changedetection_json):
logger.info("Migration already completed (changedetection.json exists), skipping")
return
# Check if we need to load legacy data
from .legacy_loader import has_legacy_datastore, load_legacy_format
if not has_legacy_datastore(self.datastore_path):
logger.info("No legacy datastore found, nothing to migrate")
return
# Load legacy data from url-watches.json
logger.critical("Loading legacy datastore from url-watches.json...")
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
legacy_data = load_legacy_format(legacy_path)
if not legacy_data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Populate settings from legacy data # Populate settings from legacy data
logger.info("Populating settings from legacy data...") logger.info("Populating settings from legacy data...")
watch_count = len(self.data['watching']) watch_count = len(self.data['watching'])
@@ -587,9 +589,7 @@ class DatastoreUpdatesMixin:
saved_count = 0 saved_count = 0
for uuid, watch in self.data['watching'].items(): for uuid, watch in self.data['watching'].items():
try: try:
watch_dict = dict(watch) watch.commit()
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
saved_count += 1 saved_count += 1
if saved_count % 100 == 0: if saved_count % 100 == 0:
@@ -635,18 +635,19 @@ class DatastoreUpdatesMixin:
# Phase 4: Verify settings file exists # Phase 4: Verify settings file exists
logger.critical("Phase 4/4: Verifying changedetection.json exists...") logger.critical("Phase 4/4: Verifying changedetection.json exists...")
changedetection_json_new_schema=os.path.join(self.datastore_path, "changedetection.json")
if not os.path.isfile(changedetection_json_new_schema):
import sys
logger.critical("Migration failed, changedetection.json not found after update ran!")
sys.exit(1)
if not os.path.isfile(changedetection_json):
raise Exception(
"Migration failed: changedetection.json not found after save. "
"url-watches.json remains intact, safe to retry."
)
logger.critical("Phase 4 complete: Verified changedetection.json exists") logger.critical("Phase 4 complete: Verified changedetection.json exists")
# Success! Now reload from new format # Success! Now reload from new format
logger.critical("Reloading datastore from new format...") logger.critical("Reloading datastore from new format...")
self._load_state() # Includes load_watches # write it to disk, it will be saved without ['watching'] in the JSON db because we find it from disk glob
self._save_settings()
logger.success("Datastore reloaded from new format successfully") logger.success("Datastore reloaded from new format successfully")
logger.critical("=" * 80) logger.critical("=" * 80)
logger.critical("MIGRATION COMPLETED SUCCESSFULLY!") logger.critical("MIGRATION COMPLETED SUCCESSFULLY!")
@@ -681,9 +682,11 @@ class DatastoreUpdatesMixin:
- Enables independent tag versioning/backup - Enables independent tag versioning/backup
- Maintains backwards compatibility (tags stay in settings too) - Maintains backwards compatibility (tags stay in settings too)
""" """
# Force save as tag.json (not watch.json) even if object is corrupted
logger.critical("=" * 80) logger.critical("=" * 80)
logger.critical("Running migration: Individual tag persistence (update_28)") logger.critical("Running migration: Individual tag persistence (update_28)")
logger.critical("Creating individual tag.json files (tags remain in settings too)") logger.critical("Creating individual tag.json files")
logger.critical("=" * 80) logger.critical("=" * 80)
tags = self.data['settings']['application'].get('tags', {}) tags = self.data['settings']['application'].get('tags', {})
@@ -700,27 +703,8 @@ class DatastoreUpdatesMixin:
for uuid, tag_data in tags.items(): for uuid, tag_data in tags.items():
try: try:
# Force save as tag.json (not watch.json) even if object is corrupted tag_data.commit()
from changedetectionio.store.file_saving_datastore import save_entity_atomic
import os
tag_dir = os.path.join(self.datastore_path, uuid)
os.makedirs(tag_dir, exist_ok=True)
# Convert to dict if it's an object
tag_dict = dict(tag_data) if hasattr(tag_data, '__iter__') else tag_data
# Save explicitly as tag.json
save_entity_atomic(
tag_dir,
uuid,
tag_dict,
filename='tag.json',
entity_type='tag',
max_size_mb=1
)
saved_count += 1 saved_count += 1
if saved_count % 10 == 0: if saved_count % 10 == 0:
logger.info(f" Progress: {saved_count}/{tag_count} tags migrated...") logger.info(f" Progress: {saved_count}/{tag_count} tags migrated...")
@@ -737,5 +721,5 @@ class DatastoreUpdatesMixin:
# On next load, _load_tags() will read from tag.json files and merge with settings # On next load, _load_tags() will read from tag.json files and merge with settings
logger.info("Tags saved to both settings AND individual tag.json files") logger.info("Tags saved to both settings AND individual tag.json files")
logger.info("Future tag edits will update both locations (dual storage)") logger.info("Future tag edits will update both locations (dual storage)")
logger.critical("=" * 80)
logger.critical("=" * 80)

View File

@@ -328,6 +328,68 @@ def test_api_simple(client, live_server, measure_memory_usage, datastore_path):
) )
assert len(res.json) == 0, "Watch list should be empty" assert len(res.json) == 0, "Watch list should be empty"
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
# Create new
res = client.post(
url_for("createwatch"),
data=json.dumps({"url": test_url}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
watch=res.json
# Be sure that 'readOnly' values are never updated in the real watch
watch['last_changed'] = 454444444444
watch['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(watch),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
res = client.get(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key}
)
last_changed = res.json.get('last_changed')
assert last_changed != 454444444444
assert last_changed != "454444444444"
date_created = res.json.get('date_created')
assert date_created != 454444444444
assert date_created != "454444444444"
def test_access_denied(client, live_server, measure_memory_usage, datastore_path): def test_access_denied(client, live_server, measure_memory_usage, datastore_path):
# `config_api_token_enabled` Should be On by default # `config_api_token_enabled` Should be On by default
res = client.get( res = client.get(
@@ -401,6 +463,9 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
follow_redirects=True follow_redirects=True
) )
if res.status_code != 201:
print(f"\n=== POST createwatch failed with {res.status_code} ===")
print(f"Response: {res.data}")
assert res.status_code == 201 assert res.status_code == 201
wait_for_all_checks(client) wait_for_all_checks(client)
@@ -464,11 +529,12 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
) )
assert res.status_code == 400, "Should get error 400 when we give a field that doesnt exist" assert res.status_code == 400, "Should get error 400 when we give a field that doesnt exist"
# Message will come from `flask_expects_json` # Backend validation now rejects unknown fields with a clear error message
# With patternProperties for processor_config_*, the error message format changed slightly assert (b'Unknown field' in res.data or
assert (b'Additional properties are not allowed' in res.data or b'Additional properties are not allowed' in res.data or
b'Unevaluated properties are not allowed' in res.data or
b'does not match any of the regexes' in res.data), \ b'does not match any of the regexes' in res.data), \
"Should reject unknown fields with schema validation error" "Should reject unknown fields with validation error"
# Try a XSS URL # Try a XSS URL
@@ -553,6 +619,8 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
assert res.status_code == 200 assert res.status_code == 200
uuid = res.json[0] uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid] watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert isinstance(watch['notification_urls'], list), "notification_urls must be stored as a list"
assert len(watch['notification_urls']) == 2, "notification_urls should have 2 entries"
assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email" assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email"
assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email" assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email"
@@ -599,6 +667,34 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
assert res.status_code == 400, "Should reject unknown field" assert res.status_code == 400, "Should reject unknown field"
assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter" assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter"
# Test 7: Import with complex nested array (browser_steps) - array of objects
browser_steps = json.dumps([
{"operation": "wait", "selector": "5", "optional_value": ""},
{"operation": "click", "selector": "button.submit", "optional_value": ""}
])
params = urllib.parse.urlencode({
'tag': 'browser-test',
'browser_steps': browser_steps
})
res = client.post(
url_for("import") + "?" + params,
data='https://website8.com',
headers={'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 200, "Should accept browser_steps array"
uuid = res.json[0]
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
assert len(watch['browser_steps']) == 2, "Should have 2 browser steps"
assert watch['browser_steps'][0]['operation'] == 'wait', "First step should be wait"
assert watch['browser_steps'][1]['operation'] == 'click', "Second step should be click"
assert watch['browser_steps'][1]['selector'] == 'button.submit', "Second step selector should be button.submit"
# Cleanup
delete_all_watches(client)
def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path): def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path):
"""Test that small imports (< threshold) are processed synchronously""" """Test that small imports (< threshold) are processed synchronously"""
@@ -837,7 +933,9 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
) )
assert res.status_code == 400, "Updating watch URL to null should fail" assert res.status_code == 400, "Updating watch URL to null should fail"
# Accept either OpenAPI validation error or our custom validation error # Accept either OpenAPI validation error or our custom validation error
assert b'URL cannot be null' in res.data or b'OpenAPI validation failed' in res.data or b'validation error' in res.data.lower() assert (b'URL cannot be null' in res.data or
b'Validation failed' in res.data or
b'validation error' in res.data.lower())
# Test 8: UPDATE to empty string URL should fail # Test 8: UPDATE to empty string URL should fail
res = client.put( res = client.put(
@@ -924,3 +1022,140 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
headers={'x-api-key': api_key}, headers={'x-api-key': api_key},
) )
delete_all_watches(client) delete_all_watches(client)
def test_api_time_between_check_validation(client, live_server, measure_memory_usage, datastore_path):
"""
Test that time_between_check validation works correctly:
- When time_between_check_use_default is false, at least one time value must be > 0
- Values must be valid integers
"""
import json
from flask import url_for
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: time_between_check_use_default=false with NO time_between_check should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time_between_check_use_default=false with no time_between_check"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 2: time_between_check_use_default=false with ALL zeros should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": 0,
"days": 0,
"hours": 0,
"minutes": 0,
"seconds": 0
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are 0"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 3: time_between_check_use_default=false with NULL values should fail
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"weeks": None,
"days": None,
"hours": None,
"minutes": None,
"seconds": None
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when all time values are null"
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
# Test 4: time_between_check_use_default=false with valid hours should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": 2
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid hours value"
uuid1 = res.json.get('uuid')
# Test 5: time_between_check_use_default=false with valid minutes should succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example2.com",
"time_between_check_use_default": False,
"time_between_check": {
"minutes": 30
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with valid minutes value"
uuid2 = res.json.get('uuid')
# Test 6: time_between_check_use_default=true (or missing) with no time_between_check should succeed (uses defaults)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example3.com",
"time_between_check_use_default": True
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed when using default settings"
uuid3 = res.json.get('uuid')
# Test 7: Default behavior (no time_between_check_use_default field) should use defaults and succeed
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example4.com"
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 201, "Should succeed with default behavior (using global settings)"
uuid4 = res.json.get('uuid')
# Test 8: Verify integer type validation - string should fail (OpenAPI validation)
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example5.com",
"time_between_check_use_default": False,
"time_between_check": {
"hours": "not_a_number"
}
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
)
assert res.status_code == 400, "Should fail when time value is not an integer"
assert b"Validation failed" in res.data or b"not of type" in res.data, "Should mention validation/type error"
# Cleanup
for uuid in [uuid1, uuid2, uuid3, uuid4]:
client.delete(
url_for("watch", uuid=uuid),
headers={'x-api-key': api_key},
)

View File

@@ -107,7 +107,7 @@ def test_watch_notification_urls_validation(client, live_server, measure_memory_
headers={'content-type': 'application/json', 'x-api-key': api_key} headers={'content-type': 'application/json', 'x-api-key': api_key}
) )
assert res.status_code == 400, "Should reject non-list notification_urls" assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data assert b"Validation failed" in res.data or b"is not of type" in res.data
# Test 6: Verify original URLs are preserved after failed update # Test 6: Verify original URLs are preserved after failed update
res = client.get( res = client.get(
@@ -159,7 +159,7 @@ def test_tag_notification_urls_validation(client, live_server, measure_memory_us
headers={'content-type': 'application/json', 'x-api-key': api_key} headers={'content-type': 'application/json', 'x-api-key': api_key}
) )
assert res.status_code == 400, "Should reject non-list notification_urls" assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data assert b"Validation failed" in res.data or b"is not of type" in res.data
# Test 4: Verify original URLs are preserved after failed update # Test 4: Verify original URLs are preserved after failed update
tag = datastore.data['settings']['application']['tags'][tag_uuid] tag = datastore.data['settings']['application']['tags'][tag_uuid]

View File

@@ -26,7 +26,7 @@ def test_openapi_validation_invalid_content_type_on_create_watch(client, live_se
# Should get 400 error due to OpenAPI validation failure # Should get 400 error due to OpenAPI validation failure
assert res.status_code == 400, f"Expected 400 but got {res.status_code}" assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message" assert b"Validation failed" in res.data, "Should contain validation error message"
def test_openapi_validation_missing_required_field_create_watch(client, live_server, measure_memory_usage, datastore_path): def test_openapi_validation_missing_required_field_create_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -43,7 +43,7 @@ def test_openapi_validation_missing_required_field_create_watch(client, live_ser
# Should get 400 error due to missing required field # Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}" assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message" assert b"Validation failed" in res.data, "Should contain validation error message"
def test_openapi_validation_invalid_field_in_request_body(client, live_server, measure_memory_usage, datastore_path): def test_openapi_validation_invalid_field_in_request_body(client, live_server, measure_memory_usage, datastore_path):
@@ -80,10 +80,9 @@ def test_openapi_validation_invalid_field_in_request_body(client, live_server, m
# Should get 400 error due to invalid field (this will be caught by internal validation) # Should get 400 error due to invalid field (this will be caught by internal validation)
# Note: This tests the flow where OpenAPI validation passes but internal validation catches it # Note: This tests the flow where OpenAPI validation passes but internal validation catches it
assert res.status_code == 400, f"Expected 400 but got {res.status_code}" assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
# With patternProperties for processor_config_*, the error message format changed slightly # Backend validation now returns "Unknown field(s):" message
assert (b"Additional properties are not allowed" in res.data or assert b"Unknown field" in res.data, \
b"does not match any of the regexes" in res.data), \ "Should contain validation error about unknown fields"
"Should contain validation error about additional/invalid properties"
def test_openapi_validation_import_wrong_content_type(client, live_server, measure_memory_usage, datastore_path): def test_openapi_validation_import_wrong_content_type(client, live_server, measure_memory_usage, datastore_path):
@@ -100,7 +99,7 @@ def test_openapi_validation_import_wrong_content_type(client, live_server, measu
# Should get 400 error due to content-type mismatch # Should get 400 error due to content-type mismatch
assert res.status_code == 400, f"Expected 400 but got {res.status_code}" assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message" assert b"Validation failed" in res.data, "Should contain validation error message"
def test_openapi_validation_import_correct_content_type_succeeds(client, live_server, measure_memory_usage, datastore_path): def test_openapi_validation_import_correct_content_type_succeeds(client, live_server, measure_memory_usage, datastore_path):
@@ -158,7 +157,7 @@ def test_openapi_validation_create_tag_missing_required_title(client, live_serve
# Should get 400 error due to missing required field # Should get 400 error due to missing required field
assert res.status_code == 400, f"Expected 400 but got {res.status_code}" assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message" assert b"Validation failed" in res.data, "Should contain validation error message"
def test_openapi_validation_watch_update_allows_partial_updates(client, live_server, measure_memory_usage, datastore_path): def test_openapi_validation_watch_update_allows_partial_updates(client, live_server, measure_memory_usage, datastore_path):

View File

@@ -176,4 +176,57 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
assert res.status_code == 204 assert res.status_code == 204
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
:param client:
:param live_server:
:param measure_memory_usage:
:param datastore_path:
:return:
"""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
set_original_response(datastore_path=datastore_path)
res = client.post(
url_for("tag"),
data=json.dumps({"title": "My tag title"}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201
uuid = res.json.get('uuid')
# Now fetch it and send it back
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
tag = res.json
# Only test with date_created (readOnly field that should be filtered out)
# last_changed is Watch-specific and doesn't apply to Tags
tag['date_created'] = 454444444444
# HTTP PUT ( UPDATE an existing watch )
res = client.put(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(tag),
)
if res.status_code != 200:
print(f"\n=== PUT failed with {res.status_code} ===")
print(f"Error: {res.data}")
assert res.status_code == 200, "HTTP PUT update was sent OK"
# Verify readOnly fields like date_created cannot be overridden
res = client.get(
url_for("tag", uuid=uuid),
headers={'x-api-key': api_key}
)
date_created = res.json.get('date_created')
assert date_created != 454444444444, "ReadOnly date_created should not be updateable"
assert date_created != "454444444444", "ReadOnly date_created should not be updateable"

View File

@@ -5,6 +5,8 @@ from flask import url_for
from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, extract_UUID_from_client, delete_all_watches from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, extract_UUID_from_client, delete_all_watches
import os import os
from ..store import ChangeDetectionStore
# def test_setup(client, live_server, measure_memory_usage, datastore_path): # def test_setup(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function # live_server_setup(live_server) # Setup on conftest per function
@@ -487,7 +489,6 @@ def test_tag_json_persistence(client, live_server, measure_memory_usage, datasto
- Tag deletion removes tag.json file - Tag deletion removes tag.json file
""" """
import json import json
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE') datastore = client.application.config.get('DATASTORE')
@@ -569,9 +570,6 @@ def test_tag_json_migration_update_27(client, live_server, measure_memory_usage,
This simulates a pre-update_27 datastore and verifies migration works. This simulates a pre-update_27 datastore and verifies migration works.
""" """
import json import json
from changedetectionio.store import ChangeDetectionStore
datastore = client.application.config.get('DATASTORE')
# 1. Create multiple tags # 1. Create multiple tags
tag_names = ['migration-tag-1', 'migration-tag-2', 'migration-tag-3'] tag_names = ['migration-tag-1', 'migration-tag-2', 'migration-tag-3']

View File

@@ -28,7 +28,7 @@ info:
For example: `x-api-key: YOUR_API_KEY` For example: `x-api-key: YOUR_API_KEY`
version: 0.1.5 version: 0.1.6
contact: contact:
name: ChangeDetection.io name: ChangeDetection.io
url: https://github.com/dgtlmoon/changedetection.io url: https://github.com/dgtlmoon/changedetection.io
@@ -126,13 +126,22 @@ components:
WatchBase: WatchBase:
type: object type: object
properties: properties:
uuid:
type: string
format: uuid
description: Unique identifier
readOnly: true
date_created:
type: [integer, 'null']
description: Unix timestamp of creation
readOnly: true
url: url:
type: string type: string
format: uri format: uri
description: URL to monitor for changes description: URL to monitor for changes
maxLength: 5000 maxLength: 5000
title: title:
type: string type: [string, 'null']
description: Custom title for the web page change monitor (watch), not to be confused with page_title description: Custom title for the web page change monitor (watch), not to be confused with page_title
maxLength: 5000 maxLength: 5000
tag: tag:
@@ -156,56 +165,61 @@ components:
description: HTTP method to use description: HTTP method to use
fetch_backend: fetch_backend:
type: string type: string
enum: [html_requests, html_webdriver] description: |
description: Backend to use for fetching content Backend to use for fetching content. Common values:
- `system` (default) - Use the system-wide default fetcher
- `html_requests` - Fast requests-based fetcher
- `html_webdriver` - Browser-based fetcher (Playwright/Puppeteer)
- `extra_browser_*` - Custom browser configurations (if configured)
- Plugin-provided fetchers (if installed)
pattern: '^(system|html_requests|html_webdriver|extra_browser_.+)$'
default: system
headers: headers:
type: object type: object
additionalProperties: additionalProperties:
type: string type: string
description: HTTP headers to include in requests description: HTTP headers to include in requests
body: body:
type: string type: [string, 'null']
description: HTTP request body description: HTTP request body
maxLength: 5000 maxLength: 5000
proxy: proxy:
type: string type: [string, 'null']
description: Proxy configuration description: Proxy configuration
maxLength: 5000 maxLength: 5000
ignore_status_codes:
type: [boolean, 'null']
description: Ignore HTTP status code errors (boolean or null)
webdriver_delay: webdriver_delay:
type: integer type: [integer, 'null']
description: Delay in seconds for webdriver description: Delay in seconds for webdriver
webdriver_js_execute_code: webdriver_js_execute_code:
type: string type: [string, 'null']
description: JavaScript code to execute description: JavaScript code to execute
maxLength: 5000 maxLength: 5000
time_between_check: time_between_check:
type: object type: object
properties: properties:
weeks: weeks:
type: integer type: [integer, 'null']
minimum: 0 minimum: 0
maximum: 52000 maximum: 52000
nullable: true
days: days:
type: integer type: [integer, 'null']
minimum: 0 minimum: 0
maximum: 365000 maximum: 365000
nullable: true
hours: hours:
type: integer type: [integer, 'null']
minimum: 0 minimum: 0
maximum: 8760000 maximum: 8760000
nullable: true
minutes: minutes:
type: integer type: [integer, 'null']
minimum: 0 minimum: 0
maximum: 525600000 maximum: 525600000
nullable: true
seconds: seconds:
type: integer type: [integer, 'null']
minimum: 0 minimum: 0
maximum: 31536000000 maximum: 31536000000
nullable: true
description: Time intervals between checks. All fields must be non-negative. At least one non-zero value required when not using default settings. description: Time intervals between checks. All fields must be non-negative. At least one non-zero value required when not using default settings.
time_between_check_use_default: time_between_check_use_default:
type: boolean type: boolean
@@ -219,11 +233,11 @@ components:
maxItems: 100 maxItems: 100
description: Notification URLs for this web page change monitor (watch). Maximum 100 URLs. description: Notification URLs for this web page change monitor (watch). Maximum 100 URLs.
notification_title: notification_title:
type: string type: [string, 'null']
description: Custom notification title description: Custom notification title
maxLength: 5000 maxLength: 5000
notification_body: notification_body:
type: string type: [string, 'null']
description: Custom notification body description: Custom notification body
maxLength: 5000 maxLength: 5000
notification_format: notification_format:
@@ -231,7 +245,7 @@ components:
enum: ['text', 'html', 'htmlcolor', 'markdown', 'System default'] enum: ['text', 'html', 'htmlcolor', 'markdown', 'System default']
description: Format for notifications description: Format for notifications
track_ldjson_price_data: track_ldjson_price_data:
type: boolean type: [boolean, 'null']
description: Whether to track JSON-LD price data description: Whether to track JSON-LD price data
browser_steps: browser_steps:
type: array type: array
@@ -239,17 +253,14 @@ components:
type: object type: object
properties: properties:
operation: operation:
type: string type: [string, 'null']
maxLength: 5000 maxLength: 5000
nullable: true
selector: selector:
type: string type: [string, 'null']
maxLength: 5000 maxLength: 5000
nullable: true
optional_value: optional_value:
type: string type: [string, 'null']
maxLength: 5000 maxLength: 5000
nullable: true
required: [operation, selector, optional_value] required: [operation, selector, optional_value]
additionalProperties: false additionalProperties: false
maxItems: 100 maxItems: 100
@@ -260,16 +271,197 @@ components:
default: text_json_diff default: text_json_diff
description: Optional processor mode to use for change detection. Defaults to `text_json_diff` if not specified. description: Optional processor mode to use for change detection. Defaults to `text_json_diff` if not specified.
# Content Filtering
include_filters:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: CSS/XPath selectors to extract specific content from the page
subtractive_selectors:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: CSS/XPath selectors to remove content from the page
ignore_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text patterns to ignore in change detection
trigger_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text/regex patterns that must be present to trigger a change
text_should_not_be_present:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Text that should NOT be present (triggers alert if found)
extract_text:
type: array
items:
type: string
maxLength: 5000
maxItems: 100
description: Regex patterns to extract specific text after filtering
# Text Processing
trim_text_whitespace:
type: boolean
default: false
description: Strip leading/trailing whitespace from text
sort_text_alphabetically:
type: boolean
default: false
description: Sort lines alphabetically before comparison
remove_duplicate_lines:
type: boolean
default: false
description: Remove duplicate lines from content
check_unique_lines:
type: boolean
default: false
description: Compare against all history for unique lines
strip_ignored_lines:
type: [boolean, 'null']
description: Remove lines matching ignore patterns
# Change Detection Filters
filter_text_added:
type: boolean
default: true
description: Include added text in change detection
filter_text_removed:
type: boolean
default: true
description: Include removed text in change detection
filter_text_replaced:
type: boolean
default: true
description: Include replaced text in change detection
# Restock/Price Detection
in_stock_only:
type: boolean
default: true
description: Only trigger on in-stock transitions (restock_diff processor)
follow_price_changes:
type: boolean
default: true
description: Monitor and track price changes (restock_diff processor)
price_change_threshold_percent:
type: [number, 'null']
description: Minimum price change percentage to trigger notification
has_ldjson_price_data:
type: [boolean, 'null']
description: Whether page has LD-JSON price data (auto-detected)
readOnly: true
# Notifications
notification_screenshot:
type: boolean
default: false
description: Include screenshot in notifications (if supported by notification URL)
filter_failure_notification_send:
type: boolean
default: true
description: Send notification when filters fail to match content
# History & Display
use_page_title_in_list:
type: [boolean, 'null']
description: Display page title in watch list (null = use system default)
history_snapshot_max_length:
type: [integer, 'null']
minimum: 1
maximum: 1000
description: Maximum number of history snapshots to keep (null = use system default)
# Scheduling
time_schedule_limit:
type: object
description: Weekly schedule limiting when checks can run
properties:
enabled:
type: boolean
default: false
monday:
$ref: '#/components/schemas/DaySchedule'
tuesday:
$ref: '#/components/schemas/DaySchedule'
wednesday:
$ref: '#/components/schemas/DaySchedule'
thursday:
$ref: '#/components/schemas/DaySchedule'
friday:
$ref: '#/components/schemas/DaySchedule'
saturday:
$ref: '#/components/schemas/DaySchedule'
sunday:
$ref: '#/components/schemas/DaySchedule'
# Conditions (advanced logic)
conditions:
type: array
items:
type: object
properties:
field:
type: string
description: Field to check (e.g., 'page_filtered_text', 'page_title')
operator:
type: string
description: Comparison operator (e.g., 'contains_regex', 'equals', 'not_equals')
value:
type: string
description: Value to compare against
required: [field, operator, value]
maxItems: 100
description: Array of condition rules for change detection logic (empty array when not set)
conditions_match_logic:
type: string
enum: ['ALL', 'ANY']
default: 'ALL'
description: Logic operator - ALL (match all conditions) or ANY (match any condition)
DaySchedule:
type: object
properties:
enabled:
type: boolean
default: true
start_time:
type: string
pattern: '^([0-1]?[0-9]|2[0-3]):[0-5][0-9]$'
default: '00:00'
description: Start time in HH:MM format
duration:
type: object
properties:
hours:
type: string
pattern: '^[0-9]+$'
default: '24'
minutes:
type: string
pattern: '^[0-9]+$'
default: '00'
Watch: Watch:
allOf: allOf:
- $ref: '#/components/schemas/WatchBase' - $ref: '#/components/schemas/WatchBase'
- type: object - type: object
properties: properties:
uuid:
type: string
format: uuid
description: Unique identifier for the web page change monitor (watch)
readOnly: true
last_checked: last_checked:
type: integer type: integer
description: Unix timestamp of last check description: Unix timestamp of last check
@@ -278,9 +470,10 @@ components:
type: integer type: integer
description: Unix timestamp of last change description: Unix timestamp of last change
readOnly: true readOnly: true
x-computed: true
last_error: last_error:
type: string type: [string, boolean, 'null']
description: Last error message description: Last error message (false when no error, string when error occurred, null if not checked yet)
readOnly: true readOnly: true
last_viewed: last_viewed:
type: integer type: integer
@@ -291,6 +484,61 @@ components:
format: string format: string
description: The watch URL rendered in case of any Jinja2 markup, always use this for listing. description: The watch URL rendered in case of any Jinja2 markup, always use this for listing.
readOnly: true readOnly: true
x-computed: true
page_title:
type: [string, 'null']
description: HTML <title> tag extracted from the page
readOnly: true
check_count:
type: integer
description: Total number of checks performed
readOnly: true
fetch_time:
type: number
description: Duration of last fetch in seconds
readOnly: true
previous_md5:
type: [string, boolean]
description: MD5 hash of previous content (false if not set)
readOnly: true
previous_md5_before_filters:
type: [string, boolean]
description: MD5 hash before filters applied (false if not set)
readOnly: true
consecutive_filter_failures:
type: integer
description: Counter for consecutive filter match failures
readOnly: true
last_notification_error:
type: [string, 'null']
description: Last notification error message
readOnly: true
notification_alert_count:
type: integer
description: Number of notifications sent
readOnly: true
content-type:
type: [string, 'null']
description: Content-Type from last fetch
readOnly: true
remote_server_reply:
type: [string, 'null']
description: Server header from last response
readOnly: true
browser_steps_last_error_step:
type: [integer, 'null']
description: Last browser step that caused an error
readOnly: true
viewed:
type: [integer, boolean]
description: Computed property - true if watch has been viewed, false otherwise (deprecated, use last_viewed instead)
readOnly: true
x-computed: true
history_n:
type: integer
description: Number of history snapshots available
readOnly: true
x-computed: true
CreateWatch: CreateWatch:
allOf: allOf:
@@ -301,34 +549,45 @@ components:
UpdateWatch: UpdateWatch:
allOf: allOf:
- $ref: '#/components/schemas/WatchBase' - $ref: '#/components/schemas/WatchBase' # Extends WatchBase for user-settable fields
- type: object - type: object
properties: properties:
last_viewed: last_viewed:
type: integer type: integer
description: Unix timestamp in seconds of the last time the watch was viewed. Setting it to a value higher than `last_changed` in the "Update watch" endpoint marks the watch as viewed. description: Unix timestamp in seconds of the last time the watch was viewed. Setting it to a value higher than `last_changed` in the "Update watch" endpoint marks the watch as viewed.
minimum: 0 minimum: 0
# Note: ReadOnly and @property fields are filtered out in the backend before update
# We don't use unevaluatedProperties:false here to allow roundtrip GET/PUT workflows
# where the response includes computed fields that should be silently ignored
Tag: Tag:
type: object allOf:
properties: - $ref: '#/components/schemas/WatchBase'
uuid: - type: object
type: string properties:
format: uuid overrides_watch:
description: Unique identifier for the tag type: [boolean, 'null']
readOnly: true description: |
title: Whether this tag's settings override watch settings for all watches in this tag/group.
type: string - true: Tag settings override watch settings
description: Tag title - false: Tag settings do not override (watches use their own settings)
maxLength: 5000 - null: Not decided yet / inherit default behavior
notification_urls: # Future: Aggregated statistics from all watches with this tag
type: array # check_count:
items: # type: integer
type: string # description: Sum of check_count from all watches with this tag
description: Default notification URLs for web page change monitors (watches) with this tag # readOnly: true
notification_muted: # x-computed: true
type: boolean # last_checked:
description: Whether notifications are muted for this tag # type: integer
# description: Most recent last_checked timestamp from all watches with this tag
# readOnly: true
# x-computed: true
# last_changed:
# type: integer
# description: Most recent last_changed timestamp from all watches with this tag
# readOnly: true
# x-computed: true
CreateTag: CreateTag:
allOf: allOf:

File diff suppressed because one or more lines are too long

View File

@@ -5,7 +5,6 @@ flask-compress
# 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers) # 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers)
flask-login>=0.6.3 flask-login>=0.6.3
flask-paginate flask-paginate
flask_expects_json~=1.7
flask_restful flask_restful
flask_cors # For the Chrome extension to operate flask_cors # For the Chrome extension to operate
# janus # No longer needed - using pure threading.Queue for multi-loop support # janus # No longer needed - using pure threading.Queue for multi-loop support
@@ -126,8 +125,8 @@ greenlet >= 3.0.3
# Default SOCKETIO_MODE=threading is recommended for better compatibility # Default SOCKETIO_MODE=threading is recommended for better compatibility
gevent gevent
# Pinned or it causes problems with flask_expects_json which seems unmaintained # Previously pinned for flask_expects_json (removed 2026-02). Unpinning for now.
referencing==0.35.1 referencing
# For conditions # For conditions
panzi-json-logic panzi-json-logic