Compare commits

..

8 Commits

Author SHA1 Message Date
dgtlmoon 04e22f1b0a Be sure Batch Mode cant run when flask mode is running
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2026-01-24 13:42:21 +01:00
dgtlmoon 2a0e8cbf6c Repeat checks 2026-01-24 13:40:38 +01:00
dgtlmoon 901d69af42 Better cross-worker watch UUID management 2026-01-24 13:29:20 +01:00
dgtlmoon 528c305928 setup tweaks 2026-01-24 13:13:10 +01:00
dgtlmoon a4357c7bb7 fi ximports 2026-01-24 13:13:00 +01:00
dgtlmoon 75db43fc09 batch mode fixes 2026-01-23 12:14:44 +01:00
dgtlmoon f8c6c62107 Adding CLI options 2026-01-23 11:37:06 +01:00
dgtlmoon 4523918752 Adding memory info stats to output log on app shutdown 2026-01-23 10:34:02 +01:00
75 changed files with 1993 additions and 4636 deletions
+2 -2
View File
@@ -61,8 +61,8 @@ jobs:
# --- API test ---
# This also means that the docs/api-spec.yml was shipped and could be read
test -f /tmp/changedetection.json
API_KEY=$(jq -r '.. | .api_access_token? // empty' /tmp/changedetection.json)
test -f /tmp/url-watches.json
API_KEY=$(jq -r '.. | .api_access_token? // empty' /tmp/url-watches.json)
echo Test API KEY is $API_KEY
curl -X POST "http://127.0.0.1:10000/api/v1/watch" \
-H "x-api-key: ${API_KEY}" \
@@ -37,29 +37,10 @@ jobs:
${{ runner.os }}-pip-py${{ env.PYTHON_VERSION }}-
${{ runner.os }}-pip-
- name: Get current date for cache key
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build changedetection.io container for testing under Python ${{ env.PYTHON_VERSION }}
uses: docker/build-push-action@v6
with:
context: ./
file: ./Dockerfile
build-args: |
PYTHON_VERSION=${{ env.PYTHON_VERSION }}
LOGGER_LEVEL=TRACE
tags: test-changedetectionio
load: true
cache-from: type=gha,scope=build-${{ github.ref_name }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements.txt', 'Dockerfile') }}-${{ steps.date.outputs.date }}
cache-to: type=gha,mode=max,scope=build-${{ github.ref_name }}-py${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements.txt', 'Dockerfile') }}-${{ steps.date.outputs.date }}
- name: Verify build
run: |
echo "---- Built for Python ${{ env.PYTHON_VERSION }} -----"
echo "---- Building for Python ${{ env.PYTHON_VERSION }} -----"
docker build --build-arg PYTHON_VERSION=${{ env.PYTHON_VERSION }} --build-arg LOGGER_LEVEL=TRACE -t test-changedetectionio .
docker run test-changedetectionio bash -c 'pip list'
- name: We should be Python ${{ env.PYTHON_VERSION }} ...
@@ -395,29 +376,6 @@ jobs:
cd changedetectionio
./run_custom_browser_url_tests.sh
processor-plugin-tests:
runs-on: ubuntu-latest
needs: build
timeout-minutes: 20
env:
PYTHON_VERSION: ${{ inputs.python-version }}
steps:
- uses: actions/checkout@v6
- name: Download Docker image artifact
uses: actions/download-artifact@v7
with:
name: test-changedetectionio-${{ env.PYTHON_VERSION }}
path: /tmp
- name: Load Docker image
run: |
docker load -i /tmp/test-changedetectionio.tar
- name: Basic processor plugin registration and checks
run: |
docker run -e EXTRA_PACKAGES=changedetection.io-osint-processor test-changedetectionio bash -c 'cd changedetectionio;pytest -vvv -s tests/plugins/test_processor.py::test_check_plugin_processor'
# Container startup tests
container-tests:
runs-on: ubuntu-latest
-1
View File
@@ -29,4 +29,3 @@ test-datastore/
# Memory consumption log
test-memory.log
tests/logs/
-9
View File
@@ -138,15 +138,6 @@ ENV LOGGER_LEVEL="$LOGGER_LEVEL"
ENV LC_ALL=en_US.UTF-8
WORKDIR /app
# Copy and set up entrypoint script for installing extra packages
COPY docker-entrypoint.sh /docker-entrypoint.sh
RUN chmod +x /docker-entrypoint.sh
# Set entrypoint to handle EXTRA_PACKAGES env var
ENTRYPOINT ["/docker-entrypoint.sh"]
# Default command (can be overridden in docker-compose.yml)
CMD ["python", "./changedetection.py", "-d", "/datastore"]
-1
View File
@@ -9,7 +9,6 @@ recursive-include changedetectionio/notification *
recursive-include changedetectionio/processors *
recursive-include changedetectionio/realtime *
recursive-include changedetectionio/static *
recursive-include changedetectionio/store *
recursive-include changedetectionio/templates *
recursive-include changedetectionio/tests *
recursive-include changedetectionio/translations *
+76 -72
View File
@@ -102,8 +102,8 @@ def sigshutdown_handler(_signo, _stack_frame):
# Shutdown workers and queues immediately
try:
from changedetectionio import worker_pool
worker_pool.shutdown_workers()
from changedetectionio import worker_handler
worker_handler.shutdown_workers()
except Exception as e:
logger.error(f"Error shutting down workers: {str(e)}")
@@ -124,57 +124,52 @@ def sigshutdown_handler(_signo, _stack_frame):
except Exception as e:
logger.error(f"Error shutting down Socket.IO server: {str(e)}")
# Save data quickly - force immediate save using abstract method
# Save data quickly
try:
datastore.force_save_all()
logger.success('Fast sync to storage complete.')
datastore.sync_to_json()
logger.success('Fast sync to disk complete.')
except Exception as e:
logger.error(f"Error syncing to storage: {str(e)}")
logger.error(f"Error syncing to disk: {str(e)}")
sys.exit()
def print_help():
"""Print help text for command line options"""
print('Usage: changedetection.py [options]')
print('')
print('Standard options:')
print(' -s SSL enable')
print(' -h HOST Listen host (default: 0.0.0.0)')
print(' -p PORT Listen port (default: 5000)')
print(' -d PATH Datastore path')
print(' -l LEVEL Log level (TRACE, DEBUG, INFO, SUCCESS, WARNING, ERROR, CRITICAL)')
print(' -c Cleanup unused snapshots')
print(' -C Create datastore directory if it doesn\'t exist')
print(' -P true/false Set all watches paused (true) or active (false)')
print('')
print('Add URLs on startup:')
print(' -u URL Add URL to watch (can be used multiple times)')
print(' -u0 \'JSON\' Set options for first -u URL (e.g. \'{"processor":"text_json_diff"}\')')
print(' -u1 \'JSON\' Set options for second -u URL (0-indexed)')
print(' -u2 \'JSON\' Set options for third -u URL, etc.')
print(' Available options: processor, fetch_backend, headers, method, etc.')
print(' See model/Watch.py for all available options')
print('')
print('Recheck on startup:')
print(' -r all Queue all watches for recheck on startup')
print(' -r UUID,... Queue specific watches (comma-separated UUIDs)')
print(' -r all N Queue all watches, wait for completion, repeat N times')
print(' -r UUID,... N Queue specific watches, wait for completion, repeat N times')
print('')
print('Batch mode:')
print(' -b Run in batch mode (process queue then exit)')
print(' Useful for CI/CD, cron jobs, or one-time checks')
print(' NOTE: Batch mode checks if Flask is running and aborts if port is in use')
print(' Use -p PORT to specify a different port if needed')
print('')
def main():
global datastore
global app
# Early help/version check before any initialization
if '--help' in sys.argv or '-help' in sys.argv:
print_help()
print('Usage: changedetection.py [options]')
print('')
print('Standard options:')
print(' -s SSL enable')
print(' -h HOST Listen host (default: 0.0.0.0)')
print(' -p PORT Listen port (default: 5000)')
print(' -d PATH Datastore path')
print(' -l LEVEL Log level (TRACE, DEBUG, INFO, SUCCESS, WARNING, ERROR, CRITICAL)')
print(' -c Cleanup unused snapshots')
print(' -C Create datastore directory if it doesn\'t exist')
print('')
print('Add URLs on startup:')
print(' -u URL Add URL to watch (can be used multiple times)')
print(' -u0 \'JSON\' Set options for first -u URL (e.g. \'{"processor":"text_json_diff"}\')')
print(' -u1 \'JSON\' Set options for second -u URL (0-indexed)')
print(' -u2 \'JSON\' Set options for third -u URL, etc.')
print(' Available options: processor, fetch_backend, headers, method, etc.')
print(' See model/Watch.py for all available options')
print('')
print('Recheck on startup:')
print(' -r all Queue all watches for recheck on startup')
print(' -r UUID,... Queue specific watches (comma-separated UUIDs)')
print(' -r all N Queue all watches, wait for completion, repeat N times')
print(' -r UUID,... N Queue specific watches, wait for completion, repeat N times')
print('')
print('Batch mode:')
print(' -b Run in batch mode (process queue then exit)')
print(' Useful for CI/CD, cron jobs, or one-time checks')
print(' NOTE: Batch mode checks if Flask is running and aborts if port is in use')
print(' Use -p PORT to specify a different port if needed')
print('')
sys.exit(0)
if '--version' in sys.argv or '-v' in sys.argv:
@@ -190,7 +185,6 @@ def main():
# Set a default logger level
logger_level = 'DEBUG'
include_default_watches = True
all_paused = None # None means don't change, True/False to set
host = os.environ.get("LISTEN_HOST", "0.0.0.0").strip()
port = int(os.environ.get('PORT', 5000))
@@ -269,9 +263,39 @@ def main():
i += 1
try:
opts, args = getopt.getopt(cleaned_argv[1:], "6Ccsd:h:p:l:P:", "port")
opts, args = getopt.getopt(cleaned_argv[1:], "6Ccsd:h:p:l:", "port")
except getopt.GetoptError as e:
print_help()
print('Usage: changedetection.py [options]')
print('')
print('Standard options:')
print(' -s SSL enable')
print(' -h HOST Listen host (default: 0.0.0.0)')
print(' -p PORT Listen port (default: 5000)')
print(' -d PATH Datastore path')
print(' -l LEVEL Log level (TRACE, DEBUG, INFO, SUCCESS, WARNING, ERROR, CRITICAL)')
print(' -c Cleanup unused snapshots')
print(' -C Create datastore directory if it doesn\'t exist')
print('')
print('Add URLs on startup:')
print(' -u URL Add URL to watch (can be used multiple times)')
print(' -u0 \'JSON\' Set options for first -u URL (e.g. \'{"processor":"text_json_diff"}\')')
print(' -u1 \'JSON\' Set options for second -u URL (0-indexed)')
print(' -u2 \'JSON\' Set options for third -u URL, etc.')
print(' Available options: processor, fetch_backend, headers, method, etc.')
print(' See model/Watch.py for all available options')
print('')
print('Recheck on startup:')
print(' -r all Queue all watches for recheck on startup')
print(' -r UUID,... Queue specific watches (comma-separated UUIDs)')
print(' -r all N Queue all watches, wait for completion, repeat N times')
print(' -r UUID,... N Queue specific watches, wait for completion, repeat N times')
print('')
print('Batch mode:')
print(' -b Run in batch mode (process queue then exit)')
print(' Useful for CI/CD, cron jobs, or one-time checks')
print(' NOTE: Batch mode checks if Flask is running and aborts if port is in use')
print(' Use -p PORT to specify a different port if needed')
print('')
print(f'Error: {e}')
sys.exit(2)
@@ -308,14 +332,6 @@ def main():
if opt == '-l':
logger_level = int(arg) if arg.isdigit() else arg.upper()
if opt == '-P':
try:
all_paused = bool(strtobool(arg))
except ValueError:
print(f'Error: Invalid value for -P option: {arg}')
print('Expected: true, false, yes, no, 1, or 0')
sys.exit(2)
# If URLs are provided, don't include default watches
if urls_to_add:
include_default_watches = False
@@ -382,11 +398,6 @@ def main():
logger.critical(str(e))
return
# Apply all_paused setting if specified via CLI
if all_paused is not None:
datastore.data['settings']['application']['all_paused'] = all_paused
logger.info(f"Setting all watches paused: {all_paused}")
# Inject datastore into plugins that need access to settings
from changedetectionio.pluggy_interface import inject_datastore_into_plugins
inject_datastore_into_plugins(datastore)
@@ -415,12 +426,12 @@ def main():
# This must happen AFTER app initialization so update_q is available
if batch_mode and added_watch_uuids:
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData, worker_pool
from changedetectionio import queuedWatchMetaData, worker_handler
logger.info(f"Batch mode: Queuing {len(added_watch_uuids)} newly added watches")
for watch_uuid in added_watch_uuids:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -432,7 +443,7 @@ def main():
# This must happen AFTER app initialization so update_q is available
if recheck_watches is not None:
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData, worker_pool
from changedetectionio import queuedWatchMetaData, worker_handler
watches_to_queue = []
if recheck_watches == 'all':
@@ -454,7 +465,7 @@ def main():
for watch_uuid in watches_to_queue:
if watch_uuid in datastore.data['watching']:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -516,7 +527,7 @@ def main():
for watch_uuid in watches_to_queue:
if watch_uuid in datastore.data['watching']:
try:
worker_pool.queue_item_async_safe(
worker_handler.queue_item_async_safe(
update_q,
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
)
@@ -549,7 +560,7 @@ def main():
logger.info(f"Batch mode: Waiting for iteration {current_iteration}/{total_iterations} to complete...")
# Use the shared wait_for_all_checks function
completed = worker_pool.wait_for_all_checks(update_q, timeout=300)
completed = worker_handler.wait_for_all_checks(update_q, timeout=300)
if not completed:
logger.warning(f"Batch mode: Iteration {current_iteration} timed out after 300 seconds")
@@ -642,14 +653,7 @@ def main():
if os.getenv('USE_X_SETTINGS'):
logger.info("USE_X_SETTINGS is ENABLED")
from werkzeug.middleware.proxy_fix import ProxyFix
app.wsgi_app = ProxyFix(
app.wsgi_app,
x_for=1, # X-Forwarded-For (client IP)
x_proto=1, # X-Forwarded-Proto (http/https)
x_host=1, # X-Forwarded-Host (original host)
x_port=1, # X-Forwarded-Port (original port)
x_prefix=1 # X-Forwarded-Prefix (URL prefix)
)
app.wsgi_app = ProxyFix(app.wsgi_app, x_prefix=1, x_host=1)
# In batch mode, skip starting the HTTP server - just keep workers running
+3 -13
View File
@@ -1,5 +1,5 @@
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask_expects_json import expects_json
from flask_restful import abort, Resource
from loguru import logger
@@ -42,7 +42,7 @@ class Tag(Resource):
# If less than 20 watches, queue synchronously for immediate feedback
if len(watches_to_queue) < 20:
for watch_uuid in watches_to_queue:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
return {'status': f'OK, queued {len(watches_to_queue)} watches for rechecking'}, 200
else:
# 20+ watches - queue in background thread to avoid blocking API response
@@ -50,7 +50,7 @@ class Tag(Resource):
"""Background thread to queue watches - discarded after completion."""
try:
for watch_uuid in watches_to_queue:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
logger.info(f"Background queueing complete for tag {tag['uuid']}: {len(watches_to_queue)} watches queued")
except Exception as e:
logger.error(f"Error in background queueing for tag {tag['uuid']}: {e}")
@@ -96,16 +96,6 @@ class Tag(Resource):
if not tag:
abort(404, message='No tag exists with the UUID of {}'.format(uuid))
# Validate notification_urls if provided
if 'notification_urls' in request.json:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
notification_urls = request.json.get('notification_urls', [])
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
tag.update(request.json)
self.datastore.needs_write_urgent = True
+55 -45
View File
@@ -6,7 +6,7 @@ from changedetectionio.favicon_utils import get_favicon_mime_type
from . import auth
from changedetectionio import queuedWatchMetaData, strtobool
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask import request, make_response, send_from_directory
from flask_expects_json import expects_json
from flask_restful import abort, Resource
@@ -85,7 +85,7 @@ class Watch(Resource):
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
if request.args.get('recheck'):
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
return "OK", 200
if request.args.get('paused', '') == 'paused':
self.datastore.data['watching'].get(uuid).pause()
@@ -140,16 +140,6 @@ class Watch(Resource):
if validation_error:
return validation_error, 400
# Validate notification_urls if provided
if 'notification_urls' in request.json:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
notification_urls = request.json.get('notification_urls', [])
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
# XSS etc protection - validate URL if it's being updated
if 'url' in request.json:
new_url = request.json.get('url')
@@ -169,18 +159,58 @@ class Watch(Resource):
# Handle processor-config-* fields separately (save to JSON, not datastore)
from changedetectionio import processors
processor_config_data = {}
regular_data = {}
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Extract and remove processor config fields from json_data
processor_config_data = processors.extract_processor_config_from_form_data(json_data)
for key, value in request.json.items():
if key.startswith('processor_config_'):
config_key = key.replace('processor_config_', '')
if value: # Only save non-empty values
processor_config_data[config_key] = value
else:
regular_data[key] = value
# Update watch with regular (non-processor-config) fields
watch.update(json_data)
watch.update(regular_data)
# Save processor config to JSON file
processors.save_processor_config(self.datastore, uuid, processor_config_data)
# Save processor config to JSON file if any config data exists
if processor_config_data:
try:
processor_name = request.json.get('processor', watch.get('processor'))
if processor_name:
# Create a processor instance to access config methods
from changedetectionio.processors import difference_detection_processor
processor_instance = difference_detection_processor(self.datastore, uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, processor_config_data)
logger.debug(f"API: Saved processor config to {config_filename}: {processor_config_data}")
# Call optional edit_hook if processor has one
try:
import importlib
edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
try:
edit_hook = importlib.import_module(edit_hook_module_name)
logger.debug(f"API: Found edit_hook module for {processor_name}")
if hasattr(edit_hook, 'on_config_save'):
logger.info(f"API: Calling edit_hook.on_config_save for {processor_name}")
# Call hook and get updated config
updated_config = edit_hook.on_config_save(watch, processor_config_data, self.datastore)
# Save updated config back to file
processor_instance.update_extra_watch_config(config_filename, updated_config)
logger.info(f"API: Edit hook updated config: {updated_config}")
else:
logger.debug(f"API: Edit hook module found but no on_config_save function")
except ModuleNotFoundError:
logger.debug(f"API: No edit_hook module for processor {processor_name} (this is normal)")
except Exception as hook_error:
logger.error(f"API: Edit hook error (non-fatal): {hook_error}", exc_info=True)
except Exception as e:
logger.error(f"API: Failed to save processor config: {e}")
return "OK", 200
@@ -414,16 +444,6 @@ class CreateWatch(Resource):
if validation_error:
return validation_error, 400
# Validate notification_urls if provided
if 'notification_urls' in json_data:
from wtforms import ValidationError
from changedetectionio.api.Notifications import validate_notification_urls
try:
notification_urls = json_data.get('notification_urls', [])
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
extras = copy.deepcopy(json_data)
# Because we renamed 'tag' to 'tags' but don't want to change the API (can do this in v2 of the API)
@@ -437,19 +457,9 @@ class CreateWatch(Resource):
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags)
if new_uuid:
# Dont queue because the scheduler will check that it hasnt been checked before anyway
# worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
# worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
return {'uuid': new_uuid}, 201
else:
# Check if it was a limit issue
page_watch_limit = os.getenv('PAGE_WATCH_LIMIT')
if page_watch_limit:
try:
page_watch_limit = int(page_watch_limit)
current_watch_count = len(self.datastore.data['watching'])
if current_watch_count >= page_watch_limit:
return f"Watch limit reached ({current_watch_count}/{page_watch_limit} watches). Cannot add more watches.", 429
except ValueError:
pass
return "Invalid or unsupported URL", 400
@auth.check_token
@@ -484,7 +494,7 @@ class CreateWatch(Resource):
if len(watches_to_queue) < 20:
# Get already queued/running UUIDs once (efficient)
queued_uuids = set(self.update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
# Filter out watches that are already queued or running
watches_to_queue_filtered = [
@@ -494,7 +504,7 @@ class CreateWatch(Resource):
# Queue only the filtered watches
for uuid in watches_to_queue_filtered:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Provide feedback about skipped watches
skipped_count = len(watches_to_queue) - len(watches_to_queue_filtered)
@@ -506,7 +516,7 @@ class CreateWatch(Resource):
# 20+ watches - queue in background thread to avoid blocking API response
# Capture queued/running state before background thread
queued_uuids = set(self.update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
def queue_all_watches_background():
"""Background thread to queue all watches - discarded after completion."""
@@ -516,7 +526,7 @@ class CreateWatch(Resource):
for uuid in watches_to_queue:
# Check if already queued or running (state captured at start)
if uuid not in queued_uuids and uuid not in running_uuids:
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
queued_count += 1
else:
skipped_count += 1
@@ -3,9 +3,7 @@ from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
from changedetectionio import worker_pool
from changedetectionio.flask_app import watch_check_update
from changedetectionio.queuedWatchMetaData import PrioritizedItem
import asyncio
import importlib
@@ -48,33 +46,19 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
jobs_processed = 0
start_time = time.time()
# Log thread name for debugging
import threading
thread_name = threading.current_thread().name
logger.info(f"Starting async worker {worker_id} on thread '{thread_name}' (max_jobs={max_jobs}, max_runtime={max_runtime_seconds}s)")
logger.info(f"Starting async worker {worker_id} (max_jobs={max_jobs}, max_runtime={max_runtime_seconds}s)")
while not app.config.exit.is_set():
update_handler = None
watch = None
try:
# Efficient blocking via run_in_executor (no polling overhead!)
# Worker blocks in threading.Queue.get() which uses Condition.wait()
# Executor must be sized to match worker count (see worker_pool.py: 50 threads default)
# Single timeout (no double-timeout wrapper) = no race condition
queued_item_data = await q.async_get(executor=executor, timeout=1.0)
# CRITICAL: Claim UUID immediately after getting from queue to prevent race condition
# in wait_for_all_checks() which checks qsize() and running_uuids separately
uuid = queued_item_data.item.get('uuid')
if not worker_pool.claim_uuid_for_processing(uuid, worker_id):
# Already being processed - re-queue and continue
logger.trace(f"Worker {worker_id} detected UUID {uuid} already processing during claim - deferring")
await asyncio.sleep(DEFER_SLEEP_TIME_ALREADY_QUEUED)
deferred_priority = max(1000, queued_item_data.priority * 10)
deferred_item = PrioritizedItem(priority=deferred_priority, item=queued_item_data.item)
worker_pool.queue_item_async_safe(q, deferred_item, silent=True)
continue
# Use sync interface via run_in_executor since each worker has its own event loop
loop = asyncio.get_event_loop()
queued_item_data = await asyncio.wait_for(
loop.run_in_executor(executor, q.get, True, 1.0), # block=True, timeout=1.0
timeout=1.5
)
except asyncio.TimeoutError:
# No jobs available - check if we should restart based on time while idle
@@ -83,17 +67,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.info(f"Worker {worker_id} idle and reached max runtime ({runtime:.0f}s), restarting")
return "restart"
continue
except RuntimeError as e:
# Handle executor shutdown gracefully - this is expected during shutdown
if "cannot schedule new futures after shutdown" in str(e):
# Executor shut down - exit gracefully without logging in pytest
if not IN_PYTEST:
logger.debug(f"Worker {worker_id} detected executor shutdown, exiting")
break
# Other RuntimeError - log and continue
logger.error(f"Worker {worker_id} runtime error: {e}")
await asyncio.sleep(0.1)
continue
except Exception as e:
# Handle expected Empty exception from queue timeout
import queue
@@ -115,8 +88,26 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
await asyncio.sleep(0.1)
continue
# UUID already claimed above immediately after getting from queue
# to prevent race condition with wait_for_all_checks()
uuid = queued_item_data.item.get('uuid')
# RACE CONDITION FIX: Atomically claim this UUID for processing
from changedetectionio import worker_handler
from changedetectionio.queuedWatchMetaData import PrioritizedItem
# Try to claim the UUID atomically - prevents duplicate processing
if not worker_handler.claim_uuid_for_processing(uuid, worker_id):
# Already being processed by another worker
logger.trace(f"Worker {worker_id} detected UUID {uuid} already being processed - deferring")
# Sleep to avoid tight loop and give the other worker time to finish
await asyncio.sleep(DEFER_SLEEP_TIME_ALREADY_QUEUED)
# Re-queue with lower priority so it gets checked again after current processing finishes
deferred_priority = max(1000, queued_item_data.priority * 10)
deferred_item = PrioritizedItem(priority=deferred_priority, item=queued_item_data.item)
worker_handler.queue_item_async_safe(q, deferred_item, silent=True)
logger.debug(f"Worker {worker_id} re-queued UUID {uuid} for subsequent check")
continue
fetch_start_time = round(time.time())
@@ -142,14 +133,11 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
processor = watch.get('processor', 'text_json_diff')
# Init a new 'difference_detection_processor'
# Use get_processor_module() to support both built-in and plugin processors
from changedetectionio.processors import get_processor_module
processor_module = get_processor_module(processor)
if not processor_module:
error_msg = f"Processor module '{processor}' not found."
logger.error(error_msg)
raise ModuleNotFoundError(error_msg)
try:
processor_module = importlib.import_module(f"changedetectionio.processors.{processor}.processor")
except ModuleNotFoundError as e:
print(f"Processor module '{processor}' not found.")
raise e
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid)
@@ -236,7 +224,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
except FilterNotFoundInResponse as e:
if not datastore.data['watching'].get(uuid):
continue
logger.debug(f"Received FilterNotFoundInResponse exception for {uuid}")
err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary."
datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
@@ -256,19 +243,17 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
c += 1
# Send notification if we reached the threshold?
threshold = datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
logger.debug(f"FilterNotFoundInResponse - Filter for {uuid} not found, consecutive_filter_failures: {c} of threshold {threshold}")
logger.debug(f"Filter for {uuid} not found, consecutive_filter_failures: {c} of threshold {threshold}")
if c >= threshold:
if not watch.get('notification_muted'):
logger.debug(f"FilterNotFoundInResponse - Sending filter failed notification for {uuid}")
logger.debug(f"Sending filter failed notification for {uuid}")
await send_filter_failure_notification(uuid, notification_q, datastore)
c = 0
logger.debug(f"FilterNotFoundInResponse - Reset filter failure count back to zero")
else:
logger.debug(f"FilterNotFoundInResponse - {c} of threshold {threshold}..")
logger.debug(f"Reset filter failure count back to zero")
datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
else:
logger.trace(f"FilterNotFoundInResponse - {uuid} - filter_failure_notification_send not enabled, skipping")
logger.trace(f"{uuid} - filter_failure_notification_send not enabled, skipping")
process_changedetection_results = False
@@ -368,10 +353,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}")
except Exception as e:
import traceback
logger.error(f"Worker {worker_id} exception processing watch UUID: {uuid}")
logger.error(str(e))
logger.error(traceback.format_exc())
datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
process_changedetection_results = False
@@ -390,8 +373,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if not datastore.data['watching'].get(uuid):
continue
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler and update_handler.xpath_data else 'empty.'}")
if update_handler and process_changedetection_results:
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler.xpath_data else 'empty.'}")
if process_changedetection_results:
try:
datastore.update_watch(uuid=uuid, update_obj=update_obj)
@@ -441,44 +424,44 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
# Always record attempt count
count = watch.get('check_count', 0) + 1
if update_handler: # Could be none or empty if the processor was not found
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
try:
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Record server header
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
pass
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
# Record server header
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
except Exception as e:
pass
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# Explicitly delete update_handler to free all references
if update_handler:
del update_handler
update_handler = None
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
# Explicitly delete update_handler to free all references
if update_handler:
del update_handler
update_handler = None
# Force aggressive memory cleanup after clearing
import gc
@@ -490,9 +473,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
pass
except Exception as e:
import traceback
logger.error(traceback.format_exc())
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.error(f"Worker {worker_id} traceback:", exc_info=True)
@@ -510,7 +490,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.error(f"Exception while cleaning/quit after calling browser: {e}")
try:
# Release UUID from processing (thread-safe)
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
worker_handler.release_uuid_from_processing(uuid, worker_id=worker_id)
# Send completion signal
if watch:
@@ -27,23 +27,11 @@ def create_backup(datastore_path, watches: dict):
compression=zipfile.ZIP_DEFLATED,
compresslevel=8) as zipObj:
# Add the settings file (supports both formats)
# New format: changedetection.json
changedetection_json = os.path.join(datastore_path, "changedetection.json")
if os.path.isfile(changedetection_json):
zipObj.write(changedetection_json, arcname="changedetection.json")
logger.debug("Added changedetection.json to backup")
# Add the index
zipObj.write(os.path.join(datastore_path, "url-watches.json"), arcname="url-watches.json")
# Legacy format: url-watches.json (for backward compatibility)
url_watches_json = os.path.join(datastore_path, "url-watches.json")
if os.path.isfile(url_watches_json):
zipObj.write(url_watches_json, arcname="url-watches.json")
logger.debug("Added url-watches.json to backup")
# Add the flask app secret (if it exists)
secret_file = os.path.join(datastore_path, "secret.txt")
if os.path.isfile(secret_file):
zipObj.write(secret_file, arcname="secret.txt")
# Add the flask app secret
zipObj.write(os.path.join(datastore_path, "secret.txt"), arcname="secret.txt")
# Add any data in the watch data directory.
for uuid, w in watches.items():
@@ -102,8 +90,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Maximum number of backups reached, please remove some"), "error")
return redirect(url_for('backups.index'))
# Be sure we're written fresh - force immediate save using abstract method
datastore.force_save_all()
# Be sure we're written fresh
datastore.sync_to_json()
zip_thread = threading.Thread(
target=create_backup,
args=(datastore.datastore_path, datastore.data.get("watching")),
@@ -14,7 +14,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
from changedetectionio import forms
#
if request.method == 'POST':
# from changedetectionio import worker_pool
# from changedetectionio import worker_handler
from changedetectionio.blueprint.imports.importer import (
import_url_list,
@@ -26,13 +26,12 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# URL List import
if request.values.get('urls') and len(request.values.get('urls').strip()):
# Import and push into the queue for immediate update check
from changedetectionio import processors
importer_handler = import_url_list()
importer_handler.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', processors.get_default_processor()))
importer_handler.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', 'text_json_diff'))
logger.debug(f"Imported {len(importer_handler.new_uuids)} new UUIDs")
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
if len(importer_handler.remaining_data) == 0:
return redirect(url_for('watchlist.index'))
@@ -46,7 +45,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# XLSX importer
@@ -71,7 +70,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Dont' add to queue because scheduler can see that they haven't been checked and will add them to the queue
# for uuid in importer_handler.new_uuids:
# worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Could be some remaining, or we could be on GET
@@ -62,7 +62,7 @@ class import_url_list(Importer):
extras = None
if processor:
extras = {'processor': processor}
new_uuid = datastore.add_watch(url=url.strip(), tag=tags, save_immediately=False, extras=extras)
new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False, extras=extras)
if new_uuid:
# Straight into the queue.
@@ -129,7 +129,7 @@ class import_distill_io_json(Importer):
new_uuid = datastore.add_watch(url=d['uri'].strip(),
tag=",".join(d.get('tags', [])),
extras=extras,
save_immediately=False)
write_to_disk_now=False)
if new_uuid:
# Straight into the queue.
@@ -204,7 +204,7 @@ class import_xlsx_wachete(Importer):
new_uuid = datastore.add_watch(url=data['url'].strip(),
extras=extras,
tag=data.get('folder'),
save_immediately=False)
write_to_disk_now=False)
if new_uuid:
# Straight into the queue.
self.new_uuids.append(new_uuid)
@@ -287,7 +287,7 @@ class import_xlsx_custom(Importer):
new_uuid = datastore.add_watch(url=url,
extras=extras,
tag=tags,
save_immediately=False)
write_to_disk_now=False)
if new_uuid:
# Straight into the queue.
self.new_uuids.append(new_uuid)
@@ -4,7 +4,7 @@ from flask import Blueprint, flash, redirect, url_for
from flask_login import login_required
from changedetectionio.store import ChangeDetectionStore
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from queue import PriorityQueue
PRICE_DATA_TRACK_ACCEPT = 'accepted'
@@ -20,7 +20,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
datastore.data['watching'][uuid]['processor'] = 'restock_diff'
datastore.data['watching'][uuid].clear_watch()
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
return redirect(url_for("watchlist.index"))
@login_required
@@ -37,8 +37,6 @@ def construct_single_watch_routes(rss_blueprint, datastore):
rss_content_format = datastore.data['settings']['application'].get('rss_content_format')
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# Get the watch by UUID
watch = datastore.data['watching'].get(uuid)
if not watch:
@@ -83,7 +83,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Adjust worker count if it changed
if new_worker_count != old_worker_count:
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from changedetectionio.flask_app import update_q, notification_q, app, datastore as ds
# Check CPU core availability and warn if worker count is high
@@ -92,7 +92,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Warning: Worker count ({}) is close to or exceeds available CPU cores ({})").format(
new_worker_count, cpu_count), 'warning')
result = worker_pool.adjust_async_worker_count(
result = worker_handler.adjust_async_worker_count(
new_count=new_worker_count,
update_q=update_q,
notification_q=notification_q,
@@ -80,16 +80,6 @@
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">{{ _('When a request returns no content, or the HTML does not contain any text, is this considered a change?') }}</span>
</div>
{% if form.requests.proxy %}
<div>
<br>
<div class="inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
<span class="pure-form-message-inline">{{ _('Choose a default proxy for all watches') }}</span>
</div>
</div>
{% endif %}
</fieldset>
</div>
@@ -350,6 +340,15 @@ nav
{{ render_fieldlist_with_inline_errors(form.requests.form.extra_proxies) }}
<span class="pure-form-message-inline">{{ _('"Name" will be used for selecting the proxy in the Watch Edit settings') }}</span><br>
<span class="pure-form-message-inline">{{ _('SOCKS5 proxies with authentication are only supported with \'plain requests\' fetcher, for other fetchers you should whitelist the IP access instead') }}</span>
{% if form.requests.proxy %}
<div>
<br>
<div class="inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
<span class="pure-form-message-inline">{{ _('Choose a default proxy for all watches') }}</span>
</div>
</div>
{% endif %}
</div>
<div class="pure-control-group" id="extra-browsers-setting">
<p>
+18 -20
View File
@@ -10,7 +10,7 @@ from changedetectionio.blueprint.ui.notification import construct_blueprint as c
from changedetectionio.blueprint.ui.views import construct_blueprint as construct_views_blueprint
from changedetectionio.blueprint.ui import diff, preview
def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchMetaData, watch_check_update, extra_data=None, emit_flash=True):
def _handle_operations(op, uuids, datastore, worker_handler, update_q, queuedWatchMetaData, watch_check_update, extra_data=None, emit_flash=True):
from flask import request, flash
if op == 'delete':
@@ -24,7 +24,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['paused'] = True
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches paused").format(len(uuids)))
@@ -32,7 +31,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid.strip()]['paused'] = False
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches unpaused").format(len(uuids)))
@@ -47,7 +45,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['notification_muted'] = True
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches muted").format(len(uuids)))
@@ -55,7 +52,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['notification_muted'] = False
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches un-muted").format(len(uuids)))
@@ -63,7 +59,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
# Recheck and require a full reprocessing
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
if emit_flash:
flash(gettext("{} watches queued for rechecking").format(len(uuids)))
@@ -71,7 +67,6 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]["last_error"] = False
datastore.mark_watch_dirty(uuid)
if emit_flash:
flash(gettext("{} watches errors cleared").format(len(uuids)))
@@ -114,7 +109,7 @@ def _handle_operations(op, uuids, datastore, worker_pool, update_q, queuedWatchM
for uuid in uuids:
watch_check_update.send(watch_uuid=uuid)
def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool, queuedWatchMetaData, watch_check_update):
def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_handler, queuedWatchMetaData, watch_check_update):
ui_blueprint = Blueprint('ui', __name__, template_folder="templates")
# Register the edit blueprint
@@ -222,14 +217,14 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
@login_optionally_required
def form_delete():
uuid = request.args.get('uuid')
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
if uuid != 'all' and not uuid in datastore.data['watching'].keys():
flash(gettext('The watch by UUID {} does not exist.').format(uuid), 'error')
return redirect(url_for('watchlist.index'))
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
datastore.delete(uuid)
flash(gettext('Deleted.'))
@@ -239,14 +234,14 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
@login_optionally_required
def form_clone():
uuid = request.args.get('uuid')
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
new_uuid = datastore.clone(uuid)
if not datastore.data['watching'].get(uuid).get('paused'):
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid}))
flash(gettext('Cloned, you are editing the new watch.'))
@@ -262,10 +257,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
if uuid:
# Single watch - check if already queued or running
if worker_pool.is_watch_running(uuid) or uuid in update_q.get_queued_uuids():
if worker_handler.is_watch_running(uuid) or uuid in update_q.get_queued_uuids():
flash(gettext("Watch is already queued or being checked."))
else:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
flash(gettext("Queued 1 watch for rechecking."))
else:
# Multiple watches - first count how many need to be queued
@@ -284,7 +279,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
if len(watches_to_queue) < 20:
# Get already queued/running UUIDs once (efficient)
queued_uuids = set(update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
# Filter out watches that are already queued or running
watches_to_queue_filtered = []
@@ -294,7 +289,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
# Queue only the filtered watches
for watch_uuid in watches_to_queue_filtered:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
# Provide feedback about skipped watches
skipped_count = len(watches_to_queue) - len(watches_to_queue_filtered)
@@ -310,7 +305,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
# 20+ watches - queue in background thread to avoid blocking HTTP response
# Capture queued/running state before background thread
queued_uuids = set(update_q.get_queued_uuids())
running_uuids = set(worker_pool.get_running_uuids())
running_uuids = set(worker_handler.get_running_uuids())
def queue_watches_background():
"""Background thread to queue watches - discarded after completion."""
@@ -320,7 +315,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
for watch_uuid in watches_to_queue:
# Check if already queued or running (state captured at start)
if watch_uuid not in queued_uuids and watch_uuid not in running_uuids:
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
queued_count += 1
else:
skipped_count += 1
@@ -349,7 +344,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
extra_data=extra_data,
queuedWatchMetaData=queuedWatchMetaData,
uuids=uuids,
worker_pool=worker_pool,
worker_handler=worker_handler,
update_q=update_q,
watch_check_update=watch_check_update,
op=op,
@@ -367,6 +362,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
import json
from copy import deepcopy
# more for testing
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# copy it to memory as trim off what we dont need (history)
watch = deepcopy(datastore.data['watching'].get(uuid))
+82 -70
View File
@@ -83,6 +83,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
If a processor doesn't have a difference module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -100,21 +101,23 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
try:
# Try to import the processor's difference module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.difference')
# Call the processor's render() function
if processor_module and hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render() function
if hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a difference module, falling back to text_json_diff: {e}")
# Fallback: if processor doesn't have difference module, use text_json_diff as default
from changedetectionio.processors.text_json_diff.difference import render as default_render
@@ -141,10 +144,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/extract.py::render_form()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -154,21 +157,23 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
try:
# Try to import the processor's extract module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.extract')
# Call the processor's render_form() function
if processor_module and hasattr(processor_module, 'render_form'):
return processor_module.render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render_form() function
if hasattr(processor_module, 'render_form'):
return processor_module.render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have an extract module, falling back to base extractor: {e}")
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import render_form as default_render_form
@@ -195,7 +200,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/extract.py::process_extraction()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -208,22 +213,24 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
try:
# Try to import the processor's extract module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.extract')
# Call the processor's process_extraction() function
if processor_module and hasattr(processor_module, 'process_extraction'):
return processor_module.process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
# Call the processor's process_extraction() function
if hasattr(processor_module, 'process_extraction'):
return processor_module.process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have an extract module, falling back to base extractor: {e}")
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import process_extraction as default_process_extraction
@@ -260,7 +267,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
- /diff/{uuid}/processor-asset/after
- /diff/{uuid}/processor-asset/rendered_diff
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
@@ -273,33 +280,38 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
try:
# Try to import the processor's difference module
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.difference')
# Call the processor's get_asset() function
if processor_module and hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
# Call the processor's get_asset() function
if hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
if result is None:
if result is None:
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
abort(404, description=f"Processor '{processor_name}' does not support assets")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a difference module: {e}")
from flask import abort
abort(404, description=f"Processor '{processor_name}' does not support assets")
abort(404, description=f"Processor '{processor_name}' not found")
return diff_blueprint
+60 -28
View File
@@ -9,7 +9,7 @@ from jinja2 import Environment, FileSystemLoader
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
from changedetectionio.time_handler import is_within_schedule
from changedetectionio import worker_pool
from changedetectionio import worker_handler
def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMetaData):
edit_blueprint = Blueprint('ui_edit', __name__, template_folder="../ui/templates")
@@ -30,13 +30,14 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
from changedetectionio import processors
import importlib
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
# More for testing, possible to return the first/only
if not datastore.data['watching'].keys():
flash(gettext("No watches to edit"), "error")
return redirect(url_for('watchlist.index'))
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
if not uuid in datastore.data['watching']:
flash(gettext("No watch with the UUID {} found.").format(uuid), "error")
return redirect(url_for('watchlist.index'))
@@ -71,13 +72,8 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
processor_name = datastore.data['watching'][uuid].get('processor', '')
processor_classes = next((tpl for tpl in processors.find_processors() if tpl[1] == processor_name), None)
if not processor_classes:
flash(gettext("Could not load '{}' processor, processor plugin might be missing. Please select a different processor.").format(processor_name), 'error')
# Fall back to default processor so user can still edit and change processor
processor_classes = next((tpl for tpl in processors.find_processors() if tpl[1] == 'text_json_diff'), None)
if not processor_classes:
# If even text_json_diff is missing, something is very wrong
flash(gettext("Could not load '{}' processor, processor plugin might be missing.").format(processor_name), 'error')
return redirect(url_for('watchlist.index'))
flash(gettext("Cannot load the edit form for processor/plugin '{}', plugin missing?").format(processor_classes[1]), 'error')
return redirect(url_for('watchlist.index'))
parent_module = processors.get_parent_module(processor_classes[0])
@@ -154,10 +150,58 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
extra_update_obj['time_between_check'] = form.time_between_check.data
# Handle processor-config-* fields separately (save to JSON, not datastore)
# IMPORTANT: These must NOT be saved to url-watches.json, only to the processor-specific JSON file
processor_config_data = processors.extract_processor_config_from_form_data(form.data)
processors.save_processor_config(datastore, uuid, processor_config_data)
# Handle processor-config-* fields separately (save to JSON, not datastore)
processor_config_data = {}
fields_to_remove = []
for field_name, field_value in form.data.items():
if field_name.startswith('processor_config_'):
config_key = field_name.replace('processor_config_', '')
if field_value: # Only save non-empty values
processor_config_data[config_key] = field_value
fields_to_remove.append(field_name)
# Save processor config to JSON file if any config data exists
if processor_config_data:
try:
processor_name = form.data.get('processor')
# Create a processor instance to access config methods
processor_instance = processors.difference_detection_processor(datastore, uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, processor_config_data)
logger.debug(f"Saved processor config to {config_filename}: {processor_config_data}")
# Call optional edit_hook if processor has one
try:
# Try to import the edit_hook module from the processor package
import importlib
edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
try:
edit_hook = importlib.import_module(edit_hook_module_name)
logger.debug(f"Found edit_hook module for {processor_name}")
if hasattr(edit_hook, 'on_config_save'):
logger.info(f"Calling edit_hook.on_config_save for {processor_name}")
watch_obj = datastore.data['watching'][uuid]
# Call hook and get updated config
updated_config = edit_hook.on_config_save(watch_obj, processor_config_data, datastore)
# Save updated config back to file
processor_instance.update_extra_watch_config(config_filename, updated_config)
logger.info(f"Edit hook updated config: {updated_config}")
else:
logger.debug(f"Edit hook module found but no on_config_save function")
except ModuleNotFoundError:
logger.debug(f"No edit_hook module for processor {processor_name} (this is normal)")
except Exception as hook_error:
logger.error(f"Edit hook error (non-fatal): {hook_error}", exc_info=True)
except Exception as e:
logger.error(f"Failed to save processor config: {e}")
# Remove processor-config-* fields from form.data before updating datastore
for field_name in fields_to_remove:
form.data.pop(field_name, None)
# Ignore text
form_ignore_text = form.ignore_text.data
@@ -239,7 +283,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
#############################
if not datastore.data['watching'][uuid].get('paused') and is_in_schedule:
# Queue the watch for immediate recheck, with a higher priority
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
@@ -267,17 +311,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
# Get fetcher capabilities instead of hardcoded logic
capabilities = get_fetcher_capabilities(watch, datastore)
# Add processor capabilities from module
capabilities['supports_visual_selector'] = getattr(parent_module, 'supports_visual_selector', False)
capabilities['supports_text_filters_and_triggers'] = getattr(parent_module, 'supports_text_filters_and_triggers', False)
capabilities['supports_text_filters_and_triggers_elements'] = getattr(parent_module, 'supports_text_filters_and_triggers_elements', False)
capabilities['supports_request_type'] = getattr(parent_module, 'supports_request_type', False)
app_rss_token = datastore.data['settings']['application'].get('rss_access_token'),
c = [f"processor-{watch.get('processor')}"]
if worker_pool.is_watch_running(uuid):
if worker_handler.is_watch_running(uuid):
c.append('checking-now')
template_args = {
@@ -334,8 +371,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
from flask import send_file
import brotli
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
watch = datastore.data['watching'].get(uuid)
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[-1]
@@ -360,9 +395,6 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
def watch_get_preview_rendered(uuid):
'''For when viewing the "preview" of the rendered text from inside of Edit'''
from flask import jsonify
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
from changedetectionio.processors.text_json_diff import prepare_filter_prevew
result = prepare_filter_prevew(watch_uuid=uuid, form_data=request.form, datastore=datastore)
return jsonify(result)
+50 -38
View File
@@ -26,9 +26,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
Each processor implements processors/{type}/preview.py::render()
If a processor doesn't have a preview module, falls back to default text preview.
"""
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -38,21 +39,24 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's preview module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'preview')
try:
# Try to import the processor's preview module
import importlib
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.preview')
# Call the processor's render() function
if processor_module and hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Call the processor's render() function
if hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
except (ImportError, ModuleNotFoundError) as e:
logger.debug(f"Processor {processor_name} does not have a preview module, using default preview: {e}")
# Fallback: if processor doesn't have preview module, use default text preview
content = []
@@ -146,8 +150,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
"""
from flask import make_response
# More for testing, possible to return the first/only
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
@@ -157,33 +163,39 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's preview module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'preview')
try:
# Try to import the processor's preview module
import importlib
processor_module = importlib.import_module(f'changedetectionio.processors.{processor_name}.preview')
# Call the processor's get_asset() function
if processor_module and hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
# Call the processor's get_asset() function
if hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
if result is None:
if result is None:
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
abort(404, description=f"Processor '{processor_name}' does not support assets")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
except (ImportError, ModuleNotFoundError) as e:
logger.warning(f"Processor {processor_name} does not have a preview module: {e}")
from flask import abort
abort(404, description=f"Processor '{processor_name}' does not support assets")
abort(404, description=f"Processor '{processor_name}' not found")
return preview_blueprint
@@ -45,19 +45,14 @@
<div class="tabs collapsable">
<ul>
<li class="tab"><a href="#general">{{ _('General') }}</a></li>
{% if capabilities.supports_request_type %}
<li class="tab"><a href="#request">{{ _('Request') }}</a></li>
{% endif %}
{% if extra_tab_content %}
<li class="tab"><a href="#extras_tab">{{ extra_tab_content }}</a></li>
{% endif %}
{% if capabilities.supports_browser_steps %}
<li class="tab"><a id="browsersteps-tab" href="#browser-steps">{{ _('Browser Steps') }}</a></li>
{% endif %}
{% if capabilities.supports_visual_selector %}
<!-- should goto extra forms? -->
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<li class="tab"><a id="visualselector-tab" href="#visualselector">{{ _('Visual Filter Selector') }}</a></li>
{% endif %}
{% if capabilities.supports_text_filters_and_triggers %}
<li class="tab" id="filters-and-triggers-tab"><a href="#filters-and-triggers">{{ _('Filters & Triggers') }}</a></li>
<li class="tab" id="conditions-tab"><a href="#conditions">{{ _('Conditions') }}</a></li>
{% endif %}
@@ -121,7 +116,6 @@
</fieldset>
</div>
{% if capabilities.supports_request_type %}
<div class="tab-pane-inner" id="request">
<div class="pure-control-group inline-radio">
{{ render_field(form.fetch_backend, class="fetch-backend") }}
@@ -209,7 +203,6 @@ Math: {{ 1 + 1 }}") }}
</div>
</fieldset>
</div>
{% endif %}
<div class="tab-pane-inner" id="browser-steps">
{% if capabilities.supports_browser_steps %}
@@ -290,7 +283,8 @@ Math: {{ 1 + 1 }}") }}
</fieldset>
</div>
{% if capabilities.supports_text_filters_and_triggers %}
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<div class="tab-pane-inner" id="conditions">
<script>
const verify_condition_rule_url="{{url_for('conditions.verify_condition_single_rule', watch_uuid=uuid)}}";
@@ -309,9 +303,7 @@ Math: {{ 1 + 1 }}") }}
<span id="activate-text-preview" class="pure-button pure-button-primary button-xsmall">{{ _('Activate preview') }}</span>
<div>
<div id="edit-text-filter">
{% if capabilities.supports_text_filters_and_triggers_elements %}
<div class="pure-control-group" id="pro-tips">
<div class="pure-control-group" id="pro-tips">
<strong>{{ _('Pro-tips:') }}</strong><br>
<ul>
<li>
@@ -322,8 +314,8 @@ Math: {{ 1 + 1 }}") }}
</li>
</ul>
</div>
{% include "edit/include_subtract.html" %}
{% endif %}
<div class="text-filtering border-fieldset">
<fieldset class="pure-group" id="text-filtering-type-options">
<h3>{{ _('Text filtering') }}</h3>
@@ -382,7 +374,7 @@ Math: {{ 1 + 1 }}") }}
{{ extra_form_content|safe }}
</div>
{% endif %}
{% if capabilities.supports_visual_selector %}
{% if watch['processor'] == 'text_json_diff' or watch['processor'] == 'image_ssim_diff' %}
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
@@ -394,7 +386,7 @@ Math: {{ 1 + 1 }}") }}
{{ _('The Visual Selector tool lets you select the') }} <i>{{ _('text') }}</i> {{ _('elements that will be used for the change detection. It automatically fills-in the filters in the "CSS/JSONPath/JQ/XPath Filters" box of the') }} <a href="#filters-and-triggers">{{ _('Filters & Triggers') }}</a> {{ _('tab. Use') }} <strong>{{ _('Shift+Click') }}</strong> {{ _('to select multiple items.') }}
</span>
{% if watch['processor'] == 'image_ssim_diff' %} {# @todo, integrate with image_ssim_diff selector better, use some extra form ? #}
{% if watch['processor'] == 'image_ssim_diff' %}
<div id="selection-mode-controls" style="margin: 10px 0; padding: 10px; background: var(--color-background-tab); border-radius: 5px;">
<label style="font-weight: 600; margin-right: 15px;">{{ _('Selection Mode:') }}</label>
<label style="margin-right: 15px;">
+3 -4
View File
@@ -2,7 +2,7 @@ from flask import Blueprint, request, redirect, url_for, flash
from flask_babel import gettext
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
from changedetectionio import worker_pool
from changedetectionio import worker_handler
def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMetaData, watch_check_update):
@@ -24,8 +24,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
flash(gettext('Warning, URL {} already exists').format(url), "notice")
add_paused = request.form.get('edit_and_watch_submit_button') != None
from changedetectionio import processors
processor = request.form.get('processor', processors.get_default_processor())
processor = request.form.get('processor', 'text_json_diff')
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags').strip(), extras={'paused': add_paused, 'processor': processor})
if new_uuid:
@@ -34,7 +33,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
return redirect(url_for('ui.ui_edit.edit_page', uuid=new_uuid, unpause_on_save=1, tag=request.args.get('tag')))
else:
# Straight into the queue.
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
flash(gettext("Watch added."))
return redirect(url_for('watchlist.index', tag=request.args.get('tag','')))
@@ -1,9 +1,5 @@
{%- extends 'base.html' -%}
{%- block content -%}
{%- set tips = [
_("Changedetection.io can monitor more than just web-pages! See our plugins!") ~ ' <a href="https://changedetection.io/plugins">' ~ _('More info') ~ '</a>',
_("You can also add 'shared' watches.") ~ ' <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">' ~ _('More info') ~ '</a>'
] -%}
{%- from '_helpers.html' import render_simple_field, render_field, render_nolabel_field, sort_by_title -%}
<script src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
@@ -73,9 +69,7 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
</div>
</fieldset>
<span style="color:#eee; font-size: 80%;">
<strong>Tip: </strong> {{ tips | random | safe }}
</span>
<span style="color:#eee; font-size: 80%;"><img alt="{{ _('Create a shareable link') }}" style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" > {{ _("Tip: You can also add 'shared' watches.") }} <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">{{ _('More info') }}</a></span>
</form>
</div>
<div class="box">
@@ -211,26 +205,25 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
</div>
{% endif %}
<div>
{%- if watch['processor'] and watch['processor'] in processor_badge_texts -%}
<span class="processor-badge processor-badge-{{ watch['processor'] }}" title="{{ processor_descriptions.get(watch['processor'], watch['processor']) }}">{{ processor_badge_texts[watch['processor']] }}</span>
{%- endif -%}
<span class="watch-title">
{% if system_use_url_watchlist or watch.get('use_page_title_in_list') %}
{{ watch.label }}
{% else %}
{{ watch.get('title') or watch.link }}
{% endif %}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
</span>
<span class="watch-title">
{% if system_use_url_watchlist or watch.get('use_page_title_in_list') %}
{{ watch.label }}
{% else %}
{{ watch.get('title') or watch.link }}
{% endif %}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}">&nbsp;</a>
</span>
<div class="error-text" style="display:none;">{{ watch.compile_error_texts(has_proxies=datastore.proxy_list)|safe }}</div>
{%- if watch['processor'] == 'text_json_diff' -%}
{%- if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] -%}
<div class="ldjson-price-track-offer">Switch to Restock & Price watch mode? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
{%- endif -%}
{%- endif -%}
{%- if watch['processor'] and watch['processor'] in processor_badge_texts -%}
<span class="processor-badge processor-badge-{{ watch['processor'] }}" title="{{ processor_descriptions.get(watch['processor'], watch['processor']) }}">{{ processor_badge_texts[watch['processor']] }}</span>
{%- endif -%}
{%- for watch_tag_uuid, watch_tag in datastore.get_all_tags_for_watch(watch['uuid']).items() -%}
<a href="{{url_for('watchlist.index', tag=watch_tag_uuid) }}" class="watch-tag-list tag-{{ watch_tag.title|sanitize_tag_class }}">{{ watch_tag.title }}</a>
<span class="watch-tag-list tag-{{ watch_tag.title|sanitize_tag_class }}">{{ watch_tag.title }}</span>
{%- endfor -%}
</div>
<div class="status-icons">
@@ -8,9 +8,7 @@ from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, FAVICON_FETCHER_JS
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, \
BrowserStepsStepException
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
async def capture_full_page_async(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False):
import os
@@ -367,16 +365,7 @@ class fetcher(Fetcher):
try:
# Run Browser Steps here
if self.browser_steps_get_valid_steps():
try:
await self.iterate_browser_steps(start_url=url)
except BrowserStepsStepException:
try:
await context.close()
await browser.close()
except Exception as e:
# Fine, could be messy situation
pass
raise
await self.iterate_browser_steps(start_url=url)
await self.page.wait_for_timeout(extra_wait * 1000)
@@ -418,11 +407,19 @@ class fetcher(Fetcher):
# Force aggressive memory cleanup - screenshots are large and base64 decode creates temporary buffers
await self.page.request_gc()
gc.collect()
# Release C-level memory from base64 decode back to OS
try:
import ctypes
ctypes.CDLL('libc.so.6').malloc_trim(0)
except Exception:
pass
except ScreenshotUnavailable:
# Re-raise screenshot unavailable exceptions
raise
except Exception as e:
# It's likely the screenshot was too long/big and something crashed
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
finally:
# Request garbage collection one more time before closing
try:
+2 -23
View File
@@ -55,26 +55,6 @@ class fetcher(Fetcher):
session = requests.Session()
# Configure retry adapter for low-level network errors only
# Retries connection timeouts, read timeouts, connection resets - not HTTP status codes
# Especially helpful in parallel test execution when servers are slow/overloaded
# Configurable via REQUESTS_RETRY_MAX_COUNT (default: 3 attempts)
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
max_retries = int(os.getenv("REQUESTS_RETRY_MAX_COUNT", "6"))
retry_strategy = Retry(
total=max_retries,
connect=max_retries, # Retry connection timeouts
read=max_retries, # Retry read timeouts
status=0, # Don't retry on HTTP status codes
backoff_factor=0.5, # Wait 0.3s, 0.6s, 1.2s between retries
allowed_methods=["HEAD", "GET", "OPTIONS", "POST"],
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
from requests_file import FileAdapter
@@ -162,11 +142,10 @@ class fetcher(Fetcher):
watch_uuid=None,
):
"""Async wrapper that runs the synchronous requests code in a thread pool"""
loop = asyncio.get_event_loop()
# Run the synchronous _run_sync in a thread pool to avoid blocking the event loop
# Retry logic is handled by requests' HTTPAdapter (see _run_sync for configuration)
await loop.run_in_executor(
None, # Use default ThreadPoolExecutor
lambda: self._run_sync(
+16 -26
View File
@@ -14,7 +14,7 @@ from pathlib import Path
from changedetectionio.strtobool import strtobool
from threading import Event
from changedetectionio.queue_handlers import RecheckPriorityQueue, NotificationQueue
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from flask import (
Flask,
@@ -195,7 +195,7 @@ def _jinja2_filter_format_number_locale(value: float) -> str:
@app.template_global('is_checking_now')
def _watch_is_checking_now(watch_obj, format="%Y-%m-%d %H:%M:%S"):
return worker_pool.is_watch_running(watch_obj['uuid'])
return worker_handler.is_watch_running(watch_obj['uuid'])
@app.template_global('get_watch_queue_position')
def _get_watch_queue_position(watch_obj):
@@ -206,13 +206,13 @@ def _get_watch_queue_position(watch_obj):
@app.template_global('get_current_worker_count')
def _get_current_worker_count():
"""Get the current number of operational workers"""
return worker_pool.get_worker_count()
return worker_handler.get_worker_count()
@app.template_global('get_worker_status_info')
def _get_worker_status_info():
"""Get detailed worker status information for display"""
status = worker_pool.get_worker_status()
running_uuids = worker_pool.get_running_uuids()
status = worker_handler.get_worker_status()
running_uuids = worker_handler.get_running_uuids()
return {
'count': status['worker_count'],
@@ -801,7 +801,7 @@ def changedetection_app(config=None, datastore_o=None):
# watchlist UI buttons etc
import changedetectionio.blueprint.ui as ui
app.register_blueprint(ui.construct_blueprint(datastore, update_q, worker_pool, queuedWatchMetaData, watch_check_update))
app.register_blueprint(ui.construct_blueprint(datastore, update_q, worker_handler, queuedWatchMetaData, watch_check_update))
import changedetectionio.blueprint.watchlist as watchlist
app.register_blueprint(watchlist.construct_blueprint(datastore=datastore, update_q=update_q, queuedWatchMetaData=queuedWatchMetaData), url_prefix='')
@@ -838,10 +838,10 @@ def changedetection_app(config=None, datastore_o=None):
expected_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
# Get basic status
status = worker_pool.get_worker_status()
status = worker_handler.get_worker_status()
# Perform health check
health_result = worker_pool.check_worker_health(
health_result = worker_handler.check_worker_health(
expected_count=expected_workers,
update_q=update_q,
notification_q=notification_q,
@@ -905,24 +905,14 @@ def changedetection_app(config=None, datastore_o=None):
# Can be overridden by ENV or use the default settings
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
logger.info(f"Starting {n_workers} workers during app initialization")
worker_pool.start_workers(n_workers, update_q, notification_q, app, datastore)
worker_handler.start_workers(n_workers, update_q, notification_q, app, datastore)
# Skip background threads in batch mode (just process queue and exit)
batch_mode = app.config.get('batch_mode', False)
if not batch_mode:
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks, daemon=True, name="TickerThread-ScheduleChecker").start()
# Start configurable number of notification workers (default 1)
notification_workers = int(os.getenv("NOTIFICATION_WORKERS", "1"))
for i in range(notification_workers):
threading.Thread(
target=notification_runner,
args=(i,),
daemon=True,
name=f"NotificationRunner-{i}"
).start()
logger.info(f"Started {notification_workers} notification worker(s)")
threading.Thread(target=notification_runner, daemon=True, name="NotificationRunner").start()
in_pytest = "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ
# Check for new release version, but not when running in test/build or pytest
@@ -964,14 +954,14 @@ def check_for_new_version():
app.config.exit.wait(86400)
def notification_runner(worker_id=0):
def notification_runner():
global notification_debug_log
from datetime import datetime
import json
with app.app_context():
while not app.config.exit.is_set():
try:
# Multiple workers can run concurrently (configurable via NOTIFICATION_WORKERS)
# At the moment only one thread runs (single runner)
n_object = notification_q.get(block=False)
except queue.Empty:
app.config.exit.wait(1)
@@ -997,7 +987,7 @@ def notification_runner(worker_id=0):
sent_obj = process_notification(n_object, datastore)
except Exception as e:
logger.error(f"Notification worker {worker_id} - Watch URL: {n_object['watch_url']} Error {str(e)}")
logger.error(f"Watch URL: {n_object['watch_url']} Error {str(e)}")
# UUID wont be present when we submit a 'test' from the global settings
if 'uuid' in n_object:
@@ -1038,7 +1028,7 @@ def ticker_thread_check_time_launch_checks():
now = time.time()
if now - last_health_check > 60:
expected_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
health_result = worker_pool.check_worker_health(
health_result = worker_handler.check_worker_health(
expected_count=expected_workers,
update_q=update_q,
notification_q=notification_q,
@@ -1057,7 +1047,7 @@ def ticker_thread_check_time_launch_checks():
continue
# Get a list of watches by UUID that are currently fetching data
running_uuids = worker_pool.get_running_uuids()
running_uuids = worker_handler.get_running_uuids()
# Build set of queued UUIDs once for O(1) lookup instead of O(n) per watch
queued_uuids = {q_item.item['uuid'] for q_item in update_q.queue}
@@ -1163,7 +1153,7 @@ def ticker_thread_check_time_launch_checks():
priority = int(time.time())
# Into the queue with you
queued_successfully = worker_pool.queue_item_async_safe(update_q,
queued_successfully = worker_handler.queue_item_async_safe(update_q,
queuedWatchMetaData.PrioritizedItem(priority=priority,
item={'uuid': uuid})
)
+3 -3
View File
@@ -730,7 +730,7 @@ class quickWatchForm(Form):
url = fields.URLField(_l('URL'), validators=[validateURL()])
tags = StringTagUUID(_l('Group tag'), validators=[validators.Optional()])
watch_submit_button = SubmitField(_l('Watch'), render_kw={"class": "pure-button pure-button-primary"})
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default="text_json_diff")
edit_and_watch_submit_button = SubmitField(_l('Edit > Watch'), render_kw={"class": "pure-button pure-button-primary"})
@@ -749,7 +749,7 @@ class commonSettingsForm(Form):
notification_format = SelectField(_l('Notification format'), choices=list(valid_notification_formats.items()))
notification_title = StringField(_l('Notification Title'), default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()])
notification_urls = StringListField(_l('Notification URL List'), validators=[validators.Optional(), ValidateAppRiseServers(), ValidateJinja2Template()])
processor = RadioField( label=_l("Processor - What do you want to achieve?"), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField( label=_l("Processor - What do you want to achieve?"), choices=lambda: processors.available_processors(), default="text_json_diff")
scheduler_timezone_default = StringField(_l("Default timezone for watch check scheduler"), render_kw={"list": "timezones"}, validators=[validateTimeZoneName()])
webdriver_delay = IntegerField(_l('Wait seconds before extracting text'), validators=[validators.Optional(), validators.NumberRange(min=1, message=_l("Should contain one or more seconds"))])
@@ -763,7 +763,7 @@ class commonSettingsForm(Form):
class importForm(Form):
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default=processors.get_default_processor)
processor = RadioField(_l('Processor'), choices=lambda: processors.available_processors(), default="text_json_diff")
urls = TextAreaField(_l('URLs'))
xlsx_file = FileField(_l('Upload .xlsx file'), validators=[FileAllowed(['xlsx'], _l('Must be .xlsx file!'))])
file_mapping = SelectField(_l('File mapping'), [validators.DataRequired()], choices={('wachete', 'Wachete mapping'), ('custom','Custom mapping')})
+1 -1
View File
@@ -29,7 +29,7 @@ class model(dict):
'proxy': None, # Preferred proxy connection
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "5")), # Number of threads, lower is better for slow connections
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
'default_ua': {
'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT),
'html_webdriver': None,
-24
View File
@@ -105,30 +105,6 @@ class ChangeDetectionSpec:
"""
pass
@hookspec
def register_processor(self):
"""Register an external processor plugin.
External packages can implement this hook to register custom processors
that will be discovered alongside built-in processors.
Returns:
dict or None: Dictionary with processor information:
{
'processor_name': str, # Machine name (e.g., 'osint_recon')
'processor_module': module, # Module containing processor.py
'processor_class': class, # The perform_site_check class
'metadata': { # Optional metadata
'name': str, # Display name
'description': str, # Description
'processor_weight': int,# Sort weight (lower = higher priority)
'list_badge_text': str, # Badge text for UI
}
}
Return None if this plugin doesn't provide a processor
"""
pass
# Set up Plugin Manager
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
+30 -219
View File
@@ -17,11 +17,9 @@ def find_sub_packages(package_name):
return [name for _, name, is_pkg in pkgutil.iter_modules(package.__path__) if is_pkg]
@lru_cache(maxsize=1)
def find_processors():
"""
Find all subclasses of DifferenceDetectionProcessor in the specified package.
Results are cached to avoid repeated discovery.
:param package_name: The name of the package to scan for processor modules.
:return: A list of (module, class) tuples.
@@ -48,23 +46,6 @@ def find_processors():
except (ModuleNotFoundError, ImportError) as e:
logger.warning(f"Failed to import module {module_name}: {e} (find_processors())")
# Discover plugin processors via pluggy
try:
from changedetectionio.pluggy_interface import plugin_manager
plugin_results = plugin_manager.hook.register_processor()
for result in plugin_results:
if result and isinstance(result, dict):
processor_module = result.get('processor_module')
processor_name = result.get('processor_name')
if processor_module and processor_name:
processors.append((processor_module, processor_name))
plugin_path = getattr(processor_module, '__file__', 'unknown location')
logger.info(f"Registered plugin processor: {processor_name} from {plugin_path}")
except Exception as e:
logger.warning(f"Error loading plugin processors: {e}")
return processors
@@ -116,137 +97,54 @@ def find_processor_module(processor_name):
return None
def get_processor_module(processor_name):
"""
Get the actual processor module (with perform_site_check class) by name.
Works for both built-in and plugin processors.
Args:
processor_name: Processor machine name (e.g., 'text_json_diff', 'osint_recon')
Returns:
module: The processor module containing perform_site_check, or None if not found
"""
processor_classes = find_processors()
processor_tuple = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
if processor_tuple:
# Return the actual processor module (first element of tuple)
return processor_tuple[0]
return None
def get_processor_submodule(processor_name, submodule_name):
"""
Get an optional submodule from a processor (e.g., 'difference', 'extract', 'preview').
Works for both built-in and plugin processors.
Args:
processor_name: Processor machine name (e.g., 'text_json_diff', 'osint_recon')
submodule_name: Name of the submodule (e.g., 'difference', 'extract', 'preview')
Returns:
module: The submodule if it exists, or None if not found
"""
processor_classes = find_processors()
processor_tuple = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
if not processor_tuple:
return None
processor_module = processor_tuple[0]
parent_module = get_parent_module(processor_module)
if not parent_module:
return None
# Try to import the submodule
try:
# For built-in processors: changedetectionio.processors.text_json_diff.difference
# For plugin processors: changedetectionio_osint.difference
parent_module_name = parent_module.__name__
submodule_full_name = f"{parent_module_name}.{submodule_name}"
return importlib.import_module(submodule_full_name)
except (ModuleNotFoundError, ImportError):
return None
@lru_cache(maxsize=1)
def get_plugin_processor_metadata():
"""Get metadata from plugin processors."""
metadata = {}
try:
from changedetectionio.pluggy_interface import plugin_manager
plugin_results = plugin_manager.hook.register_processor()
for result in plugin_results:
if result and isinstance(result, dict):
processor_name = result.get('processor_name')
meta = result.get('metadata', {})
if processor_name:
metadata[processor_name] = meta
except Exception as e:
logger.warning(f"Error getting plugin processor metadata: {e}")
return metadata
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
Can be filtered via ALLOWED_PROCESSORS environment variable (comma-separated list).
:return: A list :)
"""
processor_classes = find_processors()
# Check if DISABLED_PROCESSORS env var is set
disabled_processors_env = os.getenv('DISABLED_PROCESSORS', 'image_ssim_diff').strip()
disabled_processors = []
if disabled_processors_env:
# Check if ALLOWED_PROCESSORS env var is set
# For now we disable it, need to make a deploy with lots of new code and this will be an overload
allowed_processors_env = os.getenv('ALLOWED_PROCESSORS', 'text_json_diff, restock_diff').strip()
allowed_processors = None
if allowed_processors_env:
# Parse comma-separated list and strip whitespace
disabled_processors = [p.strip() for p in disabled_processors_env.split(',') if p.strip()]
logger.info(f"DISABLED_PROCESSORS set, disabling: {disabled_processors}")
allowed_processors = [p.strip() for p in allowed_processors_env.split(',') if p.strip()]
logger.info(f"ALLOWED_PROCESSORS set, filtering to: {allowed_processors}")
available = []
plugin_metadata = get_plugin_processor_metadata()
for module, sub_package_name in processor_classes:
# Skip disabled processors
if sub_package_name in disabled_processors:
logger.debug(f"Skipping processor '{sub_package_name}' (in DISABLED_PROCESSORS)")
# Filter by allowed processors if set
if allowed_processors and sub_package_name not in allowed_processors:
logger.debug(f"Skipping processor '{sub_package_name}' (not in ALLOWED_PROCESSORS)")
continue
# Check if this is a plugin processor
if sub_package_name in plugin_metadata:
meta = plugin_metadata[sub_package_name]
description = gettext(meta.get('name', sub_package_name))
# Plugin processors start from weight 10 to separate them from built-in processors
weight = 100 + meta.get('processor_weight', 0)
# Try to get the 'name' attribute from the processor module first
if hasattr(module, 'name'):
description = gettext(module.name)
else:
# Try to get the 'name' attribute from the processor module first
if hasattr(module, 'name'):
description = gettext(module.name)
# Fall back to processor_description from parent module's __init__.py
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_description'):
description = gettext(parent_module.processor_description)
else:
# Fall back to processor_description from parent module's __init__.py
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_description'):
description = gettext(parent_module.processor_description)
else:
# Final fallback to a readable name
description = sub_package_name.replace('_', ' ').title()
# Final fallback to a readable name
description = sub_package_name.replace('_', ' ').title()
# Get weight for sorting (lower weight = higher in list)
weight = 0 # Default weight for processors without explicit weight
# Get weight for sorting (lower weight = higher in list)
weight = 0 # Default weight for processors without explicit weight
# Check processor module itself first
if hasattr(module, 'processor_weight'):
weight = module.processor_weight
else:
# Fall back to parent module (package __init__.py)
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_weight'):
weight = parent_module.processor_weight
# Check processor module itself first
if hasattr(module, 'processor_weight'):
weight = module.processor_weight
else:
# Fall back to parent module (package __init__.py)
parent_module = get_parent_module(module)
if parent_module and hasattr(parent_module, 'processor_weight'):
weight = parent_module.processor_weight
available.append((sub_package_name, description, weight))
@@ -257,20 +155,6 @@ def available_processors():
return [(name, desc) for name, desc, weight in available]
def get_default_processor():
"""
Get the default processor to use when none is specified.
Returns the first available processor based on weight (lowest weight = highest priority).
This ensures forms auto-select a valid processor even when DISABLED_PROCESSORS filters the list.
:return: The processor name string (e.g., 'text_json_diff')
"""
available = available_processors()
if available:
return available[0][0] # Return the processor name from first tuple
return 'text_json_diff' # Fallback if somehow no processors are available
def get_processor_badge_texts():
"""
Get a dictionary mapping processor names to their list_badge_text values.
@@ -395,76 +279,3 @@ def get_processor_badge_css():
return '\n\n'.join(css_rules)
def save_processor_config(datastore, watch_uuid, config_data):
"""
Save processor-specific configuration to JSON file.
This is a shared helper function used by both the UI edit form and API endpoints
to consistently handle processor configuration storage.
Args:
datastore: The application datastore instance
watch_uuid: UUID of the watch
config_data: Dictionary of configuration data to save (with processor_config_* prefix removed)
Returns:
bool: True if saved successfully, False otherwise
"""
if not config_data:
return True
try:
from changedetectionio.processors.base import difference_detection_processor
# Get processor name from watch
watch = datastore.data['watching'].get(watch_uuid)
if not watch:
logger.error(f"Cannot save processor config: watch {watch_uuid} not found")
return False
processor_name = watch.get('processor', 'text_json_diff')
# Create a processor instance to access config methods
processor_instance = difference_detection_processor(datastore, watch_uuid)
# Use processor name as filename so each processor keeps its own config
config_filename = f'{processor_name}.json'
processor_instance.update_extra_watch_config(config_filename, config_data)
logger.debug(f"Saved processor config to {config_filename}: {config_data}")
return True
except Exception as e:
logger.error(f"Failed to save processor config: {e}")
return False
def extract_processor_config_from_form_data(form_data):
"""
Extract processor_config_* fields from form data and return separate dicts.
This is a shared helper function used by both the UI edit form and API endpoints
to consistently handle processor configuration extraction.
IMPORTANT: This function modifies form_data in-place by removing processor_config_* fields.
Args:
form_data: Dictionary of form data (will be modified in-place)
Returns:
dict: Dictionary of processor config data (with processor_config_* prefix removed)
"""
processor_config_data = {}
# Use list() to create a copy of keys since we're modifying the dict
for field_name in list(form_data.keys()):
if field_name.startswith('processor_config_'):
config_key = field_name.replace('processor_config_', '')
# Save all values (including empty strings) to allow explicit clearing of settings
processor_config_data[config_key] = form_data[field_name]
# Remove from form_data to prevent it from reaching datastore
del form_data[field_name]
return processor_config_data
@@ -12,13 +12,6 @@ processor_description = "Visual/Screenshot change detection (Fast)"
processor_name = "image_ssim_diff"
processor_weight = 2 # Lower weight = appears at top, heavier weight = appears lower (bottom)
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = False
supports_text_filters_and_triggers_elements = False
supports_request_type = True
PROCESSOR_CONFIG_NAME = f"{Path(__file__).parent.name}.json"
# Subprocess timeout settings
@@ -4,13 +4,6 @@ from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = True
supports_text_filters_and_triggers_elements = True
supports_request_type = True
class Restock(dict):
def parse_currency(self, raw_value: str) -> Union[float, None]:
@@ -1,11 +1,5 @@
from loguru import logger
# Processor capabilities
supports_visual_selector = True
supports_browser_steps = True
supports_text_filters_and_triggers = True
supports_text_filters_and_triggers_elements = True
supports_request_type = True
from loguru import logger
+141 -228
View File
@@ -5,57 +5,51 @@ import heapq
import queue
import threading
# Janus is no longer required - we use pure threading.Queue for multi-loop support
# try:
# import janus
# except ImportError:
# pass # Not needed anymore
try:
import janus
except ImportError:
logger.critical(f"CRITICAL: janus library is required. Install with: pip install janus")
raise
class RecheckPriorityQueue:
"""
Thread-safe priority queue supporting multiple async event loops.
ARCHITECTURE:
- Multiple async workers, each with its own event loop in its own thread
- Hybrid sync/async design for maximum scalability
- Sync interface for ticker thread (threading.Queue)
- Async interface for workers (asyncio.Event - NO executor threads!)
SCALABILITY:
- Scales to 100-200+ workers without executor thread exhaustion
- Async workers wait on asyncio.Event (pure coroutines, no threads)
- Sync callers use threading.Queue (backward compatible)
WHY NOT JANUS:
- Janus binds to ONE event loop at creation time
- Our architecture has 15+ workers, each with separate event loops
- Workers in different threads/loops cannot share janus async interface
WHY NOT RUN_IN_EXECUTOR:
- With 200 workers, run_in_executor() would block 200 threads
- Exhausts ThreadPoolExecutor, starves Flask HTTP handlers
- Pure async approach uses 0 threads while waiting
Ultra-reliable priority queue using janus for async/sync bridging.
CRITICAL DESIGN NOTE: Both sync_q and async_q are required because:
- sync_q: Used by Flask routes, ticker threads, and other synchronous code
- async_q: Used by async workers (the actual fetchers/processors) and coroutines
DO NOT REMOVE EITHER INTERFACE - they bridge different execution contexts:
- Synchronous code (Flask, threads) cannot use async methods without blocking
- Async code cannot use sync methods without blocking the event loop
- janus provides the only safe bridge between these two worlds
Attempting to unify to async-only would require:
- Converting all Flask routes to async (major breaking change)
- Using asyncio.run() in sync contexts (causes deadlocks)
- Thread-pool wrapping (adds complexity and overhead)
Minimal implementation focused on reliability:
- Pure janus for sync/async bridge
- Thread-safe priority ordering
- Bulletproof error handling with critical logging
"""
def __init__(self, maxsize: int = 0):
try:
import asyncio
# Sync interface: threading.Queue for ticker thread and Flask routes
self._notification_queue = queue.Queue(maxsize=maxsize if maxsize > 0 else 0)
self._janus_queue = janus.Queue(maxsize=maxsize)
# BOTH interfaces required - see class docstring for why
self.sync_q = self._janus_queue.sync_q # Flask routes, ticker thread
self.async_q = self._janus_queue.async_q # Async workers
# Priority storage - thread-safe
self._priority_items = []
self._lock = threading.RLock()
# No event signaling needed - pure polling approach
# Workers check queue every 50ms (latency acceptable: 0-500ms)
# Scales to 1000+ workers: each sleeping worker = ~4KB coroutine, not thread
# Signals for UI updates
self.queue_length_signal = signal('queue_length')
logger.debug("RecheckPriorityQueue initialized successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to initialize RecheckPriorityQueue: {str(e)}")
@@ -64,48 +58,38 @@ class RecheckPriorityQueue:
# SYNC INTERFACE (for ticker thread)
def put(self, item, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync put with priority ordering"""
logger.trace(f"RecheckQueue.put() called for item: {self._get_item_uuid(item)}, block={block}, timeout={timeout}")
try:
# CRITICAL: Add to both priority storage AND notification queue atomically
# to prevent desynchronization where item exists but no notification
# Add to priority storage
with self._lock:
heapq.heappush(self._priority_items, item)
# Add notification - use blocking with timeout for safety
# Notification queue is unlimited size, so should never block in practice
# but timeout ensures we detect any unexpected issues (deadlock, etc)
try:
self._notification_queue.put(True, block=True, timeout=5.0)
except Exception as notif_e:
# Notification failed - MUST remove from priority_items to keep in sync
# This prevents "Priority queue inconsistency" errors in get()
logger.critical(f"CRITICAL: Notification queue put failed, removing from priority_items: {notif_e}")
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
raise # Re-raise to be caught by outer exception handler
# Signal emission after successful queue - log but don't fail the operation
# Item is already safely queued, so signal failure shouldn't affect queue state
try:
self._emit_put_signals(item)
except Exception as signal_e:
logger.error(f"Failed to emit put signals but item queued successfully: {signal_e}")
# Notify via janus sync queue
self.sync_q.put(True, block=block, timeout=timeout)
# Emit signals
self._emit_put_signals(item)
logger.trace(f"Successfully queued item: {self._get_item_uuid(item)}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to put item {self._get_item_uuid(item)}: {type(e).__name__}: {str(e)}")
# Item should have been cleaned up in the inner try/except if notification failed
logger.critical(f"CRITICAL: Failed to put item {self._get_item_uuid(item)}: {str(e)}")
# Remove from priority storage if janus put failed
try:
with self._lock:
if item in self._priority_items:
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
except Exception as cleanup_e:
logger.critical(f"CRITICAL: Failed to cleanup after put failure: {str(e)}")
return False
def get(self, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync get with priority ordering"""
logger.trace(f"RecheckQueue.get() called, block={block}, timeout={timeout}")
import queue as queue_module
import queue
try:
# Wait for notification (this doesn't return the actual item, just signals availability)
self._notification_queue.get(block=block, timeout=timeout)
# Wait for notification
self.sync_q.get(block=block, timeout=timeout)
# Get highest priority item
with self._lock:
@@ -114,91 +98,69 @@ class RecheckPriorityQueue:
raise Exception("Priority queue inconsistency")
item = heapq.heappop(self._priority_items)
# Signal emission after successful retrieval - log but don't lose the item
# Item is already retrieved, so signal failure shouldn't affect queue state
try:
self._emit_get_signals()
except Exception as signal_e:
logger.error(f"Failed to emit get signals but item retrieved successfully: {signal_e}")
# Emit signals
self._emit_get_signals()
logger.trace(f"RecheckQueue.get() successfully retrieved item: {self._get_item_uuid(item)}")
return item
except queue_module.Empty:
# Queue is empty with timeout - expected behavior
logger.trace(f"RecheckQueue.get() timed out - queue is empty (timeout={timeout})")
raise # noqa
except Exception as e:
# Re-raise without logging - caller (worker) will handle and log appropriately
logger.trace(f"RecheckQueue.get() failed with exception: {type(e).__name__}: {str(e)}")
raise
# ASYNC INTERFACE (for workers)
async def async_put(self, item, executor=None):
"""Async put with priority ordering - uses thread pool to avoid blocking
Args:
item: Item to add to queue
executor: Optional ThreadPoolExecutor. If None, uses default pool.
"""
logger.trace(f"RecheckQueue.async_put() called for item: {self._get_item_uuid(item)}, executor={executor}")
import asyncio
try:
# Use run_in_executor to call sync put without blocking event loop
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
executor, # Use provided executor or default
lambda: self.put(item, block=True, timeout=5.0)
)
logger.trace(f"RecheckQueue.async_put() successfully queued item: {self._get_item_uuid(item)}")
return result
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put item {self._get_item_uuid(item)}: {str(e)}")
return False
async def async_get(self, executor=None, timeout=1.0):
"""
Efficient async get using executor for blocking call.
HYBRID APPROACH: Best of both worlds
- Uses run_in_executor for efficient blocking (no polling overhead)
- Single timeout (no double-timeout race condition)
- Scales well: executor sized to match worker count
With FETCH_WORKERS=10: 10 threads blocked max (acceptable)
With FETCH_WORKERS=200: Need executor with 200+ threads (see worker_pool.py)
Args:
executor: ThreadPoolExecutor (sized to match worker count)
timeout: Maximum time to wait in seconds
Returns:
Item from queue
Raises:
queue.Empty: If timeout expires with no item available
"""
logger.trace(f"RecheckQueue.async_get() called, timeout={timeout}")
import asyncio
try:
# Use run_in_executor to call sync get efficiently
# No outer asyncio.wait_for wrapper = no double timeout issue!
loop = asyncio.get_event_loop()
item = await loop.run_in_executor(
executor,
lambda: self.get(block=True, timeout=timeout)
)
logger.trace(f"RecheckQueue.async_get() successfully retrieved item: {self._get_item_uuid(item)}")
logger.debug(f"Successfully retrieved item: {self._get_item_uuid(item)}")
return item
except queue.Empty:
logger.trace(f"RecheckQueue.async_get() timed out - queue is empty")
# Queue is empty with timeout - expected behavior, re-raise without logging
raise
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get item from queue: {type(e).__name__}: {str(e)}")
# Re-raise without logging - caller (worker) will handle and log appropriately
raise
# ASYNC INTERFACE (for workers)
async def async_put(self, item):
"""Pure async put with priority ordering"""
try:
# Add to priority storage
with self._lock:
heapq.heappush(self._priority_items, item)
# Notify via janus async queue
await self.async_q.put(True)
# Emit signals
self._emit_put_signals(item)
logger.debug(f"Successfully async queued item: {self._get_item_uuid(item)}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put item {self._get_item_uuid(item)}: {str(e)}")
# Remove from priority storage if janus put failed
try:
with self._lock:
if item in self._priority_items:
self._priority_items.remove(item)
heapq.heapify(self._priority_items)
except Exception as cleanup_e:
logger.critical(f"CRITICAL: Failed to cleanup after async put failure: {str(e)}")
return False
async def async_get(self):
"""Pure async get with priority ordering"""
try:
# Wait for notification
await self.async_q.get()
# Get highest priority item
with self._lock:
if not self._priority_items:
logger.critical(f"CRITICAL: Async queue notification received but no priority items available")
raise Exception("Priority queue inconsistency")
item = heapq.heappop(self._priority_items)
# Emit signals
self._emit_get_signals()
logger.debug(f"Successfully async retrieved item: {self._get_item_uuid(item)}")
return item
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get item from queue: {str(e)}")
raise
# UTILITY METHODS
@@ -224,35 +186,10 @@ class RecheckPriorityQueue:
logger.critical(f"CRITICAL: Failed to get queued UUIDs: {str(e)}")
return []
def clear(self):
"""Clear all items from both priority storage and notification queue"""
try:
with self._lock:
# Clear priority items
self._priority_items.clear()
# Drain all notifications to prevent stale notifications
# This is critical for test cleanup to prevent queue desynchronization
drained = 0
while not self._notification_queue.empty():
try:
self._notification_queue.get_nowait()
drained += 1
except queue.Empty:
break
if drained > 0:
logger.debug(f"Cleared queue: removed {drained} notifications")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to clear queue: {str(e)}")
return False
def close(self):
"""Close the queue"""
"""Close the janus queue"""
try:
# Nothing to close for threading.Queue
self._janus_queue.close()
logger.debug("RecheckPriorityQueue closed successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to close RecheckPriorityQueue: {str(e)}")
@@ -384,7 +321,7 @@ class RecheckPriorityQueue:
except Exception:
pass
return 'unknown'
def _emit_put_signals(self, item):
"""Emit signals when item is added"""
try:
@@ -393,14 +330,14 @@ class RecheckPriorityQueue:
watch_check_update = signal('watch_check_update')
if watch_check_update:
watch_check_update.send(watch_uuid=item.item['uuid'])
# Queue length signal
if self.queue_length_signal:
self.queue_length_signal.send(length=self.qsize())
except Exception as e:
logger.critical(f"CRITICAL: Failed to emit put signals: {str(e)}")
def _emit_get_signals(self):
"""Emit signals when item is removed"""
try:
@@ -426,11 +363,12 @@ class NotificationQueue:
def __init__(self, maxsize: int = 0, datastore=None):
try:
# Use pure threading.Queue to avoid event loop binding issues
self._notification_queue = queue.Queue(maxsize=maxsize if maxsize > 0 else 0)
self._janus_queue = janus.Queue(maxsize=maxsize)
# BOTH interfaces required - see class docstring for why
self.sync_q = self._janus_queue.sync_q # Flask routes, threads
self.async_q = self._janus_queue.async_q # Async workers
self.notification_event_signal = signal('notification_event')
self.datastore = datastore # For checking all_muted setting
self._lock = threading.RLock()
logger.debug("NotificationQueue initialized successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to initialize NotificationQueue: {str(e)}")
@@ -442,97 +380,72 @@ class NotificationQueue:
def put(self, item: Dict[str, Any], block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync put with signal emission"""
logger.trace(f"NotificationQueue.put() called for item: {item.get('uuid', 'unknown')}, block={block}, timeout={timeout}")
try:
# Check if all notifications are muted
if self.datastore and self.datastore.data['settings']['application'].get('all_muted', False):
logger.debug(f"Notification blocked - all notifications are muted: {item.get('uuid', 'unknown')}")
return False
with self._lock:
self._notification_queue.put(item, block=block, timeout=timeout)
self.sync_q.put(item, block=block, timeout=timeout)
self._emit_notification_signal(item)
logger.trace(f"NotificationQueue.put() successfully queued notification: {item.get('uuid', 'unknown')}")
logger.debug(f"Successfully queued notification: {item.get('uuid', 'unknown')}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to put notification {item.get('uuid', 'unknown')}: {str(e)}")
return False
async def async_put(self, item: Dict[str, Any], executor=None):
"""Async put with signal emission - uses thread pool
Args:
item: Notification item to queue
executor: Optional ThreadPoolExecutor
"""
logger.trace(f"NotificationQueue.async_put() called for item: {item.get('uuid', 'unknown')}, executor={executor}")
import asyncio
async def async_put(self, item: Dict[str, Any]):
"""Pure async put with signal emission"""
try:
# Check if all notifications are muted
if self.datastore and self.datastore.data['settings']['application'].get('all_muted', False):
logger.debug(f"Notification blocked - all notifications are muted: {item.get('uuid', 'unknown')}")
return False
loop = asyncio.get_event_loop()
await loop.run_in_executor(executor, lambda: self.put(item, block=True, timeout=5.0))
logger.trace(f"NotificationQueue.async_put() successfully queued notification: {item.get('uuid', 'unknown')}")
await self.async_q.put(item)
self._emit_notification_signal(item)
logger.debug(f"Successfully async queued notification: {item.get('uuid', 'unknown')}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Failed to async put notification {item.get('uuid', 'unknown')}: {str(e)}")
return False
def get(self, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync get"""
logger.trace(f"NotificationQueue.get() called, block={block}, timeout={timeout}")
try:
with self._lock:
item = self._notification_queue.get(block=block, timeout=timeout)
logger.trace(f"NotificationQueue.get() retrieved item: {item.get('uuid', 'unknown') if isinstance(item, dict) else 'unknown'}")
return item
return self.sync_q.get(block=block, timeout=timeout)
except queue.Empty as e:
logger.trace(f"NotificationQueue.get() timed out - queue is empty (timeout={timeout})")
raise e
except Exception as e:
logger.critical(f"CRITICAL: Failed to get notification: {type(e).__name__}: {str(e)}")
logger.critical(f"CRITICAL: Failed to get notification: {str(e)}")
raise e
async def async_get(self, executor=None):
"""Async get - uses thread pool
Args:
executor: Optional ThreadPoolExecutor
"""
logger.trace(f"NotificationQueue.async_get() called, executor={executor}")
import asyncio
async def async_get(self):
"""Pure async get"""
try:
loop = asyncio.get_event_loop()
item = await loop.run_in_executor(executor, lambda: self.get(block=True, timeout=1.0))
logger.trace(f"NotificationQueue.async_get() retrieved item: {item.get('uuid', 'unknown') if isinstance(item, dict) else 'unknown'}")
return item
return await self.async_q.get()
except queue.Empty as e:
logger.trace(f"NotificationQueue.async_get() timed out - queue is empty")
raise e
except Exception as e:
logger.critical(f"CRITICAL: Failed to async get notification: {type(e).__name__}: {str(e)}")
logger.critical(f"CRITICAL: Failed to async get notification: {str(e)}")
raise e
def qsize(self) -> int:
"""Get current queue size"""
try:
with self._lock:
return self._notification_queue.qsize()
return self.sync_q.qsize()
except Exception as e:
logger.critical(f"CRITICAL: Failed to get notification queue size: {str(e)}")
return 0
def empty(self) -> bool:
"""Check if queue is empty"""
return self.qsize() == 0
def close(self):
"""Close the queue"""
"""Close the janus queue"""
try:
# Nothing to close for threading.Queue
self._janus_queue.close()
logger.debug("NotificationQueue closed successfully")
except Exception as e:
logger.critical(f"CRITICAL: Failed to close NotificationQueue: {str(e)}")
+2 -2
View File
@@ -37,9 +37,9 @@ def register_watch_operation_handlers(socketio, datastore):
# Import here to avoid circular imports
from changedetectionio.flask_app import update_q
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
worker_pool.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
worker_handler.queue_item_async_safe(update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
logger.info(f"Socket.IO: Queued recheck for watch {uuid}")
else:
emit('operation_result', {'success': False, 'error': f'Unknown operation: {op}'})
+4 -4
View File
@@ -145,10 +145,10 @@ def handle_watch_update(socketio, **kwargs):
# Emit the watch update to all connected clients
from changedetectionio.flask_app import update_q
from changedetectionio.flask_app import _jinja2_filter_datetime
from changedetectionio import worker_pool
from changedetectionio import worker_handler
# Get list of watches that are currently running
running_uuids = worker_pool.get_running_uuids()
running_uuids = worker_handler.get_running_uuids()
# Get list of watches in the queue (efficient single-lock method)
queue_list = update_q.get_queued_uuids()
@@ -252,7 +252,7 @@ def init_socketio(app, datastore):
def event_checkbox_operations(data):
from changedetectionio.blueprint.ui import _handle_operations
from changedetectionio import queuedWatchMetaData
from changedetectionio import worker_pool
from changedetectionio import worker_handler
from changedetectionio.flask_app import update_q, watch_check_update
import threading
@@ -268,7 +268,7 @@ def init_socketio(app, datastore):
uuids=data.get('uuids'),
datastore=datastore,
extra_data=data.get('extra_data'),
worker_pool=worker_pool,
worker_handler=worker_handler,
update_q=update_q,
queuedWatchMetaData=queuedWatchMetaData,
watch_check_update=watch_check_update,
+2 -6
View File
@@ -10,7 +10,6 @@
set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
rm tests/logs/* -f
# Since theres no curl installed lets roll with python3
check_sanity() {
@@ -65,21 +64,18 @@ data_sanity_test
echo "-------------------- Running rest of tests in parallel -------------------------------"
# REMOVE_REQUESTS_OLD_SCREENSHOTS disabled so that we can write a screenshot and send it in test_notifications.py without a real browser
FETCH_WORKERS=2 REMOVE_REQUESTS_OLD_SCREENSHOTS=false \
REMOVE_REQUESTS_OLD_SCREENSHOTS=false \
pytest tests/test_*.py \
-n 18 \
-n 30 \
--dist=load \
-vvv \
-s \
--capture=no \
-k "not test_queue_system" \
--log-cli-level=DEBUG \
--log-cli-format="%(asctime)s [%(process)d] [%(levelname)s] %(name)s: %(message)s"
echo "---------------------------- DONE parallel test ---------------------------------------"
FETCH_WORKERS=20 pytest -vvv -s tests/test_queue_handler.py
echo "RUNNING WITH BASE_URL SET"
# Now re-run some tests with BASE_URL enabled
@@ -222,19 +222,6 @@ code {
color: var(--color-white);
background: var(--color-text-watch-tag-list);
@extend .inline-tag;
/* Remove default anchor styling when used as links */
text-decoration: none;
&:hover {
text-decoration: none;
opacity: 0.8;
cursor: pointer;
}
&:visited {
color: var(--color-white);
}
}
@media (min-width: 768px) {
File diff suppressed because it is too large Load Diff
-991
View File
@@ -1,991 +0,0 @@
import shutil
from changedetectionio.strtobool import strtobool
from changedetectionio.validate_url import is_safe_valid_url
from flask import (
flash
)
from flask_babel import gettext
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..model import App, Watch, USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from copy import deepcopy, copy
from os import path, unlink
from threading import Lock
import json
import os
import re
import secrets
import sys
import threading
import time
import uuid as uuid_builder
from loguru import logger
from blinker import signal
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
from ..processors import get_custom_watch_obj_for_processor
from ..processors.restock_diff import Restock
# Import the base class and helpers
from .file_saving_datastore import FileSavingDataStore, load_all_watches, save_watch_atomic, save_json_atomic
from .updates import DatastoreUpdatesMixin
from .legacy_loader import has_legacy_datastore
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
dictfilt = lambda x, y: dict([(i, x[i]) for i in x if i in set(y)])
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
# Open a github issue if you know something :)
# https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change
class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
__version_check = True
def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"):
# Initialize parent class
super().__init__()
# Should only be active for docker
# logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
self.datastore_path = datastore_path
self.needs_write = False
self.start_time = time.time()
self.stop_thread = False
self.save_version_copy_json_db(version_tag)
self.reload_state(datastore_path=datastore_path, include_default_watches=include_default_watches, version_tag=version_tag)
def save_version_copy_json_db(self, version_tag):
"""
Create version-tagged backup of changedetection.json.
This is called on version upgrades to preserve a backup in case
the new version has issues.
"""
import re
version_text = re.sub(r'\D+', '-', version_tag)
db_path = os.path.join(self.datastore_path, "changedetection.json")
db_path_version_backup = os.path.join(self.datastore_path, f"changedetection-{version_text}.json")
if not os.path.isfile(db_path_version_backup) and os.path.isfile(db_path):
from shutil import copyfile
logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
copyfile(db_path, db_path_version_backup)
def _load_settings(self):
"""
Load settings from storage.
File backend implementation: reads from changedetection.json
Returns:
dict: Settings data loaded from storage
"""
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
logger.info(f"Loading settings from {changedetection_json}")
if HAS_ORJSON:
with open(changedetection_json, 'rb') as f:
return orjson.loads(f.read())
else:
with open(changedetection_json, 'r', encoding='utf-8') as f:
return json.load(f)
def _apply_settings(self, settings_data):
"""
Apply loaded settings data to internal data structure.
Args:
settings_data: Dictionary loaded from changedetection.json
"""
# Apply top-level fields
if 'app_guid' in settings_data:
self.__data['app_guid'] = settings_data['app_guid']
if 'build_sha' in settings_data:
self.__data['build_sha'] = settings_data['build_sha']
if 'version_tag' in settings_data:
self.__data['version_tag'] = settings_data['version_tag']
# Apply settings sections
if 'settings' in settings_data:
if 'headers' in settings_data['settings']:
self.__data['settings']['headers'].update(settings_data['settings']['headers'])
if 'requests' in settings_data['settings']:
self.__data['settings']['requests'].update(settings_data['settings']['requests'])
if 'application' in settings_data['settings']:
self.__data['settings']['application'].update(settings_data['settings']['application'])
def _rehydrate_tags(self):
"""Rehydrate tag entities from stored data."""
for uuid, tag in self.__data['settings']['application']['tags'].items():
self.__data['settings']['application']['tags'][uuid] = self.rehydrate_entity(
uuid, tag, processor_override='restock_diff'
)
logger.info(f"Tag: {uuid} {tag['title']}")
def _load_state(self):
"""
Load complete datastore state from storage.
Orchestrates loading of settings and watches using polymorphic methods.
"""
# Load settings
settings_data = self._load_settings()
self._apply_settings(settings_data)
# Load watches (polymorphic - parent class method)
self._load_watches()
# Rehydrate tags
self._rehydrate_tags()
def reload_state(self, datastore_path, include_default_watches, version_tag):
"""
Load datastore from storage or create new one.
Supports two scenarios:
1. NEW format: changedetection.json exists load and run updates if needed
2. EMPTY: No changedetection.json create new OR trigger migration from legacy
Note: Legacy url-watches.json migration happens in update_26, not here.
"""
logger.info(f"Datastore path is '{datastore_path}'")
# CRITICAL: Update datastore_path (was using old path from __init__)
self.datastore_path = datastore_path
# Initialize data structure
self.__data = App.model()
self.json_store_path = os.path.join(self.datastore_path, "changedetection.json")
# Base definition for all watchers (deepcopy part of #569)
self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, default={}))
# Load build SHA if available (Docker deployments)
if path.isfile('changedetectionio/source.txt'):
with open('changedetectionio/source.txt') as f:
self.__data['build_sha'] = f.read()
# Check if datastore already exists
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
if os.path.exists(changedetection_json):
# Load existing datastore (changedetection.json + watch.json files)
logger.info("Loading existing datastore")
try:
self._load_state()
except Exception as e:
logger.critical(f"Failed to load datastore: {e}")
raise
# Run schema updates if needed
# Pass current schema version from loaded datastore (defaults to 0 if not set)
current_schema = self.__data['settings']['application'].get('schema_version', 0)
self.run_updates(current_schema_version=current_schema)
else:
# No datastore yet - check if this is a fresh install or legacy migration
# Generate app_guid FIRST (required for all operations)
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
# Generate RSS access token
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
# Generate API access token
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
# Check if legacy datastore exists (url-watches.json)
if has_legacy_datastore(self.datastore_path):
# Legacy datastore detected - trigger migration
logger.critical(f"Legacy datastore detected at {self.datastore_path}/url-watches.json")
logger.critical("Migration will be triggered via update_26")
# Load the legacy datastore to get its schema_version
from .legacy_loader import load_legacy_format
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
with open(legacy_path) as f:
self.__data = json.load(f)
if not self.__data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# update_26 will load the legacy data again and migrate to new format
# Only run updates AFTER the legacy schema version (e.g., if legacy is at 25, only run 26+)
self.run_updates()
else:
# Fresh install - create new datastore
logger.critical(f"No datastore found, creating new datastore at {self.datastore_path}")
# Set schema version to latest (no updates needed)
updates_available = self.get_updates_available()
self.__data['settings']['application']['schema_version'] = updates_available.pop() if updates_available else 26
# Add default watches if requested
if include_default_watches:
self.add_watch(
url='https://news.ycombinator.com/',
tag='Tech news',
extras={'fetch_backend': 'html_requests'}
)
self.add_watch(
url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'}
)
# Create changedetection.json immediately
try:
self._save_settings()
logger.info("Created changedetection.json for new datastore")
except Exception as e:
logger.error(f"Failed to create initial changedetection.json: {e}")
# Set version tag
self.__data['version_tag'] = version_tag
# Validate proxies.json if it exists
_ = self.proxy_list # Just to test parsing
# Ensure app_guid exists (for datastores loaded from existing files)
if 'app_guid' not in self.__data:
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
else:
self.__data['app_guid'] = str(uuid_builder.uuid4())
self.mark_settings_dirty()
# Ensure RSS access token exists
if not self.__data['settings']['application'].get('rss_access_token'):
secret = secrets.token_hex(16)
self.__data['settings']['application']['rss_access_token'] = secret
self.mark_settings_dirty()
# Ensure API access token exists
if not self.__data['settings']['application'].get('api_access_token'):
secret = secrets.token_hex(16)
self.__data['settings']['application']['api_access_token'] = secret
self.mark_settings_dirty()
# Handle password reset lockfile
password_reset_lockfile = os.path.join(self.datastore_path, "removepassword.lock")
if path.isfile(password_reset_lockfile):
self.remove_password()
unlink(password_reset_lockfile)
# Start the background save thread
self.start_save_thread()
def rehydrate_entity(self, uuid, entity, processor_override=None):
"""Set the dict back to the dict Watch object"""
entity['uuid'] = uuid
if processor_override:
watch_class = get_custom_watch_obj_for_processor(processor_override)
entity['processor'] = processor_override
else:
watch_class = get_custom_watch_obj_for_processor(entity.get('processor'))
if entity.get('processor') != 'text_json_diff':
logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}")
entity = watch_class(datastore_path=self.datastore_path, default=entity)
return entity
# ============================================================================
# FileSavingDataStore Abstract Method Implementations
# ============================================================================
def _watch_exists(self, uuid):
"""Check if watch exists in datastore."""
return uuid in self.__data['watching']
def _get_watch_dict(self, uuid):
"""Get watch as dictionary."""
return dict(self.__data['watching'][uuid])
def _build_settings_data(self):
"""
Build settings data structure for saving.
Returns:
dict: Settings data ready for serialization
"""
return {
'note': 'Settings file - watches are stored in individual {uuid}/watch.json files',
'app_guid': self.__data['app_guid'],
'settings': self.__data['settings'],
'build_sha': self.__data.get('build_sha'),
'version_tag': self.__data.get('version_tag')
}
def _save_settings(self):
"""
Save settings to storage.
File backend implementation: saves to changedetection.json
Implementation of abstract method from FileSavingDataStore.
Uses the generic save_json_atomic helper.
Raises:
OSError: If disk is full or other I/O error
"""
settings_data = self._build_settings_data()
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
save_json_atomic(changedetection_json, settings_data, label="settings", max_size_mb=10)
def _load_watches(self):
"""
Load all watches from storage.
File backend implementation: reads individual watch.json files
Implementation of abstract method from FileSavingDataStore.
Delegates to helper function and stores results in internal data structure.
"""
watching, watch_hashes = load_all_watches(
self.datastore_path,
self.rehydrate_entity,
self._compute_hash
)
# Store loaded data
self.__data['watching'] = watching
self._watch_hashes = watch_hashes
# Verify all watches have hashes
missing_hashes = [uuid for uuid in watching.keys() if uuid not in watch_hashes]
if missing_hashes:
logger.error(f"WARNING: {len(missing_hashes)} watches missing hashes after load: {missing_hashes[:5]}")
else:
logger.debug(f"All {len(watching)} watches have valid hashes")
def _delete_watch(self, uuid):
"""
Delete a watch from storage.
File backend implementation: deletes entire {uuid}/ directory recursively.
Implementation of abstract method from FileSavingDataStore.
Args:
uuid: Watch UUID to delete
"""
watch_dir = os.path.join(self.datastore_path, uuid)
if os.path.exists(watch_dir):
shutil.rmtree(watch_dir)
logger.info(f"Deleted watch directory: {watch_dir}")
# ============================================================================
# Watch Management Methods
# ============================================================================
def set_last_viewed(self, uuid, timestamp):
logger.debug(f"Setting watch UUID: {uuid} last viewed to {int(timestamp)}")
self.data['watching'][uuid].update({'last_viewed': int(timestamp)})
self.mark_watch_dirty(uuid)
watch_check_update = signal('watch_check_update')
if watch_check_update:
watch_check_update.send(watch_uuid=uuid)
def remove_password(self):
self.__data['settings']['application']['password'] = False
self.mark_settings_dirty()
def update_watch(self, uuid, update_obj):
# It's possible that the watch could be deleted before update
if not self.__data['watching'].get(uuid):
return
with self.lock:
# In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
for dict_key, d in self.generic_definition.items():
if isinstance(d, dict):
if update_obj is not None and dict_key in update_obj:
self.__data['watching'][uuid][dict_key].update(update_obj[dict_key])
del (update_obj[dict_key])
self.__data['watching'][uuid].update(update_obj)
self.mark_watch_dirty(uuid)
@property
def threshold_seconds(self):
seconds = 0
for m, n in Watch.mtable.items():
x = self.__data['settings']['requests']['time_between_check'].get(m)
if x:
seconds += x * n
return seconds
@property
def unread_changes_count(self):
unread_changes_count = 0
for uuid, watch in self.__data['watching'].items():
if watch.history_n >= 2 and watch.viewed == False:
unread_changes_count += 1
return unread_changes_count
@property
def data(self):
# Re #152, Return env base_url if not overriden
# Re #148 - Some people have just {{ base_url }} in the body or title, but this may break some notification services
# like 'Join', so it's always best to atleast set something obvious so that they are not broken.
active_base_url = BASE_URL_NOT_SET_TEXT
if self.__data['settings']['application'].get('base_url'):
active_base_url = self.__data['settings']['application'].get('base_url')
elif os.getenv('BASE_URL'):
active_base_url = os.getenv('BASE_URL')
# I looked at various ways todo the following, but in the end just copying the dict seemed simplest/most reliable
# even given the memory tradeoff - if you know a better way.. maybe return d|self.__data.. or something
d = self.__data
d['settings']['application']['active_base_url'] = active_base_url.strip('" ')
return d
# Delete a single watch by UUID
def delete(self, uuid):
"""
Delete a watch by UUID.
Uses abstracted storage method for backend-agnostic deletion.
Supports 'all' to delete all watches (mainly for testing).
Args:
uuid: Watch UUID to delete, or 'all' to delete all watches
"""
with self.lock:
if uuid == 'all':
# Delete all watches - capture UUIDs first before modifying dict
all_uuids = list(self.__data['watching'].keys())
for watch_uuid in all_uuids:
# Delete from storage using polymorphic method
try:
self._delete_watch(watch_uuid)
except Exception as e:
logger.error(f"Failed to delete watch {watch_uuid} from storage: {e}")
# Clean up tracking data
self._watch_hashes.pop(watch_uuid, None)
self._dirty_watches.discard(watch_uuid)
# Send delete signal
watch_delete_signal = signal('watch_deleted')
if watch_delete_signal:
watch_delete_signal.send(watch_uuid=watch_uuid)
# Clear the dict
self.__data['watching'] = {}
# Mainly used for testing to allow all items to flush before running next test
time.sleep(1)
else:
# Delete single watch from storage using polymorphic method
try:
self._delete_watch(uuid)
except Exception as e:
logger.error(f"Failed to delete watch {uuid} from storage: {e}")
# Remove from watching dict
del self.data['watching'][uuid]
# Clean up tracking data
self._watch_hashes.pop(uuid, None)
self._dirty_watches.discard(uuid)
# Send delete signal
watch_delete_signal = signal('watch_deleted')
if watch_delete_signal:
watch_delete_signal.send(watch_uuid=uuid)
self.needs_write_urgent = True
# Clone a watch by UUID
def clone(self, uuid):
url = self.data['watching'][uuid].get('url')
extras = deepcopy(self.data['watching'][uuid])
new_uuid = self.add_watch(url=url, extras=extras)
watch = self.data['watching'][new_uuid]
return new_uuid
def url_exists(self, url):
# Probably their should be dict...
for watch in self.data['watching'].values():
if watch['url'].lower() == url.lower():
return True
return False
# Remove a watchs data but keep the entry (URL etc)
def clear_watch_history(self, uuid):
self.__data['watching'][uuid].clear_watch()
self.needs_write_urgent = True
def add_watch(self, url, tag='', extras=None, tag_uuids=None, save_immediately=True):
if extras is None:
extras = {}
# Incase these are copied across, assume it's a reference and deepcopy()
apply_extras = deepcopy(extras)
apply_extras['tags'] = [] if not apply_extras.get('tags') else apply_extras.get('tags')
# Was it a share link? try to fetch the data
if (url.startswith("https://changedetection.io/share/")):
import requests
try:
r = requests.request(method="GET",
url=url,
# So we know to return the JSON instead of the human-friendly "help" page
headers={'App-Guid': self.__data['app_guid']},
timeout=5.0) # 5 second timeout to prevent blocking
res = r.json()
# List of permissible attributes we accept from the wild internet
for k in [
'body',
'browser_steps',
'css_filter',
'extract_text',
'headers',
'ignore_text',
'include_filters',
'method',
'paused',
'previous_md5',
'processor',
'subtractive_selectors',
'tag',
'tags',
'text_should_not_be_present',
'title',
'trigger_text',
'url',
'use_page_title_in_list',
'webdriver_js_execute_code',
]:
if res.get(k):
if k != 'css_filter':
apply_extras[k] = res[k]
else:
# We renamed the field and made it a list
apply_extras['include_filters'] = [res['css_filter']]
except Exception as e:
logger.error(f"Error fetching metadata for shared watch link {url} {str(e)}")
flash(gettext("Error fetching metadata for {}").format(url), 'error')
return False
if not is_safe_valid_url(url):
flash(gettext('Watch protocol is not permitted or invalid URL format'), 'error')
return None
# Check PAGE_WATCH_LIMIT if set
page_watch_limit = os.getenv('PAGE_WATCH_LIMIT')
if page_watch_limit:
try:
page_watch_limit = int(page_watch_limit)
current_watch_count = len(self.__data['watching'])
if current_watch_count >= page_watch_limit:
logger.error(f"Watch limit reached: {current_watch_count}/{page_watch_limit} watches. Cannot add {url}")
flash(gettext("Watch limit reached ({}/{} watches). Cannot add more watches.").format(current_watch_count, page_watch_limit), 'error')
return None
except ValueError:
logger.warning(f"Invalid PAGE_WATCH_LIMIT value: {page_watch_limit}, ignoring limit check")
if tag and type(tag) == str:
# Then it's probably a string of the actual tag by name, split and add it
for t in tag.split(','):
# for each stripped tag, add tag as UUID
for a_t in t.split(','):
tag_uuid = self.add_tag(a_t)
apply_extras['tags'].append(tag_uuid)
# Or if UUIDs given directly
if tag_uuids:
for t in tag_uuids:
apply_extras['tags'] = list(set(apply_extras['tags'] + [t.strip()]))
# Make any uuids unique
if apply_extras.get('tags'):
apply_extras['tags'] = list(set(apply_extras.get('tags')))
# If the processor also has its own Watch implementation
watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor'))
new_watch = watch_class(datastore_path=self.datastore_path, url=url)
new_uuid = new_watch.get('uuid')
logger.debug(f"Adding URL '{url}' - {new_uuid}")
for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']:
if k in apply_extras:
del apply_extras[k]
if not apply_extras.get('date_created'):
apply_extras['date_created'] = int(time.time())
new_watch.update(apply_extras)
new_watch.ensure_data_dir_exists()
self.__data['watching'][new_uuid] = new_watch
if save_immediately:
# Save immediately using polymorphic method
try:
self.save_watch(new_uuid, force=True)
logger.debug(f"Saved new watch {new_uuid}")
except Exception as e:
logger.error(f"Failed to save new watch {new_uuid}: {e}")
# Mark dirty for retry
self.mark_watch_dirty(new_uuid)
else:
self.mark_watch_dirty(new_uuid)
logger.debug(f"Added '{url}'")
return new_uuid
def _watch_resource_exists(self, watch_uuid, resource_name):
"""
Check if a watch-related resource exists.
File backend implementation: checks if file exists in watch directory.
Args:
watch_uuid: Watch UUID
resource_name: Name of resource (e.g., "last-screenshot.png")
Returns:
bool: True if resource exists
"""
resource_path = os.path.join(self.datastore_path, watch_uuid, resource_name)
return path.isfile(resource_path)
def visualselector_data_is_ready(self, watch_uuid):
"""
Check if visual selector data (screenshot + elements) is ready.
Returns:
bool: True if both screenshot and elements data exist
"""
has_screenshot = self._watch_resource_exists(watch_uuid, "last-screenshot.png")
has_elements = self._watch_resource_exists(watch_uuid, "elements.deflate")
return has_screenshot and has_elements
# Old sync_to_json and save_datastore methods removed - now handled by FileSavingDataStore parent class
# Go through the datastore path and remove any snapshots that are not mentioned in the index
# This usually is not used, but can be handy.
def remove_unused_snapshots(self):
logger.info("Removing snapshots from datastore that are not in the index..")
index = []
for uuid in self.data['watching']:
for id in self.data['watching'][uuid].history:
index.append(self.data['watching'][uuid].history[str(id)])
import pathlib
# Only in the sub-directories
for uuid in self.data['watching']:
for item in pathlib.Path(self.datastore_path).rglob(uuid + "/*.txt"):
if not str(item) in index:
logger.info(f"Removing {item}")
unlink(item)
@property
def proxy_list(self):
proxy_list = {}
proxy_list_file = os.path.join(self.datastore_path, 'proxies.json')
# Load from external config file
if path.isfile(proxy_list_file):
if HAS_ORJSON:
# orjson.loads() expects UTF-8 encoded bytes #3611
with open(os.path.join(self.datastore_path, "proxies.json"), 'rb') as f:
proxy_list = orjson.loads(f.read())
else:
with open(os.path.join(self.datastore_path, "proxies.json"), encoding='utf-8') as f:
proxy_list = json.load(f)
# Mapping from UI config if available
extras = self.data['settings']['requests'].get('extra_proxies')
if extras:
i = 0
for proxy in extras:
i += 0
if proxy.get('proxy_name') and proxy.get('proxy_url'):
k = "ui-" + str(i) + proxy.get('proxy_name')
proxy_list[k] = {'label': proxy.get('proxy_name'), 'url': proxy.get('proxy_url')}
if proxy_list and strtobool(os.getenv('ENABLE_NO_PROXY_OPTION', 'True')):
proxy_list["no-proxy"] = {'label': "No proxy", 'url': ''}
return proxy_list if len(proxy_list) else None
def get_preferred_proxy_for_watch(self, uuid):
"""
Returns the preferred proxy by ID key
:param uuid: UUID
:return: proxy "key" id
"""
if self.proxy_list is None:
return None
# If it's a valid one
watch = self.data['watching'].get(uuid)
if strtobool(os.getenv('ENABLE_NO_PROXY_OPTION', 'True')) and watch.get('proxy') == "no-proxy":
return None
if watch.get('proxy') and watch.get('proxy') in list(self.proxy_list.keys()):
return watch.get('proxy')
# not valid (including None), try the system one
else:
system_proxy_id = self.data['settings']['requests'].get('proxy')
# Is not None and exists
if self.proxy_list.get(system_proxy_id):
return system_proxy_id
# Fallback - Did not resolve anything, or doesnt exist, use the first available
if system_proxy_id is None or not self.proxy_list.get(system_proxy_id):
first_default = list(self.proxy_list)[0]
return first_default
return None
@property
def has_extra_headers_file(self):
filepath = os.path.join(self.datastore_path, 'headers.txt')
return os.path.isfile(filepath)
def get_all_base_headers(self):
headers = {}
# Global app settings
headers.update(self.data['settings'].get('headers', {}))
return headers
def get_all_headers_in_textfile_for_watch(self, uuid):
from ..model.App import parse_headers_from_text_file
headers = {}
# Global in /datastore/headers.txt
filepath = os.path.join(self.datastore_path, 'headers.txt')
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")
watch = self.data['watching'].get(uuid)
if watch:
# In /datastore/xyz-xyz/headers.txt
filepath = os.path.join(watch.watch_data_dir, 'headers.txt')
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")
# In /datastore/tag-name.txt
tags = self.get_all_tags_for_watch(uuid=uuid)
for tag_uuid, tag in tags.items():
fname = "headers-" + re.sub(r'[\W_]', '', tag.get('title')).lower().strip() + ".txt"
filepath = os.path.join(self.datastore_path, fname)
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")
return headers
def get_tag_overrides_for_watch(self, uuid, attr):
tags = self.get_all_tags_for_watch(uuid=uuid)
ret = []
if tags:
for tag_uuid, tag in tags.items():
if attr in tag and tag[attr]:
ret = [*ret, *tag[attr]]
return ret
def add_tag(self, title):
# If name exists, return that
n = title.strip().lower()
logger.debug(f">>> Adding new tag - '{n}'")
if not n:
return False
for uuid, tag in self.__data['settings']['application'].get('tags', {}).items():
if n == tag.get('title', '').lower().strip():
logger.warning(f"Tag '{title}' already exists, skipping creation.")
return uuid
# Eventually almost everything todo with a watch will apply as a Tag
# So we use the same model as a Watch
with self.lock:
from ..model import Tag
new_tag = Tag.model(datastore_path=self.datastore_path, default={
'title': title.strip(),
'date_created': int(time.time())
})
new_uuid = new_tag.get('uuid')
self.__data['settings']['application']['tags'][new_uuid] = new_tag
self.mark_settings_dirty()
return new_uuid
def get_all_tags_for_watch(self, uuid):
"""This should be in Watch model but Watch doesn't have access to datastore, not sure how to solve that yet"""
watch = self.data['watching'].get(uuid)
# Should return a dict of full tag info linked by UUID
if watch:
return dictfilt(self.__data['settings']['application']['tags'], watch.get('tags', []))
return {}
@property
def extra_browsers(self):
res = []
p = list(filter(
lambda s: (s.get('browser_name') and s.get('browser_connection_url')),
self.__data['settings']['requests'].get('extra_browsers', [])))
if p:
for i in p:
res.append(("extra_browser_" + i['browser_name'], i['browser_name']))
return res
def tag_exists_by_name(self, tag_name):
# Check if any tag dictionary has a 'title' attribute matching the provided tag_name
tags = self.__data['settings']['application']['tags'].values()
return next((v for v in tags if v.get('title', '').lower() == tag_name.lower()),
None)
def any_watches_have_processor_by_name(self, processor_name):
for watch in self.data['watching'].values():
if watch.get('processor') == processor_name:
return True
return False
def search_watches_for_url(self, query, tag_limit=None, partial=False):
"""Search watches by URL, title, or error messages
Args:
query (str): Search term to match against watch URLs, titles, and error messages
tag_limit (str, optional): Optional tag name to limit search results
partial: (bool, optional): sub-string matching
Returns:
list: List of UUIDs of watches that match the search criteria
"""
matching_uuids = []
query = query.lower().strip()
tag = self.tag_exists_by_name(tag_limit) if tag_limit else False
for uuid, watch in self.data['watching'].items():
# Filter by tag if requested
if tag_limit:
if not tag.get('uuid') in watch.get('tags', []):
continue
# Search in URL, title, or error messages
if partial:
if ((watch.get('title') and query in watch.get('title').lower()) or
query in watch.get('url', '').lower() or
(watch.get('last_error') and query in watch.get('last_error').lower())):
matching_uuids.append(uuid)
else:
if ((watch.get('title') and query == watch.get('title').lower()) or
query == watch.get('url', '').lower() or
(watch.get('last_error') and query == watch.get('last_error').lower())):
matching_uuids.append(uuid)
return matching_uuids
def get_unique_notification_tokens_available(self):
# Ask each type of watch if they have any extra notification token to add to the validation
extra_notification_tokens = {}
watch_processors_checked = set()
for watch_uuid, watch in self.__data['watching'].items():
processor = watch.get('processor')
if processor not in watch_processors_checked:
extra_notification_tokens.update(watch.extra_notification_token_values())
watch_processors_checked.add(processor)
return extra_notification_tokens
def get_unique_notification_token_placeholders_available(self):
# The actual description of the tokens, could be combined with get_unique_notification_tokens_available instead of doing this twice
extra_notification_tokens = []
watch_processors_checked = set()
for watch_uuid, watch in self.__data['watching'].items():
processor = watch.get('processor')
if processor not in watch_processors_checked:
extra_notification_tokens += watch.extra_notification_token_placeholder_info()
watch_processors_checked.add(processor)
return extra_notification_tokens
def add_notification_url(self, notification_url):
logger.debug(f">>> Adding new notification_url - '{notification_url}'")
notification_urls = self.data['settings']['application'].get('notification_urls', [])
if notification_url in notification_urls:
return notification_url
with self.lock:
notification_urls = self.__data['settings']['application'].get('notification_urls', [])
if notification_url in notification_urls:
return notification_url
# Append and update the datastore
notification_urls.append(notification_url)
self.__data['settings']['application']['notification_urls'] = notification_urls
self.mark_settings_dirty()
return notification_url
# Schema update methods moved to store/updates.py (DatastoreUpdatesMixin)
# This includes: get_updates_available(), run_updates(), and update_1() through update_26()
-100
View File
@@ -1,100 +0,0 @@
"""
Base classes for the datastore.
This module defines the abstract interfaces that all datastore implementations must follow.
"""
from abc import ABC, abstractmethod
from threading import Lock
from loguru import logger
class DataStore(ABC):
"""
Abstract base class for all datastore implementations.
Defines the core interface that all datastores must implement for:
- Loading and saving data
- Managing watches
- Handling settings
- Providing data access
"""
lock = Lock()
datastore_path = None
@abstractmethod
def reload_state(self, datastore_path, include_default_watches, version_tag):
"""
Load data from persistent storage.
Args:
datastore_path: Path to the datastore directory
include_default_watches: Whether to create default watches if none exist
version_tag: Application version string
"""
pass
@abstractmethod
def add_watch(self, url, **kwargs):
"""
Add a new watch.
Args:
url: URL to watch
**kwargs: Additional watch parameters
Returns:
UUID of the created watch
"""
pass
@abstractmethod
def update_watch(self, uuid, update_obj):
"""
Update an existing watch.
Args:
uuid: Watch UUID
update_obj: Dictionary of fields to update
"""
pass
@abstractmethod
def delete(self, uuid):
"""
Delete a watch.
Args:
uuid: Watch UUID to delete
"""
pass
@property
@abstractmethod
def data(self):
"""
Access to the underlying data structure.
Returns:
Dictionary containing all datastore data
"""
pass
@abstractmethod
def force_save_all(self):
"""
Force immediate synchronous save of all data to storage.
This is the abstract method for forcing a complete save.
Different backends implement this differently:
- File backend: Mark all watches/settings dirty, then save
- Redis backend: SAVE command or pipeline flush
- SQL backend: COMMIT transaction
Used by:
- Backup creation (ensure everything is saved before backup)
- Shutdown (ensure all changes are persisted)
- Manual save operations
"""
pass
@@ -1,898 +0,0 @@
"""
File-based datastore with individual watch persistence and dirty tracking.
This module provides the FileSavingDataStore abstract class that implements:
- Individual watch.json file persistence
- Hash-based change detection (only save what changed)
- Periodic audit scan (catches unmarked changes)
- Background save thread with batched parallel saves
- Atomic file writes safe for NFS/NAS
"""
import glob
import hashlib
import json
import os
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Thread
from loguru import logger
from .base import DataStore
from .. import strtobool
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
# Fsync configuration: Force file data to disk for crash safety
# Default False to match legacy behavior (write-and-rename without fsync)
# Set to True for mission-critical deployments requiring crash consistency
FORCE_FSYNC_DATA_IS_CRITICAL = bool(strtobool(os.getenv('FORCE_FSYNC_DATA_IS_CRITICAL', 'False')))
# Save interval configuration: How often the background thread saves dirty items
# Default 10 seconds - increase for less frequent saves, decrease for more frequent
DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS = int(os.getenv('DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS', '10'))
# Rolling audit configuration: Scans a fraction of watches each cycle
# Default: Run audit every 10s, split into 5 shards
# Full audit completes every 50s (10s × 5 shards)
# With 56k watches: 56k / 5 = ~11k watches per cycle (~60ms vs 316ms for all)
# Handles dynamic watch count - recalculates shard boundaries each cycle
DATASTORE_AUDIT_INTERVAL_SECONDS = int(os.getenv('DATASTORE_AUDIT_INTERVAL_SECONDS', '10'))
DATASTORE_AUDIT_SHARDS = int(os.getenv('DATASTORE_AUDIT_SHARDS', '5'))
# ============================================================================
# Helper Functions for Atomic File Operations
# ============================================================================
def save_json_atomic(file_path, data_dict, label="file", max_size_mb=10):
"""
Save JSON data to disk using atomic write pattern.
Generic helper for saving any JSON data (settings, watches, etc.) with:
- Atomic write (temp file + rename)
- Directory fsync for crash consistency (only for new files)
- Size validation
- Proper error handling
Args:
file_path: Full path to target JSON file
data_dict: Dictionary to serialize
label: Human-readable label for error messages (e.g., "watch", "settings")
max_size_mb: Maximum allowed file size in MB
Raises:
ValueError: If serialized data exceeds max_size_mb
OSError: If disk is full (ENOSPC) or other I/O error
"""
# Check if file already exists (before we start writing)
# Directory fsync only needed for NEW files to persist the filename
file_exists = os.path.exists(file_path)
# Ensure parent directory exists
parent_dir = os.path.dirname(file_path)
os.makedirs(parent_dir, exist_ok=True)
# Create temp file in same directory (required for NFS atomicity)
fd, temp_path = tempfile.mkstemp(
suffix='.tmp',
prefix='json-',
dir=parent_dir,
text=False
)
fd_closed = False
try:
# Serialize data
t0 = time.time()
if HAS_ORJSON:
data = orjson.dumps(data_dict, option=orjson.OPT_INDENT_2)
else:
data = json.dumps(data_dict, indent=2, ensure_ascii=False).encode('utf-8')
serialize_ms = (time.time() - t0) * 1000
# Safety check: validate size
MAX_SIZE = max_size_mb * 1024 * 1024
data_size = len(data)
if data_size > MAX_SIZE:
raise ValueError(
f"{label.capitalize()} data is unexpectedly large: {data_size / 1024 / 1024:.2f}MB "
f"(max: {max_size_mb}MB). This indicates a bug or data corruption."
)
# Write to temp file
t1 = time.time()
os.write(fd, data)
write_ms = (time.time() - t1) * 1000
# Optional fsync: Force file data to disk for crash safety
# Only if FORCE_FSYNC_DATA_IS_CRITICAL=True (default: False, matches legacy behavior)
t2 = time.time()
if FORCE_FSYNC_DATA_IS_CRITICAL:
os.fsync(fd)
file_fsync_ms = (time.time() - t2) * 1000
os.close(fd)
fd_closed = True
# Atomic rename
t3 = time.time()
os.replace(temp_path, file_path)
rename_ms = (time.time() - t3) * 1000
# Sync directory to ensure filename metadata is durable
# OPTIMIZATION: Only needed for NEW files. Existing files already have
# directory entry persisted, so we only need file fsync for data durability.
dir_fsync_ms = 0
if not file_exists:
try:
dir_fd = os.open(parent_dir, os.O_RDONLY)
try:
t4 = time.time()
os.fsync(dir_fd)
dir_fsync_ms = (time.time() - t4) * 1000
finally:
os.close(dir_fd)
except (OSError, AttributeError):
# Windows doesn't support fsync on directories
pass
# Log timing breakdown for slow saves
# total_ms = serialize_ms + write_ms + file_fsync_ms + rename_ms + dir_fsync_ms
# if total_ms: # Log if save took more than 10ms
# file_status = "new" if not file_exists else "update"
# logger.trace(
# f"Save timing breakdown ({total_ms:.1f}ms total, {file_status}): "
# f"serialize={serialize_ms:.1f}ms, write={write_ms:.1f}ms, "
# f"file_fsync={file_fsync_ms:.1f}ms, rename={rename_ms:.1f}ms, "
# f"dir_fsync={dir_fsync_ms:.1f}ms, using_orjson={HAS_ORJSON}"
# )
except OSError as e:
# Cleanup temp file
if not fd_closed:
try:
os.close(fd)
except:
pass
if os.path.exists(temp_path):
try:
os.unlink(temp_path)
except:
pass
# Provide helpful error messages
if e.errno == 28: # ENOSPC
raise OSError(f"Disk full: Cannot save {label}") from e
elif e.errno == 122: # EDQUOT
raise OSError(f"Disk quota exceeded: Cannot save {label}") from e
else:
raise OSError(f"I/O error saving {label}: {e}") from e
except Exception as e:
# Cleanup temp file
if not fd_closed:
try:
os.close(fd)
except:
pass
if os.path.exists(temp_path):
try:
os.unlink(temp_path)
except:
pass
raise e
def save_watch_atomic(watch_dir, uuid, watch_dict):
"""
Save a watch to disk using atomic write pattern.
Convenience wrapper around save_json_atomic for watches.
Args:
watch_dir: Directory for this watch (e.g., /datastore/{uuid})
uuid: Watch UUID (for logging)
watch_dict: Dictionary representation of the watch
Raises:
ValueError: If serialized data exceeds 10MB (indicates bug or corruption)
OSError: If disk is full (ENOSPC) or other I/O error
"""
watch_json = os.path.join(watch_dir, "watch.json")
save_json_atomic(watch_json, watch_dict, label=f"watch {uuid}", max_size_mb=10)
def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
"""
Load a watch from its JSON file.
Args:
watch_json: Path to the watch.json file
uuid: Watch UUID
rehydrate_entity_func: Function to convert dict to Watch object
Returns:
Tuple of (Watch object, raw_data_dict) or (None, None) if failed
The raw_data_dict is needed to compute the hash before rehydration
"""
try:
# Check file size before reading
file_size = os.path.getsize(watch_json)
MAX_WATCH_SIZE = 10 * 1024 * 1024 # 10MB
if file_size > MAX_WATCH_SIZE:
logger.critical(
f"CORRUPTED WATCH DATA: Watch {uuid} file is unexpectedly large: "
f"{file_size / 1024 / 1024:.2f}MB (max: {MAX_WATCH_SIZE / 1024 / 1024}MB). "
f"File: {watch_json}. This indicates a bug or data corruption. "
f"Watch will be skipped."
)
return None, None
if HAS_ORJSON:
with open(watch_json, 'rb') as f:
watch_data = orjson.loads(f.read())
else:
with open(watch_json, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
if watch_data.get('time_schedule_limit'):
del watch_data['time_schedule_limit']
if watch_data.get('time_between_check'):
del watch_data['time_between_check']
# Return both the raw data and the rehydrated watch
# Raw data is needed to compute hash before rehydration changes anything
watch_obj = rehydrate_entity_func(uuid, watch_data)
return watch_obj, watch_data
except json.JSONDecodeError as e:
logger.critical(
f"CORRUPTED WATCH DATA: Failed to parse JSON for watch {uuid}. "
f"File: {watch_json}. Error: {e}. "
f"Watch will be skipped and may need manual recovery from backup."
)
return None, None
except ValueError as e:
# orjson raises ValueError for invalid JSON
if "invalid json" in str(e).lower() or HAS_ORJSON:
logger.critical(
f"CORRUPTED WATCH DATA: Failed to parse JSON for watch {uuid}. "
f"File: {watch_json}. Error: {e}. "
f"Watch will be skipped and may need manual recovery from backup."
)
return None, None
# Re-raise if it's not a JSON parsing error
raise
except FileNotFoundError:
logger.error(f"Watch file not found: {watch_json} for watch {uuid}")
return None, None
except Exception as e:
logger.error(f"Failed to load watch {uuid} from {watch_json}: {e}")
return None, None
def load_all_watches(datastore_path, rehydrate_entity_func, compute_hash_func):
"""
Load all watches from individual watch.json files.
SYNCHRONOUS loading: Blocks until all watches are loaded.
This ensures data consistency - web server won't accept requests
until all watches are available. Progress logged every 100 watches.
Args:
datastore_path: Path to the datastore directory
rehydrate_entity_func: Function to convert dict to Watch object
compute_hash_func: Function to compute hash from raw watch dict
Returns:
Tuple of (watching_dict, hashes_dict)
- watching_dict: uuid -> Watch object
- hashes_dict: uuid -> hash string (computed from raw data)
"""
start_time = time.time()
logger.info("Loading watches from individual watch.json files...")
watching = {}
watch_hashes = {}
if not os.path.exists(datastore_path):
return watching, watch_hashes
# Find all watch.json files using glob (faster than manual directory traversal)
glob_start = time.time()
watch_files = glob.glob(os.path.join(datastore_path, "*", "watch.json"))
glob_time = time.time() - glob_start
total = len(watch_files)
logger.debug(f"Found {total} watch.json files in {glob_time:.3f}s")
loaded = 0
failed = 0
for watch_json in watch_files:
# Extract UUID from path: /datastore/{uuid}/watch.json
uuid_dir = os.path.basename(os.path.dirname(watch_json))
watch, raw_data = load_watch_from_file(watch_json, uuid_dir, rehydrate_entity_func)
if watch and raw_data:
watching[uuid_dir] = watch
# Compute hash from rehydrated Watch object (as dict) to match how we compute on save
# This ensures hash matches what audit will compute from dict(watch)
watch_hashes[uuid_dir] = compute_hash_func(dict(watch))
loaded += 1
if loaded % 100 == 0:
logger.info(f"Loaded {loaded}/{total} watches...")
else:
# load_watch_from_file already logged the specific error
failed += 1
elapsed = time.time() - start_time
if failed > 0:
logger.critical(
f"LOAD COMPLETE: {loaded} watches loaded successfully, "
f"{failed} watches FAILED to load (corrupted or invalid) "
f"in {elapsed:.2f}s ({loaded/elapsed:.0f} watches/sec)"
)
else:
logger.info(f"Loaded {loaded} watches from disk in {elapsed:.2f}s ({loaded/elapsed:.0f} watches/sec)")
return watching, watch_hashes
# ============================================================================
# FileSavingDataStore Class
# ============================================================================
class FileSavingDataStore(DataStore):
"""
Abstract datastore that provides file persistence with change tracking.
Features:
- Individual watch.json files (one per watch)
- Dirty tracking: Only saves items that have changed
- Hash-based change detection: Prevents unnecessary writes
- Background save thread: Non-blocking persistence
- Two-tier urgency: Standard (60s) and urgent (immediate) saves
Subclasses must implement:
- rehydrate_entity(): Convert dict to Watch object
- Access to internal __data structure for watch management
"""
needs_write = False
needs_write_urgent = False
stop_thread = False
# Change tracking
_dirty_watches = set() # Watch UUIDs that need saving
_dirty_settings = False # Settings changed
_watch_hashes = {} # UUID -> SHA256 hash for change detection
# Health monitoring
_last_save_time = 0 # Timestamp of last successful save
_last_audit_time = 0 # Timestamp of last audit scan
_save_cycle_count = 0 # Number of save cycles completed
_total_saves = 0 # Total watches saved (lifetime)
_save_errors = 0 # Total save errors (lifetime)
_audit_count = 0 # Number of audit scans completed
_audit_found_changes = 0 # Total unmarked changes found by audits
_audit_shard_index = 0 # Current shard being audited (rolling audit)
def __init__(self):
super().__init__()
self.save_data_thread = None
self._last_save_time = time.time()
self._last_audit_time = time.time()
def mark_watch_dirty(self, uuid):
"""
Mark a watch as needing save.
Args:
uuid: Watch UUID
"""
with self.lock:
self._dirty_watches.add(uuid)
dirty_count = len(self._dirty_watches)
# Backpressure detection - warn if dirty set grows too large
if dirty_count > 1000:
logger.critical(
f"BACKPRESSURE WARNING: {dirty_count} watches pending save! "
f"Save thread may not be keeping up with write rate. "
f"This could indicate disk I/O bottleneck or save thread failure."
)
elif dirty_count > 500:
logger.warning(
f"Dirty watch count high: {dirty_count} watches pending save. "
f"Monitoring for potential backpressure."
)
self.needs_write = True
def mark_settings_dirty(self):
"""Mark settings as needing save."""
with self.lock:
self._dirty_settings = True
self.needs_write = True
def _compute_hash(self, watch_dict):
"""
Compute SHA256 hash of watch for change detection.
Args:
watch_dict: Dictionary representation of watch
Returns:
Hex string of SHA256 hash
"""
# Use orjson for deterministic serialization if available
if HAS_ORJSON:
json_bytes = orjson.dumps(watch_dict, option=orjson.OPT_SORT_KEYS)
else:
json_str = json.dumps(watch_dict, sort_keys=True, ensure_ascii=False)
json_bytes = json_str.encode('utf-8')
return hashlib.sha256(json_bytes).hexdigest()
def save_watch(self, uuid, force=False, watch_dict=None, current_hash=None):
"""
Save a single watch if it has changed (polymorphic method).
Args:
uuid: Watch UUID
force: If True, skip hash check and save anyway
watch_dict: Pre-computed watch dictionary (optimization)
current_hash: Pre-computed hash (optimization)
Returns:
True if saved, False if skipped (unchanged)
"""
if not self._watch_exists(uuid):
logger.warning(f"Cannot save watch {uuid} - does not exist")
return False
# Get watch dict if not provided
if watch_dict is None:
watch_dict = self._get_watch_dict(uuid)
# Compute hash if not provided
if current_hash is None:
current_hash = self._compute_hash(watch_dict)
# Skip save if unchanged (unless forced)
if not force and current_hash == self._watch_hashes.get(uuid):
return False
try:
self._save_watch(uuid, watch_dict)
self._watch_hashes[uuid] = current_hash
logger.debug(f"Saved watch {uuid}")
return True
except Exception as e:
logger.error(f"Failed to save watch {uuid}: {e}")
raise
def _save_watch(self, uuid, watch_dict):
"""
Save a single watch to storage (polymorphic).
Backend-specific implementation. Subclasses override for different storage:
- File backend: Writes to {uuid}/watch.json
- Redis backend: SET watch:{uuid}
- SQL backend: UPDATE watches WHERE uuid=?
Args:
uuid: Watch UUID
watch_dict: Dictionary representation of watch
"""
# Default file implementation
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
def _save_settings(self):
"""
Save settings to storage (polymorphic).
Subclasses must implement for their backend.
- File: changedetection.json
- Redis: SET settings
- SQL: UPDATE settings table
"""
raise NotImplementedError("Subclass must implement _save_settings")
def _load_watches(self):
"""
Load all watches from storage (polymorphic).
Subclasses must implement for their backend.
- File: Read individual watch.json files
- Redis: SCAN watch:* keys
- SQL: SELECT * FROM watches
"""
raise NotImplementedError("Subclass must implement _load_watches")
def _delete_watch(self, uuid):
"""
Delete a watch from storage (polymorphic).
Subclasses must implement for their backend.
- File: Delete {uuid}/ directory recursively
- Redis: DEL watch:{uuid}
- SQL: DELETE FROM watches WHERE uuid=?
Args:
uuid: Watch UUID to delete
"""
raise NotImplementedError("Subclass must implement _delete_watch")
def _save_dirty_items(self):
"""
Save dirty watches and settings.
This is the core optimization: instead of saving the entire datastore,
we only save watches that were marked dirty and settings if changed.
"""
start_time = time.time()
# Capture dirty sets under lock
with self.lock:
dirty_watches = list(self._dirty_watches)
dirty_settings = self._dirty_settings
self._dirty_watches.clear()
self._dirty_settings = False
if not dirty_watches and not dirty_settings:
return
logger.trace(f"Saving {len(dirty_watches)} dirty watches, settings_dirty={dirty_settings}")
# Save each dirty watch using the polymorphic save method
saved_count = 0
error_count = 0
skipped_unchanged = 0
# Process in batches of 50, using thread pool for parallel saves
BATCH_SIZE = 50
MAX_WORKERS = 20 # Number of parallel save threads
def save_single_watch(uuid):
"""Helper function for thread pool execution."""
try:
# Check if watch still exists (might have been deleted)
if not self._watch_exists(uuid):
# Watch was deleted, remove hash
self._watch_hashes.pop(uuid, None)
return {'status': 'deleted', 'uuid': uuid}
# Pre-check hash to avoid unnecessary save_watch() calls
watch_dict = self._get_watch_dict(uuid)
current_hash = self._compute_hash(watch_dict)
if current_hash == self._watch_hashes.get(uuid):
# Watch hasn't actually changed, skip
return {'status': 'unchanged', 'uuid': uuid}
# Pass pre-computed values to avoid redundant serialization/hashing
if self.save_watch(uuid, force=True, watch_dict=watch_dict, current_hash=current_hash):
return {'status': 'saved', 'uuid': uuid}
else:
return {'status': 'skipped', 'uuid': uuid}
except Exception as e:
logger.error(f"Error saving watch {uuid}: {e}")
return {'status': 'error', 'uuid': uuid, 'error': e}
# Process dirty watches in batches
for batch_start in range(0, len(dirty_watches), BATCH_SIZE):
batch = dirty_watches[batch_start:batch_start + BATCH_SIZE]
batch_num = (batch_start // BATCH_SIZE) + 1
total_batches = (len(dirty_watches) + BATCH_SIZE - 1) // BATCH_SIZE
if len(dirty_watches) > BATCH_SIZE:
logger.trace(f"Save batch {batch_num}/{total_batches} ({len(batch)} watches)")
# Use thread pool to save watches in parallel
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# Submit all save tasks
future_to_uuid = {executor.submit(save_single_watch, uuid): uuid for uuid in batch}
# Collect results as they complete
for future in as_completed(future_to_uuid):
result = future.result()
status = result['status']
if status == 'saved':
saved_count += 1
elif status == 'unchanged':
skipped_unchanged += 1
elif status == 'error':
error_count += 1
# Re-mark for retry
with self.lock:
self._dirty_watches.add(result['uuid'])
# 'deleted' and 'skipped' don't need special handling
# Save settings if changed
if dirty_settings:
try:
self._save_settings()
logger.debug("Saved settings")
except Exception as e:
logger.error(f"Failed to save settings: {e}")
error_count += 1
with self.lock:
self._dirty_settings = True
# Update metrics
elapsed = time.time() - start_time
self._save_cycle_count += 1
self._total_saves += saved_count
self._save_errors += error_count
self._last_save_time = time.time()
# Log performance metrics
if saved_count > 0:
avg_time_per_watch = (elapsed / saved_count) * 1000 # milliseconds
skipped_msg = f", {skipped_unchanged} unchanged" if skipped_unchanged > 0 else ""
parallel_msg = f" [parallel: {MAX_WORKERS} workers]" if saved_count > 1 else ""
logger.info(
f"Successfully saved {saved_count} watches in {elapsed:.2f}s "
f"(avg {avg_time_per_watch:.1f}ms per watch{skipped_msg}){parallel_msg}. "
f"Total: {self._total_saves} saves, {self._save_errors} errors (lifetime)"
)
elif skipped_unchanged > 0:
logger.debug(f"Save cycle: {skipped_unchanged} watches verified unchanged (hash match), nothing saved")
if error_count > 0:
logger.error(f"Save cycle completed with {error_count} errors")
self.needs_write = False
self.needs_write_urgent = False
def _watch_exists(self, uuid):
"""
Check if watch exists. Subclass must implement.
Args:
uuid: Watch UUID
Returns:
bool
"""
raise NotImplementedError("Subclass must implement _watch_exists")
def _get_watch_dict(self, uuid):
"""
Get watch as dictionary. Subclass must implement.
Args:
uuid: Watch UUID
Returns:
Dictionary representation of watch
"""
raise NotImplementedError("Subclass must implement _get_watch_dict")
def _audit_all_watches(self):
"""
Rolling audit: Scans a fraction of watches to detect unmarked changes.
Instead of scanning ALL watches at once, this scans 1/N shards per cycle.
The shard rotates each cycle, completing a full audit every N cycles.
Handles dynamic watch count - recalculates shard boundaries each cycle,
so newly added watches will be audited in subsequent cycles.
Benefits:
- Lower CPU per cycle (56k / 5 = ~11k watches vs all 56k)
- More frequent audits overall (every 50s vs every 10s)
- Spreads load evenly across time
"""
audit_start = time.time()
# Get list of all watch UUIDs (read-only, no lock needed)
try:
all_uuids = list(self.data['watching'].keys())
except (KeyError, AttributeError, RuntimeError):
# Data structure not ready or being modified
return
if not all_uuids:
return
total_watches = len(all_uuids)
# Calculate this cycle's shard boundaries
# Example: 56,278 watches / 5 shards = 11,255 watches per shard
# Shard 0: [0:11255], Shard 1: [11255:22510], etc.
shard_size = (total_watches + DATASTORE_AUDIT_SHARDS - 1) // DATASTORE_AUDIT_SHARDS
start_idx = self._audit_shard_index * shard_size
end_idx = min(start_idx + shard_size, total_watches)
# Handle wrap-around (shouldn't happen normally, but defensive)
if start_idx >= total_watches:
self._audit_shard_index = 0
start_idx = 0
end_idx = min(shard_size, total_watches)
# Audit only this shard's watches
shard_uuids = all_uuids[start_idx:end_idx]
changes_found = 0
errors = 0
for uuid in shard_uuids:
try:
# Get current watch dict and compute hash
watch_dict = self._get_watch_dict(uuid)
current_hash = self._compute_hash(watch_dict)
stored_hash = self._watch_hashes.get(uuid)
# If hash changed and not already marked dirty, mark it
if current_hash != stored_hash:
with self.lock:
if uuid not in self._dirty_watches:
self._dirty_watches.add(uuid)
changes_found += 1
logger.warning(
f"Audit detected unmarked change in watch {uuid[:8]}... current {current_hash:8} stored hash {stored_hash[:8]}"
f"(hash changed but not marked dirty)"
)
self.needs_write = True
except Exception as e:
errors += 1
logger.trace(f"Audit error for watch {uuid[:8]}...: {e}")
audit_elapsed = (time.time() - audit_start) * 1000 # milliseconds
# Advance to next shard (wrap around after last shard)
self._audit_shard_index = (self._audit_shard_index + 1) % DATASTORE_AUDIT_SHARDS
# Update metrics
self._audit_count += 1
self._audit_found_changes += changes_found
self._last_audit_time = time.time()
if changes_found > 0:
logger.warning(
f"Audit shard {self._audit_shard_index}/{DATASTORE_AUDIT_SHARDS} found {changes_found} "
f"unmarked changes in {len(shard_uuids)}/{total_watches} watches ({audit_elapsed:.1f}ms)"
)
else:
logger.trace(
f"Audit shard {self._audit_shard_index}/{DATASTORE_AUDIT_SHARDS}: "
f"{len(shard_uuids)}/{total_watches} watches checked, 0 changes ({audit_elapsed:.1f}ms)"
)
def save_datastore(self):
"""
Background thread that periodically saves dirty items and audits watches.
Runs two independent cycles:
1. Save dirty items every DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS (default 10s)
2. Rolling audit: every DATASTORE_AUDIT_INTERVAL_SECONDS (default 10s)
- Scans 1/DATASTORE_AUDIT_SHARDS watches per cycle (default 1/5)
- Full audit completes every 50s (10s × 5 shards)
- Automatically handles new/deleted watches
Uses 0.5s sleep intervals for responsiveness to urgent saves.
"""
while True:
if self.stop_thread:
# Graceful shutdown: flush any remaining dirty items before stopping
if self.needs_write or self._dirty_watches or self._dirty_settings:
logger.warning("Datastore save thread stopping - flushing remaining dirty items...")
try:
self._save_dirty_items()
logger.info("Graceful shutdown complete - all data saved")
except Exception as e:
logger.critical(f"FAILED to save dirty items during shutdown: {e}")
else:
logger.info("Datastore save thread stopping - no dirty items")
return
# Check if it's time to run audit scan (every N seconds)
if time.time() - self._last_audit_time >= DATASTORE_AUDIT_INTERVAL_SECONDS:
try:
self._audit_all_watches()
except Exception as e:
logger.error(f"Error in audit cycle: {e}")
# Save dirty items if needed
if self.needs_write or self.needs_write_urgent:
try:
self._save_dirty_items()
except Exception as e:
logger.error(f"Error in save cycle: {e}")
# Timer with early break for urgent saves
# Each iteration is 0.5 seconds, so iterations = DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS * 2
for i in range(DATASTORE_SCAN_DIRTY_SAVE_INTERVAL_SECONDS * 2):
time.sleep(0.5)
if self.stop_thread or self.needs_write_urgent:
break
def start_save_thread(self):
"""Start the background save thread."""
if not self.save_data_thread or not self.save_data_thread.is_alive():
self.save_data_thread = Thread(target=self.save_datastore, daemon=True, name="DatastoreSaver")
self.save_data_thread.start()
logger.info("Datastore save thread started")
def force_save_all(self):
"""
Force immediate synchronous save of all changes to storage.
File backend implementation of the abstract force_save_all() method.
Marks all watches and settings as dirty, then saves immediately.
Used by:
- Backup creation (ensure everything is saved before backup)
- Shutdown (ensure all changes are persisted)
- Manual save operations
"""
logger.info("Force saving all data to storage...")
# Mark everything as dirty to ensure complete save
for uuid in self.data['watching'].keys():
self.mark_watch_dirty(uuid)
self.mark_settings_dirty()
# Save immediately (synchronous)
self._save_dirty_items()
logger.success("All data saved to storage")
def get_health_status(self):
"""
Get datastore health status for monitoring.
Returns:
dict with health metrics and status
"""
now = time.time()
time_since_last_save = now - self._last_save_time
with self.lock:
dirty_count = len(self._dirty_watches)
is_thread_alive = self.save_data_thread and self.save_data_thread.is_alive()
# Determine health status
if not is_thread_alive:
status = "CRITICAL"
message = "Save thread is DEAD"
elif time_since_last_save > 300: # 5 minutes
status = "WARNING"
message = f"No save activity for {time_since_last_save:.0f}s"
elif dirty_count > 1000:
status = "WARNING"
message = f"High backpressure: {dirty_count} watches pending"
elif self._save_errors > 0 and (self._save_errors / max(self._total_saves, 1)) > 0.01:
status = "WARNING"
message = f"High error rate: {self._save_errors} errors"
else:
status = "HEALTHY"
message = "Operating normally"
return {
"status": status,
"message": message,
"thread_alive": is_thread_alive,
"dirty_watches": dirty_count,
"dirty_settings": self._dirty_settings,
"last_save_seconds_ago": int(time_since_last_save),
"save_cycles": self._save_cycle_count,
"total_saves": self._total_saves,
"total_errors": self._save_errors,
"error_rate_percent": round((self._save_errors / max(self._total_saves, 1)) * 100, 2)
}
-66
View File
@@ -1,66 +0,0 @@
"""
Legacy format loader for url-watches.json.
Provides functions to detect and load from the legacy monolithic JSON format.
Used during migration (update_26) to transition to individual watch.json files.
"""
import os
import json
from loguru import logger
# Try to import orjson for faster JSON serialization
try:
import orjson
HAS_ORJSON = True
except ImportError:
HAS_ORJSON = False
def has_legacy_datastore(datastore_path):
"""
Check if a legacy url-watches.json file exists.
This is used by update_26 to determine if migration is needed.
Args:
datastore_path: Path to datastore directory
Returns:
bool: True if url-watches.json exists
"""
url_watches_json = os.path.join(datastore_path, "url-watches.json")
return os.path.exists(url_watches_json)
def load_legacy_format(json_store_path):
"""
Load datastore from legacy url-watches.json format.
Args:
json_store_path: Full path to url-watches.json file
Returns:
dict: Loaded datastore data with 'watching', 'settings', etc.
None: If file doesn't exist or loading failed
"""
logger.info(f"Loading from legacy format: {json_store_path}")
if not os.path.isfile(json_store_path):
logger.warning(f"Legacy file not found: {json_store_path}")
return None
try:
if HAS_ORJSON:
with open(json_store_path, 'rb') as f:
data = orjson.loads(f.read())
else:
with open(json_store_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Loaded {len(data.get('watching', {}))} watches from legacy format")
return data
except Exception as e:
logger.error(f"Failed to load legacy format: {e}")
return None
-686
View File
@@ -1,686 +0,0 @@
"""
Schema update migrations for the datastore.
This module contains all schema version upgrade methods (update_1 through update_N).
These are mixed into ChangeDetectionStore to keep the main store file focused.
IMPORTANT: Each update could be run even when they have a new install and the schema is correct.
Therefore - each `update_n` should be very careful about checking if it needs to actually run.
"""
import os
import re
import shutil
import tarfile
import time
from loguru import logger
from copy import deepcopy
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
from ..processors.restock_diff import Restock
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
from .file_saving_datastore import save_watch_atomic
def create_backup_tarball(datastore_path, update_number):
"""
Create a tarball backup of the entire datastore structure before running an update.
Includes:
- All {uuid}/watch.json files
- changedetection.json (settings, if it exists)
- url-watches.json (legacy format, if it exists)
- Directory structure preserved
Args:
datastore_path: Path to datastore directory
update_number: Update number being applied
Returns:
str: Path to created tarball, or None if backup failed
Restoration:
To restore from a backup:
cd /path/to/datastore
tar -xzf before-update-N-timestamp.tar.gz
This will restore all watch.json files and settings to their pre-update state.
"""
timestamp = int(time.time())
backup_filename = f"before-update-{update_number}-{timestamp}.tar.gz"
backup_path = os.path.join(datastore_path, backup_filename)
try:
logger.info(f"Creating backup tarball: {backup_filename}")
with tarfile.open(backup_path, "w:gz") as tar:
# Backup changedetection.json if it exists (new format)
changedetection_json = os.path.join(datastore_path, "changedetection.json")
if os.path.isfile(changedetection_json):
tar.add(changedetection_json, arcname="changedetection.json")
logger.debug("Added changedetection.json to backup")
# Backup url-watches.json if it exists (legacy format)
url_watches_json = os.path.join(datastore_path, "url-watches.json")
if os.path.isfile(url_watches_json):
tar.add(url_watches_json, arcname="url-watches.json")
logger.debug("Added url-watches.json to backup")
# Backup all watch directories with their watch.json files
# This preserves the UUID directory structure
watch_count = 0
for entry in os.listdir(datastore_path):
entry_path = os.path.join(datastore_path, entry)
# Skip if not a directory
if not os.path.isdir(entry_path):
continue
# Skip hidden directories and backup directories
if entry.startswith('.') or entry.startswith('before-update-'):
continue
# Check if this directory has a watch.json (indicates it's a watch UUID directory)
watch_json = os.path.join(entry_path, "watch.json")
if os.path.isfile(watch_json):
# Add the watch.json file preserving directory structure
tar.add(watch_json, arcname=f"{entry}/watch.json")
watch_count += 1
if watch_count % 100 == 0:
logger.debug(f"Backed up {watch_count} watch.json files...")
logger.success(f"Backup created: {backup_filename} ({watch_count} watches)")
return backup_path
except Exception as e:
logger.error(f"Failed to create backup tarball: {e}")
# Try to clean up partial backup
if os.path.exists(backup_path):
try:
os.unlink(backup_path)
except:
pass
return None
class DatastoreUpdatesMixin:
"""
Mixin class containing all schema update methods.
This class is inherited by ChangeDetectionStore to provide schema migration functionality.
Each update_N method upgrades the schema from version N-1 to version N.
"""
def get_updates_available(self):
"""
Discover all available update methods.
Returns:
list: Sorted list of update version numbers (e.g., [1, 2, 3, ..., 26])
"""
import inspect
updates_available = []
for i, o in inspect.getmembers(self, predicate=inspect.ismethod):
m = re.search(r'update_(\d+)$', i)
if m:
updates_available.append(int(m.group(1)))
updates_available.sort()
return updates_available
def run_updates(self, current_schema_version=None):
"""
Run all pending schema updates sequentially.
Args:
current_schema_version: Optional current schema version. If provided, only run updates
greater than this version. If None, uses the schema version from
the datastore. If no schema version exists in datastore and it appears
to be a fresh install, sets to latest update number (no updates needed).
IMPORTANT: Each update could be run even when they have a new install and the schema is correct.
Therefore - each `update_n` should be very careful about checking if it needs to actually run.
Process:
1. Get list of available updates
2. For each update > current schema version:
- Create backup of datastore
- Run update method
- Update schema version
- Mark settings and watches dirty
3. If any update fails, stop processing
4. Save all changes immediately
"""
updates_available = self.get_updates_available()
# Determine current schema version
if current_schema_version is None:
# Check if schema_version exists in datastore
current_schema_version = self.data['settings']['application'].get('schema_version')
if current_schema_version is None:
# No schema version found - could be a fresh install or very old datastore
# If this is a fresh/new config with no watches, assume it's up-to-date
# and set to latest update number (no updates needed)
if len(self.data['watching']) == 0:
# Get the highest update number from available update methods
latest_update = updates_available[-1] if updates_available else 0
logger.info(f"No schema version found and no watches exist - assuming fresh install, setting schema_version to {latest_update}")
self.data['settings']['application']['schema_version'] = latest_update
self.mark_settings_dirty()
return # No updates needed for fresh install
else:
# Has watches but no schema version - likely old datastore, run all updates
logger.warning("No schema version found but watches exist - running all updates from version 0")
current_schema_version = 0
logger.info(f"Current schema version: {current_schema_version}")
updates_ran = []
for update_n in updates_available:
if update_n > current_schema_version:
logger.critical(f"Applying update_{update_n}")
# Create tarball backup of entire datastore structure
# This includes all watch.json files, settings, and preserves directory structure
backup_path = create_backup_tarball(self.datastore_path, update_n)
if backup_path:
logger.info(f"Backup created at: {backup_path}")
else:
logger.warning("Backup creation failed, but continuing with update")
try:
update_method = getattr(self, f"update_{update_n}")()
except Exception as e:
logger.error(f"Error while trying update_{update_n}")
logger.error(e)
# Don't run any more updates
return
else:
# Bump the version, important
self.data['settings']['application']['schema_version'] = update_n
self.mark_settings_dirty()
# CRITICAL: Mark all watches as dirty so changes are persisted
# Most updates modify watches, and in the new individual watch.json structure,
# we need to ensure those changes are saved
logger.info(f"Marking all {len(self.data['watching'])} watches as dirty after update_{update_n} (so that it saves them to disk)")
for uuid in self.data['watching'].keys():
self.mark_watch_dirty(uuid)
# Save changes immediately after each update (more resilient than batching)
logger.critical(f"Saving all changes after update_{update_n}")
try:
self._save_dirty_items()
logger.success(f"Update {update_n} changes saved successfully")
except Exception as e:
logger.error(f"Failed to save update_{update_n} changes: {e}")
# Don't raise - update already ran, but changes might not be persisted
# The update will try to run again on next startup
# Track which updates ran
updates_ran.append(update_n)
# ============================================================================
# Individual Update Methods
# ============================================================================
def update_1(self):
"""Convert minutes to seconds on settings and each watch."""
if self.data['settings']['requests'].get('minutes_between_check'):
self.data['settings']['requests']['time_between_check']['minutes'] = self.data['settings']['requests']['minutes_between_check']
# Remove the default 'hours' that is set from the model
self.data['settings']['requests']['time_between_check']['hours'] = None
for uuid, watch in self.data['watching'].items():
if 'minutes_between_check' in watch:
# Only upgrade individual watch time if it was set
if watch.get('minutes_between_check', False):
self.data['watching'][uuid]['time_between_check']['minutes'] = watch['minutes_between_check']
def update_2(self):
"""
Move the history list to a flat text file index.
Better than SQLite because this list is only appended to, and works across NAS / NFS type setups.
"""
# @todo test running this on a newly updated one (when this already ran)
for uuid, watch in self.data['watching'].items():
history = []
if watch.get('history', False):
for d, p in watch['history'].items():
d = int(d) # Used to be keyed as str, we'll fix this now too
history.append("{},{}\n".format(d, p))
if len(history):
target_path = os.path.join(self.datastore_path, uuid)
if os.path.exists(target_path):
with open(os.path.join(target_path, "history.txt"), "w") as f:
f.writelines(history)
else:
logger.warning(f"Datastore history directory {target_path} does not exist, skipping history import.")
# No longer needed, dynamically pulled from the disk when needed.
# But we should set it back to a empty dict so we don't break if this schema runs on an earlier version.
# In the distant future we can remove this entirely
self.data['watching'][uuid]['history'] = {}
def update_3(self):
"""We incorrectly stored last_changed when there was not a change, and then confused the output list table."""
# see https://github.com/dgtlmoon/changedetection.io/pull/835
return
def update_4(self):
"""`last_changed` not needed, we pull that information from the history.txt index."""
for uuid, watch in self.data['watching'].items():
try:
# Remove it from the struct
del(watch['last_changed'])
except:
continue
return
def update_5(self):
"""
If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings.
In other words - the watch notification_title and notification_body are not needed if they are the same as the default one.
"""
current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
for uuid, watch in self.data['watching'].items():
try:
watch_body = watch.get('notification_body', '')
if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
# Looks the same as the default one, so unset it
watch['notification_body'] = None
watch_title = watch.get('notification_title', '')
if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
# Looks the same as the default one, so unset it
watch['notification_title'] = None
except Exception as e:
continue
return
def update_7(self):
"""
We incorrectly used common header overrides that should only apply to Requests.
These are now handled in content_fetcher::html_requests and shouldnt be passed to Playwright/Selenium.
"""
# These were hard-coded in early versions
for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']:
if self.data['settings']['headers'].get(v):
del self.data['settings']['headers'][v]
def update_8(self):
"""Convert filters to a list of filters css_filter -> include_filters."""
for uuid, watch in self.data['watching'].items():
try:
existing_filter = watch.get('css_filter', '')
if existing_filter:
watch['include_filters'] = [existing_filter]
except:
continue
return
def update_9(self):
"""Convert old static notification tokens to jinja2 tokens."""
# Each watch
# only { } not {{ or }}
r = r'(?<!{){(?!{)(\w+)(?<!})}(?!})'
for uuid, watch in self.data['watching'].items():
try:
n_body = watch.get('notification_body', '')
if n_body:
watch['notification_body'] = re.sub(r, r'{{\1}}', n_body)
n_title = watch.get('notification_title')
if n_title:
watch['notification_title'] = re.sub(r, r'{{\1}}', n_title)
n_urls = watch.get('notification_urls')
if n_urls:
for i, url in enumerate(n_urls):
watch['notification_urls'][i] = re.sub(r, r'{{\1}}', url)
except:
continue
# System wide
n_body = self.data['settings']['application'].get('notification_body')
if n_body:
self.data['settings']['application']['notification_body'] = re.sub(r, r'{{\1}}', n_body)
n_title = self.data['settings']['application'].get('notification_title')
if n_body:
self.data['settings']['application']['notification_title'] = re.sub(r, r'{{\1}}', n_title)
n_urls = self.data['settings']['application'].get('notification_urls')
if n_urls:
for i, url in enumerate(n_urls):
self.data['settings']['application']['notification_urls'][i] = re.sub(r, r'{{\1}}', url)
return
def update_10(self):
"""Some setups may have missed the correct default, so it shows the wrong config in the UI, although it will default to system-wide."""
for uuid, watch in self.data['watching'].items():
try:
if not watch.get('fetch_backend', ''):
watch['fetch_backend'] = 'system'
except:
continue
return
def update_12(self):
"""Create tag objects and their references from existing tag text."""
i = 0
for uuid, watch in self.data['watching'].items():
# Split out and convert old tag string
tag = watch.get('tag')
if tag:
tag_uuids = []
for t in tag.split(','):
tag_uuids.append(self.add_tag(title=t))
self.data['watching'][uuid]['tags'] = tag_uuids
def update_13(self):
"""#1775 - Update 11 did not update the records correctly when adding 'date_created' values for sorting."""
i = 0
for uuid, watch in self.data['watching'].items():
if not watch.get('date_created'):
self.data['watching'][uuid]['date_created'] = i
i += 1
return
def update_14(self):
"""#1774 - protect xpath1 against migration."""
for awatch in self.data["watching"]:
if self.data["watching"][awatch]['include_filters']:
for num, selector in enumerate(self.data["watching"][awatch]['include_filters']):
if selector.startswith('/'):
self.data["watching"][awatch]['include_filters'][num] = 'xpath1:' + selector
if selector.startswith('xpath:'):
self.data["watching"][awatch]['include_filters'][num] = selector.replace('xpath:', 'xpath1:', 1)
def update_15(self):
"""Use more obvious default time setting."""
for uuid in self.data["watching"]:
if self.data["watching"][uuid]['time_between_check'] == self.data['settings']['requests']['time_between_check']:
# What the old logic was, which was pretty confusing
self.data["watching"][uuid]['time_between_check_use_default'] = True
elif all(value is None or value == 0 for value in self.data["watching"][uuid]['time_between_check'].values()):
self.data["watching"][uuid]['time_between_check_use_default'] = True
else:
# Something custom here
self.data["watching"][uuid]['time_between_check_use_default'] = False
def update_16(self):
"""Correctly set datatype for older installs where 'tag' was string and update_12 did not catch it."""
for uuid, watch in self.data['watching'].items():
if isinstance(watch.get('tags'), str):
self.data['watching'][uuid]['tags'] = []
def update_17(self):
"""Migrate old 'in_stock' values to the new Restock."""
for uuid, watch in self.data['watching'].items():
if 'in_stock' in watch:
watch['restock'] = Restock({'in_stock': watch.get('in_stock')})
del watch['in_stock']
def update_18(self):
"""Migrate old restock settings."""
for uuid, watch in self.data['watching'].items():
if not watch.get('restock_settings'):
# So we enable price following by default
self.data['watching'][uuid]['restock_settings'] = {'follow_price_changes': True}
# Migrate and cleanoff old value
self.data['watching'][uuid]['restock_settings']['in_stock_processing'] = 'in_stock_only' if watch.get(
'in_stock_only') else 'all_changes'
if self.data['watching'][uuid].get('in_stock_only'):
del (self.data['watching'][uuid]['in_stock_only'])
def update_19(self):
"""Compress old elements.json to elements.deflate, saving disk, this compression is pretty fast."""
import zlib
for uuid, watch in self.data['watching'].items():
json_path = os.path.join(self.datastore_path, uuid, "elements.json")
deflate_path = os.path.join(self.datastore_path, uuid, "elements.deflate")
if os.path.exists(json_path):
with open(json_path, "rb") as f_j:
with open(deflate_path, "wb") as f_d:
logger.debug(f"Compressing {str(json_path)} to {str(deflate_path)}..")
f_d.write(zlib.compress(f_j.read()))
os.unlink(json_path)
def update_20(self):
"""Migrate extract_title_as_title to use_page_title_in_list."""
for uuid, watch in self.data['watching'].items():
if self.data['watching'][uuid].get('extract_title_as_title'):
self.data['watching'][uuid]['use_page_title_in_list'] = self.data['watching'][uuid].get('extract_title_as_title')
del self.data['watching'][uuid]['extract_title_as_title']
if self.data['settings']['application'].get('extract_title_as_title'):
self.data['settings']['application']['ui']['use_page_title_in_list'] = self.data['settings']['application'].get('extract_title_as_title')
def update_21(self):
"""Migrate timezone to scheduler_timezone_default."""
if self.data['settings']['application'].get('timezone'):
self.data['settings']['application']['scheduler_timezone_default'] = self.data['settings']['application'].get('timezone')
del self.data['settings']['application']['timezone']
def update_23(self):
"""Some notification formats got the wrong name type."""
def re_run(formats):
sys_n_format = self.data['settings']['application'].get('notification_format')
key_exists_as_value = next((k for k, v in formats.items() if v == sys_n_format), None)
if key_exists_as_value: # key of "Plain text"
logger.success(f"['settings']['application']['notification_format'] '{sys_n_format}' -> '{key_exists_as_value}'")
self.data['settings']['application']['notification_format'] = key_exists_as_value
for uuid, watch in self.data['watching'].items():
n_format = self.data['watching'][uuid].get('notification_format')
key_exists_as_value = next((k for k, v in formats.items() if v == n_format), None)
if key_exists_as_value and key_exists_as_value != USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH: # key of "Plain text"
logger.success(f"['watching'][{uuid}]['notification_format'] '{n_format}' -> '{key_exists_as_value}'")
self.data['watching'][uuid]['notification_format'] = key_exists_as_value # should be 'text' or whatever
for uuid, tag in self.data['settings']['application']['tags'].items():
n_format = self.data['settings']['application']['tags'][uuid].get('notification_format')
key_exists_as_value = next((k for k, v in formats.items() if v == n_format), None)
if key_exists_as_value and key_exists_as_value != USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH: # key of "Plain text"
logger.success(
f"['settings']['application']['tags'][{uuid}]['notification_format'] '{n_format}' -> '{key_exists_as_value}'")
self.data['settings']['application']['tags'][uuid][
'notification_format'] = key_exists_as_value # should be 'text' or whatever
from ..notification import valid_notification_formats
formats = deepcopy(valid_notification_formats)
re_run(formats)
# And in previous versions, it was "text" instead of Plain text, Markdown instead of "Markdown to HTML"
formats['text'] = 'Text'
formats['markdown'] = 'Markdown'
re_run(formats)
def update_24(self):
"""RSS types should be inline with the same names as notification types."""
rss_format = self.data['settings']['application'].get('rss_content_format')
if not rss_format or 'text' in rss_format:
# might have been 'plaintext, 'plain text' or something
self.data['settings']['application']['rss_content_format'] = RSS_CONTENT_FORMAT_DEFAULT
elif 'html' in rss_format:
self.data['settings']['application']['rss_content_format'] = 'htmlcolor'
else:
# safe fallback to text
self.data['settings']['application']['rss_content_format'] = RSS_CONTENT_FORMAT_DEFAULT
def update_25(self):
"""Different processors now hold their own history.txt."""
for uuid, watch in self.data['watching'].items():
processor = self.data['watching'][uuid].get('processor')
if processor != 'text_json_diff':
old_history_txt = os.path.join(self.datastore_path, "history.txt")
target_history_name = f"history-{processor}.txt"
if os.path.isfile(old_history_txt) and not os.path.isfile(target_history_name):
new_history_txt = os.path.join(self.datastore_path, target_history_name)
logger.debug(f"Renaming history index {old_history_txt} to {new_history_txt}...")
shutil.move(old_history_txt, new_history_txt)
def migrate_legacy_db_format(self):
"""
Migration: Individual watch persistence (COPY-based, safe rollback).
Loads legacy url-watches.json format and migrates to:
- {uuid}/watch.json (per watch)
- changedetection.json (settings only)
IMPORTANT:
- A tarball backup (before-update-26-timestamp.tar.gz) is created before migration
- url-watches.json is LEFT INTACT for rollback safety
- Users can roll back by simply downgrading to the previous version
- Or restore from tarball: tar -xzf before-update-26-*.tar.gz
This is a dedicated migration release - users upgrade at their own pace.
"""
logger.critical("=" * 80)
logger.critical("Running migration: Individual watch persistence (update_26)")
logger.critical("COPY-based migration: url-watches.json will remain intact for rollback")
logger.critical("=" * 80)
# Check if already migrated
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
if os.path.exists(changedetection_json):
logger.info("Migration already completed (changedetection.json exists), skipping")
return
# Check if we need to load legacy data
from .legacy_loader import has_legacy_datastore, load_legacy_format
if not has_legacy_datastore(self.datastore_path):
logger.info("No legacy datastore found, nothing to migrate")
return
# Load legacy data from url-watches.json
logger.critical("Loading legacy datastore from url-watches.json...")
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
legacy_data = load_legacy_format(legacy_path)
if not legacy_data:
raise Exception("Failed to load legacy datastore from url-watches.json")
# Populate settings from legacy data
logger.info("Populating settings from legacy data...")
watch_count = len(self.data['watching'])
logger.success(f"Loaded {watch_count} watches from legacy format")
# Phase 1: Save all watches to individual files
logger.critical(f"Phase 1/4: Saving {watch_count} watches to individual watch.json files...")
saved_count = 0
for uuid, watch in self.data['watching'].items():
try:
watch_dict = dict(watch)
watch_dir = os.path.join(self.datastore_path, uuid)
save_watch_atomic(watch_dir, uuid, watch_dict)
saved_count += 1
if saved_count % 100 == 0:
logger.info(f" Progress: {saved_count}/{watch_count} watches migrated...")
except Exception as e:
logger.error(f"Failed to save watch {uuid}: {e}")
raise Exception(
f"Migration failed: Could not save watch {uuid}. "
f"url-watches.json remains intact, safe to retry. Error: {e}"
)
logger.critical(f"Phase 1 complete: Saved {saved_count} watches")
# Phase 2: Verify all files exist
logger.critical("Phase 2/4: Verifying all watch.json files were created...")
missing = []
for uuid in self.data['watching'].keys():
watch_json = os.path.join(self.datastore_path, uuid, "watch.json")
if not os.path.isfile(watch_json):
missing.append(uuid)
if missing:
raise Exception(
f"Migration failed: {len(missing)} watch files missing: {missing[:5]}... "
f"url-watches.json remains intact, safe to retry."
)
logger.critical(f"Phase 2 complete: Verified {watch_count} watch files")
# Phase 3: Create new settings file
logger.critical("Phase 3/4: Creating changedetection.json...")
try:
self._save_settings()
except Exception as e:
logger.error(f"Failed to create changedetection.json: {e}")
raise Exception(
f"Migration failed: Could not create changedetection.json. "
f"url-watches.json remains intact, safe to retry. Error: {e}"
)
# Phase 4: Verify settings file exists
logger.critical("Phase 4/4: Verifying changedetection.json exists...")
if not os.path.isfile(changedetection_json):
raise Exception(
"Migration failed: changedetection.json not found after save. "
"url-watches.json remains intact, safe to retry."
)
logger.critical("Phase 4 complete: Verified changedetection.json exists")
# Success! Now reload from new format
logger.critical("Reloading datastore from new format...")
self._load_state() # Includes load_watches
logger.success("Datastore reloaded from new format successfully")
# Verify all watches have hashes after migration
missing_hashes = [uuid for uuid in self.data['watching'].keys() if uuid not in self._watch_hashes]
if missing_hashes:
logger.error(f"WARNING: {len(missing_hashes)} watches missing hashes after migration: {missing_hashes[:5]}")
else:
logger.success(f"All {len(self.data['watching'])} watches have valid hashes after migration")
# Set schema version to latest available update
# This prevents re-running updates and re-marking all watches as dirty
updates_available = self.get_updates_available()
latest_schema = updates_available[-1] if updates_available else 26
self.data['settings']['application']['schema_version'] = latest_schema
self.mark_settings_dirty()
logger.info(f"Set schema_version to {latest_schema} (migration complete, all watches already saved)")
logger.critical("=" * 80)
logger.critical("MIGRATION COMPLETED SUCCESSFULLY!")
logger.critical("=" * 80)
logger.info("")
logger.info("New format:")
logger.info(f" - {watch_count} individual watch.json files created")
logger.info(f" - changedetection.json created (settings only)")
logger.info("")
logger.info("Rollback safety:")
logger.info(" - url-watches.json preserved for rollback")
logger.info(" - To rollback: downgrade to previous version and restart")
logger.info(" - No manual file operations needed")
logger.info("")
logger.info("Optional cleanup (after testing new version):")
logger.info(f" - rm {os.path.join(self.datastore_path, 'url-watches.json')}")
logger.info("")
def update_26(self):
self.migrate_legacy_db_format()
+21 -43
View File
@@ -70,8 +70,8 @@ test_single_url() {
local test_id=$1
local dir="/tmp/cli-test-single-${test_id}-$$"
timeout 10 python3 changedetection.py -d "$dir" -C -u https://example.com -b &>/dev/null
# Count watch directories (UUID directories containing watch.json)
[ "$(find "$dir" -mindepth 2 -maxdepth 2 -name 'watch.json' | wc -l)" -eq 1 ]
[ -f "$dir/url-watches.json" ] && \
[ "$(python3 -c "import json; print(len(json.load(open('$dir/url-watches.json')).get('watching', {})))")" -eq 1 ]
}
test_multiple_urls() {
@@ -82,8 +82,8 @@ test_multiple_urls() {
-u https://github.com \
-u https://httpbin.org \
-b &>/dev/null
# Count watch directories (UUID directories containing watch.json)
[ "$(find "$dir" -mindepth 2 -maxdepth 2 -name 'watch.json' | wc -l)" -eq 3 ]
[ -f "$dir/url-watches.json" ] && \
[ "$(python3 -c "import json; print(len(json.load(open('$dir/url-watches.json')).get('watching', {})))")" -eq 3 ]
}
test_url_with_options() {
@@ -93,17 +93,8 @@ test_url_with_options() {
-u https://example.com \
-u0 '{"title":"Test Site","processor":"text_json_diff"}' \
-b &>/dev/null
# Check that at least one watch.json contains the title "Test Site"
python3 -c "
import json, glob, sys
watch_files = glob.glob('$dir/*/watch.json')
for wf in watch_files:
with open(wf) as f:
data = json.load(f)
if data.get('title') == 'Test Site':
sys.exit(0)
sys.exit(1)
"
[ -f "$dir/url-watches.json" ] && \
python3 -c "import json; data=json.load(open('$dir/url-watches.json')); watches=data.get('watching', {}); exit(0 if any(w.get('title')=='Test Site' for w in watches.values()) else 1)"
}
test_multiple_urls_with_options() {
@@ -115,19 +106,9 @@ test_multiple_urls_with_options() {
-u https://github.com \
-u1 '{"title":"Site Two"}' \
-b &>/dev/null
# Check that we have 2 watches and both titles are present
python3 -c "
import json, glob, sys
watch_files = glob.glob('$dir/*/watch.json')
if len(watch_files) != 2:
sys.exit(1)
titles = []
for wf in watch_files:
with open(wf) as f:
data = json.load(f)
titles.append(data.get('title'))
sys.exit(0 if 'Site One' in titles and 'Site Two' in titles else 1)
"
[ -f "$dir/url-watches.json" ] && \
[ "$(python3 -c "import json; print(len(json.load(open('$dir/url-watches.json')).get('watching', {})))")" -eq 2 ] && \
python3 -c "import json; data=json.load(open('$dir/url-watches.json')); watches=data.get('watching', {}); titles=[w.get('title') for w in watches.values()]; exit(0 if 'Site One' in titles and 'Site Two' in titles else 1)"
}
test_batch_mode_exit() {
@@ -145,24 +126,21 @@ test_batch_mode_exit() {
test_recheck_all() {
local test_id=$1
local dir="/tmp/cli-test-recheck-all-${test_id}-$$"
# Create a watch using CLI, then recheck it
timeout 10 python3 changedetection.py -d "$dir" -C -u https://example.com -b &>/dev/null
# Now recheck all watches
timeout 10 python3 changedetection.py -d "$dir" -r all -b 2>&1 | grep -q "Queuing"
mkdir -p "$dir"
cat > "$dir/url-watches.json" << 'EOF'
{"watching":{"test-uuid":{"url":"https://example.com","last_checked":0,"processor":"text_json_diff","uuid":"test-uuid"}},"settings":{"application":{"password":false}}}
EOF
timeout 10 python3 changedetection.py -d "$dir" -r all -b 2>&1 | grep -q "Queuing all"
}
test_recheck_specific() {
local test_id=$1
local dir="/tmp/cli-test-recheck-uuid-${test_id}-$$"
# Create 2 watches using CLI
timeout 12 python3 changedetection.py -d "$dir" -C \
-u https://example.com \
-u https://github.com \
-b &>/dev/null
# Get the UUIDs that were created
local uuids=$(find "$dir" -mindepth 2 -maxdepth 2 -name 'watch.json' -exec dirname {} \; | xargs -n1 basename | tr '\n' ',' | sed 's/,$//')
# Now recheck specific UUIDs
timeout 10 python3 changedetection.py -d "$dir" -r "$uuids" -b 2>&1 | grep -q "Queuing"
mkdir -p "$dir"
cat > "$dir/url-watches.json" << 'EOF'
{"watching":{"uuid-1":{"url":"https://example.com","last_checked":0,"processor":"text_json_diff","uuid":"uuid-1"},"uuid-2":{"url":"https://github.com","last_checked":0,"processor":"text_json_diff","uuid":"uuid-2"}},"settings":{"application":{"password":false}}}
EOF
timeout 10 python3 changedetection.py -d "$dir" -r uuid-1,uuid-2 -b 2>&1 | grep -q "Queuing 2 specific watches"
}
test_combined_operations() {
@@ -173,8 +151,8 @@ test_combined_operations() {
-u https://github.com \
-r all \
-b &>/dev/null
# Count watch directories (UUID directories containing watch.json)
[ "$(find "$dir" -mindepth 2 -maxdepth 2 -name 'watch.json' | wc -l)" -eq 2 ]
[ -f "$dir/url-watches.json" ] && \
[ "$(python3 -c "import json; print(len(json.load(open('$dir/url-watches.json')).get('watching', {})))")" -eq 2 ]
}
test_invalid_json() {
+3 -145
View File
@@ -9,11 +9,6 @@ from changedetectionio import store
import os
import sys
# CRITICAL: Set short timeout for tests to prevent 45-second hangs
# When test server is slow/unresponsive, workers fail fast instead of holding UUIDs for 45s
# This prevents exponential priority growth from repeated deferrals (priority × 10 each defer)
os.environ['DEFAULT_SETTINGS_REQUESTS_TIMEOUT'] = '5'
from changedetectionio.flask_app import init_app_secret, changedetection_app
from changedetectionio.tests.util import live_server_setup, new_live_server_setup
@@ -34,93 +29,6 @@ def reportlog(pytestconfig):
logger.remove(handler_id)
@pytest.fixture(autouse=True)
def per_test_log_file(request):
"""Create a separate log file for each test function with pytest output."""
import re
# Create logs directory if it doesn't exist
log_dir = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(log_dir, exist_ok=True)
# Generate log filename from test name and worker ID (for parallel runs)
test_name = request.node.name
# Sanitize test name - replace unsafe characters with underscores
# Keep only alphanumeric, dash, underscore, and period
safe_test_name = re.sub(r'[^\w\-.]', '_', test_name)
# Limit length to avoid filesystem issues (max 200 chars)
if len(safe_test_name) > 200:
# Keep first 150 chars + hash of full name + last 30 chars
import hashlib
name_hash = hashlib.md5(test_name.encode()).hexdigest()[:8]
safe_test_name = f"{safe_test_name[:150]}_{name_hash}_{safe_test_name[-30:]}"
worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'master')
log_file = os.path.join(log_dir, f"{safe_test_name}_{worker_id}.log")
# Add file handler for this test with TRACE level
handler_id = logger.add(
log_file,
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {process} | {name}:{function}:{line} - {message}",
level="TRACE",
mode="w", # Overwrite if exists
enqueue=True # Thread-safe
)
logger.info(f"=== Starting test: {test_name} (worker: {worker_id}) ===")
logger.info(f"Test location: {request.node.nodeid}")
yield
# Capture test outcome (PASSED/FAILED/SKIPPED/ERROR)
outcome = "UNKNOWN"
exc_info = None
stdout = None
stderr = None
if hasattr(request.node, 'rep_call'):
outcome = request.node.rep_call.outcome.upper()
if request.node.rep_call.failed:
exc_info = request.node.rep_call.longreprtext
# Capture stdout/stderr from call phase
if hasattr(request.node.rep_call, 'sections'):
for section_name, section_content in request.node.rep_call.sections:
if 'stdout' in section_name.lower():
stdout = section_content
elif 'stderr' in section_name.lower():
stderr = section_content
elif hasattr(request.node, 'rep_setup'):
if request.node.rep_setup.failed:
outcome = "SETUP_FAILED"
exc_info = request.node.rep_setup.longreprtext
logger.info(f"=== Test Result: {outcome} ===")
if exc_info:
logger.error(f"=== Test Failure Details ===\n{exc_info}")
if stdout:
logger.info(f"=== Captured stdout ===\n{stdout}")
if stderr:
logger.warning(f"=== Captured stderr ===\n{stderr}")
logger.info(f"=== Finished test: {test_name} ===")
logger.remove(handler_id)
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
"""Hook to capture test results and attach to the test node."""
outcome = yield
rep = outcome.get_result()
# Store report on the test node for access in fixtures
setattr(item, f"rep_{rep.when}", rep)
@pytest.fixture
def environment(mocker):
"""Mock arrow.now() to return a fixed datetime for testing jinja2 time extension."""
@@ -257,57 +165,6 @@ def prepare_test_function(live_server, datastore_path):
except:
break
# Add test helper methods to the app for worker management
def set_workers(count):
"""Set the number of workers for testing - brutal shutdown, no delays"""
from changedetectionio import worker_pool
from changedetectionio.flask_app import update_q, notification_q
current_count = worker_pool.get_worker_count()
# Special case: Setting to 0 means shutdown all workers brutally
if count == 0:
logger.debug(f"Brutally shutting down all {current_count} workers")
worker_pool.shutdown_workers()
return {
'status': 'success',
'message': f'Shutdown all {current_count} workers',
'previous_count': current_count,
'current_count': 0
}
# Adjust worker count (no delays, no verification)
result = worker_pool.adjust_async_worker_count(
count,
update_q=update_q,
notification_q=notification_q,
app=live_server.app,
datastore=datastore
)
return result
def check_all_workers_alive(expected_count):
"""Check that all expected workers are alive"""
from changedetectionio import worker_pool
from changedetectionio.flask_app import update_q, notification_q
result = worker_pool.check_worker_health(
expected_count,
update_q=update_q,
notification_q=notification_q,
app=live_server.app,
datastore=datastore
)
assert result['status'] == 'healthy', f"Workers not healthy: {result['message']}"
return result
# Attach helper methods to app for easy test access
live_server.app.set_workers = set_workers
live_server.app.check_all_workers_alive = check_all_workers_alive
# Prevent background thread from writing during cleanup/reload
datastore.needs_write = False
datastore.needs_write_urgent = False
@@ -405,8 +262,8 @@ def app(request, datastore_path):
# Shutdown workers gracefully before loguru cleanup
try:
from changedetectionio import worker_pool
worker_pool.shutdown_workers()
from changedetectionio import worker_handler
worker_handler.shutdown_workers()
except Exception:
pass
@@ -454,3 +311,4 @@ def app(request, datastore_path):
yield app
@@ -1,41 +0,0 @@
import time
from flask import url_for
from changedetectionio.tests.util import wait_for_all_checks
def test_check_plugin_processor(client, live_server, measure_memory_usage, datastore_path):
# requires os-int intelligence plugin installed (first basic one we test with)
res = client.get(url_for("watchlist.index"))
assert b'OSINT Reconnaissance' in res.data, "Must have the OSINT plugin installed at test time"
assert b'<input checked id="processor-0" name="processor" type="radio" value="text_json_diff">' in res.data, "But the first text_json_diff processor should always be selected by default in quick watch form"
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": 'http://127.0.0.1', "tags": '', 'processor': 'osint_recon'},
follow_redirects=True
)
assert b"Watch added" in res.data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
assert b'Target: http://127.0.0.1' in res.data
assert b'DNSKEY Records' in res.data
wait_for_all_checks(client)
# Now change it to something that doesnt exist
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
live_server.app.config['DATASTORE'].data['watching'][uuid]['processor'] = "now_missing"
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b"Exception: Processor module" in res.data and b'now_missing' in res.data, f'Should register that the plugin is missing for {uuid}'
@@ -1,166 +0,0 @@
#!/usr/bin/env python3
"""
Test notification_urls validation in Watch and Tag API endpoints.
Ensures that invalid AppRise URLs are rejected when setting notification_urls.
Valid AppRise notification URLs use specific protocols like:
- posts://example.com - POST to HTTP endpoint
- gets://example.com - GET to HTTP endpoint
- mailto://user@example.com - Email
- slack://token/channel - Slack
- discord://webhook_id/webhook_token - Discord
- etc.
Invalid notification URLs:
- https://example.com - Plain HTTPS is NOT a valid AppRise notification protocol
- ftp://example.com - FTP is NOT a valid AppRise notification protocol
- Plain URLs without proper AppRise protocol prefix
"""
from flask import url_for
import json
def test_watch_notification_urls_validation(client, live_server, measure_memory_usage, datastore_path):
"""Test that Watch PUT/POST endpoints validate notification_urls."""
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Test 1: Create a watch with valid notification URLs
valid_urls = ["posts://example.com/notify1", "posts://example.com/notify2"]
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"notification_urls": valid_urls
}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201, "Should accept valid notification URLs on watch creation"
watch_uuid = res.json['uuid']
# Verify the notification URLs were saved
res = client.get(
url_for("watch", uuid=watch_uuid),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert set(res.json['notification_urls']) == set(valid_urls), "Valid notification URLs should be saved"
# Test 2: Try to create a watch with invalid notification URLs (https:// is not valid)
invalid_urls = ["https://example.com/webhook"]
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"notification_urls": invalid_urls
}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject https:// notification URLs (not a valid AppRise protocol)"
assert b"is not a valid AppRise URL" in res.data, "Should provide AppRise validation error message"
# Test 2b: Also test other invalid protocols
invalid_urls_ftp = ["ftp://not-apprise-url"]
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": "https://example.com",
"notification_urls": invalid_urls_ftp
}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject ftp:// notification URLs"
assert b"is not a valid AppRise URL" in res.data, "Should provide AppRise validation error message"
# Test 3: Update watch with valid notification URLs
new_valid_urls = ["posts://newserver.com"]
res = client.put(
url_for("watch", uuid=watch_uuid),
data=json.dumps({"notification_urls": new_valid_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 200, "Should accept valid notification URLs on watch update"
# Verify the notification URLs were updated
res = client.get(
url_for("watch", uuid=watch_uuid),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert res.json['notification_urls'] == new_valid_urls, "Valid notification URLs should be updated"
# Test 4: Try to update watch with invalid notification URLs (plain https:// not valid)
invalid_https_url = ["https://example.com/webhook"]
res = client.put(
url_for("watch", uuid=watch_uuid),
data=json.dumps({"notification_urls": invalid_https_url}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject https:// notification URLs on watch update"
assert b"is not a valid AppRise URL" in res.data, "Should provide AppRise validation error message"
# Test 5: Update watch with non-list notification_urls (caught by OpenAPI schema validation)
res = client.put(
url_for("watch", uuid=watch_uuid),
data=json.dumps({"notification_urls": "not-a-list"}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 6: Verify original URLs are preserved after failed update
res = client.get(
url_for("watch", uuid=watch_uuid),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert res.json['notification_urls'] == new_valid_urls, "URLs should remain unchanged after validation failure"
def test_tag_notification_urls_validation(client, live_server, measure_memory_usage, datastore_path):
"""Test that Tag PUT endpoint validates notification_urls."""
from changedetectionio.model import Tag
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
datastore = live_server.app.config['DATASTORE']
# Create a tag
tag_uuid = datastore.add_tag(title="Test Tag")
assert tag_uuid is not None
# Test 1: Update tag with valid notification URLs
valid_urls = ["posts://example.com/tag-notify"]
res = client.put(
url_for("tag", uuid=tag_uuid),
data=json.dumps({"notification_urls": valid_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 200, "Should accept valid notification URLs on tag update"
# Verify the notification URLs were saved
tag = datastore.data['settings']['application']['tags'][tag_uuid]
assert tag['notification_urls'] == valid_urls, "Valid notification URLs should be saved to tag"
# Test 2: Try to update tag with invalid notification URLs (https:// not valid)
invalid_urls = ["https://example.com/webhook"]
res = client.put(
url_for("tag", uuid=tag_uuid),
data=json.dumps({"notification_urls": invalid_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject https:// notification URLs on tag update"
assert b"is not a valid AppRise URL" in res.data, "Should provide AppRise validation error message"
# Test 3: Update tag with non-list notification_urls (caught by OpenAPI schema validation)
res = client.put(
url_for("tag", uuid=tag_uuid),
data=json.dumps({"notification_urls": "not-a-list"}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400, "Should reject non-list notification_urls"
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
# Test 4: Verify original URLs are preserved after failed update
tag = datastore.data['settings']['application']['tags'][tag_uuid]
assert tag['notification_urls'] == valid_urls, "URLs should remain unchanged after validation failure"
@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, extract_UUID_from_client, wait_for_all_checks, delete_all_watches
from .util import live_server_setup, extract_UUID_from_client, wait_for_all_checks
import os
@@ -116,7 +116,7 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage
# And not this cause its not the ld-json
assert b"So let's see what happens" not in res.data
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
##########################################################################################
# And we shouldnt see the offer
@@ -131,7 +131,7 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage
assert b'ldjson-price-track-offer' not in res.data
##########################################################################################
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data):
@@ -147,7 +147,7 @@ def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_
##########################################################################################
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_bad_ldjson_is_correctly_ignored(client, live_server, measure_memory_usage, datastore_path):
+1 -1
View File
@@ -414,4 +414,4 @@ def test_plaintext_even_if_xml_content_and_can_apply_filters(client, live_server
assert b'Abonnementen bijwerken' in res.data
assert b'&lt;foobar' not in res.data
res = delete_all_watches(client)
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+3 -13
View File
@@ -53,21 +53,11 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
backup = ZipFile(io.BytesIO(res.data))
l = backup.namelist()
uuid4hex = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
newlist = list(filter(uuid4hex.match, l)) # Read Note below
# Check for UUID-based txt files (history and snapshot)
uuid4hex_txt = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, l))
# Should be two txt files in the archive (history and the snapshot)
assert len(txt_files) == 2
# Check for watch.json files (new format)
uuid4hex_json = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}/watch\.json$', re.I)
json_files = list(filter(uuid4hex_json.match, l))
# Should be one watch.json file in the archive (the imported watch)
assert len(json_files) == 1, f"Expected 1 watch.json file, found {len(json_files)}: {json_files}"
# Check for changedetection.json (settings file)
assert 'changedetection.json' in l, "changedetection.json should be in backup"
assert len(newlist) == 2
# Get the latest one
res = client.get(
@@ -6,7 +6,7 @@ from .util import (
set_original_response,
set_modified_response,
live_server_setup,
wait_for_all_checks, delete_all_watches
wait_for_all_checks
)
from loguru import logger
@@ -104,7 +104,7 @@ def run_socketio_watch_update_test(client, live_server, password_mode="", datast
assert watch.has_unviewed, "The watch was not marked as unviewed after content change"
# Clean up
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_everything(live_server, client, measure_memory_usage, datastore_path):
+20 -5
View File
@@ -69,7 +69,7 @@ def test_conditions_with_text_and_number(client, live_server, measure_memory_usa
# 1. The page filtered text must contain "5" (first digit of value)
# 2. The extracted number should be >= 20 and <= 100
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"fetch_backend": "html_requests",
@@ -110,20 +110,25 @@ def test_conditions_with_text_and_number(client, live_server, measure_memory_usa
wait_for_all_checks(client)
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
time.sleep(1)
time.sleep(0.2)
wait_for_all_checks(client)
# Case 1
set_number_in_range_response(datastore_path=datastore_path, number="70.5")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(2)
# 75 is > 20 and < 100 and contains "5"
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data
# Case 2: Change with one condition violated
# Number out of range (150) but contains '5'
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
time.sleep(0.2)
set_number_out_of_range_response(datastore_path=datastore_path, number="150.5")
@@ -149,6 +154,7 @@ def test_condition_validate_rule_row(client, live_server, measure_memory_usage,
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
# the front end submits the current form state which should override the watch in a temporary copy
res = client.post(
@@ -189,8 +195,12 @@ def test_condition_validate_rule_row(client, live_server, measure_memory_usage,
)
assert res.status_code == 200
assert b'false' in res.data
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
delete_all_watches(client)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
@@ -220,12 +230,17 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage,
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
# Assert the word count is counted correctly
assert b'<td>13</td>' in res.data
delete_all_watches(client)
# cleanup for the next
client.get(
url_for("ui.form_delete", uuid="all"),
follow_redirects=True
)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_lev_conditions_plugin(client, live_server, measure_memory_usage, datastore_path):
@@ -64,7 +64,6 @@ def test_DNS_errors(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
assert b"1 Imported" in res.data
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
@@ -80,7 +79,7 @@ def test_DNS_errors(client, live_server, measure_memory_usage, datastore_path):
)
assert found_name_resolution_error
# Should always record that we tried
assert "just now".encode('utf-8') in res.data or 'seconds ago'.encode('utf-8') in res.data
assert bytes("just now".encode('utf-8')) in res.data
delete_all_watches(client)
# Re 1513
@@ -1,7 +1,7 @@
import os
import time
from flask import url_for
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output, delete_all_watches
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output
from ..notification import valid_notification_formats
@@ -118,10 +118,8 @@ def run_filter_test(client, live_server, content_filter, app_notification_format
res = client.get(url_for("watchlist.index"))
assert b'Warning, no filters were found' in res.data
assert not os.path.isfile(notification_file)
time.sleep(2)
wait_for_all_checks(client)
time.sleep(1)
wait_for_all_checks(client)
assert live_server.app.config['DATASTORE'].data['watching'][uuid]['consecutive_filter_failures'] == 5
time.sleep(2)
@@ -180,7 +178,6 @@ def run_filter_test(client, live_server, content_filter, app_notification_format
follow_redirects=True
)
os.unlink(notification_file)
delete_all_watches(client)
def test_check_include_filters_failure_notification(client, live_server, measure_memory_usage, datastore_path):
@@ -188,12 +185,10 @@ def test_check_include_filters_failure_notification(client, live_server, measure
run_filter_test(client=client, live_server=live_server, content_filter='#nope-doesnt-exist', app_notification_format=valid_notification_formats.get('htmlcolor'), datastore_path=datastore_path)
# Check markup send conversion didnt affect plaintext preference
run_filter_test(client=client, live_server=live_server, content_filter='#nope-doesnt-exist', app_notification_format=valid_notification_formats.get('text'), datastore_path=datastore_path)
delete_all_watches(client)
def test_check_xpath_filter_failure_notification(client, live_server, measure_memory_usage, datastore_path):
# # live_server_setup(live_server) # Setup on conftest per function
run_filter_test(client=client, live_server=live_server, content_filter='//*[@id="nope-doesnt-exist"]', app_notification_format=valid_notification_formats.get('htmlcolor'), datastore_path=datastore_path)
delete_all_watches(client)
# Test that notification is never sent
@@ -202,4 +197,3 @@ def test_basic_markup_from_text(client, live_server, measure_memory_usage, datas
from ..notification.handler import markup_text_links_to_html
x = markup_text_links_to_html("hello https://google.com")
assert 'a href' in x
delete_all_watches(client)
+1 -2
View File
@@ -166,8 +166,7 @@ def test_tag_add_in_ui(client, live_server, measure_memory_usage, datastore_path
delete_all_watches(client)
def test_group_tag_notification(client, live_server, measure_memory_usage, datastore_path):
delete_all_watches(client)
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
@@ -59,29 +59,11 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Wait for the sync DB save to happen
time.sleep(2)
# Check which format is being used
datastore_path = live_server.app.config['DATASTORE'].datastore_path
changedetection_json = os.path.join(datastore_path, 'changedetection.json')
url_watches_json = os.path.join(datastore_path, 'url-watches.json')
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
json_obj = {'watching': {}}
if os.path.exists(changedetection_json):
# New format: individual watch.json files
logger.info("Testing with new format (changedetection.json + individual watch.json)")
# Load each watch.json file
for uuid in live_server.app.config['DATASTORE'].data['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
assert os.path.isfile(watch_json_file), f"watch.json should exist at {watch_json_file}"
with open(watch_json_file, 'r', encoding='utf-8') as f:
json_obj['watching'][uuid] = json.load(f)
else:
# Legacy format: url-watches.json
logger.info("Testing with legacy format (url-watches.json)")
with open(url_watches_json, 'r', encoding='utf-8') as f:
json_obj = json.load(f)
json_obj = None
with open(json_db_file, 'r', encoding='utf-8') as f:
json_obj = json.load(f)
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(workers), "Correct number of watches was found in the JSON"
@@ -106,7 +88,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and fname != 'watch.json' and 'html' not in fname:
if fname != 'history.txt' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
@@ -123,27 +105,13 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
# With new format, we also have watch.json, so 4 files total
if os.path.exists(changedetection_json):
assert len(files_in_watch_dir) == 4, "Should be four files in the dir with new format: watch.json, html.br snapshot, history.txt and the extracted text snapshot"
else:
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir with legacy format: html.br snapshot, history.txt and the extracted text snapshot"
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
# Check that 'default' Watch vars aren't accidentally being saved
if os.path.exists(changedetection_json):
# New format: check all individual watch.json files
for uuid in json_obj['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
with open(watch_json_file, 'r', encoding='utf-8') as f:
assert '"default"' not in f.read(), f"'default' probably shouldnt be here in {watch_json_file}, it came from when the 'default' Watch vars were accidently being saved"
else:
# Legacy format: check url-watches.json
with open(url_watches_json, 'r', encoding='utf-8') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r', encoding='utf-8') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
delete_all_watches(client)
def test_check_text_history_view(client, live_server, measure_memory_usage, datastore_path):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
@@ -164,7 +132,7 @@ def test_check_text_history_view(client, live_server, measure_memory_usage, data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid=uuid))
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid="first"))
assert b'test-one' in res.data
assert b'test-two' in res.data
@@ -40,7 +40,10 @@ def set_some_changed_response(datastore_path):
def test_normal_page_check_works_with_ignore_status_code(client, live_server, measure_memory_usage, datastore_path):
from loguru import logger
# Give the endpoint time to spin up
time.sleep(1)
set_original_response(datastore_path=datastore_path)
@@ -59,41 +62,20 @@ def test_normal_page_check_works_with_ignore_status_code(client, live_server, me
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
logger.info(f"TEST: First check - queuing UUID {uuid}")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
logger.info(f"TEST: Waiting for first check to complete")
wait_result = wait_for_all_checks(client)
logger.info(f"TEST: First check wait completed: {wait_result}")
# Check history after first check
watch = client.application.config.get('DATASTORE').data['watching'][uuid]
logger.info(f"TEST: After first check - history count: {len(watch.history.keys())}")
wait_for_all_checks(client)
set_some_changed_response(datastore_path=datastore_path)
wait_for_all_checks(client)
# Trigger a check
logger.info(f"TEST: Second check - queuing UUID {uuid}")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
logger.info(f"TEST: Waiting for second check to complete")
wait_result = wait_for_all_checks(client)
logger.info(f"TEST: Second check wait completed: {wait_result}")
# Check history after second check
watch = client.application.config.get('DATASTORE').data['watching'][uuid]
logger.info(f"TEST: After second check - history count: {len(watch.history.keys())}")
logger.info(f"TEST: Watch history keys: {list(watch.history.keys())}")
# Give the thread time to pick it up
wait_for_all_checks(client)
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
if b'has-unread-changes' not in res.data:
logger.error(f"TEST FAILED: has-unread-changes not found in response")
logger.error(f"TEST: Watch last_error: {watch.get('last_error')}")
logger.error(f"TEST: Watch last_checked: {watch.get('last_checked')}")
assert b'has-unread-changes' in res.data
assert b'/test-endpoint' in res.data
+1 -1
View File
@@ -82,7 +82,7 @@ def test_import_distillio(client, live_server, measure_memory_usage, datastore_p
# Give the endpoint time to spin up
time.sleep(1)
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
res = client.post(
url_for("imports.import_page"),
data={
@@ -224,7 +224,6 @@ def check_json_filter(json_filter, client, live_server, datastore_path):
set_original_response(datastore_path=datastore_path)
delete_all_watches(client)
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type="application/json", _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={"include_filters": json_filter.splitlines()})
@@ -298,17 +297,14 @@ def check_json_filter_bool_val(json_filter, client, live_server, datastore_path)
def test_check_jsonpath_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
check_json_filter_bool_val("json:$['available']", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jq_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_filter_bool_val("jq:.available", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jqraw_filter_bool_val(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_filter_bool_val("jq:.available", client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
# Re #265 - Extended JSON selector test
# Stuff to consider here
@@ -456,17 +452,14 @@ def test_correct_header_detect(client, live_server, measure_memory_usage, datast
def test_check_jsonpath_ext_filter(client, live_server, measure_memory_usage, datastore_path):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jq_ext_filter(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage, datastore_path):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server, datastore_path=datastore_path)
delete_all_watches(client)
def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage, datastore_path):
from .. import html_tools
@@ -477,6 +470,5 @@ def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage, datastore_
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(json_str, "json:$.name")
assert text == '"José"'
delete_all_watches(client)
+8 -2
View File
@@ -313,8 +313,14 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
# Add a watch and trigger a HTTP POST
test_url = url_for('test_endpoint', _external=True)
watch_uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, tag="nice one")
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": 'nice one'},
follow_redirects=True
)
assert b"Watch added" in res.data
watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
wait_for_all_checks(client)
set_modified_response(datastore_path=datastore_path)
@@ -1,7 +1,7 @@
import os
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, delete_all_watches
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import logging
def test_check_notification_error_handling(client, live_server, measure_memory_usage, datastore_path):
@@ -81,4 +81,4 @@ def test_check_notification_error_handling(client, live_server, measure_memory_u
os.unlink(os.path.join(datastore_path, "notification.txt"))
assert 'xxxxx' in notification_submission
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
@@ -1,52 +0,0 @@
import os
import time
from flask import url_for
from .util import set_original_response, wait_for_all_checks, wait_for_notification_endpoint_output
from ..notification import valid_notification_formats
from loguru import logger
def test_queue_system(client, live_server, measure_memory_usage, datastore_path):
"""Test that multiple workers can process queue concurrently without blocking each other"""
# (pytest) Werkzeug's threaded server uses ThreadPoolExecutor with a default limit of around 40 threads (or min(32, os.cpu_count() + 4)).
items = os.cpu_count() +3
delay = 10
# Auto-queue is off here.
live_server.app.config['DATASTORE'].data['settings']['application']['all_paused'] = True
test_urls = [
f"{url_for('test_endpoint', _external=True)}?delay={delay}&id={i}&content=hello+test+content+{i}"
for i in range(0, items)
]
# Import 30 URLs to queue
res = client.post(
url_for("imports.import_page"),
data={"urls": "\r\n".join(test_urls)},
follow_redirects=True
)
assert f"{items} Imported".encode('utf-8') in res.data
client.application.set_workers(items)
start = time.time()
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
time.sleep(delay/2)
# Verify all workers are idle (no UUIDs being processed)
from changedetectionio import worker_pool
running_uuids = worker_pool.get_running_uuids()
logger.debug( f"Should be atleast some workers running - {len(running_uuids)} UUIDs still being processed: {running_uuids}")
assert len(running_uuids) != 0, f"Should be atleast some workers running - {len(running_uuids)} UUIDs still being processed: {running_uuids}"
wait_for_all_checks(client)
# all workers should be done in less than say 10 seconds (they take time to 'see' something is in the queue too)
total_time = (time.time() - start)
logger.debug(f"All workers finished {items} items in less than {delay} seconds per job. {total_time}s total")
# if there was a bug in queue handler not running parallel, this would blow out to items*delay seconds
assert total_time < delay + 10, f"All workers finished {items} items in less than {delay} seconds per job, total time {total_time}s"
# Verify all workers are idle (no UUIDs being processed)
from changedetectionio import worker_pool
running_uuids = worker_pool.get_running_uuids()
assert len(running_uuids) == 0, f"Expected all workers to be idle, but {len(running_uuids)} UUIDs still being processed: {running_uuids}"
+16 -25
View File
@@ -17,12 +17,12 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
test_url = test_url.replace('localhost', 'changedet')
# Add the test URL twice, we will check
uuidA = client.application.config.get('DATASTORE').add_watch(url=test_url)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
uuidB = client.application.config.get('DATASTORE').add_watch(url=test_url)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
@@ -31,7 +31,7 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
# Add some headers to a request
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuidA),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -42,14 +42,13 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore
)
assert b"Updated watch." in res.data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick up the first version
wait_for_all_checks(client)
# The service should echo back the request headers
res = client.get(
url_for("ui.ui_preview.preview_page", uuid=uuidA),
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
@@ -93,7 +92,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# add the first 'version'
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -111,7 +110,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
body_value = 'Test Body Value {{ 1+1 }}'
body_value_formatted = 'Test Body Value 2'
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -127,7 +126,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# The service should echo back the body
res = client.get(
url_for("ui.ui_preview.preview_page", uuid=uuid),
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
@@ -143,14 +142,10 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
watches_with_body = 0
# Read individual watch.json files
for uuid in client.application.config.get('DATASTORE').data['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
assert os.path.exists(watch_json_file), f"watch.json should exist at {watch_json_file}"
with open(watch_json_file, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
if watch_data.get('body') == body_value:
with open(os.path.join(datastore_path, 'url-watches.json'), encoding='utf-8') as f:
app_struct = json.load(f)
for uuid in app_struct['watching']:
if app_struct['watching'][uuid]['body']==body_value:
watches_with_body += 1
# Should be only one with body set
@@ -158,7 +153,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa
# Attempt to add a body with a GET method
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"tags": "",
@@ -230,14 +225,10 @@ def test_method_in_request(client, live_server, measure_memory_usage, datastore_
wait_for_all_checks(client)
watches_with_method = 0
# Read individual watch.json files
for uuid in client.application.config.get('DATASTORE').data['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
assert os.path.exists(watch_json_file), f"watch.json should exist at {watch_json_file}"
with open(watch_json_file, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
if watch_data.get('method') == 'PATCH':
with open(os.path.join(datastore_path, 'url-watches.json'), encoding='utf-8') as f:
app_struct = json.load(f)
for uuid in app_struct['watching']:
if app_struct['watching'][uuid]['method'] == 'PATCH':
watches_with_method += 1
# Should be only one with method set to PATCH
@@ -236,7 +236,6 @@ def test_restock_itemprop_with_tag(client, live_server, measure_memory_usage, da
}
_run_test_minmax_limit(client, extra_watch_edit_form=extras,datastore_path=datastore_path)
delete_all_watches(client)
@@ -389,10 +388,9 @@ def test_change_with_notification_values(client, live_server, measure_memory_usa
os.unlink(os.path.join(datastore_path, "notification.txt"))
uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
res = client.post(url_for("ui.ui_notification.ajax_callback_send_notification_test", watch_uuid=uuid), data={}, follow_redirects=True)
wait_for_notification_endpoint_output(datastore_path=datastore_path)
time.sleep(5)
assert os.path.isfile(os.path.join(datastore_path, "notification.txt")), "Notification received"
delete_all_watches(client)
def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
@@ -408,7 +406,6 @@ def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
@@ -420,7 +417,6 @@ def test_data_sanity(client, live_server, measure_memory_usage, datastore_path):
data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert str(res.data.decode()).count("950.95") == 1, "Price should only show once (for the watch added, no other watches yet)"
@@ -466,4 +462,3 @@ def test_special_prop_examples(client, live_server, measure_memory_usage, datast
assert b'ception' not in res.data
assert b'155.55' in res.data
delete_all_watches(client)
+1 -1
View File
@@ -107,7 +107,7 @@ def test_rss_and_token(client, live_server, measure_memory_usage, datastore_path
assert b"Access denied, bad token" not in res.data
assert b"Random content" in res.data
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage, datastore_path):
@@ -23,7 +23,6 @@ def test_rss_feed_empty(client, live_server, measure_memory_usage, datastore_pat
)
assert res.status_code == 400
assert b'does not have enough history snapshots to show' in res.data
delete_all_watches(client)
def test_rss_single_watch_order(client, live_server, measure_memory_usage, datastore_path):
"""
+5 -8
View File
@@ -24,20 +24,20 @@ def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
assert bytes(include_filters.encode('utf-8')) in res.data
# click share the link
res = client.get(
url_for("ui.form_share_put_watch", uuid=uuid),
url_for("ui.form_share_put_watch", uuid="first"),
follow_redirects=True
)
@@ -63,16 +63,13 @@ def test_share_watch(client, live_server, measure_memory_usage, datastore_path):
# Now hit edit, we should see what we expect
# that the import fetched the meta-data
uuids = list(client.application.config.get('DATASTORE').data['watching'])
assert uuids, "It saved/imported and created a new URL from the share"
# Check it saved
res = client.get(
url_for("ui.ui_edit.edit_page", uuid=uuids[0]),
url_for("ui.ui_edit.edit_page", uuid="first"),
)
assert bytes(include_filters.encode('utf-8')) in res.data
# Check it saved the URL
res = client.get(url_for("watchlist.index"))
assert bytes(test_url.encode('utf-8')) in res.data
delete_all_watches(client)
+1 -5
View File
@@ -25,7 +25,6 @@ def test_recheck_time_field_validation_global_settings(client, live_server, meas
assert REQUIRE_ATLEAST_ONE_TIME_PART_MESSAGE_DEFAULT.encode('utf-8') in res.data
delete_all_watches(client)
def test_recheck_time_field_validation_single_watch(client, live_server, measure_memory_usage, datastore_path):
@@ -95,7 +94,6 @@ def test_recheck_time_field_validation_single_watch(client, live_server, measure
assert b"Updated watch." in res.data
assert REQUIRE_ATLEAST_ONE_TIME_PART_WHEN_NOT_GLOBAL_DEFAULT.encode('utf-8') not in res.data
delete_all_watches(client)
def test_checkbox_open_diff_in_new_tab(client, live_server, measure_memory_usage, datastore_path):
@@ -244,7 +242,6 @@ def test_page_title_listing_behaviour(client, live_server, measure_memory_usage,
# No page title description, and 'use_page_title_in_list' is on, it should show the <title>
res = client.get(url_for("watchlist.index"))
assert b"head titlecustom html" in res.data
delete_all_watches(client)
def test_ui_viewed_unread_flag(client, live_server, measure_memory_usage, datastore_path):
@@ -286,5 +283,4 @@ def test_ui_viewed_unread_flag(client, live_server, measure_memory_usage, datast
client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
time.sleep(0.2)
res = client.get(url_for("watchlist.index"))
assert b'<span id="unread-tab-counter">0</span>' in res.data
delete_all_watches(client)
assert b'<span id="unread-tab-counter">0</span>' in res.data
+8 -10
View File
@@ -366,7 +366,7 @@ def test_check_with_prefix_include_filters(client, live_server, measure_memory_u
assert b"Some text thats the same" in res.data # in selector
assert b"Some text that will change" not in res.data # not in selector
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_various_rules(client, live_server, measure_memory_usage, datastore_path):
@@ -423,7 +423,7 @@ def test_xpath_20(client, live_server, measure_memory_usage, datastore_path):
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"include_filters": "//*[contains(@class, 'sametext')]|//*[contains(@class, 'changetext')]",
"url": test_url,
"tags": "",
@@ -437,14 +437,14 @@ def test_xpath_20(client, live_server, measure_memory_usage, datastore_path):
wait_for_all_checks(client)
res = client.get(
url_for("ui.ui_preview.preview_page", uuid=uuid),
url_for("ui.ui_preview.preview_page", uuid="first"),
follow_redirects=True
)
assert b"Some text thats the same" in res.data # in selector
assert b"Some text that will change" in res.data # in selector
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_xpath_20_function_count(client, live_server, measure_memory_usage, datastore_path):
@@ -477,7 +477,7 @@ def test_xpath_20_function_count(client, live_server, measure_memory_usage, data
assert b"246913579975308642" in res.data # in selector
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_xpath_20_function_count2(client, live_server, measure_memory_usage, datastore_path):
@@ -501,8 +501,6 @@ def test_xpath_20_function_count2(client, live_server, measure_memory_usage, dat
)
assert b"Updated watch." in res.data
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(
@@ -512,7 +510,7 @@ def test_xpath_20_function_count2(client, live_server, measure_memory_usage, dat
assert b"246913579975308642" in res.data # in selector
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def test_xpath_20_function_string_join_matches(client, live_server, measure_memory_usage, datastore_path):
@@ -546,7 +544,7 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
assert b"Some text thats the samespecialconjunctionSome text that will change" in res.data # in selector
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
def _subtest_xpath_rss(client, datastore_path, content_type='text/html'):
@@ -584,7 +582,7 @@ def _subtest_xpath_rss(client, datastore_path, content_type='text/html'):
assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
delete_all_watches(client)
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
# Be sure all-in-the-wild types of RSS feeds work with xpath
def test_rss_xpath(client, live_server, measure_memory_usage, datastore_path):
+28 -104
View File
@@ -6,42 +6,6 @@ from flask import url_for
import logging
import time
import os
import threading
# Thread-safe global storage for test endpoint content
# Avoids filesystem cache issues in parallel tests
_test_endpoint_content_lock = threading.Lock()
_test_endpoint_content = {}
def write_test_file_and_sync(filepath, content, mode='w'):
"""
Write test data to file and ensure it's synced to disk.
Also stores in thread-safe global dict to bypass filesystem cache.
Critical for parallel tests where workers may read files immediately after write.
Without fsync(), data may still be in OS buffers when workers try to read,
causing race conditions where old data is seen.
Args:
filepath: Full path to file
content: Content to write (str or bytes)
mode: File mode ('w' for text, 'wb' for binary)
"""
# Convert content to bytes if needed
if isinstance(content, str):
content_bytes = content.encode('utf-8')
else:
content_bytes = content
# Store in thread-safe global dict for instant access
with _test_endpoint_content_lock:
_test_endpoint_content[os.path.basename(filepath)] = content_bytes
# Also write to file for compatibility
with open(filepath, mode) as f:
f.write(content)
f.flush() # Flush Python buffer to OS
os.fsync(f.fileno()) # Force OS to write to disk
def set_original_response(datastore_path, extra_title=''):
test_return_data = f"""<html>
@@ -56,7 +20,8 @@ def set_original_response(datastore_path, extra_title=''):
</html>
"""
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), test_return_data)
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_return_data)
return None
def set_modified_response(datastore_path):
@@ -71,7 +36,9 @@ def set_modified_response(datastore_path):
</html>
"""
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), test_return_data)
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_return_data)
return None
def set_longer_modified_response(datastore_path):
test_return_data = """<html>
@@ -88,7 +55,9 @@ def set_longer_modified_response(datastore_path):
</html>
"""
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), test_return_data)
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_return_data)
return None
def set_more_modified_response(datastore_path):
@@ -104,14 +73,17 @@ def set_more_modified_response(datastore_path):
</html>
"""
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), test_return_data)
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_return_data)
return None
def set_empty_text_response(datastore_path):
test_return_data = """<html><body></body></html>"""
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), test_return_data)
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_return_data)
return None
@@ -160,40 +132,21 @@ def extract_UUID_from_client(client):
return uuid.strip()
def delete_all_watches(client=None):
# Change tracking
client.application.config.get('DATASTORE')._dirty_watches = set() # Watch UUIDs that need saving
client.application.config.get('DATASTORE')._dirty_settings = False # Settings changed
client.application.config.get('DATASTORE')._watch_hashes = {} # UUID -> SHA256 hash for change detection
uuids = list(client.application.config.get('DATASTORE').data['watching'])
for uuid in uuids:
client.application.config.get('DATASTORE').delete(uuid)
from changedetectionio.flask_app import update_q
# Clear the queue to prevent leakage to next test
# Use clear() method to ensure both priority_items and notification_queue are drained
if hasattr(update_q, 'clear'):
update_q.clear()
else:
# Fallback for old implementation
while not update_q.empty():
try:
update_q.get_nowait()
except:
break
time.sleep(0.2)
def wait_for_all_checks(client=None):
"""
Waits until the queue is empty and workers are idle.
Delegates to worker_pool.wait_for_all_checks for shared logic.
Delegates to worker_handler.wait_for_all_checks for shared logic.
"""
from changedetectionio.flask_app import update_q as global_update_q
from changedetectionio import worker_pool
return worker_pool.wait_for_all_checks(global_update_q, timeout=150)
from changedetectionio import worker_handler
# Use the shared wait logic from worker_handler
return worker_handler.wait_for_all_checks(global_update_q, timeout=150)
def wait_for_watch_history(client, min_history_count=2, timeout=10):
"""
@@ -242,11 +195,8 @@ def new_live_server_setup(live_server):
@live_server.app.route('/test-endpoint')
def test_endpoint():
# REMOVED: logger.debug() causes file locking between test process and Flask server process
# Flask server runs in separate multiprocessing.Process and inherited loguru tries to
# write to same log files, causing request handlers to block on file locks
# from loguru import logger
# logger.debug(f"/test-endpoint hit {request}")
from loguru import logger
logger.debug(f"/test-endpoint hit {request}")
ctype = request.args.get('content_type')
status_code = request.args.get('status_code')
content = request.args.get('content') or None
@@ -268,35 +218,15 @@ def new_live_server_setup(live_server):
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
return resp
# Check thread-safe global dict first (instant, no cache issues)
# Fall back to file if not in dict (for tests that write directly)
with _test_endpoint_content_lock:
content_data = _test_endpoint_content.get("endpoint-content.txt")
if content_data is None:
# Not in global dict, read from file
datastore_path = current_app.config.get('TEST_DATASTORE_PATH', 'test-datastore')
filepath = os.path.join(datastore_path, "endpoint-content.txt")
# REMOVED: os.sync() was blocking for many seconds during parallel tests
# With -n 6+ parallel tests, heavy I/O causes os.sync() to wait for ALL
# system writes to complete, causing "Read timed out" errors
# File writes from test code are already flushed by the time workers fetch
try:
with open(filepath, "rb") as f:
content_data = f.read()
except Exception as e:
# REMOVED: logger.error() causes file locking in multiprocess context
# Just raise the exception directly for debugging
raise
resp = make_response(content_data, status_code)
if uppercase_headers:
resp.headers['CONTENT-TYPE'] = ctype if ctype else 'text/html'
else:
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
return resp
# Tried using a global var here but didn't seem to work, so reading from a file instead.
datastore_path = current_app.config.get('TEST_DATASTORE_PATH', 'test-datastore')
with open(os.path.join(datastore_path, "endpoint-content.txt"), "rb") as f:
resp = make_response(f.read(), status_code)
if uppercase_headers:
resp.headers['CONTENT-TYPE'] = ctype if ctype else 'text/html'
else:
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
return resp
except FileNotFoundError:
return make_response('', status_code)
@@ -371,12 +301,6 @@ def new_live_server_setup(live_server):
def test_pdf_endpoint():
datastore_path = current_app.config.get('TEST_DATASTORE_PATH', 'test-datastore')
# Force filesystem sync before reading to ensure fresh data
try:
os.sync()
except (AttributeError, PermissionError):
pass
# Tried using a global var here but didn't seem to work, so reading from a file instead.
with open(os.path.join(datastore_path, "endpoint-test.pdf"), "rb") as f:
resp = make_response(f.read(), 200)
@@ -23,14 +23,11 @@ _uuid_processing_lock = threading.Lock() # Protects currently_processing_uuids
USE_ASYNC_WORKERS = True
# Custom ThreadPoolExecutor for queue operations with named threads
# Scale executor threads to match FETCH_WORKERS (no minimum, no maximum)
# Thread naming: "QueueGetter-N" for easy debugging in thread dumps/traces
# With FETCH_WORKERS=10: 10 workers + 10 executor threads = 20 threads total
# With FETCH_WORKERS=500: 500 workers + 500 executor threads = 1000 threads total (acceptable on modern systems)
_max_executor_workers = int(os.getenv("FETCH_WORKERS", "10"))
# Scale executor threads with FETCH_WORKERS to avoid bottleneck at high concurrency
_max_executor_workers = max(50, int(os.getenv("FETCH_WORKERS", "10")))
queue_executor = ThreadPoolExecutor(
max_workers=_max_executor_workers,
thread_name_prefix="QueueGetter-" # Shows in thread dumps as "QueueGetter-0", "QueueGetter-1", etc.
thread_name_prefix="QueueGetter-"
)
@@ -85,17 +82,16 @@ class WorkerThread:
self.loop = None
def start(self):
"""Start the worker thread with descriptive name for debugging"""
"""Start the worker thread"""
self.thread = threading.Thread(
target=self.run,
daemon=True,
name=f"PageFetchAsyncUpdateWorker-{self.worker_id}" # Shows in thread dumps with worker ID
name=f"PageFetchAsyncUpdateWorker-{self.worker_id}"
)
self.thread.start()
def stop(self):
"""Stop the worker thread brutally - no waiting"""
# Try to stop the event loop if it exists
"""Stop the worker thread"""
if self.loop and self.running:
try:
# Signal the loop to stop
@@ -103,7 +99,8 @@ class WorkerThread:
except RuntimeError:
pass
# Don't wait - thread is daemon and will die when needed
if self.thread and self.thread.is_alive():
self.thread.join(timeout=2.0)
def start_async_workers(n_workers, update_q, notification_q, app, datastore):
@@ -128,7 +125,7 @@ def start_async_workers(n_workers, update_q, notification_q, app, datastore):
async def start_single_async_worker(worker_id, update_q, notification_q, app, datastore, executor=None):
"""Start a single async worker with auto-restart capability"""
from changedetectionio.worker import async_update_worker
from changedetectionio.async_update_worker import async_update_worker
# Check if we're in pytest environment - if so, be more gentle with logging
import os
@@ -340,36 +337,24 @@ def queue_item_async_safe(update_q, item, silent=False):
def shutdown_workers():
"""Shutdown all async workers brutally - no delays, no waiting"""
global worker_threads, queue_executor
"""Shutdown all async workers fast and aggressively"""
global worker_threads
# Check if we're in pytest environment - if so, be more gentle with logging
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info("Brutal shutdown of async workers initiated...")
logger.info("Fast shutdown of async workers initiated...")
# Stop all worker event loops
# Stop all worker threads
for worker in worker_threads:
worker.stop()
# Clear immediately - threads are daemon and will die
worker_threads.clear()
# Shutdown the queue executor to prevent "cannot schedule new futures after shutdown" errors
# This must happen AFTER workers are stopped to avoid race conditions
if queue_executor:
try:
queue_executor.shutdown(wait=False)
if not in_pytest:
logger.debug("Queue executor shut down")
except Exception as e:
if not in_pytest:
logger.warning(f"Error shutting down queue executor: {e}")
if not in_pytest:
logger.info("Async workers brutal shutdown complete")
logger.info("Async workers fast shutdown complete")
@@ -484,14 +469,12 @@ def wait_for_all_checks(update_q, timeout=150):
elif time.time() - empty_since >= 0.3:
# Add small buffer for filesystem operations to complete
time.sleep(0.2)
logger.trace("wait_for_all_checks: All checks complete (queue empty, workers idle)")
return True
else:
empty_since = None
attempt += 1
logger.warning(f"wait_for_all_checks: Timeout after {timeout} attempts")
return False # Timeout
-7
View File
@@ -16,13 +16,6 @@ services:
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=TRACE
#
# Plugins! See https://changedetection.io/plugins for more plugins.
# Install additional Python packages (processor plugins, etc.)
# Example: Install the OSINT reconnaissance processor plugin
# - EXTRA_PACKAGES=changedetection.io-osint-processor
# Multiple packages can be installed by separating with spaces:
# - EXTRA_PACKAGES=changedetection.io-osint-processor another-plugin
#
#
# Uncomment below and the "sockpuppetbrowser" to use a real Chrome browser (It uses the "playwright" protocol)
# - PLAYWRIGHT_DRIVER_URL=ws://browser-sockpuppet-chrome:3000
-28
View File
@@ -1,28 +0,0 @@
#!/bin/bash
set -e
# Install additional packages from EXTRA_PACKAGES env var
# Uses a marker file to avoid reinstalling on every container restart
INSTALLED_MARKER="/datastore/.extra_packages_installed"
CURRENT_PACKAGES="$EXTRA_PACKAGES"
if [ -n "$EXTRA_PACKAGES" ]; then
# Check if we need to install/update packages
if [ ! -f "$INSTALLED_MARKER" ] || [ "$(cat $INSTALLED_MARKER 2>/dev/null)" != "$CURRENT_PACKAGES" ]; then
echo "Installing extra packages: $EXTRA_PACKAGES"
pip3 install --no-cache-dir $EXTRA_PACKAGES
if [ $? -eq 0 ]; then
echo "$CURRENT_PACKAGES" > "$INSTALLED_MARKER"
echo "Extra packages installed successfully"
else
echo "ERROR: Failed to install extra packages"
exit 1
fi
else
echo "Extra packages already installed: $EXTRA_PACKAGES"
fi
fi
# Execute the main command
exec "$@"
+3 -3
View File
@@ -8,7 +8,7 @@ flask-paginate
flask_expects_json~=1.7
flask_restful
flask_cors # For the Chrome extension to operate
# janus # No longer needed - using pure threading.Queue for multi-loop support
janus # Thread-safe async/sync queue bridge
flask_wtf~=1.2
flask~=3.1
flask-socketio~=5.6.0
@@ -51,9 +51,9 @@ linkify-it-py
# - Needed for apprise/spush, and maybe others? hopefully doesnt trigger a rust compile.
# - Requires extra wheel for rPi, adds build time for arm/v8 which is not in piwheels
# Pinned to 44.x for ARM compatibility and sslyze compatibility (sslyze requires <45) and (45.x may not have pre-built ARM wheels)
# Pinned to 43.0.1 for ARM compatibility (45.x may not have pre-built ARM wheels)
# Also pinned because dependabot wants specific versions
cryptography==44.0.0
cryptography==46.0.3
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
# use any version other than 2.0.x due to https://github.com/eclipse/paho.mqtt.python/issues/814