mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-12 01:06:04 +00:00
Compare commits
7 Commits
dependabot
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4bc01aca8d | ||
|
|
ef41dd304c | ||
|
|
5726c5a0ac | ||
|
|
80f7decf4f | ||
|
|
c66a29b011 | ||
|
|
a1a2e5c5bf | ||
|
|
6e90a0bbd1 |
141
.github/workflows/test-stack-reusable-workflow.yml
vendored
141
.github/workflows/test-stack-reusable-workflow.yml
vendored
@@ -103,7 +103,7 @@ jobs:
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_watch_model'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_jinja2_security'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_semver'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
|
||||
docker run test-changedetectionio bash -c 'python3 -m unittest changedetectionio.tests.unit.test_html_to_text'
|
||||
|
||||
# Basic pytest tests with ancillary services
|
||||
basic-tests:
|
||||
@@ -516,3 +516,142 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
docker rm sig-test
|
||||
|
||||
# Upgrade path test
|
||||
upgrade-path-test:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
timeout-minutes: 25
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0 # Fetch all history and tags for upgrade testing
|
||||
|
||||
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Check upgrade works without error
|
||||
run: |
|
||||
echo "=== Testing upgrade path from 0.49.1 to ${{ github.ref_name }} (${{ github.sha }}) ==="
|
||||
|
||||
# Checkout old version and create datastore
|
||||
git checkout 0.49.1
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
pip install 'pyOpenSSL>=23.2.0'
|
||||
|
||||
echo "=== Running version 0.49.1 to create datastore ==="
|
||||
python3 ./changedetection.py -C -d /tmp/data &
|
||||
APP_PID=$!
|
||||
|
||||
# Wait for app to be ready
|
||||
echo "Waiting for 0.49.1 to be ready..."
|
||||
sleep 6
|
||||
|
||||
# Extract API key from datastore (0.49.1 uses url-watches.json)
|
||||
API_KEY=$(jq -r '.settings.application.api_access_token // empty' /tmp/data/url-watches.json)
|
||||
echo "API Key: ${API_KEY:0:8}..."
|
||||
|
||||
# Create a watch with tag "github-group-test" via API
|
||||
echo "Creating test watch with tag via API..."
|
||||
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
|
||||
-H "x-api-key: ${API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
--show-error --fail \
|
||||
--retry 6 --retry-delay 1 --retry-connrefused \
|
||||
-d '{
|
||||
"url": "https://example.com/upgrade-test",
|
||||
"tag": "github-group-test"
|
||||
}'
|
||||
|
||||
echo "✓ Created watch with tag 'github-group-test'"
|
||||
|
||||
# Create a specific test URL watch
|
||||
echo "Creating test URL watch via API..."
|
||||
curl -X POST "http://127.0.0.1:5000/api/v1/watch" \
|
||||
-H "x-api-key: ${API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
--show-error --fail \
|
||||
-d '{
|
||||
"url": "http://localhost/test.txt"
|
||||
}'
|
||||
|
||||
echo "✓ Created watch for 'http://localhost/test.txt' in version 0.49.1"
|
||||
|
||||
# Stop the old version gracefully
|
||||
kill $APP_PID
|
||||
wait $APP_PID || true
|
||||
echo "✓ Version 0.49.1 stopped"
|
||||
|
||||
# Upgrade to current version (use commit SHA since we're in detached HEAD)
|
||||
echo "Upgrading to commit ${{ github.sha }}"
|
||||
git checkout ${{ github.sha }}
|
||||
pip install -r requirements.txt
|
||||
|
||||
echo "=== Running current version (commit ${{ github.sha }}) with old datastore (testing mode) ==="
|
||||
TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD=1 python3 ./changedetection.py -d /tmp/data > /tmp/upgrade-test.log 2>&1
|
||||
|
||||
echo "=== Upgrade test output ==="
|
||||
cat /tmp/upgrade-test.log
|
||||
echo "✓ Datastore upgraded successfully"
|
||||
|
||||
# Now start the current version normally to verify the tag survived
|
||||
echo "=== Starting current version to verify tag exists after upgrade ==="
|
||||
timeout 20 python3 ./changedetection.py -d /tmp/data > /tmp/ui-test.log 2>&1 &
|
||||
APP_PID=$!
|
||||
|
||||
# Wait for app to be ready and fetch UI
|
||||
echo "Waiting for current version to be ready..."
|
||||
sleep 5
|
||||
curl --retry 6 --retry-delay 1 --retry-connrefused --silent http://127.0.0.1:5000 > /tmp/ui-output.html
|
||||
|
||||
# Verify tag exists in UI
|
||||
if grep -q "github-group-test" /tmp/ui-output.html; then
|
||||
echo "✓ Tag 'github-group-test' found in UI after upgrade"
|
||||
else
|
||||
echo "ERROR: Tag 'github-group-test' not found in UI after upgrade"
|
||||
echo "=== UI Output ==="
|
||||
cat /tmp/ui-output.html
|
||||
echo "=== App Log ==="
|
||||
cat /tmp/ui-test.log
|
||||
kill $APP_PID || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Verify test URL exists in UI
|
||||
if grep -q "http://localhost/test.txt" /tmp/ui-output.html; then
|
||||
echo "✓ Watch URL 'http://localhost/test.txt' found in UI after upgrade"
|
||||
else
|
||||
echo "ERROR: Watch URL 'http://localhost/test.txt' not found in UI after upgrade"
|
||||
echo "=== UI Output ==="
|
||||
cat /tmp/ui-output.html
|
||||
echo "=== App Log ==="
|
||||
cat /tmp/ui-test.log
|
||||
kill $APP_PID || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
kill $APP_PID || true
|
||||
wait $APP_PID || true
|
||||
|
||||
echo ""
|
||||
echo "✓✓✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }} ✓✓✓"
|
||||
echo " - Commit: ${{ github.sha }}"
|
||||
echo " - Datastore migrated successfully"
|
||||
echo " - Tag 'github-group-test' survived upgrade"
|
||||
echo " - Watch URL 'http://localhost/test.txt' survived upgrade"
|
||||
|
||||
echo "✓ Upgrade test passed: 0.49.1 → ${{ github.ref_name }}"
|
||||
|
||||
- name: Upload upgrade test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: upgrade-test-logs-py${{ env.PYTHON_VERSION }}
|
||||
path: /tmp/upgrade-test.log
|
||||
|
||||
@@ -371,7 +371,15 @@ def main():
|
||||
# Dont' start if the JSON DB looks corrupt
|
||||
logger.critical(f"ERROR: JSON DB or Proxy List JSON at '{app_config['datastore_path']}' appears to be corrupt, aborting.")
|
||||
logger.critical(str(e))
|
||||
return
|
||||
sys.exit(1)
|
||||
|
||||
# Testing mode: Exit cleanly after datastore initialization (for CI/CD upgrade tests)
|
||||
if os.environ.get('TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD'):
|
||||
logger.success(f"TESTING MODE: Datastore loaded successfully from {app_config['datastore_path']}")
|
||||
logger.success(f"TESTING MODE: Schema version: {datastore.data['settings']['application'].get('schema_version', 'unknown')}")
|
||||
logger.success(f"TESTING MODE: Loaded {len(datastore.data['watching'])} watches")
|
||||
logger.success("TESTING MODE: Exiting cleanly (TESTING_SHUTDOWN_AFTER_DATASTORE_LOAD is set)")
|
||||
sys.exit(0)
|
||||
|
||||
# Apply all_paused setting if specified via CLI
|
||||
if all_paused is not None:
|
||||
|
||||
@@ -2,8 +2,12 @@ from changedetectionio.strtobool import strtobool
|
||||
from flask_restful import abort, Resource
|
||||
from flask import request
|
||||
from functools import wraps
|
||||
from . import auth, validate_openapi_request
|
||||
from . import auth, validate_openapi_request, schema_create_watch
|
||||
from ..validate_url import is_safe_valid_url
|
||||
import json
|
||||
|
||||
# Number of URLs above which import switches to background processing
|
||||
IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD = 20
|
||||
|
||||
|
||||
def default_content_type(content_type='text/plain'):
|
||||
@@ -19,6 +23,62 @@ def default_content_type(content_type='text/plain'):
|
||||
return decorator
|
||||
|
||||
|
||||
def convert_query_param_to_type(value, schema_property):
|
||||
"""
|
||||
Convert a query parameter string to the appropriate type based on schema definition.
|
||||
|
||||
Args:
|
||||
value: String value from query parameter
|
||||
schema_property: Schema property definition with 'type' or 'anyOf' field
|
||||
|
||||
Returns:
|
||||
Converted value in the appropriate type
|
||||
"""
|
||||
# Handle anyOf schemas (extract the first type)
|
||||
if 'anyOf' in schema_property:
|
||||
# Use the first non-null type from anyOf
|
||||
for option in schema_property['anyOf']:
|
||||
if option.get('type') and option.get('type') != 'null':
|
||||
prop_type = option.get('type')
|
||||
break
|
||||
else:
|
||||
prop_type = None
|
||||
else:
|
||||
prop_type = schema_property.get('type')
|
||||
|
||||
# Handle array type (e.g., notification_urls)
|
||||
if prop_type == 'array':
|
||||
# Support both comma-separated and JSON array format
|
||||
if value.startswith('['):
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
return [v.strip() for v in value.split(',')]
|
||||
return [v.strip() for v in value.split(',')]
|
||||
|
||||
# Handle object type (e.g., time_between_check, headers)
|
||||
elif prop_type == 'object':
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError(f"Invalid JSON object for field: {value}")
|
||||
|
||||
# Handle boolean type
|
||||
elif prop_type == 'boolean':
|
||||
return strtobool(value)
|
||||
|
||||
# Handle integer type
|
||||
elif prop_type == 'integer':
|
||||
return int(value)
|
||||
|
||||
# Handle number type (float)
|
||||
elif prop_type == 'number':
|
||||
return float(value)
|
||||
|
||||
# Default: return as string
|
||||
return value
|
||||
|
||||
|
||||
class Import(Resource):
|
||||
def __init__(self, **kwargs):
|
||||
# datastore is a black box dependency
|
||||
@@ -28,40 +88,127 @@ class Import(Resource):
|
||||
@default_content_type('text/plain') #3547 #3542
|
||||
@validate_openapi_request('importWatches')
|
||||
def post(self):
|
||||
"""Import a list of watched URLs."""
|
||||
"""Import a list of watched URLs with optional watch configuration."""
|
||||
|
||||
# Special parameters that are NOT watch configuration
|
||||
special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'}
|
||||
|
||||
extras = {}
|
||||
|
||||
# Handle special 'proxy' parameter
|
||||
if request.args.get('proxy'):
|
||||
plist = self.datastore.proxy_list
|
||||
if not request.args.get('proxy') in plist:
|
||||
return "Invalid proxy choice, currently supported proxies are '{}'".format(', '.join(plist)), 400
|
||||
proxy_list_str = ', '.join(plist) if plist else 'none configured'
|
||||
return f"Invalid proxy choice, currently supported proxies are '{proxy_list_str}'", 400
|
||||
else:
|
||||
extras['proxy'] = request.args.get('proxy')
|
||||
|
||||
# Handle special 'dedupe' parameter
|
||||
dedupe = strtobool(request.args.get('dedupe', 'true'))
|
||||
|
||||
# Handle special 'tag' and 'tag_uuids' parameters
|
||||
tags = request.args.get('tag')
|
||||
tag_uuids = request.args.get('tag_uuids')
|
||||
|
||||
if tag_uuids:
|
||||
tag_uuids = tag_uuids.split(',')
|
||||
|
||||
# Extract ALL other query parameters as watch configuration
|
||||
schema_properties = schema_create_watch.get('properties', {})
|
||||
for param_name, param_value in request.args.items():
|
||||
# Skip special parameters
|
||||
if param_name in special_params:
|
||||
continue
|
||||
|
||||
# Skip if not in schema (unknown parameter)
|
||||
if param_name not in schema_properties:
|
||||
return f"Unknown watch configuration parameter: {param_name}", 400
|
||||
|
||||
# Convert to appropriate type based on schema
|
||||
try:
|
||||
converted_value = convert_query_param_to_type(param_value, schema_properties[param_name])
|
||||
extras[param_name] = converted_value
|
||||
except (ValueError, json.JSONDecodeError) as e:
|
||||
return f"Invalid value for parameter '{param_name}': {str(e)}", 400
|
||||
|
||||
# Validate processor if provided
|
||||
if 'processor' in extras:
|
||||
from changedetectionio.processors import available_processors
|
||||
available = [p[0] for p in available_processors()]
|
||||
if extras['processor'] not in available:
|
||||
return f"Invalid processor '{extras['processor']}'. Available processors: {', '.join(available)}", 400
|
||||
|
||||
# Validate fetch_backend if provided
|
||||
if 'fetch_backend' in extras:
|
||||
from changedetectionio.content_fetchers import available_fetchers
|
||||
available = [f[0] for f in available_fetchers()]
|
||||
# Also allow 'system' and extra_browser_* patterns
|
||||
is_valid = (
|
||||
extras['fetch_backend'] == 'system' or
|
||||
extras['fetch_backend'] in available or
|
||||
extras['fetch_backend'].startswith('extra_browser_')
|
||||
)
|
||||
if not is_valid:
|
||||
return f"Invalid fetch_backend '{extras['fetch_backend']}'. Available: system, {', '.join(available)}", 400
|
||||
|
||||
# Validate notification_urls if provided
|
||||
if 'notification_urls' in extras:
|
||||
from wtforms import ValidationError
|
||||
from changedetectionio.api.Notifications import validate_notification_urls
|
||||
try:
|
||||
validate_notification_urls(extras['notification_urls'])
|
||||
except ValidationError as e:
|
||||
return f"Invalid notification_urls: {str(e)}", 400
|
||||
|
||||
urls = request.get_data().decode('utf8').splitlines()
|
||||
added = []
|
||||
# Clean and validate URLs upfront
|
||||
urls_to_import = []
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
if not len(url):
|
||||
continue
|
||||
|
||||
# If hosts that only contain alphanumerics are allowed ("localhost" for example)
|
||||
# Validate URL
|
||||
if not is_safe_valid_url(url):
|
||||
return f"Invalid or unsupported URL - {url}", 400
|
||||
|
||||
# Check for duplicates if dedupe is enabled
|
||||
if dedupe and self.datastore.url_exists(url):
|
||||
continue
|
||||
|
||||
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
|
||||
added.append(new_uuid)
|
||||
urls_to_import.append(url)
|
||||
|
||||
return added
|
||||
# For small imports, process synchronously for immediate feedback
|
||||
if len(urls_to_import) < IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD:
|
||||
added = []
|
||||
for url in urls_to_import:
|
||||
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
|
||||
added.append(new_uuid)
|
||||
return added, 200
|
||||
|
||||
# For large imports (>= 20), process in background thread
|
||||
else:
|
||||
import threading
|
||||
from loguru import logger
|
||||
|
||||
def import_watches_background():
|
||||
"""Background thread to import watches - discarded after completion."""
|
||||
try:
|
||||
added_count = 0
|
||||
for url in urls_to_import:
|
||||
try:
|
||||
self.datastore.add_watch(url=url, extras=extras, tag=tags, tag_uuids=tag_uuids)
|
||||
added_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error importing URL {url}: {e}")
|
||||
|
||||
logger.info(f"Background import complete: {added_count} watches created")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background import: {e}")
|
||||
|
||||
# Start background thread and return immediately
|
||||
thread = threading.Thread(target=import_watches_background, daemon=True, name="ImportWatches-Background")
|
||||
thread.start()
|
||||
|
||||
return {'status': f'Importing {len(urls_to_import)} URLs in background', 'count': len(urls_to_import)}, 202
|
||||
@@ -481,6 +481,7 @@ class CreateWatch(Resource):
|
||||
'last_error': watch['last_error'],
|
||||
'link': watch.link,
|
||||
'page_title': watch['page_title'],
|
||||
'tags': [*tags], # Unpack dict keys to list (can't use list() since variable named 'list')
|
||||
'title': watch['title'],
|
||||
'url': watch['url'],
|
||||
'viewed': watch.viewed
|
||||
|
||||
@@ -20,11 +20,9 @@ See: Watch.py model docstring for full Pydantic architecture explanation
|
||||
See: processors/restock_diff/processor.py:184-192 for current manual implementation
|
||||
"""
|
||||
|
||||
import os
|
||||
from changedetectionio.model import watch_base
|
||||
from changedetectionio.model.persistence import EntityPersistenceMixin
|
||||
|
||||
|
||||
class model(EntityPersistenceMixin, watch_base):
|
||||
"""
|
||||
Tag domain model - groups watches and can override their settings.
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import uuid
|
||||
|
||||
from changedetectionio import strtobool
|
||||
from .persistence import EntityPersistenceMixin
|
||||
from .persistence import EntityPersistenceMixin, _determine_entity_type
|
||||
|
||||
__all__ = ['EntityPersistenceMixin', 'watch_base']
|
||||
|
||||
@@ -511,10 +511,8 @@ class watch_base(dict):
|
||||
# Save to disk via subclass implementation
|
||||
try:
|
||||
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
|
||||
from changedetectionio.model.persistence import _determine_entity_type
|
||||
entity_type = _determine_entity_type(self.__class__)
|
||||
filename = f"{entity_type}.json"
|
||||
|
||||
self._save_to_disk(data_dict, uuid)
|
||||
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
|
||||
except Exception as e:
|
||||
|
||||
@@ -56,6 +56,259 @@ def _deduplicate_prices(data):
|
||||
return list(unique_data)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
|
||||
# =============================================================================
|
||||
#
|
||||
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
|
||||
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
|
||||
# libxml2 - a C library that allocates memory at the C level.
|
||||
#
|
||||
# Memory Leak Problem:
|
||||
# --------------------
|
||||
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
|
||||
# C-level allocations (nodes, attributes, text content)
|
||||
# 2. Python's garbage collector can mark these objects as collectible, but
|
||||
# cannot force the OS to reclaim the actual C-level memory
|
||||
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
|
||||
# "free in the process address space"
|
||||
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
|
||||
# after Python GC runs
|
||||
#
|
||||
# Why Multiprocessing Fixes This:
|
||||
# --------------------------------
|
||||
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
|
||||
# allocations that Python GC couldn't release. This ensures clean memory state
|
||||
# after each extraction.
|
||||
#
|
||||
# Performance Impact:
|
||||
# -------------------
|
||||
# - Memray analysis showed 1.2M document_fromstring allocations per page
|
||||
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
|
||||
# - With subprocess: ~35MB overhead but forces full cleanup after each run
|
||||
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
|
||||
#
|
||||
# References:
|
||||
# -----------
|
||||
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
|
||||
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
|
||||
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
|
||||
#
|
||||
# Additional Context:
|
||||
# -------------------
|
||||
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
|
||||
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
|
||||
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
|
||||
# use direct call as fallback
|
||||
#
|
||||
# Alternative Solution (Future Optimization):
|
||||
# -------------------------------------------
|
||||
# This entire problem could be avoided by using regex to extract just the machine
|
||||
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
|
||||
#
|
||||
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
|
||||
# 2. Use regex to extract <meta property="og:*"> tags
|
||||
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
|
||||
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
|
||||
#
|
||||
# Benefits:
|
||||
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
|
||||
# - Eliminates the lxml memory leak entirely
|
||||
# - Faster extraction (regex is much faster than DOM parsing)
|
||||
# - No subprocess overhead needed
|
||||
#
|
||||
# Trade-offs:
|
||||
# - Regex for HTML is brittle (comments, CDATA, edge cases)
|
||||
# - Microdata extraction would be complex (need to track element boundaries)
|
||||
# - Would need extensive testing to ensure we don't miss valid data
|
||||
# - extruct is battle-tested; regex solution would need similar maturity
|
||||
#
|
||||
# For now, the subprocess approach is safer and leverages existing extruct code.
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _extract_itemprop_availability_worker(pipe_conn):
|
||||
"""
|
||||
Subprocess worker for itemprop extraction (Linux memory management).
|
||||
|
||||
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
|
||||
When the subprocess exits, the OS reclaims ALL memory including lxml's
|
||||
C-level allocations that Python's GC cannot release.
|
||||
|
||||
Args:
|
||||
pipe_conn: Pipe connection to receive HTML and send result
|
||||
"""
|
||||
import json
|
||||
import gc
|
||||
|
||||
html_content = None
|
||||
result_data = None
|
||||
|
||||
try:
|
||||
# Receive HTML as raw bytes (no pickle)
|
||||
html_bytes = pipe_conn.recv_bytes()
|
||||
html_content = html_bytes.decode('utf-8')
|
||||
|
||||
# Explicitly delete html_bytes to free memory
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Perform extraction in subprocess (uses extruct/lxml)
|
||||
result_data = get_itemprop_availability(html_content)
|
||||
|
||||
# Convert Restock object to dict for JSON serialization
|
||||
result = {
|
||||
'success': True,
|
||||
'data': dict(result_data) if result_data else {}
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
# Clean up before exit
|
||||
del result_data, html_content, result
|
||||
gc.collect()
|
||||
|
||||
except MoreThanOnePriceFound:
|
||||
# Serialize the specific exception type
|
||||
result = {
|
||||
'success': False,
|
||||
'exception_type': 'MoreThanOnePriceFound'
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
# Serialize other exceptions
|
||||
result = {
|
||||
'success': False,
|
||||
'exception_type': type(e).__name__,
|
||||
'exception_message': str(e)
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
finally:
|
||||
# Final cleanup before subprocess exits
|
||||
# Variables may already be deleted in try block, so use try/except
|
||||
try:
|
||||
del html_content
|
||||
except (NameError, UnboundLocalError):
|
||||
pass
|
||||
try:
|
||||
del result_data
|
||||
except (NameError, UnboundLocalError):
|
||||
pass
|
||||
gc.collect()
|
||||
pipe_conn.close()
|
||||
|
||||
|
||||
def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||
"""
|
||||
Extract itemprop availability with hybrid approach for memory efficiency.
|
||||
|
||||
Strategy (fastest to slowest, least to most memory):
|
||||
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
|
||||
2. Fall back to extruct with subprocess isolation on Linux for complex cases
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
|
||||
Returns:
|
||||
Restock: Extracted availability data
|
||||
|
||||
Raises:
|
||||
MoreThanOnePriceFound: When multiple prices detected
|
||||
Other exceptions: From extruct/parsing
|
||||
"""
|
||||
import platform
|
||||
|
||||
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
|
||||
try:
|
||||
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
|
||||
|
||||
logger.trace("Attempting pure Python metadata extraction (no lxml)")
|
||||
extracted_data = extract_metadata_pure_python(html_content)
|
||||
price_data = query_price_availability(extracted_data)
|
||||
|
||||
# If we got price AND availability, we're done!
|
||||
if price_data.get('price') and price_data.get('availability'):
|
||||
result = Restock(price_data)
|
||||
logger.debug(f"Pure Python extraction successful: {dict(result)}")
|
||||
return result
|
||||
|
||||
# If we got some data but not everything, still try extruct for completeness
|
||||
if price_data.get('price') or price_data.get('availability'):
|
||||
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
|
||||
|
||||
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
|
||||
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
|
||||
|
||||
# Only use subprocess isolation on Linux
|
||||
# Other platforms may have issues with spawn or don't need the aggressive memory management
|
||||
if platform.system() == 'Linux':
|
||||
import multiprocessing
|
||||
import json
|
||||
import gc
|
||||
|
||||
try:
|
||||
ctx = multiprocessing.get_context('spawn')
|
||||
parent_conn, child_conn = ctx.Pipe()
|
||||
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
|
||||
p.start()
|
||||
|
||||
# Send HTML as raw bytes (no pickle)
|
||||
html_bytes = html_content.encode('utf-8')
|
||||
parent_conn.send_bytes(html_bytes)
|
||||
|
||||
# Explicitly delete html_bytes copy immediately after sending
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Receive result as JSON
|
||||
result_bytes = parent_conn.recv_bytes()
|
||||
result = json.loads(result_bytes.decode('utf-8'))
|
||||
|
||||
# Wait for subprocess to complete
|
||||
p.join()
|
||||
|
||||
# Close pipes
|
||||
parent_conn.close()
|
||||
child_conn.close()
|
||||
|
||||
# Clean up all subprocess-related objects
|
||||
del p, parent_conn, child_conn, result_bytes
|
||||
gc.collect()
|
||||
|
||||
# Handle result or re-raise exception
|
||||
if result['success']:
|
||||
# Reconstruct Restock object from dict
|
||||
restock_obj = Restock(result['data'])
|
||||
# Clean up result dict
|
||||
del result
|
||||
gc.collect()
|
||||
return restock_obj
|
||||
else:
|
||||
# Re-raise the exception that occurred in subprocess
|
||||
exception_type = result['exception_type']
|
||||
exception_msg = result.get('exception_message', '')
|
||||
del result
|
||||
gc.collect()
|
||||
|
||||
if exception_type == 'MoreThanOnePriceFound':
|
||||
raise MoreThanOnePriceFound()
|
||||
else:
|
||||
raise Exception(f"{exception_type}: {exception_msg}")
|
||||
|
||||
except Exception as e:
|
||||
# If multiprocessing itself fails, log and fall back to direct call
|
||||
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
|
||||
gc.collect()
|
||||
return get_itemprop_availability(html_content)
|
||||
else:
|
||||
# Non-Linux: direct call (no subprocess overhead needed)
|
||||
return get_itemprop_availability(html_content)
|
||||
|
||||
|
||||
# should return Restock()
|
||||
# add casting?
|
||||
def get_itemprop_availability(html_content) -> Restock:
|
||||
@@ -196,8 +449,9 @@ class perform_site_check(difference_detection_processor):
|
||||
multiple_prices_found = False
|
||||
|
||||
# Try built-in extraction first, this will scan metadata in the HTML
|
||||
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
|
||||
try:
|
||||
itemprop_availability = get_itemprop_availability(self.fetcher.content)
|
||||
itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
|
||||
except MoreThanOnePriceFound as e:
|
||||
# Don't raise immediately - let plugins try to handle this case
|
||||
# Plugins might be able to determine which price is correct
|
||||
|
||||
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Pure Python metadata extractor - no lxml, no memory leaks.
|
||||
|
||||
This module provides a fast, memory-efficient alternative to extruct for common
|
||||
e-commerce metadata extraction. It handles:
|
||||
- JSON-LD (covers 80%+ of modern sites)
|
||||
- OpenGraph meta tags
|
||||
- Basic microdata attributes
|
||||
|
||||
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
|
||||
memory allocation issues. For edge cases, the main processor can fall back to
|
||||
extruct (with subprocess isolation on Linux).
|
||||
"""
|
||||
|
||||
from html.parser import HTMLParser
|
||||
import json
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class JSONLDExtractor(HTMLParser):
|
||||
"""
|
||||
Extract JSON-LD structured data from HTML.
|
||||
|
||||
Finds all <script type="application/ld+json"> tags and parses their content.
|
||||
Handles multiple JSON-LD blocks on the same page.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_jsonld = False
|
||||
self.data = [] # List of all parsed JSON-LD objects
|
||||
self.current_script = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'script':
|
||||
# Check if this is a JSON-LD script tag
|
||||
for attr, value in attrs:
|
||||
if attr == 'type' and value == 'application/ld+json':
|
||||
self.in_jsonld = True
|
||||
self.current_script = []
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_jsonld:
|
||||
self.current_script.append(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'script' and self.in_jsonld:
|
||||
# Parse the accumulated script content
|
||||
script_content = ''.join(self.current_script)
|
||||
if script_content.strip():
|
||||
try:
|
||||
# Parse JSON (handles both objects and arrays)
|
||||
parsed = json.loads(script_content)
|
||||
if isinstance(parsed, list):
|
||||
self.data.extend(parsed)
|
||||
else:
|
||||
self.data.append(parsed)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"Failed to parse JSON-LD: {e}")
|
||||
pass
|
||||
|
||||
self.in_jsonld = False
|
||||
self.current_script = []
|
||||
|
||||
|
||||
class OpenGraphExtractor(HTMLParser):
|
||||
"""
|
||||
Extract OpenGraph meta tags from HTML.
|
||||
|
||||
Finds <meta property="og:*"> tags commonly used for social media sharing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.og_data = {}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'meta':
|
||||
attrs_dict = dict(attrs)
|
||||
prop = attrs_dict.get('property', '')
|
||||
|
||||
# Extract OpenGraph properties
|
||||
if prop.startswith('og:'):
|
||||
content = attrs_dict.get('content', '')
|
||||
if content:
|
||||
self.og_data[prop] = content
|
||||
|
||||
|
||||
class MicrodataExtractor(HTMLParser):
|
||||
"""
|
||||
Extract basic microdata attributes from HTML.
|
||||
|
||||
Finds elements with itemprop attributes. This is a simplified extractor
|
||||
that doesn't handle nested itemscope/itemtype hierarchies - for complex
|
||||
cases, use extruct as fallback.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.microdata = {}
|
||||
self.current_itemprop = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
if 'itemprop' in attrs_dict:
|
||||
itemprop = attrs_dict['itemprop']
|
||||
|
||||
# Price/currency/availability can be in content/href attributes
|
||||
if itemprop == 'price':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['price'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'price'
|
||||
|
||||
elif itemprop == 'priceCurrency':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['currency'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'priceCurrency'
|
||||
|
||||
elif itemprop == 'availability':
|
||||
# Can be in href (link) or content (meta)
|
||||
if 'href' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['href']
|
||||
elif 'content' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'availability'
|
||||
|
||||
def handle_data(self, data):
|
||||
# Capture text content for itemprop elements
|
||||
if self.current_itemprop == 'price':
|
||||
# Try to extract numeric price from text
|
||||
try:
|
||||
price_text = re.sub(r'[^\d.]', '', data.strip())
|
||||
if price_text:
|
||||
self.microdata['price'] = float(price_text)
|
||||
except ValueError:
|
||||
pass
|
||||
elif self.current_itemprop == 'priceCurrency':
|
||||
currency = data.strip()
|
||||
if currency:
|
||||
self.microdata['currency'] = currency
|
||||
elif self.current_itemprop == 'availability':
|
||||
availability = data.strip()
|
||||
if availability:
|
||||
self.microdata['availability'] = availability
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
# Reset current itemprop after closing tag
|
||||
self.current_itemprop = None
|
||||
|
||||
|
||||
def extract_metadata_pure_python(html_content):
|
||||
"""
|
||||
Extract structured metadata from HTML using pure Python parsers.
|
||||
|
||||
Returns a dict with three keys:
|
||||
- 'json-ld': List of parsed JSON-LD objects
|
||||
- 'opengraph': Dict of OpenGraph properties
|
||||
- 'microdata': Dict of microdata properties
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
|
||||
Returns:
|
||||
dict: Extracted metadata in three formats
|
||||
"""
|
||||
result = {
|
||||
'json-ld': [],
|
||||
'opengraph': {},
|
||||
'microdata': {}
|
||||
}
|
||||
|
||||
# Extract JSON-LD
|
||||
try:
|
||||
jsonld_extractor = JSONLDExtractor()
|
||||
jsonld_extractor.feed(html_content)
|
||||
result['json-ld'] = jsonld_extractor.data
|
||||
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
|
||||
except Exception as e:
|
||||
logger.debug(f"JSON-LD extraction failed: {e}")
|
||||
|
||||
# Extract OpenGraph
|
||||
try:
|
||||
og_extractor = OpenGraphExtractor()
|
||||
og_extractor.feed(html_content)
|
||||
result['opengraph'] = og_extractor.og_data
|
||||
if result['opengraph']:
|
||||
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
|
||||
except Exception as e:
|
||||
logger.debug(f"OpenGraph extraction failed: {e}")
|
||||
|
||||
# Extract Microdata
|
||||
try:
|
||||
microdata_extractor = MicrodataExtractor()
|
||||
microdata_extractor.feed(html_content)
|
||||
result['microdata'] = microdata_extractor.microdata
|
||||
if result['microdata']:
|
||||
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Microdata extraction failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def query_price_availability(extracted_data):
|
||||
"""
|
||||
Query extracted metadata for price and availability information.
|
||||
|
||||
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
|
||||
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
|
||||
|
||||
Args:
|
||||
extracted_data: Dict from extract_metadata_pure_python()
|
||||
|
||||
Returns:
|
||||
dict: {'price': float, 'currency': str, 'availability': str}
|
||||
"""
|
||||
from jsonpath_ng import parse
|
||||
|
||||
result = {}
|
||||
|
||||
# 1. Try JSON-LD first (most reliable and common)
|
||||
for data in extracted_data.get('json-ld', []):
|
||||
try:
|
||||
# Use jsonpath to find price/availability anywhere in the structure
|
||||
price_parse = parse('$..(price|Price)')
|
||||
availability_parse = parse('$..(availability|Availability)')
|
||||
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
|
||||
|
||||
price_results = [m.value for m in price_parse.find(data)]
|
||||
if price_results and not result.get('price'):
|
||||
# Handle various price formats
|
||||
price_val = price_results[0]
|
||||
if isinstance(price_val, (int, float)):
|
||||
result['price'] = float(price_val)
|
||||
elif isinstance(price_val, str):
|
||||
# Extract numeric value from string
|
||||
try:
|
||||
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
avail_results = [m.value for m in availability_parse.find(data)]
|
||||
if avail_results and not result.get('availability'):
|
||||
result['availability'] = str(avail_results[0])
|
||||
|
||||
curr_results = [m.value for m in currency_parse.find(data)]
|
||||
if curr_results and not result.get('currency'):
|
||||
result['currency'] = str(curr_results[0])
|
||||
|
||||
# If we found price, this JSON-LD block is good
|
||||
if result.get('price'):
|
||||
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error querying JSON-LD: {e}")
|
||||
continue
|
||||
|
||||
# 2. Try OpenGraph if JSON-LD didn't provide everything
|
||||
og_data = extracted_data.get('opengraph', {})
|
||||
if not result.get('price') and 'og:price:amount' in og_data:
|
||||
try:
|
||||
result['price'] = float(og_data['og:price:amount'])
|
||||
except ValueError:
|
||||
pass
|
||||
if not result.get('currency') and 'og:price:currency' in og_data:
|
||||
result['currency'] = og_data['og:price:currency']
|
||||
if not result.get('availability') and 'og:availability' in og_data:
|
||||
result['availability'] = og_data['og:availability']
|
||||
|
||||
# 3. Use microdata as last resort
|
||||
microdata = extracted_data.get('microdata', {})
|
||||
if not result.get('price') and 'price' in microdata:
|
||||
result['price'] = microdata['price']
|
||||
if not result.get('currency') and 'currency' in microdata:
|
||||
result['currency'] = microdata['currency']
|
||||
if not result.get('availability') and 'availability' in microdata:
|
||||
result['availability'] = microdata['availability']
|
||||
|
||||
return result
|
||||
@@ -184,7 +184,8 @@ $(document).ready(function() {
|
||||
}
|
||||
// If it's a button in a form, submit the form
|
||||
else if ($element.is('button')) {
|
||||
$element.closest('form').submit();
|
||||
// Use requestSubmit() to include the button's name/value in the form data
|
||||
$element.closest('form')[0].requestSubmit($element[0]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -33,9 +33,8 @@ except ImportError:
|
||||
from ..processors import get_custom_watch_obj_for_processor
|
||||
|
||||
# Import the base class and helpers
|
||||
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_watch_atomic, save_tag_atomic, save_json_atomic
|
||||
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_json_atomic
|
||||
from .updates import DatastoreUpdatesMixin
|
||||
from .legacy_loader import has_legacy_datastore
|
||||
|
||||
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
|
||||
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
|
||||
@@ -78,7 +77,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
|
||||
copyfile(db_path, db_path_version_backup)
|
||||
|
||||
def _load_settings(self):
|
||||
def _load_settings(self, filename="changedetection.json"):
|
||||
"""
|
||||
Load settings from storage.
|
||||
|
||||
@@ -87,7 +86,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
Returns:
|
||||
dict: Settings data loaded from storage
|
||||
"""
|
||||
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
|
||||
changedetection_json = os.path.join(self.datastore_path, filename)
|
||||
|
||||
logger.info(f"Loading settings from {changedetection_json}")
|
||||
|
||||
@@ -122,6 +121,11 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
if 'application' in settings_data['settings']:
|
||||
self.__data['settings']['application'].update(settings_data['settings']['application'])
|
||||
|
||||
# More or less for the old format which had this data in the one url-watches.json
|
||||
# cant hurt to leave it here,
|
||||
if 'watching' in settings_data:
|
||||
self.__data['watching'].update(settings_data['watching'])
|
||||
|
||||
def _rehydrate_tags(self):
|
||||
"""Rehydrate tag entities from stored data into Tag objects with restock_diff processor."""
|
||||
from ..model import Tag
|
||||
@@ -146,23 +150,28 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
logger.info(f"Rehydrating {watch_count} watches...")
|
||||
watching_rehydrated = {}
|
||||
for uuid, watch_dict in self.__data.get('watching', {}).items():
|
||||
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
|
||||
if isinstance(watch_dict, dict):
|
||||
watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
|
||||
else:
|
||||
logger.error(f"Watch UUID {uuid} already rehydrated")
|
||||
|
||||
self.__data['watching'] = watching_rehydrated
|
||||
logger.success(f"Rehydrated {watch_count} watches into Watch objects")
|
||||
|
||||
|
||||
def _load_state(self):
|
||||
def _load_state(self, main_settings_filename="changedetection.json"):
|
||||
"""
|
||||
Load complete datastore state from storage.
|
||||
|
||||
Orchestrates loading of settings, watches, and tags using polymorphic methods.
|
||||
"""
|
||||
# Load settings
|
||||
settings_data = self._load_settings()
|
||||
settings_data = self._load_settings(filename=main_settings_filename)
|
||||
self._apply_settings(settings_data)
|
||||
|
||||
# Load watches (polymorphic - parent class method)
|
||||
# Load watches, scan them from the disk
|
||||
self._load_watches()
|
||||
self._rehydrate_watches()
|
||||
|
||||
# Load tags from individual tag.json files
|
||||
# These will override any tags in settings (migration path)
|
||||
@@ -200,112 +209,73 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
|
||||
# Check if datastore already exists
|
||||
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
|
||||
changedetection_json_old_schema = os.path.join(self.datastore_path, "url-watches.json")
|
||||
|
||||
if os.path.exists(changedetection_json):
|
||||
# Load existing datastore (changedetection.json + watch.json files)
|
||||
logger.info("Loading existing datastore")
|
||||
try:
|
||||
self._load_state()
|
||||
except Exception as e:
|
||||
logger.critical(f"Failed to load datastore: {e}")
|
||||
raise
|
||||
|
||||
# Run schema updates if needed
|
||||
# Pass current schema version from loaded datastore (defaults to 0 if not set)
|
||||
# Load existing datastore (changedetection.json + watch.json files)
|
||||
logger.info("Loading existing datastore")
|
||||
self._load_state()
|
||||
current_schema = self.data['settings']['application'].get('schema_version', 0)
|
||||
self.run_updates(current_schema_version=current_schema)
|
||||
|
||||
# Legacy datastore detected - trigger migration, even works if the schema is much before the migration step.
|
||||
elif os.path.exists(changedetection_json_old_schema):
|
||||
|
||||
logger.critical(f"Legacy datastore detected at {changedetection_json_old_schema}, loading and running updates")
|
||||
self._load_state(main_settings_filename="url-watches.json")
|
||||
# update 26 will load the whole old config from disk to __data
|
||||
current_schema = self.__data['settings']['application'].get('schema_version', 0)
|
||||
self.run_updates(current_schema_version=current_schema)
|
||||
# Probably tags were also shifted to disk and many other changes, so best to reload here.
|
||||
self._load_state()
|
||||
|
||||
else:
|
||||
# No datastore yet - check if this is a fresh install or legacy migration
|
||||
# Generate app_guid FIRST (required for all operations)
|
||||
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
|
||||
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
|
||||
else:
|
||||
self.__data['app_guid'] = str(uuid_builder.uuid4())
|
||||
self.init_fresh_install(include_default_watches=include_default_watches,
|
||||
version_tag=version_tag)
|
||||
|
||||
# Generate RSS access token
|
||||
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
|
||||
def init_fresh_install(self, include_default_watches, version_tag):
|
||||
# Generate app_guid FIRST (required for all operations)
|
||||
if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
|
||||
self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
|
||||
else:
|
||||
self.__data['app_guid'] = str(uuid_builder.uuid4())
|
||||
|
||||
# Generate API access token
|
||||
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
|
||||
# Generate RSS access token
|
||||
self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)
|
||||
|
||||
# Check if legacy datastore exists (url-watches.json)
|
||||
if has_legacy_datastore(self.datastore_path):
|
||||
# Legacy datastore detected - trigger migration
|
||||
logger.critical(f"Legacy datastore detected at {self.datastore_path}/url-watches.json")
|
||||
logger.critical("Migration will be triggered via update_26")
|
||||
# Generate API access token
|
||||
self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
|
||||
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
|
||||
|
||||
# Load the legacy datastore
|
||||
from .legacy_loader import load_legacy_format
|
||||
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
|
||||
legacy_data = load_legacy_format(legacy_path)
|
||||
# Set schema version to latest (no updates needed)
|
||||
latest_update_available = self.get_updates_available().pop()
|
||||
logger.info(f"Marking fresh install to schema version {latest_update_available}")
|
||||
self.__data['settings']['application']['schema_version'] = latest_update_available
|
||||
|
||||
if not legacy_data:
|
||||
raise Exception("Failed to load legacy datastore from url-watches.json")
|
||||
# Add default watches if requested
|
||||
if include_default_watches:
|
||||
self.add_watch(
|
||||
url='https://news.ycombinator.com/',
|
||||
tag='Tech news',
|
||||
extras={'fetch_backend': 'html_requests'}
|
||||
)
|
||||
self.add_watch(
|
||||
url='https://changedetection.io/CHANGELOG.txt',
|
||||
tag='changedetection.io',
|
||||
extras={'fetch_backend': 'html_requests'}
|
||||
)
|
||||
|
||||
# Merge legacy data with base_config defaults (preserves new fields like 'ui')
|
||||
# self.__data already has App.model() defaults from line 190
|
||||
logger.info("Merging legacy data with base_config defaults...")
|
||||
|
||||
# Apply top-level fields from legacy data
|
||||
if 'app_guid' in legacy_data:
|
||||
self.__data['app_guid'] = legacy_data['app_guid']
|
||||
if 'build_sha' in legacy_data:
|
||||
self.__data['build_sha'] = legacy_data['build_sha']
|
||||
if 'version_tag' in legacy_data:
|
||||
self.__data['version_tag'] = legacy_data['version_tag']
|
||||
|
||||
# Apply watching data (complete replacement as these are user's watches)
|
||||
if 'watching' in legacy_data:
|
||||
self.__data['watching'] = legacy_data['watching']
|
||||
|
||||
# Merge settings sections (preserves base_config defaults for missing fields)
|
||||
if 'settings' in legacy_data:
|
||||
if 'headers' in legacy_data['settings']:
|
||||
self.__data['settings']['headers'].update(legacy_data['settings']['headers'])
|
||||
if 'requests' in legacy_data['settings']:
|
||||
self.__data['settings']['requests'].update(legacy_data['settings']['requests'])
|
||||
if 'application' in legacy_data['settings']:
|
||||
# CRITICAL: Use .update() to merge, not replace
|
||||
# This preserves new fields like 'ui' that exist in base_config
|
||||
self.__data['settings']['application'].update(legacy_data['settings']['application'])
|
||||
|
||||
# CRITICAL: Rehydrate watches from dicts into Watch objects
|
||||
# This ensures watches have their methods available during migration
|
||||
self._rehydrate_watches()
|
||||
|
||||
# update_26 will save watches to individual files and create changedetection.json
|
||||
# Next startup will load from new format normally
|
||||
self.run_updates()
|
||||
# Create changedetection.json immediately
|
||||
try:
|
||||
self._save_settings()
|
||||
logger.info("Created changedetection.json for new datastore")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create initial changedetection.json: {e}")
|
||||
|
||||
|
||||
else:
|
||||
# Fresh install - create new datastore
|
||||
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
|
||||
|
||||
# Set schema version to latest (no updates needed)
|
||||
updates_available = self.get_updates_available()
|
||||
self.__data['settings']['application']['schema_version'] = updates_available.pop() if updates_available else 26
|
||||
|
||||
# Add default watches if requested
|
||||
if include_default_watches:
|
||||
self.add_watch(
|
||||
url='https://news.ycombinator.com/',
|
||||
tag='Tech news',
|
||||
extras={'fetch_backend': 'html_requests'}
|
||||
)
|
||||
self.add_watch(
|
||||
url='https://changedetection.io/CHANGELOG.txt',
|
||||
tag='changedetection.io',
|
||||
extras={'fetch_backend': 'html_requests'}
|
||||
)
|
||||
|
||||
# Create changedetection.json immediately
|
||||
try:
|
||||
self._save_settings()
|
||||
logger.info("Created changedetection.json for new datastore")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create initial changedetection.json: {e}")
|
||||
|
||||
# Set version tag
|
||||
self.__data['version_tag'] = version_tag
|
||||
@@ -383,17 +353,9 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
# Deep copy settings to avoid modifying the original
|
||||
settings_copy = copy.deepcopy(self.__data['settings'])
|
||||
|
||||
# Only exclude tags if we've already migrated them to individual files (schema >= 28)
|
||||
# This ensures update_28 can migrate tags from settings
|
||||
schema_version = self.__data['settings']['application'].get('schema_version', 0)
|
||||
if schema_version >= 28:
|
||||
# Tags are in individual tag.json files, don't save to settings
|
||||
settings_copy['application']['tags'] = {}
|
||||
# else: keep tags in settings for update_28 migration
|
||||
|
||||
return {
|
||||
'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
|
||||
'app_guid': self.__data['app_guid'],
|
||||
'app_guid': self.__data.get('app_guid'),
|
||||
'settings': settings_copy,
|
||||
'build_sha': self.__data.get('build_sha'),
|
||||
'version_tag': self.__data.get('version_tag')
|
||||
@@ -422,15 +384,14 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
Implementation of abstract method from FileSavingDataStore.
|
||||
Delegates to helper function and stores results in internal data structure.
|
||||
"""
|
||||
watching = load_all_watches(
|
||||
self.datastore_path,
|
||||
self.rehydrate_entity
|
||||
)
|
||||
|
||||
# Store loaded data
|
||||
self.__data['watching'] = watching
|
||||
|
||||
logger.debug(f"Loaded {len(watching)} watches")
|
||||
# @note this will also work for the old legacy format because self.__data['watching'] should already have them loaded by this point.
|
||||
self.__data['watching'].update(load_all_watches(
|
||||
self.datastore_path,
|
||||
self.rehydrate_entity
|
||||
))
|
||||
logger.debug(f"Loaded {len(self.__data['watching'])} watches")
|
||||
|
||||
def _load_tags(self):
|
||||
"""
|
||||
|
||||
@@ -207,15 +207,6 @@ def save_watch_atomic(watch_dir, uuid, watch_dict):
|
||||
save_entity_atomic(watch_dir, uuid, watch_dict, "watch.json", "watch", max_size_mb=10)
|
||||
|
||||
|
||||
def save_tag_atomic(tag_dir, uuid, tag_dict):
|
||||
"""
|
||||
Save a tag to disk using atomic write pattern.
|
||||
|
||||
Convenience wrapper around save_entity_atomic for tags.
|
||||
Kept for backwards compatibility.
|
||||
"""
|
||||
save_entity_atomic(tag_dir, uuid, tag_dict, "tag.json", "tag", max_size_mb=1)
|
||||
|
||||
|
||||
def load_watch_from_file(watch_json, uuid, rehydrate_entity_func):
|
||||
"""
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
"""
|
||||
Legacy format loader for url-watches.json.
|
||||
|
||||
Provides functions to detect and load from the legacy monolithic JSON format.
|
||||
Used during migration (update_26) to transition to individual watch.json files.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from loguru import logger
|
||||
|
||||
# Try to import orjson for faster JSON serialization
|
||||
try:
|
||||
import orjson
|
||||
HAS_ORJSON = True
|
||||
except ImportError:
|
||||
HAS_ORJSON = False
|
||||
|
||||
|
||||
def has_legacy_datastore(datastore_path):
|
||||
"""
|
||||
Check if a legacy url-watches.json file exists.
|
||||
|
||||
This is used by update_26 to determine if migration is needed.
|
||||
|
||||
Args:
|
||||
datastore_path: Path to datastore directory
|
||||
|
||||
Returns:
|
||||
bool: True if url-watches.json exists
|
||||
"""
|
||||
url_watches_json = os.path.join(datastore_path, "url-watches.json")
|
||||
return os.path.exists(url_watches_json)
|
||||
|
||||
|
||||
def load_legacy_format(json_store_path):
|
||||
"""
|
||||
Load datastore from legacy url-watches.json format.
|
||||
|
||||
Args:
|
||||
json_store_path: Full path to url-watches.json file
|
||||
|
||||
Returns:
|
||||
dict: Loaded datastore data with 'watching', 'settings', etc.
|
||||
None: If file doesn't exist or loading failed
|
||||
"""
|
||||
logger.info(f"Loading from legacy format: {json_store_path}")
|
||||
|
||||
if not os.path.isfile(json_store_path):
|
||||
logger.warning(f"Legacy file not found: {json_store_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
if HAS_ORJSON:
|
||||
with open(json_store_path, 'rb') as f:
|
||||
data = orjson.loads(f.read())
|
||||
else:
|
||||
with open(json_store_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
logger.info(f"Loaded {len(data.get('watching', {}))} watches from legacy format")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load legacy format: {e}")
|
||||
return None
|
||||
@@ -16,12 +16,18 @@ import time
|
||||
from loguru import logger
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
# Try to import orjson for faster JSON serialization
|
||||
try:
|
||||
import orjson
|
||||
HAS_ORJSON = True
|
||||
except ImportError:
|
||||
HAS_ORJSON = False
|
||||
|
||||
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
|
||||
from ..processors.restock_diff import Restock
|
||||
from ..blueprint.rss import RSS_CONTENT_FORMAT_DEFAULT
|
||||
from ..model import USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH
|
||||
from .file_saving_datastore import save_watch_atomic
|
||||
|
||||
|
||||
def create_backup_tarball(datastore_path, update_number):
|
||||
"""
|
||||
@@ -97,7 +103,7 @@ def create_backup_tarball(datastore_path, update_number):
|
||||
tar.add(tag_json, arcname=f"{entry}/tag.json")
|
||||
tag_count += 1
|
||||
|
||||
logger.success(f"Backup created: {backup_filename} ({watch_count} watches, {tag_count} tags)")
|
||||
logger.success(f"Backup created: {backup_filename} ({watch_count} watches from disk, {tag_count} tags from disk)")
|
||||
return backup_path
|
||||
|
||||
except Exception as e:
|
||||
@@ -137,6 +143,7 @@ class DatastoreUpdatesMixin:
|
||||
return updates_available
|
||||
|
||||
def run_updates(self, current_schema_version=None):
|
||||
import sys
|
||||
"""
|
||||
Run all pending schema updates sequentially.
|
||||
|
||||
@@ -160,6 +167,23 @@ class DatastoreUpdatesMixin:
|
||||
4. All changes saved via individual .commit() calls
|
||||
"""
|
||||
updates_available = self.get_updates_available()
|
||||
if self.data.get('watching'):
|
||||
test_watch = self.data['watching'].get(next(iter(self.data.get('watching', {}))))
|
||||
from ..model.Watch import model
|
||||
|
||||
if not isinstance(test_watch, model):
|
||||
import sys
|
||||
logger.critical("Cannot run updates! Watch structure must be re-hydrated back to a Watch model object!")
|
||||
sys.exit(1)
|
||||
|
||||
if self.data['settings']['application'].get('tags',{}):
|
||||
test_tag = self.data['settings']['application'].get('tags',{}).get(next(iter(self.data['settings']['application'].get('tags',{}))))
|
||||
from ..model.Tag import model as tag_model
|
||||
|
||||
if not isinstance(test_tag, tag_model):
|
||||
import sys
|
||||
logger.critical("Cannot run updates! Watch tag/group structure must be re-hydrated back to a Tag model object!")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine current schema version
|
||||
if current_schema_version is None:
|
||||
@@ -201,10 +225,9 @@ class DatastoreUpdatesMixin:
|
||||
try:
|
||||
update_method = getattr(self, f"update_{update_n}")()
|
||||
except Exception as e:
|
||||
logger.error(f"Error while trying update_{update_n}")
|
||||
logger.error(e)
|
||||
# Don't run any more updates
|
||||
return
|
||||
logger.critical(f"Error while trying update_{update_n}")
|
||||
logger.exception(e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Bump the version
|
||||
self.data['settings']['application']['schema_version'] = update_n
|
||||
@@ -555,27 +578,6 @@ class DatastoreUpdatesMixin:
|
||||
logger.critical("COPY-based migration: url-watches.json will remain intact for rollback")
|
||||
logger.critical("=" * 80)
|
||||
|
||||
# Check if already migrated
|
||||
changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
|
||||
if os.path.exists(changedetection_json):
|
||||
logger.info("Migration already completed (changedetection.json exists), skipping")
|
||||
return
|
||||
|
||||
# Check if we need to load legacy data
|
||||
from .legacy_loader import has_legacy_datastore, load_legacy_format
|
||||
|
||||
if not has_legacy_datastore(self.datastore_path):
|
||||
logger.info("No legacy datastore found, nothing to migrate")
|
||||
return
|
||||
|
||||
# Load legacy data from url-watches.json
|
||||
logger.critical("Loading legacy datastore from url-watches.json...")
|
||||
legacy_path = os.path.join(self.datastore_path, "url-watches.json")
|
||||
legacy_data = load_legacy_format(legacy_path)
|
||||
|
||||
if not legacy_data:
|
||||
raise Exception("Failed to load legacy datastore from url-watches.json")
|
||||
|
||||
# Populate settings from legacy data
|
||||
logger.info("Populating settings from legacy data...")
|
||||
watch_count = len(self.data['watching'])
|
||||
@@ -587,9 +589,7 @@ class DatastoreUpdatesMixin:
|
||||
saved_count = 0
|
||||
for uuid, watch in self.data['watching'].items():
|
||||
try:
|
||||
watch_dict = dict(watch)
|
||||
watch_dir = os.path.join(self.datastore_path, uuid)
|
||||
save_watch_atomic(watch_dir, uuid, watch_dict)
|
||||
watch.commit()
|
||||
saved_count += 1
|
||||
|
||||
if saved_count % 100 == 0:
|
||||
@@ -635,18 +635,19 @@ class DatastoreUpdatesMixin:
|
||||
|
||||
# Phase 4: Verify settings file exists
|
||||
logger.critical("Phase 4/4: Verifying changedetection.json exists...")
|
||||
changedetection_json_new_schema=os.path.join(self.datastore_path, "changedetection.json")
|
||||
if not os.path.isfile(changedetection_json_new_schema):
|
||||
import sys
|
||||
logger.critical("Migration failed, changedetection.json not found after update ran!")
|
||||
sys.exit(1)
|
||||
|
||||
if not os.path.isfile(changedetection_json):
|
||||
raise Exception(
|
||||
"Migration failed: changedetection.json not found after save. "
|
||||
"url-watches.json remains intact, safe to retry."
|
||||
)
|
||||
|
||||
logger.critical("Phase 4 complete: Verified changedetection.json exists")
|
||||
|
||||
# Success! Now reload from new format
|
||||
logger.critical("Reloading datastore from new format...")
|
||||
self._load_state() # Includes load_watches
|
||||
# write it to disk, it will be saved without ['watching'] in the JSON db because we find it from disk glob
|
||||
self._save_settings()
|
||||
logger.success("Datastore reloaded from new format successfully")
|
||||
logger.critical("=" * 80)
|
||||
logger.critical("MIGRATION COMPLETED SUCCESSFULLY!")
|
||||
@@ -681,9 +682,11 @@ class DatastoreUpdatesMixin:
|
||||
- Enables independent tag versioning/backup
|
||||
- Maintains backwards compatibility (tags stay in settings too)
|
||||
"""
|
||||
# Force save as tag.json (not watch.json) even if object is corrupted
|
||||
|
||||
logger.critical("=" * 80)
|
||||
logger.critical("Running migration: Individual tag persistence (update_28)")
|
||||
logger.critical("Creating individual tag.json files (tags remain in settings too)")
|
||||
logger.critical("Creating individual tag.json files")
|
||||
logger.critical("=" * 80)
|
||||
|
||||
tags = self.data['settings']['application'].get('tags', {})
|
||||
@@ -700,27 +703,8 @@ class DatastoreUpdatesMixin:
|
||||
|
||||
for uuid, tag_data in tags.items():
|
||||
try:
|
||||
# Force save as tag.json (not watch.json) even if object is corrupted
|
||||
from changedetectionio.store.file_saving_datastore import save_entity_atomic
|
||||
import os
|
||||
|
||||
tag_dir = os.path.join(self.datastore_path, uuid)
|
||||
os.makedirs(tag_dir, exist_ok=True)
|
||||
|
||||
# Convert to dict if it's an object
|
||||
tag_dict = dict(tag_data) if hasattr(tag_data, '__iter__') else tag_data
|
||||
|
||||
# Save explicitly as tag.json
|
||||
save_entity_atomic(
|
||||
tag_dir,
|
||||
uuid,
|
||||
tag_dict,
|
||||
filename='tag.json',
|
||||
entity_type='tag',
|
||||
max_size_mb=1
|
||||
)
|
||||
tag_data.commit()
|
||||
saved_count += 1
|
||||
|
||||
if saved_count % 10 == 0:
|
||||
logger.info(f" Progress: {saved_count}/{tag_count} tags migrated...")
|
||||
|
||||
@@ -737,5 +721,5 @@ class DatastoreUpdatesMixin:
|
||||
# On next load, _load_tags() will read from tag.json files and merge with settings
|
||||
logger.info("Tags saved to both settings AND individual tag.json files")
|
||||
logger.info("Future tag edits will update both locations (dual storage)")
|
||||
logger.critical("=" * 80)
|
||||
|
||||
logger.critical("=" * 80)
|
||||
@@ -489,6 +489,7 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
|
||||
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
# Test 1: Basic import with tag
|
||||
res = client.post(
|
||||
url_for("import") + "?tag=import-test",
|
||||
data='https://website1.com\r\nhttps://website2.com',
|
||||
@@ -507,6 +508,209 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
|
||||
res = client.get(url_for('tags.tags_overview_page'))
|
||||
assert b'import-test' in res.data
|
||||
|
||||
# Test 2: Import with watch configuration fields (issue #3845)
|
||||
# Test string field (include_filters), boolean (paused), and processor
|
||||
import urllib.parse
|
||||
params = urllib.parse.urlencode({
|
||||
'tag': 'config-test',
|
||||
'include_filters': 'div.content',
|
||||
'paused': 'true',
|
||||
'processor': 'text_json_diff',
|
||||
'title': 'Imported with Config'
|
||||
})
|
||||
|
||||
res = client.post(
|
||||
url_for("import") + "?" + params,
|
||||
data='https://website3.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 200
|
||||
assert len(res.json) == 1
|
||||
uuid = res.json[0]
|
||||
|
||||
# Verify the configuration was applied
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
assert watch['include_filters'] == ['div.content'], "include_filters should be set as array"
|
||||
assert watch['paused'] == True, "paused should be True"
|
||||
assert watch['processor'] == 'text_json_diff', "processor should be set"
|
||||
assert watch['title'] == 'Imported with Config', "title should be set"
|
||||
|
||||
# Test 3: Import with array field (notification_urls) - using valid Apprise format
|
||||
params = urllib.parse.urlencode({
|
||||
'tag': 'notification-test',
|
||||
'notification_urls': 'mailto://test@example.com,mailto://admin@example.com'
|
||||
})
|
||||
|
||||
res = client.post(
|
||||
url_for("import") + "?" + params,
|
||||
data='https://website4.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 200
|
||||
uuid = res.json[0]
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email"
|
||||
assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email"
|
||||
|
||||
# Test 4: Import with object field (time_between_check)
|
||||
import json
|
||||
time_config = json.dumps({"hours": 2, "minutes": 30})
|
||||
params = urllib.parse.urlencode({
|
||||
'tag': 'schedule-test',
|
||||
'time_between_check': time_config
|
||||
})
|
||||
|
||||
res = client.post(
|
||||
url_for("import") + "?" + params,
|
||||
data='https://website5.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 200
|
||||
uuid = res.json[0]
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
assert watch['time_between_check']['hours'] == 2, "time_between_check hours should be 2"
|
||||
assert watch['time_between_check']['minutes'] == 30, "time_between_check minutes should be 30"
|
||||
|
||||
# Test 5: Import with invalid processor (should fail)
|
||||
res = client.post(
|
||||
url_for("import") + "?processor=invalid_processor",
|
||||
data='https://website6.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 400, "Should reject invalid processor"
|
||||
assert b"Invalid processor" in res.data, "Error message should mention invalid processor"
|
||||
|
||||
# Test 6: Import with invalid field (should fail)
|
||||
res = client.post(
|
||||
url_for("import") + "?unknown_field=value",
|
||||
data='https://website7.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 400, "Should reject unknown field"
|
||||
assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter"
|
||||
|
||||
|
||||
def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""Test that small imports (< threshold) are processed synchronously"""
|
||||
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
|
||||
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
# Use local test endpoint to avoid network delays
|
||||
test_url_base = url_for('test_endpoint', _external=True)
|
||||
|
||||
# Create URLs: threshold - 1 to stay under limit
|
||||
num_urls = min(5, IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD - 1) # Use small number for faster test
|
||||
urls = '\n'.join([f'{test_url_base}?id=small-{i}' for i in range(num_urls)])
|
||||
|
||||
# Import small batch
|
||||
res = client.post(
|
||||
url_for("import") + "?tag=small-test",
|
||||
data=urls,
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Should return 200 OK with UUID list (synchronous)
|
||||
assert res.status_code == 200, f"Should return 200 for small imports, got {res.status_code}"
|
||||
assert isinstance(res.json, list), "Response should be a list of UUIDs"
|
||||
assert len(res.json) == num_urls, f"Should return {num_urls} UUIDs, got {len(res.json)}"
|
||||
|
||||
# Verify all watches were created immediately
|
||||
for uuid in res.json:
|
||||
assert uuid in live_server.app.config['DATASTORE'].data['watching'], \
|
||||
f"Watch {uuid} should exist immediately after synchronous import"
|
||||
|
||||
print(f"\n✓ Successfully created {num_urls} watches synchronously")
|
||||
|
||||
|
||||
def test_api_import_large_background(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""Test that large imports (>= threshold) are processed in background thread"""
|
||||
from changedetectionio.api.Import import IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD
|
||||
import time
|
||||
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
# Use local test endpoint to avoid network delays
|
||||
test_url_base = url_for('test_endpoint', _external=True)
|
||||
|
||||
# Create URLs: threshold + 10 to trigger background processing
|
||||
num_urls = IMPORT_SWITCH_TO_BACKGROUND_THRESHOLD + 10
|
||||
urls = '\n'.join([f'{test_url_base}?id=bulk-{i}' for i in range(num_urls)])
|
||||
|
||||
# Import large batch
|
||||
res = client.post(
|
||||
url_for("import") + "?tag=bulk-test",
|
||||
data=urls,
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# Should return 202 Accepted (background processing)
|
||||
assert res.status_code == 202, f"Should return 202 for large imports, got {res.status_code}"
|
||||
assert b"background" in res.data.lower(), "Response should mention background processing"
|
||||
|
||||
# Extract expected count from response
|
||||
response_json = res.json
|
||||
assert 'count' in response_json, "Response should include count"
|
||||
assert response_json['count'] == num_urls, f"Count should be {num_urls}, got {response_json['count']}"
|
||||
|
||||
# Wait for background thread to complete (with timeout)
|
||||
max_wait = 10 # seconds
|
||||
wait_interval = 0.5
|
||||
elapsed = 0
|
||||
watches_created = 0
|
||||
|
||||
while elapsed < max_wait:
|
||||
time.sleep(wait_interval)
|
||||
elapsed += wait_interval
|
||||
|
||||
# Count how many watches have been created
|
||||
watches_created = len([
|
||||
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
|
||||
if 'id=bulk-' in watch['url']
|
||||
])
|
||||
|
||||
if watches_created == num_urls:
|
||||
break
|
||||
|
||||
# Verify all watches were created
|
||||
assert watches_created == num_urls, \
|
||||
f"Expected {num_urls} watches to be created, but found {watches_created} after {elapsed}s"
|
||||
|
||||
# Verify watches have correct configuration
|
||||
bulk_watches = [
|
||||
watch for watch in live_server.app.config['DATASTORE'].data['watching'].values()
|
||||
if 'id=bulk-' in watch['url']
|
||||
]
|
||||
|
||||
assert len(bulk_watches) == num_urls, "All bulk watches should exist"
|
||||
|
||||
# Check that they have the correct tag
|
||||
datastore = live_server.app.config['DATASTORE']
|
||||
# Get UUIDs of bulk watches by filtering the datastore keys
|
||||
bulk_watch_uuids = [
|
||||
uuid for uuid, watch in live_server.app.config['DATASTORE'].data['watching'].items()
|
||||
if 'id=bulk-' in watch['url']
|
||||
]
|
||||
for watch_uuid in bulk_watch_uuids:
|
||||
tags = datastore.get_all_tags_for_watch(uuid=watch_uuid)
|
||||
tag_names = [t['title'] for t in tags.values()]
|
||||
assert 'bulk-test' in tag_names, f"Watch {watch_uuid} should have 'bulk-test' tag"
|
||||
|
||||
print(f"\n✓ Successfully created {num_urls} watches in background (took {elapsed}s)")
|
||||
|
||||
|
||||
def test_api_conflict_UI_password(client, live_server, measure_memory_usage, datastore_path):
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
|
||||
url_for("tags"),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.text.strip() == "{}", "Should be empty list"
|
||||
assert res.get_data(as_text=True).strip() == "{}", "Should be empty list"
|
||||
assert res.status_code == 200
|
||||
|
||||
res = client.post(
|
||||
@@ -36,7 +36,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert new_tag_uuid in res.text
|
||||
assert new_tag_uuid in res.get_data(as_text=True)
|
||||
assert res.json[new_tag_uuid]['title'] == tag_title
|
||||
assert res.json[new_tag_uuid]['notification_muted'] == False
|
||||
|
||||
@@ -118,6 +118,16 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
|
||||
assert res.status_code == 200
|
||||
assert new_tag_uuid in res.json.get('tags', [])
|
||||
|
||||
# Test that tags are returned when listing ALL watches (issue #3854)
|
||||
res = client.get(
|
||||
url_for("createwatch"), # GET /api/v1/watch - list all watches
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert watch_uuid in res.json, "Watch should be in the list"
|
||||
assert 'tags' in res.json[watch_uuid], "Tags field should be present in watch list"
|
||||
assert new_tag_uuid in res.json[watch_uuid]['tags'], "Tag UUID should be in tags array"
|
||||
|
||||
# Check recheck by tag
|
||||
before_check_time = live_server.app.config['DATASTORE'].data['watching'][watch_uuid].get('last_checked')
|
||||
time.sleep(1)
|
||||
@@ -148,7 +158,7 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert new_tag_uuid not in res.text
|
||||
assert new_tag_uuid not in res.get_data(as_text=True)
|
||||
|
||||
# Verify tag was removed from watch
|
||||
res = client.get(
|
||||
|
||||
@@ -5,6 +5,8 @@ from flask import url_for
|
||||
from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, extract_UUID_from_client, delete_all_watches
|
||||
import os
|
||||
|
||||
from ..store import ChangeDetectionStore
|
||||
|
||||
|
||||
# def test_setup(client, live_server, measure_memory_usage, datastore_path):
|
||||
# live_server_setup(live_server) # Setup on conftest per function
|
||||
@@ -487,7 +489,6 @@ def test_tag_json_persistence(client, live_server, measure_memory_usage, datasto
|
||||
- Tag deletion removes tag.json file
|
||||
"""
|
||||
import json
|
||||
from changedetectionio.store import ChangeDetectionStore
|
||||
|
||||
datastore = client.application.config.get('DATASTORE')
|
||||
|
||||
@@ -569,9 +570,6 @@ def test_tag_json_migration_update_27(client, live_server, measure_memory_usage,
|
||||
This simulates a pre-update_27 datastore and verifies migration works.
|
||||
"""
|
||||
import json
|
||||
from changedetectionio.store import ChangeDetectionStore
|
||||
|
||||
datastore = client.application.config.get('DATASTORE')
|
||||
|
||||
# 1. Create multiple tags
|
||||
tag_names = ['migration-tag-1', 'migration-tag-2', 'migration-tag-3']
|
||||
|
||||
@@ -28,7 +28,7 @@ info:
|
||||
|
||||
For example: `x-api-key: YOUR_API_KEY`
|
||||
|
||||
version: 0.1.4
|
||||
version: 0.1.5
|
||||
contact:
|
||||
name: ChangeDetection.io
|
||||
url: https://github.com/dgtlmoon/changedetection.io
|
||||
@@ -1503,46 +1503,92 @@ paths:
|
||||
post:
|
||||
operationId: importWatches
|
||||
tags: [Import]
|
||||
summary: Import watch URLs
|
||||
description: Import a list of URLs to monitor. Accepts line-separated URLs in request body.
|
||||
summary: Import watch URLs with configuration
|
||||
description: |
|
||||
Import a list of URLs to monitor with optional watch configuration. Accepts line-separated URLs in request body.
|
||||
|
||||
**Configuration via Query Parameters:**
|
||||
|
||||
You can pass ANY watch configuration field as query parameters to apply settings to all imported watches.
|
||||
All parameters from the Watch schema are supported (processor, fetch_backend, notification_urls, etc.).
|
||||
|
||||
**Special Parameters:**
|
||||
- `tag` / `tag_uuids` - Assign tags to imported watches
|
||||
- `proxy` - Use specific proxy for imported watches
|
||||
- `dedupe` - Skip duplicate URLs (default: true)
|
||||
|
||||
**Type Conversion:**
|
||||
- Booleans: `true`, `false`, `1`, `0`, `yes`, `no`
|
||||
- Arrays: Comma-separated or JSON format (`[item1,item2]`)
|
||||
- Objects: JSON format (`{"key":"value"}`)
|
||||
- Numbers: Parsed as int or float
|
||||
x-code-samples:
|
||||
- lang: 'curl'
|
||||
source: |
|
||||
# Basic import
|
||||
curl -X POST "http://localhost:5000/api/v1/import" \
|
||||
-H "x-api-key: YOUR_API_KEY" \
|
||||
-H "Content-Type: text/plain" \
|
||||
-d $'https://example.com\nhttps://example.org\nhttps://example.net'
|
||||
|
||||
# Import with processor and fetch backend
|
||||
curl -X POST "http://localhost:5000/api/v1/import?processor=restock_diff&fetch_backend=html_webdriver" \
|
||||
-H "x-api-key: YOUR_API_KEY" \
|
||||
-H "Content-Type: text/plain" \
|
||||
-d $'https://example.com\nhttps://example.org'
|
||||
|
||||
# Import with multiple settings
|
||||
curl -X POST "http://localhost:5000/api/v1/import?processor=restock_diff&paused=true&tag=production" \
|
||||
-H "x-api-key: YOUR_API_KEY" \
|
||||
-H "Content-Type: text/plain" \
|
||||
-d $'https://example.com'
|
||||
- lang: 'Python'
|
||||
source: |
|
||||
import requests
|
||||
|
||||
|
||||
headers = {
|
||||
'x-api-key': 'YOUR_API_KEY',
|
||||
'Content-Type': 'text/plain'
|
||||
}
|
||||
|
||||
# Basic import
|
||||
urls = 'https://example.com\nhttps://example.org\nhttps://example.net'
|
||||
response = requests.post('http://localhost:5000/api/v1/import',
|
||||
response = requests.post('http://localhost:5000/api/v1/import',
|
||||
headers=headers, data=urls)
|
||||
print(response.json())
|
||||
|
||||
# Import with configuration
|
||||
params = {
|
||||
'processor': 'restock_diff',
|
||||
'fetch_backend': 'html_webdriver',
|
||||
'paused': 'false',
|
||||
'tag': 'production'
|
||||
}
|
||||
response = requests.post('http://localhost:5000/api/v1/import',
|
||||
headers=headers, params=params, data=urls)
|
||||
print(response.json())
|
||||
parameters:
|
||||
- name: tag_uuids
|
||||
in: query
|
||||
description: Tag UUID to apply to imported web page change monitors (watches)
|
||||
description: Tag UUID(s) to apply to imported watches (comma-separated for multiple)
|
||||
schema:
|
||||
type: string
|
||||
example: "550e8400-e29b-41d4-a716-446655440000"
|
||||
- name: tag
|
||||
in: query
|
||||
description: Tag name to apply to imported web page change monitors (watches)
|
||||
description: Tag name to apply to imported watches
|
||||
schema:
|
||||
type: string
|
||||
example: "production"
|
||||
- name: proxy
|
||||
in: query
|
||||
description: Proxy key to use for imported web page change monitors (watches)
|
||||
description: Proxy key to use for imported watches
|
||||
schema:
|
||||
type: string
|
||||
example: "proxy1"
|
||||
- name: dedupe
|
||||
in: query
|
||||
description: Remove duplicate URLs (default true)
|
||||
description: Skip duplicate URLs (default true)
|
||||
schema:
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user