mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-04-01 16:48:11 +00:00
Compare commits
15 Commits
memfix-lin
...
API-fields
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fffcc9af39 | ||
|
|
961901c594 | ||
|
|
340421ea36 | ||
|
|
f29c4c8f5f | ||
|
|
9702b6c8a1 | ||
|
|
798fc21f1c | ||
|
|
0c6931c07c | ||
|
|
60ed2a26ea | ||
|
|
490ca0a663 | ||
|
|
10c9df288a | ||
|
|
f54725d292 | ||
|
|
acf9e4a1e6 | ||
|
|
7ddc0f9be0 | ||
|
|
20f11c5c4a | ||
|
|
4bc01aca8d |
@@ -2,7 +2,7 @@ from changedetectionio.strtobool import strtobool
|
||||
from flask_restful import abort, Resource
|
||||
from flask import request
|
||||
from functools import wraps
|
||||
from . import auth, validate_openapi_request, schema_create_watch
|
||||
from . import auth, validate_openapi_request
|
||||
from ..validate_url import is_safe_valid_url
|
||||
import json
|
||||
|
||||
@@ -33,9 +33,25 @@ def convert_query_param_to_type(value, schema_property):
|
||||
|
||||
Returns:
|
||||
Converted value in the appropriate type
|
||||
|
||||
Supports both OpenAPI 3.1 formats:
|
||||
- type: [string, 'null'] (array format)
|
||||
- anyOf: [{type: string}, {type: null}] (anyOf format)
|
||||
"""
|
||||
# Handle anyOf schemas (extract the first type)
|
||||
if 'anyOf' in schema_property:
|
||||
prop_type = schema_property.get('type')
|
||||
|
||||
# Handle OpenAPI 3.1 type arrays: type: [string, 'null']
|
||||
if isinstance(prop_type, list):
|
||||
# Use the first non-null type from the array
|
||||
for t in prop_type:
|
||||
if t != 'null':
|
||||
prop_type = t
|
||||
break
|
||||
else:
|
||||
prop_type = None
|
||||
|
||||
# Handle anyOf schemas (older format)
|
||||
elif 'anyOf' in schema_property:
|
||||
# Use the first non-null type from anyOf
|
||||
for option in schema_property['anyOf']:
|
||||
if option.get('type') and option.get('type') != 'null':
|
||||
@@ -43,8 +59,6 @@ def convert_query_param_to_type(value, schema_property):
|
||||
break
|
||||
else:
|
||||
prop_type = None
|
||||
else:
|
||||
prop_type = schema_property.get('type')
|
||||
|
||||
# Handle array type (e.g., notification_urls)
|
||||
if prop_type == 'array':
|
||||
@@ -89,7 +103,7 @@ class Import(Resource):
|
||||
@validate_openapi_request('importWatches')
|
||||
def post(self):
|
||||
"""Import a list of watched URLs with optional watch configuration."""
|
||||
|
||||
from . import get_watch_schema_properties
|
||||
# Special parameters that are NOT watch configuration
|
||||
special_params = {'tag', 'tag_uuids', 'dedupe', 'proxy'}
|
||||
|
||||
@@ -115,7 +129,8 @@ class Import(Resource):
|
||||
tag_uuids = tag_uuids.split(',')
|
||||
|
||||
# Extract ALL other query parameters as watch configuration
|
||||
schema_properties = schema_create_watch.get('properties', {})
|
||||
# Get schema from OpenAPI spec (replaces old schema_create_watch)
|
||||
schema_properties = get_watch_schema_properties()
|
||||
for param_name, param_value in request.args.items():
|
||||
# Skip special parameters
|
||||
if param_name in special_params:
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
from flask_expects_json import expects_json
|
||||
from flask_restful import Resource, abort
|
||||
from flask import request
|
||||
from . import auth, validate_openapi_request
|
||||
from . import schema_create_notification_urls, schema_delete_notification_urls
|
||||
|
||||
class Notifications(Resource):
|
||||
def __init__(self, **kwargs):
|
||||
@@ -22,7 +20,6 @@ class Notifications(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('addNotifications')
|
||||
@expects_json(schema_create_notification_urls)
|
||||
def post(self):
|
||||
"""Create Notification URLs."""
|
||||
|
||||
@@ -50,7 +47,6 @@ class Notifications(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('replaceNotifications')
|
||||
@expects_json(schema_create_notification_urls)
|
||||
def put(self):
|
||||
"""Replace Notification URLs."""
|
||||
json_data = request.get_json()
|
||||
@@ -73,7 +69,6 @@ class Notifications(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('deleteNotifications')
|
||||
@expects_json(schema_delete_notification_urls)
|
||||
def delete(self):
|
||||
"""Delete Notification URLs."""
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from changedetectionio import queuedWatchMetaData
|
||||
from changedetectionio import worker_pool
|
||||
from flask_expects_json import expects_json
|
||||
from flask_restful import abort, Resource
|
||||
from loguru import logger
|
||||
|
||||
@@ -8,8 +7,7 @@ import threading
|
||||
from flask import request
|
||||
from . import auth
|
||||
|
||||
# Import schemas from __init__.py
|
||||
from . import schema_tag, schema_create_tag, schema_update_tag, validate_openapi_request
|
||||
from . import validate_openapi_request
|
||||
|
||||
|
||||
class Tag(Resource):
|
||||
@@ -69,7 +67,25 @@ class Tag(Resource):
|
||||
tag.commit()
|
||||
return "OK", 200
|
||||
|
||||
return tag
|
||||
# Filter out Watch-specific runtime fields that don't apply to Tags (yet)
|
||||
# TODO: Future enhancement - aggregate these values from all Watches that have this tag:
|
||||
# - check_count: sum of all watches' check_count
|
||||
# - last_checked: most recent last_checked from all watches
|
||||
# - last_changed: most recent last_changed from all watches
|
||||
# - consecutive_filter_failures: count of watches with failures
|
||||
# - etc.
|
||||
# These come from watch_base inheritance but currently have no meaningful value for Tags
|
||||
watch_only_fields = {
|
||||
'browser_steps_last_error_step', 'check_count', 'consecutive_filter_failures',
|
||||
'content-type', 'fetch_time', 'last_changed', 'last_checked', 'last_error',
|
||||
'last_notification_error', 'last_viewed', 'notification_alert_count',
|
||||
'page_title', 'previous_md5', 'previous_md5_before_filters', 'remote_server_reply'
|
||||
}
|
||||
|
||||
# Create clean tag dict without Watch-specific fields
|
||||
clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields}
|
||||
|
||||
return clean_tag
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('deleteTag')
|
||||
@@ -102,24 +118,46 @@ class Tag(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('updateTag')
|
||||
@expects_json(schema_update_tag)
|
||||
def put(self, uuid):
|
||||
"""Update tag information."""
|
||||
tag = self.datastore.data['settings']['application']['tags'].get(uuid)
|
||||
if not tag:
|
||||
abort(404, message='No tag exists with the UUID of {}'.format(uuid))
|
||||
|
||||
# Make a mutable copy of request.json for modification
|
||||
json_data = dict(request.json)
|
||||
|
||||
# Validate notification_urls if provided
|
||||
if 'notification_urls' in request.json:
|
||||
if 'notification_urls' in json_data:
|
||||
from wtforms import ValidationError
|
||||
from changedetectionio.api.Notifications import validate_notification_urls
|
||||
try:
|
||||
notification_urls = request.json.get('notification_urls', [])
|
||||
notification_urls = json_data.get('notification_urls', [])
|
||||
validate_notification_urls(notification_urls)
|
||||
except ValidationError as e:
|
||||
return str(e), 400
|
||||
|
||||
tag.update(request.json)
|
||||
# Filter out readOnly fields (extracted from OpenAPI spec Tag schema)
|
||||
# These are system-managed fields that should never be user-settable
|
||||
from . import get_readonly_tag_fields
|
||||
readonly_fields = get_readonly_tag_fields()
|
||||
|
||||
# Tag model inherits from watch_base but has no @property attributes of its own
|
||||
# So we only need to filter readOnly fields
|
||||
for field in readonly_fields:
|
||||
json_data.pop(field, None)
|
||||
|
||||
# Validate remaining fields - reject truly unknown fields
|
||||
# Get valid fields from Tag schema
|
||||
from . import get_tag_schema_properties
|
||||
valid_fields = set(get_tag_schema_properties().keys())
|
||||
|
||||
# Check for unknown fields
|
||||
unknown_fields = set(json_data.keys()) - valid_fields
|
||||
if unknown_fields:
|
||||
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
|
||||
|
||||
tag.update(json_data)
|
||||
tag.commit()
|
||||
|
||||
return "OK", 200
|
||||
@@ -127,13 +165,21 @@ class Tag(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('createTag')
|
||||
# Only cares for {'title': 'xxxx'}
|
||||
def post(self):
|
||||
"""Create a single tag/group."""
|
||||
|
||||
json_data = request.get_json()
|
||||
title = json_data.get("title",'').strip()
|
||||
|
||||
# Validate that only valid fields are provided
|
||||
# Get valid fields from Tag schema
|
||||
from . import get_tag_schema_properties
|
||||
valid_fields = set(get_tag_schema_properties().keys())
|
||||
|
||||
# Check for unknown fields
|
||||
unknown_fields = set(json_data.keys()) - valid_fields
|
||||
if unknown_fields:
|
||||
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
|
||||
|
||||
new_uuid = self.datastore.add_tag(title=title)
|
||||
if new_uuid:
|
||||
|
||||
@@ -8,13 +8,11 @@ from . import auth
|
||||
from changedetectionio import queuedWatchMetaData, strtobool
|
||||
from changedetectionio import worker_pool
|
||||
from flask import request, make_response, send_from_directory
|
||||
from flask_expects_json import expects_json
|
||||
from flask_restful import abort, Resource
|
||||
from loguru import logger
|
||||
import copy
|
||||
|
||||
# Import schemas from __init__.py
|
||||
from . import schema, schema_create_watch, schema_update_watch, validate_openapi_request
|
||||
from . import validate_openapi_request, get_readonly_watch_fields
|
||||
from ..notification import valid_notification_formats
|
||||
from ..notification.handler import newline_re
|
||||
|
||||
@@ -121,7 +119,6 @@ class Watch(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('updateWatch')
|
||||
@expects_json(schema_update_watch)
|
||||
def put(self, uuid):
|
||||
"""Update watch information."""
|
||||
watch = self.datastore.data['watching'].get(uuid)
|
||||
@@ -175,6 +172,35 @@ class Watch(Resource):
|
||||
# Extract and remove processor config fields from json_data
|
||||
processor_config_data = processors.extract_processor_config_from_form_data(json_data)
|
||||
|
||||
# Filter out readOnly fields (extracted from OpenAPI spec Watch schema)
|
||||
# These are system-managed fields that should never be user-settable
|
||||
readonly_fields = get_readonly_watch_fields()
|
||||
|
||||
# Also filter out @property attributes (computed/derived values from the model)
|
||||
# These are not stored and should be ignored in PUT requests
|
||||
from changedetectionio.model.Watch import model as WatchModel
|
||||
property_fields = WatchModel.get_property_names()
|
||||
|
||||
# Combine both sets of fields to ignore
|
||||
fields_to_ignore = readonly_fields | property_fields
|
||||
|
||||
# Remove all ignored fields from update data
|
||||
for field in fields_to_ignore:
|
||||
json_data.pop(field, None)
|
||||
|
||||
# Validate remaining fields - reject truly unknown fields
|
||||
# Get valid fields from WatchBase schema
|
||||
from . import get_watch_schema_properties
|
||||
valid_fields = set(get_watch_schema_properties().keys())
|
||||
|
||||
# Also allow last_viewed (explicitly defined in UpdateWatch schema)
|
||||
valid_fields.add('last_viewed')
|
||||
|
||||
# Check for unknown fields
|
||||
unknown_fields = set(json_data.keys()) - valid_fields
|
||||
if unknown_fields:
|
||||
return f"Unknown field(s): {', '.join(sorted(unknown_fields))}", 400
|
||||
|
||||
# Update watch with regular (non-processor-config) fields
|
||||
watch.update(json_data)
|
||||
watch.commit()
|
||||
@@ -393,7 +419,6 @@ class CreateWatch(Resource):
|
||||
|
||||
@auth.check_token
|
||||
@validate_openapi_request('createWatch')
|
||||
@expects_json(schema_create_watch)
|
||||
def post(self):
|
||||
"""Create a single watch."""
|
||||
|
||||
|
||||
@@ -1,41 +1,6 @@
|
||||
import copy
|
||||
import functools
|
||||
from flask import request, abort
|
||||
from loguru import logger
|
||||
from . import api_schema
|
||||
from ..model import watch_base
|
||||
|
||||
# Build a JSON Schema atleast partially based on our Watch model
|
||||
watch_base_config = watch_base()
|
||||
schema = api_schema.build_watch_json_schema(watch_base_config)
|
||||
|
||||
schema_create_watch = copy.deepcopy(schema)
|
||||
schema_create_watch['required'] = ['url']
|
||||
del schema_create_watch['properties']['last_viewed']
|
||||
# Allow processor_config_* fields (handled separately in endpoint)
|
||||
schema_create_watch['patternProperties'] = {
|
||||
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
|
||||
}
|
||||
|
||||
schema_update_watch = copy.deepcopy(schema)
|
||||
schema_update_watch['additionalProperties'] = False
|
||||
# Allow processor_config_* fields (handled separately in endpoint)
|
||||
schema_update_watch['patternProperties'] = {
|
||||
'^processor_config_': {'type': ['string', 'number', 'boolean', 'object', 'array', 'null']}
|
||||
}
|
||||
|
||||
# Tag schema is also based on watch_base since Tag inherits from it
|
||||
schema_tag = copy.deepcopy(schema)
|
||||
schema_create_tag = copy.deepcopy(schema_tag)
|
||||
schema_create_tag['required'] = ['title']
|
||||
schema_update_tag = copy.deepcopy(schema_tag)
|
||||
schema_update_tag['additionalProperties'] = False
|
||||
|
||||
schema_notification_urls = copy.deepcopy(schema)
|
||||
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
|
||||
schema_create_notification_urls['required'] = ['notification_urls']
|
||||
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
|
||||
schema_delete_notification_urls['required'] = ['notification_urls']
|
||||
|
||||
@functools.cache
|
||||
def get_openapi_spec():
|
||||
@@ -54,6 +19,134 @@ def get_openapi_spec():
|
||||
_openapi_spec = OpenAPI.from_dict(spec_dict)
|
||||
return _openapi_spec
|
||||
|
||||
@functools.cache
|
||||
def get_openapi_schema_dict():
|
||||
"""
|
||||
Get the raw OpenAPI spec dictionary for schema access.
|
||||
|
||||
Used by Import endpoint to validate and convert query parameters.
|
||||
Returns the YAML dict directly (not the OpenAPI object).
|
||||
"""
|
||||
import os
|
||||
import yaml
|
||||
|
||||
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
|
||||
if not os.path.exists(spec_path):
|
||||
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
|
||||
|
||||
with open(spec_path, 'r', encoding='utf-8') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
@functools.cache
|
||||
def _resolve_schema_properties(schema_name):
|
||||
"""
|
||||
Generic helper to resolve schema properties, including allOf inheritance.
|
||||
|
||||
Args:
|
||||
schema_name: Name of the schema (e.g., 'WatchBase', 'Watch', 'Tag')
|
||||
|
||||
Returns:
|
||||
dict: All properties including inherited ones from $ref schemas
|
||||
"""
|
||||
spec_dict = get_openapi_schema_dict()
|
||||
schema = spec_dict['components']['schemas'].get(schema_name, {})
|
||||
|
||||
properties = {}
|
||||
|
||||
# Handle allOf (schema inheritance)
|
||||
if 'allOf' in schema:
|
||||
for item in schema['allOf']:
|
||||
# Resolve $ref to parent schema
|
||||
if '$ref' in item:
|
||||
ref_path = item['$ref'].split('/')[-1]
|
||||
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
|
||||
properties.update(ref_schema.get('properties', {}))
|
||||
# Add schema-specific properties
|
||||
if 'properties' in item:
|
||||
properties.update(item['properties'])
|
||||
else:
|
||||
# Direct properties (no inheritance)
|
||||
properties = schema.get('properties', {})
|
||||
|
||||
return properties
|
||||
|
||||
@functools.cache
|
||||
def _resolve_readonly_fields(schema_name):
|
||||
"""
|
||||
Generic helper to resolve readOnly fields, including allOf inheritance.
|
||||
|
||||
Args:
|
||||
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
|
||||
|
||||
Returns:
|
||||
frozenset: All readOnly field names including inherited ones
|
||||
"""
|
||||
spec_dict = get_openapi_schema_dict()
|
||||
schema = spec_dict['components']['schemas'].get(schema_name, {})
|
||||
|
||||
readonly_fields = set()
|
||||
|
||||
# Handle allOf (schema inheritance)
|
||||
if 'allOf' in schema:
|
||||
for item in schema['allOf']:
|
||||
# Resolve $ref to parent schema
|
||||
if '$ref' in item:
|
||||
ref_path = item['$ref'].split('/')[-1]
|
||||
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
|
||||
if 'properties' in ref_schema:
|
||||
for field_name, field_def in ref_schema['properties'].items():
|
||||
if field_def.get('readOnly') is True:
|
||||
readonly_fields.add(field_name)
|
||||
# Check schema-specific properties
|
||||
if 'properties' in item:
|
||||
for field_name, field_def in item['properties'].items():
|
||||
if field_def.get('readOnly') is True:
|
||||
readonly_fields.add(field_name)
|
||||
else:
|
||||
# Direct properties (no inheritance)
|
||||
if 'properties' in schema:
|
||||
for field_name, field_def in schema['properties'].items():
|
||||
if field_def.get('readOnly') is True:
|
||||
readonly_fields.add(field_name)
|
||||
|
||||
return frozenset(readonly_fields)
|
||||
|
||||
@functools.cache
|
||||
def get_watch_schema_properties():
|
||||
"""
|
||||
Extract watch schema properties from OpenAPI spec for Import endpoint.
|
||||
|
||||
Returns WatchBase properties (all writable Watch fields).
|
||||
"""
|
||||
return _resolve_schema_properties('WatchBase')
|
||||
|
||||
@functools.cache
|
||||
def get_readonly_watch_fields():
|
||||
"""
|
||||
Extract readOnly field names from Watch schema in OpenAPI spec.
|
||||
|
||||
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
|
||||
"""
|
||||
return _resolve_readonly_fields('Watch')
|
||||
|
||||
@functools.cache
|
||||
def get_tag_schema_properties():
|
||||
"""
|
||||
Extract Tag schema properties from OpenAPI spec.
|
||||
|
||||
Returns WatchBase properties + Tag-specific properties (overrides_watch).
|
||||
"""
|
||||
return _resolve_schema_properties('Tag')
|
||||
|
||||
@functools.cache
|
||||
def get_readonly_tag_fields():
|
||||
"""
|
||||
Extract readOnly field names from Tag schema in OpenAPI spec.
|
||||
|
||||
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
|
||||
"""
|
||||
return _resolve_readonly_fields('Tag')
|
||||
|
||||
def validate_openapi_request(operation_id):
|
||||
"""Decorator to validate incoming requests against OpenAPI spec."""
|
||||
def decorator(f):
|
||||
@@ -72,8 +165,16 @@ def validate_openapi_request(operation_id):
|
||||
if result.errors:
|
||||
error_details = []
|
||||
for error in result.errors:
|
||||
error_details.append(str(error))
|
||||
raise BadRequest(f"OpenAPI validation failed: {error_details}")
|
||||
# Extract detailed schema errors from __cause__
|
||||
if hasattr(error, '__cause__') and hasattr(error.__cause__, 'schema_errors'):
|
||||
for schema_error in error.__cause__.schema_errors:
|
||||
field = '.'.join(str(p) for p in schema_error.path) if schema_error.path else 'body'
|
||||
msg = schema_error.message if hasattr(schema_error, 'message') else str(schema_error)
|
||||
error_details.append(f"{field}: {msg}")
|
||||
else:
|
||||
error_details.append(str(error))
|
||||
logger.error(f"API Call - Validation failed: {'; '.join(error_details)}")
|
||||
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
|
||||
except BadRequest:
|
||||
# Re-raise BadRequest exceptions (validation failures)
|
||||
raise
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
|
||||
# Probably other ways to solve this when the backend switches to some ORM
|
||||
from changedetectionio.notification import valid_notification_formats
|
||||
|
||||
|
||||
def build_time_between_check_json_schema():
|
||||
# Setup time between check schema
|
||||
schema_properties_time_between_check = {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {}
|
||||
}
|
||||
for p in ['weeks', 'days', 'hours', 'minutes', 'seconds']:
|
||||
schema_properties_time_between_check['properties'][p] = {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
return schema_properties_time_between_check
|
||||
|
||||
def build_watch_json_schema(d):
|
||||
# Base JSON schema
|
||||
schema = {
|
||||
'type': 'object',
|
||||
'properties': {},
|
||||
}
|
||||
|
||||
for k, v in d.items():
|
||||
# @todo 'integer' is not covered here because its almost always for internal usage
|
||||
|
||||
if isinstance(v, type(None)):
|
||||
schema['properties'][k] = {
|
||||
"anyOf": [
|
||||
{"type": "null"},
|
||||
]
|
||||
}
|
||||
elif isinstance(v, list):
|
||||
schema['properties'][k] = {
|
||||
"anyOf": [
|
||||
{"type": "array",
|
||||
# Always is an array of strings, like text or regex or something
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 5000
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
elif isinstance(v, bool):
|
||||
schema['properties'][k] = {
|
||||
"anyOf": [
|
||||
{"type": "boolean"},
|
||||
]
|
||||
}
|
||||
elif isinstance(v, str):
|
||||
schema['properties'][k] = {
|
||||
"anyOf": [
|
||||
{"type": "string",
|
||||
"maxLength": 5000},
|
||||
]
|
||||
}
|
||||
|
||||
# Can also be a string (or None by default above)
|
||||
for v in ['body',
|
||||
'notification_body',
|
||||
'notification_format',
|
||||
'notification_title',
|
||||
'proxy',
|
||||
'tag',
|
||||
'title',
|
||||
'webdriver_js_execute_code'
|
||||
]:
|
||||
schema['properties'][v]['anyOf'].append({'type': 'string', "maxLength": 5000})
|
||||
|
||||
for v in ['last_viewed']:
|
||||
schema['properties'][v] = {
|
||||
"type": "integer",
|
||||
"description": "Unix timestamp in seconds of the last time the watch was viewed.",
|
||||
"minimum": 0
|
||||
}
|
||||
|
||||
# None or Boolean
|
||||
schema['properties']['track_ldjson_price_data']['anyOf'].append({'type': 'boolean'})
|
||||
|
||||
schema['properties']['method'] = {"type": "string",
|
||||
"enum": ["GET", "POST", "DELETE", "PUT"]
|
||||
}
|
||||
|
||||
schema['properties']['fetch_backend']['anyOf'].append({"type": "string",
|
||||
"enum": ["html_requests", "html_webdriver"]
|
||||
})
|
||||
|
||||
schema['properties']['processor'] = {"anyOf": [
|
||||
{"type": "string", "enum": ["restock_diff", "text_json_diff"]},
|
||||
{"type": "null"}
|
||||
]}
|
||||
|
||||
# All headers must be key/value type dict
|
||||
schema['properties']['headers'] = {
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
# Should always be a string:string type value
|
||||
".*": {"type": "string"},
|
||||
}
|
||||
}
|
||||
|
||||
schema['properties']['notification_format'] = {'type': 'string',
|
||||
'enum': list(valid_notification_formats.keys())
|
||||
}
|
||||
|
||||
# Stuff that shouldn't be available but is just state-storage
|
||||
for v in ['previous_md5', 'last_error', 'has_ldjson_price_data', 'previous_md5_before_filters', 'uuid']:
|
||||
del schema['properties'][v]
|
||||
|
||||
schema['properties']['webdriver_delay']['anyOf'].append({'type': 'integer'})
|
||||
|
||||
schema['properties']['time_between_check'] = build_time_between_check_json_schema()
|
||||
|
||||
schema['properties']['time_between_check_use_default'] = {
|
||||
"type": "boolean",
|
||||
"default": True,
|
||||
"description": "Whether to use global settings for time between checks - defaults to true if not set"
|
||||
}
|
||||
|
||||
schema['properties']['browser_steps'] = {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"operation": {
|
||||
"type": ["string", "null"],
|
||||
"maxLength": 5000 # Allows null and any string up to 5000 chars (including "")
|
||||
},
|
||||
"selector": {
|
||||
"type": ["string", "null"],
|
||||
"maxLength": 5000
|
||||
},
|
||||
"optional_value": {
|
||||
"type": ["string", "null"],
|
||||
"maxLength": 5000
|
||||
}
|
||||
},
|
||||
"required": ["operation", "selector", "optional_value"],
|
||||
"additionalProperties": False # No extra keys allowed
|
||||
}
|
||||
},
|
||||
{"type": "null"}, # Allows null for `browser_steps`
|
||||
{"type": "array", "maxItems": 0} # Allows empty array []
|
||||
]
|
||||
}
|
||||
|
||||
# headers ?
|
||||
return schema
|
||||
|
||||
@@ -26,6 +26,7 @@ class watch_base(dict):
|
||||
- Configuration override chain resolution (Watch → Tag → Global)
|
||||
- Immutability options
|
||||
- Better testing
|
||||
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
|
||||
|
||||
CHAIN RESOLUTION ARCHITECTURE:
|
||||
The dream is a 3-level override hierarchy:
|
||||
@@ -173,7 +174,7 @@ class watch_base(dict):
|
||||
'body': None,
|
||||
'browser_steps': [],
|
||||
'browser_steps_last_error_step': None,
|
||||
'conditions' : {},
|
||||
'conditions' : [],
|
||||
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
|
||||
'check_count': 0,
|
||||
'check_unique_lines': False, # On change-detected, compare against all history if its something new
|
||||
@@ -299,6 +300,42 @@ class watch_base(dict):
|
||||
if self.get('default'):
|
||||
del self['default']
|
||||
|
||||
@classmethod
|
||||
def get_property_names(cls):
|
||||
"""
|
||||
Get all @property attribute names from this model class using introspection.
|
||||
|
||||
This discovers computed/derived properties that are not stored in the datastore.
|
||||
These properties should be filtered out during PUT/POST requests.
|
||||
|
||||
Returns:
|
||||
frozenset: Immutable set of @property attribute names from the model class
|
||||
"""
|
||||
import functools
|
||||
|
||||
# Create a cached version if it doesn't exist
|
||||
if not hasattr(cls, '_cached_get_property_names'):
|
||||
@functools.cache
|
||||
def _get_props():
|
||||
properties = set()
|
||||
# Use introspection to find all @property attributes
|
||||
for name in dir(cls):
|
||||
# Skip private/magic attributes
|
||||
if name.startswith('_'):
|
||||
continue
|
||||
try:
|
||||
attr = getattr(cls, name)
|
||||
# Check if it's a property descriptor
|
||||
if isinstance(attr, property):
|
||||
properties.add(name)
|
||||
except (AttributeError, TypeError):
|
||||
continue
|
||||
return frozenset(properties)
|
||||
|
||||
cls._cached_get_property_names = _get_props
|
||||
|
||||
return cls._cached_get_property_names()
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""
|
||||
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
|
||||
|
||||
@@ -56,6 +56,259 @@ def _deduplicate_prices(data):
|
||||
return list(unique_data)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
|
||||
# =============================================================================
|
||||
#
|
||||
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
|
||||
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
|
||||
# libxml2 - a C library that allocates memory at the C level.
|
||||
#
|
||||
# Memory Leak Problem:
|
||||
# --------------------
|
||||
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
|
||||
# C-level allocations (nodes, attributes, text content)
|
||||
# 2. Python's garbage collector can mark these objects as collectible, but
|
||||
# cannot force the OS to reclaim the actual C-level memory
|
||||
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
|
||||
# "free in the process address space"
|
||||
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
|
||||
# after Python GC runs
|
||||
#
|
||||
# Why Multiprocessing Fixes This:
|
||||
# --------------------------------
|
||||
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
|
||||
# allocations that Python GC couldn't release. This ensures clean memory state
|
||||
# after each extraction.
|
||||
#
|
||||
# Performance Impact:
|
||||
# -------------------
|
||||
# - Memray analysis showed 1.2M document_fromstring allocations per page
|
||||
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
|
||||
# - With subprocess: ~35MB overhead but forces full cleanup after each run
|
||||
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
|
||||
#
|
||||
# References:
|
||||
# -----------
|
||||
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
|
||||
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
|
||||
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
|
||||
#
|
||||
# Additional Context:
|
||||
# -------------------
|
||||
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
|
||||
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
|
||||
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
|
||||
# use direct call as fallback
|
||||
#
|
||||
# Alternative Solution (Future Optimization):
|
||||
# -------------------------------------------
|
||||
# This entire problem could be avoided by using regex to extract just the machine
|
||||
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
|
||||
#
|
||||
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
|
||||
# 2. Use regex to extract <meta property="og:*"> tags
|
||||
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
|
||||
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
|
||||
#
|
||||
# Benefits:
|
||||
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
|
||||
# - Eliminates the lxml memory leak entirely
|
||||
# - Faster extraction (regex is much faster than DOM parsing)
|
||||
# - No subprocess overhead needed
|
||||
#
|
||||
# Trade-offs:
|
||||
# - Regex for HTML is brittle (comments, CDATA, edge cases)
|
||||
# - Microdata extraction would be complex (need to track element boundaries)
|
||||
# - Would need extensive testing to ensure we don't miss valid data
|
||||
# - extruct is battle-tested; regex solution would need similar maturity
|
||||
#
|
||||
# For now, the subprocess approach is safer and leverages existing extruct code.
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _extract_itemprop_availability_worker(pipe_conn):
|
||||
"""
|
||||
Subprocess worker for itemprop extraction (Linux memory management).
|
||||
|
||||
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
|
||||
When the subprocess exits, the OS reclaims ALL memory including lxml's
|
||||
C-level allocations that Python's GC cannot release.
|
||||
|
||||
Args:
|
||||
pipe_conn: Pipe connection to receive HTML and send result
|
||||
"""
|
||||
import json
|
||||
import gc
|
||||
|
||||
html_content = None
|
||||
result_data = None
|
||||
|
||||
try:
|
||||
# Receive HTML as raw bytes (no pickle)
|
||||
html_bytes = pipe_conn.recv_bytes()
|
||||
html_content = html_bytes.decode('utf-8')
|
||||
|
||||
# Explicitly delete html_bytes to free memory
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Perform extraction in subprocess (uses extruct/lxml)
|
||||
result_data = get_itemprop_availability(html_content)
|
||||
|
||||
# Convert Restock object to dict for JSON serialization
|
||||
result = {
|
||||
'success': True,
|
||||
'data': dict(result_data) if result_data else {}
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
# Clean up before exit
|
||||
del result_data, html_content, result
|
||||
gc.collect()
|
||||
|
||||
except MoreThanOnePriceFound:
|
||||
# Serialize the specific exception type
|
||||
result = {
|
||||
'success': False,
|
||||
'exception_type': 'MoreThanOnePriceFound'
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
# Serialize other exceptions
|
||||
result = {
|
||||
'success': False,
|
||||
'exception_type': type(e).__name__,
|
||||
'exception_message': str(e)
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
finally:
|
||||
# Final cleanup before subprocess exits
|
||||
# Variables may already be deleted in try block, so use try/except
|
||||
try:
|
||||
del html_content
|
||||
except (NameError, UnboundLocalError):
|
||||
pass
|
||||
try:
|
||||
del result_data
|
||||
except (NameError, UnboundLocalError):
|
||||
pass
|
||||
gc.collect()
|
||||
pipe_conn.close()
|
||||
|
||||
|
||||
def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||
"""
|
||||
Extract itemprop availability with hybrid approach for memory efficiency.
|
||||
|
||||
Strategy (fastest to slowest, least to most memory):
|
||||
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
|
||||
2. Fall back to extruct with subprocess isolation on Linux for complex cases
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
|
||||
Returns:
|
||||
Restock: Extracted availability data
|
||||
|
||||
Raises:
|
||||
MoreThanOnePriceFound: When multiple prices detected
|
||||
Other exceptions: From extruct/parsing
|
||||
"""
|
||||
import platform
|
||||
|
||||
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
|
||||
try:
|
||||
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
|
||||
|
||||
logger.trace("Attempting pure Python metadata extraction (no lxml)")
|
||||
extracted_data = extract_metadata_pure_python(html_content)
|
||||
price_data = query_price_availability(extracted_data)
|
||||
|
||||
# If we got price AND availability, we're done!
|
||||
if price_data.get('price') and price_data.get('availability'):
|
||||
result = Restock(price_data)
|
||||
logger.debug(f"Pure Python extraction successful: {dict(result)}")
|
||||
return result
|
||||
|
||||
# If we got some data but not everything, still try extruct for completeness
|
||||
if price_data.get('price') or price_data.get('availability'):
|
||||
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
|
||||
|
||||
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
|
||||
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
|
||||
|
||||
# Only use subprocess isolation on Linux
|
||||
# Other platforms may have issues with spawn or don't need the aggressive memory management
|
||||
if platform.system() == 'Linux':
|
||||
import multiprocessing
|
||||
import json
|
||||
import gc
|
||||
|
||||
try:
|
||||
ctx = multiprocessing.get_context('spawn')
|
||||
parent_conn, child_conn = ctx.Pipe()
|
||||
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
|
||||
p.start()
|
||||
|
||||
# Send HTML as raw bytes (no pickle)
|
||||
html_bytes = html_content.encode('utf-8')
|
||||
parent_conn.send_bytes(html_bytes)
|
||||
|
||||
# Explicitly delete html_bytes copy immediately after sending
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Receive result as JSON
|
||||
result_bytes = parent_conn.recv_bytes()
|
||||
result = json.loads(result_bytes.decode('utf-8'))
|
||||
|
||||
# Wait for subprocess to complete
|
||||
p.join()
|
||||
|
||||
# Close pipes
|
||||
parent_conn.close()
|
||||
child_conn.close()
|
||||
|
||||
# Clean up all subprocess-related objects
|
||||
del p, parent_conn, child_conn, result_bytes
|
||||
gc.collect()
|
||||
|
||||
# Handle result or re-raise exception
|
||||
if result['success']:
|
||||
# Reconstruct Restock object from dict
|
||||
restock_obj = Restock(result['data'])
|
||||
# Clean up result dict
|
||||
del result
|
||||
gc.collect()
|
||||
return restock_obj
|
||||
else:
|
||||
# Re-raise the exception that occurred in subprocess
|
||||
exception_type = result['exception_type']
|
||||
exception_msg = result.get('exception_message', '')
|
||||
del result
|
||||
gc.collect()
|
||||
|
||||
if exception_type == 'MoreThanOnePriceFound':
|
||||
raise MoreThanOnePriceFound()
|
||||
else:
|
||||
raise Exception(f"{exception_type}: {exception_msg}")
|
||||
|
||||
except Exception as e:
|
||||
# If multiprocessing itself fails, log and fall back to direct call
|
||||
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
|
||||
gc.collect()
|
||||
return get_itemprop_availability(html_content)
|
||||
else:
|
||||
# Non-Linux: direct call (no subprocess overhead needed)
|
||||
return get_itemprop_availability(html_content)
|
||||
|
||||
|
||||
# should return Restock()
|
||||
# add casting?
|
||||
def get_itemprop_availability(html_content) -> Restock:
|
||||
@@ -196,8 +449,9 @@ class perform_site_check(difference_detection_processor):
|
||||
multiple_prices_found = False
|
||||
|
||||
# Try built-in extraction first, this will scan metadata in the HTML
|
||||
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
|
||||
try:
|
||||
itemprop_availability = get_itemprop_availability(self.fetcher.content)
|
||||
itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
|
||||
except MoreThanOnePriceFound as e:
|
||||
# Don't raise immediately - let plugins try to handle this case
|
||||
# Plugins might be able to determine which price is correct
|
||||
|
||||
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Pure Python metadata extractor - no lxml, no memory leaks.
|
||||
|
||||
This module provides a fast, memory-efficient alternative to extruct for common
|
||||
e-commerce metadata extraction. It handles:
|
||||
- JSON-LD (covers 80%+ of modern sites)
|
||||
- OpenGraph meta tags
|
||||
- Basic microdata attributes
|
||||
|
||||
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
|
||||
memory allocation issues. For edge cases, the main processor can fall back to
|
||||
extruct (with subprocess isolation on Linux).
|
||||
"""
|
||||
|
||||
from html.parser import HTMLParser
|
||||
import json
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class JSONLDExtractor(HTMLParser):
|
||||
"""
|
||||
Extract JSON-LD structured data from HTML.
|
||||
|
||||
Finds all <script type="application/ld+json"> tags and parses their content.
|
||||
Handles multiple JSON-LD blocks on the same page.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_jsonld = False
|
||||
self.data = [] # List of all parsed JSON-LD objects
|
||||
self.current_script = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'script':
|
||||
# Check if this is a JSON-LD script tag
|
||||
for attr, value in attrs:
|
||||
if attr == 'type' and value == 'application/ld+json':
|
||||
self.in_jsonld = True
|
||||
self.current_script = []
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_jsonld:
|
||||
self.current_script.append(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'script' and self.in_jsonld:
|
||||
# Parse the accumulated script content
|
||||
script_content = ''.join(self.current_script)
|
||||
if script_content.strip():
|
||||
try:
|
||||
# Parse JSON (handles both objects and arrays)
|
||||
parsed = json.loads(script_content)
|
||||
if isinstance(parsed, list):
|
||||
self.data.extend(parsed)
|
||||
else:
|
||||
self.data.append(parsed)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"Failed to parse JSON-LD: {e}")
|
||||
pass
|
||||
|
||||
self.in_jsonld = False
|
||||
self.current_script = []
|
||||
|
||||
|
||||
class OpenGraphExtractor(HTMLParser):
|
||||
"""
|
||||
Extract OpenGraph meta tags from HTML.
|
||||
|
||||
Finds <meta property="og:*"> tags commonly used for social media sharing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.og_data = {}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'meta':
|
||||
attrs_dict = dict(attrs)
|
||||
prop = attrs_dict.get('property', '')
|
||||
|
||||
# Extract OpenGraph properties
|
||||
if prop.startswith('og:'):
|
||||
content = attrs_dict.get('content', '')
|
||||
if content:
|
||||
self.og_data[prop] = content
|
||||
|
||||
|
||||
class MicrodataExtractor(HTMLParser):
|
||||
"""
|
||||
Extract basic microdata attributes from HTML.
|
||||
|
||||
Finds elements with itemprop attributes. This is a simplified extractor
|
||||
that doesn't handle nested itemscope/itemtype hierarchies - for complex
|
||||
cases, use extruct as fallback.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.microdata = {}
|
||||
self.current_itemprop = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
if 'itemprop' in attrs_dict:
|
||||
itemprop = attrs_dict['itemprop']
|
||||
|
||||
# Price/currency/availability can be in content/href attributes
|
||||
if itemprop == 'price':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['price'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'price'
|
||||
|
||||
elif itemprop == 'priceCurrency':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['currency'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'priceCurrency'
|
||||
|
||||
elif itemprop == 'availability':
|
||||
# Can be in href (link) or content (meta)
|
||||
if 'href' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['href']
|
||||
elif 'content' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'availability'
|
||||
|
||||
def handle_data(self, data):
|
||||
# Capture text content for itemprop elements
|
||||
if self.current_itemprop == 'price':
|
||||
# Try to extract numeric price from text
|
||||
try:
|
||||
price_text = re.sub(r'[^\d.]', '', data.strip())
|
||||
if price_text:
|
||||
self.microdata['price'] = float(price_text)
|
||||
except ValueError:
|
||||
pass
|
||||
elif self.current_itemprop == 'priceCurrency':
|
||||
currency = data.strip()
|
||||
if currency:
|
||||
self.microdata['currency'] = currency
|
||||
elif self.current_itemprop == 'availability':
|
||||
availability = data.strip()
|
||||
if availability:
|
||||
self.microdata['availability'] = availability
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
# Reset current itemprop after closing tag
|
||||
self.current_itemprop = None
|
||||
|
||||
|
||||
def extract_metadata_pure_python(html_content):
|
||||
"""
|
||||
Extract structured metadata from HTML using pure Python parsers.
|
||||
|
||||
Returns a dict with three keys:
|
||||
- 'json-ld': List of parsed JSON-LD objects
|
||||
- 'opengraph': Dict of OpenGraph properties
|
||||
- 'microdata': Dict of microdata properties
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
|
||||
Returns:
|
||||
dict: Extracted metadata in three formats
|
||||
"""
|
||||
result = {
|
||||
'json-ld': [],
|
||||
'opengraph': {},
|
||||
'microdata': {}
|
||||
}
|
||||
|
||||
# Extract JSON-LD
|
||||
try:
|
||||
jsonld_extractor = JSONLDExtractor()
|
||||
jsonld_extractor.feed(html_content)
|
||||
result['json-ld'] = jsonld_extractor.data
|
||||
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
|
||||
except Exception as e:
|
||||
logger.debug(f"JSON-LD extraction failed: {e}")
|
||||
|
||||
# Extract OpenGraph
|
||||
try:
|
||||
og_extractor = OpenGraphExtractor()
|
||||
og_extractor.feed(html_content)
|
||||
result['opengraph'] = og_extractor.og_data
|
||||
if result['opengraph']:
|
||||
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
|
||||
except Exception as e:
|
||||
logger.debug(f"OpenGraph extraction failed: {e}")
|
||||
|
||||
# Extract Microdata
|
||||
try:
|
||||
microdata_extractor = MicrodataExtractor()
|
||||
microdata_extractor.feed(html_content)
|
||||
result['microdata'] = microdata_extractor.microdata
|
||||
if result['microdata']:
|
||||
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Microdata extraction failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def query_price_availability(extracted_data):
|
||||
"""
|
||||
Query extracted metadata for price and availability information.
|
||||
|
||||
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
|
||||
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
|
||||
|
||||
Args:
|
||||
extracted_data: Dict from extract_metadata_pure_python()
|
||||
|
||||
Returns:
|
||||
dict: {'price': float, 'currency': str, 'availability': str}
|
||||
"""
|
||||
from jsonpath_ng import parse
|
||||
|
||||
result = {}
|
||||
|
||||
# 1. Try JSON-LD first (most reliable and common)
|
||||
for data in extracted_data.get('json-ld', []):
|
||||
try:
|
||||
# Use jsonpath to find price/availability anywhere in the structure
|
||||
price_parse = parse('$..(price|Price)')
|
||||
availability_parse = parse('$..(availability|Availability)')
|
||||
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
|
||||
|
||||
price_results = [m.value for m in price_parse.find(data)]
|
||||
if price_results and not result.get('price'):
|
||||
# Handle various price formats
|
||||
price_val = price_results[0]
|
||||
if isinstance(price_val, (int, float)):
|
||||
result['price'] = float(price_val)
|
||||
elif isinstance(price_val, str):
|
||||
# Extract numeric value from string
|
||||
try:
|
||||
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
avail_results = [m.value for m in availability_parse.find(data)]
|
||||
if avail_results and not result.get('availability'):
|
||||
result['availability'] = str(avail_results[0])
|
||||
|
||||
curr_results = [m.value for m in currency_parse.find(data)]
|
||||
if curr_results and not result.get('currency'):
|
||||
result['currency'] = str(curr_results[0])
|
||||
|
||||
# If we found price, this JSON-LD block is good
|
||||
if result.get('price'):
|
||||
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error querying JSON-LD: {e}")
|
||||
continue
|
||||
|
||||
# 2. Try OpenGraph if JSON-LD didn't provide everything
|
||||
og_data = extracted_data.get('opengraph', {})
|
||||
if not result.get('price') and 'og:price:amount' in og_data:
|
||||
try:
|
||||
result['price'] = float(og_data['og:price:amount'])
|
||||
except ValueError:
|
||||
pass
|
||||
if not result.get('currency') and 'og:price:currency' in og_data:
|
||||
result['currency'] = og_data['og:price:currency']
|
||||
if not result.get('availability') and 'og:availability' in og_data:
|
||||
result['availability'] = og_data['og:availability']
|
||||
|
||||
# 3. Use microdata as last resort
|
||||
microdata = extracted_data.get('microdata', {})
|
||||
if not result.get('price') and 'price' in microdata:
|
||||
result['price'] = microdata['price']
|
||||
if not result.get('currency') and 'currency' in microdata:
|
||||
result['currency'] = microdata['currency']
|
||||
if not result.get('availability') and 'availability' in microdata:
|
||||
result['availability'] = microdata['availability']
|
||||
|
||||
return result
|
||||
@@ -328,6 +328,68 @@ def test_api_simple(client, live_server, measure_memory_usage, datastore_path):
|
||||
)
|
||||
assert len(res.json) == 0, "Watch list should be empty"
|
||||
|
||||
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""
|
||||
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:param datastore_path:
|
||||
:return:
|
||||
"""
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
set_original_response(datastore_path=datastore_path)
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
|
||||
# Create new
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({"url": test_url}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 201
|
||||
uuid = res.json.get('uuid')
|
||||
|
||||
# Now fetch it and send it back
|
||||
|
||||
res = client.get(
|
||||
url_for("watch", uuid=uuid),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
|
||||
watch=res.json
|
||||
|
||||
# Be sure that 'readOnly' values are never updated in the real watch
|
||||
watch['last_changed'] = 454444444444
|
||||
watch['date_created'] = 454444444444
|
||||
|
||||
# HTTP PUT ( UPDATE an existing watch )
|
||||
res = client.put(
|
||||
url_for("watch", uuid=uuid),
|
||||
headers={'x-api-key': api_key, 'content-type': 'application/json'},
|
||||
data=json.dumps(watch),
|
||||
)
|
||||
if res.status_code != 200:
|
||||
print(f"\n=== PUT failed with {res.status_code} ===")
|
||||
print(f"Error: {res.data}")
|
||||
assert res.status_code == 200, "HTTP PUT update was sent OK"
|
||||
|
||||
res = client.get(
|
||||
url_for("watch", uuid=uuid),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
last_changed = res.json.get('last_changed')
|
||||
assert last_changed != 454444444444
|
||||
assert last_changed != "454444444444"
|
||||
|
||||
date_created = res.json.get('date_created')
|
||||
assert date_created != 454444444444
|
||||
assert date_created != "454444444444"
|
||||
|
||||
|
||||
def test_access_denied(client, live_server, measure_memory_usage, datastore_path):
|
||||
# `config_api_token_enabled` Should be On by default
|
||||
res = client.get(
|
||||
@@ -401,6 +463,9 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
if res.status_code != 201:
|
||||
print(f"\n=== POST createwatch failed with {res.status_code} ===")
|
||||
print(f"Response: {res.data}")
|
||||
assert res.status_code == 201
|
||||
|
||||
wait_for_all_checks(client)
|
||||
@@ -464,11 +529,12 @@ def test_api_watch_PUT_update(client, live_server, measure_memory_usage, datasto
|
||||
)
|
||||
|
||||
assert res.status_code == 400, "Should get error 400 when we give a field that doesnt exist"
|
||||
# Message will come from `flask_expects_json`
|
||||
# With patternProperties for processor_config_*, the error message format changed slightly
|
||||
assert (b'Additional properties are not allowed' in res.data or
|
||||
# Backend validation now rejects unknown fields with a clear error message
|
||||
assert (b'Unknown field' in res.data or
|
||||
b'Additional properties are not allowed' in res.data or
|
||||
b'Unevaluated properties are not allowed' in res.data or
|
||||
b'does not match any of the regexes' in res.data), \
|
||||
"Should reject unknown fields with schema validation error"
|
||||
"Should reject unknown fields with validation error"
|
||||
|
||||
|
||||
# Try a XSS URL
|
||||
@@ -553,6 +619,8 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
|
||||
assert res.status_code == 200
|
||||
uuid = res.json[0]
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
assert isinstance(watch['notification_urls'], list), "notification_urls must be stored as a list"
|
||||
assert len(watch['notification_urls']) == 2, "notification_urls should have 2 entries"
|
||||
assert 'mailto://test@example.com' in watch['notification_urls'], "notification_urls should contain first email"
|
||||
assert 'mailto://admin@example.com' in watch['notification_urls'], "notification_urls should contain second email"
|
||||
|
||||
@@ -599,6 +667,34 @@ def test_api_import(client, live_server, measure_memory_usage, datastore_path):
|
||||
assert res.status_code == 400, "Should reject unknown field"
|
||||
assert b"Unknown watch configuration parameter" in res.data, "Error message should mention unknown parameter"
|
||||
|
||||
# Test 7: Import with complex nested array (browser_steps) - array of objects
|
||||
browser_steps = json.dumps([
|
||||
{"operation": "wait", "selector": "5", "optional_value": ""},
|
||||
{"operation": "click", "selector": "button.submit", "optional_value": ""}
|
||||
])
|
||||
params = urllib.parse.urlencode({
|
||||
'tag': 'browser-test',
|
||||
'browser_steps': browser_steps
|
||||
})
|
||||
|
||||
res = client.post(
|
||||
url_for("import") + "?" + params,
|
||||
data='https://website8.com',
|
||||
headers={'x-api-key': api_key},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert res.status_code == 200, "Should accept browser_steps array"
|
||||
uuid = res.json[0]
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
assert len(watch['browser_steps']) == 2, "Should have 2 browser steps"
|
||||
assert watch['browser_steps'][0]['operation'] == 'wait', "First step should be wait"
|
||||
assert watch['browser_steps'][1]['operation'] == 'click', "Second step should be click"
|
||||
assert watch['browser_steps'][1]['selector'] == 'button.submit', "Second step selector should be button.submit"
|
||||
|
||||
# Cleanup
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_api_import_small_synchronous(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""Test that small imports (< threshold) are processed synchronously"""
|
||||
@@ -837,7 +933,9 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
|
||||
)
|
||||
assert res.status_code == 400, "Updating watch URL to null should fail"
|
||||
# Accept either OpenAPI validation error or our custom validation error
|
||||
assert b'URL cannot be null' in res.data or b'OpenAPI validation failed' in res.data or b'validation error' in res.data.lower()
|
||||
assert (b'URL cannot be null' in res.data or
|
||||
b'Validation failed' in res.data or
|
||||
b'validation error' in res.data.lower())
|
||||
|
||||
# Test 8: UPDATE to empty string URL should fail
|
||||
res = client.put(
|
||||
@@ -924,3 +1022,140 @@ def test_api_url_validation(client, live_server, measure_memory_usage, datastore
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_api_time_between_check_validation(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""
|
||||
Test that time_between_check validation works correctly:
|
||||
- When time_between_check_use_default is false, at least one time value must be > 0
|
||||
- Values must be valid integers
|
||||
"""
|
||||
import json
|
||||
from flask import url_for
|
||||
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
# Test 1: time_between_check_use_default=false with NO time_between_check should fail
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example.com",
|
||||
"time_between_check_use_default": False
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 400, "Should fail when time_between_check_use_default=false with no time_between_check"
|
||||
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
|
||||
|
||||
# Test 2: time_between_check_use_default=false with ALL zeros should fail
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example.com",
|
||||
"time_between_check_use_default": False,
|
||||
"time_between_check": {
|
||||
"weeks": 0,
|
||||
"days": 0,
|
||||
"hours": 0,
|
||||
"minutes": 0,
|
||||
"seconds": 0
|
||||
}
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 400, "Should fail when all time values are 0"
|
||||
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
|
||||
|
||||
# Test 3: time_between_check_use_default=false with NULL values should fail
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example.com",
|
||||
"time_between_check_use_default": False,
|
||||
"time_between_check": {
|
||||
"weeks": None,
|
||||
"days": None,
|
||||
"hours": None,
|
||||
"minutes": None,
|
||||
"seconds": None
|
||||
}
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 400, "Should fail when all time values are null"
|
||||
assert b"At least one time interval" in res.data, "Error message should mention time interval requirement"
|
||||
|
||||
# Test 4: time_between_check_use_default=false with valid hours should succeed
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example.com",
|
||||
"time_between_check_use_default": False,
|
||||
"time_between_check": {
|
||||
"hours": 2
|
||||
}
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 201, "Should succeed with valid hours value"
|
||||
uuid1 = res.json.get('uuid')
|
||||
|
||||
# Test 5: time_between_check_use_default=false with valid minutes should succeed
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example2.com",
|
||||
"time_between_check_use_default": False,
|
||||
"time_between_check": {
|
||||
"minutes": 30
|
||||
}
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 201, "Should succeed with valid minutes value"
|
||||
uuid2 = res.json.get('uuid')
|
||||
|
||||
# Test 6: time_between_check_use_default=true (or missing) with no time_between_check should succeed (uses defaults)
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example3.com",
|
||||
"time_between_check_use_default": True
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 201, "Should succeed when using default settings"
|
||||
uuid3 = res.json.get('uuid')
|
||||
|
||||
# Test 7: Default behavior (no time_between_check_use_default field) should use defaults and succeed
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example4.com"
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 201, "Should succeed with default behavior (using global settings)"
|
||||
uuid4 = res.json.get('uuid')
|
||||
|
||||
# Test 8: Verify integer type validation - string should fail (OpenAPI validation)
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({
|
||||
"url": "https://example5.com",
|
||||
"time_between_check_use_default": False,
|
||||
"time_between_check": {
|
||||
"hours": "not_a_number"
|
||||
}
|
||||
}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 400, "Should fail when time value is not an integer"
|
||||
assert b"Validation failed" in res.data or b"not of type" in res.data, "Should mention validation/type error"
|
||||
|
||||
# Cleanup
|
||||
for uuid in [uuid1, uuid2, uuid3, uuid4]:
|
||||
client.delete(
|
||||
url_for("watch", uuid=uuid),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
|
||||
@@ -107,7 +107,7 @@ def test_watch_notification_urls_validation(client, live_server, measure_memory_
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 400, "Should reject non-list notification_urls"
|
||||
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
|
||||
assert b"Validation failed" in res.data or b"is not of type" in res.data
|
||||
|
||||
# Test 6: Verify original URLs are preserved after failed update
|
||||
res = client.get(
|
||||
@@ -159,7 +159,7 @@ def test_tag_notification_urls_validation(client, live_server, measure_memory_us
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 400, "Should reject non-list notification_urls"
|
||||
assert b"OpenAPI validation failed" in res.data or b"Request body validation error" in res.data
|
||||
assert b"Validation failed" in res.data or b"is not of type" in res.data
|
||||
|
||||
# Test 4: Verify original URLs are preserved after failed update
|
||||
tag = datastore.data['settings']['application']['tags'][tag_uuid]
|
||||
|
||||
@@ -26,7 +26,7 @@ def test_openapi_validation_invalid_content_type_on_create_watch(client, live_se
|
||||
|
||||
# Should get 400 error due to OpenAPI validation failure
|
||||
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
|
||||
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
|
||||
assert b"Validation failed" in res.data, "Should contain validation error message"
|
||||
|
||||
|
||||
def test_openapi_validation_missing_required_field_create_watch(client, live_server, measure_memory_usage, datastore_path):
|
||||
@@ -43,7 +43,7 @@ def test_openapi_validation_missing_required_field_create_watch(client, live_ser
|
||||
|
||||
# Should get 400 error due to missing required field
|
||||
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
|
||||
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
|
||||
assert b"Validation failed" in res.data, "Should contain validation error message"
|
||||
|
||||
|
||||
def test_openapi_validation_invalid_field_in_request_body(client, live_server, measure_memory_usage, datastore_path):
|
||||
@@ -80,10 +80,9 @@ def test_openapi_validation_invalid_field_in_request_body(client, live_server, m
|
||||
# Should get 400 error due to invalid field (this will be caught by internal validation)
|
||||
# Note: This tests the flow where OpenAPI validation passes but internal validation catches it
|
||||
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
|
||||
# With patternProperties for processor_config_*, the error message format changed slightly
|
||||
assert (b"Additional properties are not allowed" in res.data or
|
||||
b"does not match any of the regexes" in res.data), \
|
||||
"Should contain validation error about additional/invalid properties"
|
||||
# Backend validation now returns "Unknown field(s):" message
|
||||
assert b"Unknown field" in res.data, \
|
||||
"Should contain validation error about unknown fields"
|
||||
|
||||
|
||||
def test_openapi_validation_import_wrong_content_type(client, live_server, measure_memory_usage, datastore_path):
|
||||
@@ -100,7 +99,7 @@ def test_openapi_validation_import_wrong_content_type(client, live_server, measu
|
||||
|
||||
# Should get 400 error due to content-type mismatch
|
||||
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
|
||||
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
|
||||
assert b"Validation failed" in res.data, "Should contain validation error message"
|
||||
|
||||
|
||||
def test_openapi_validation_import_correct_content_type_succeeds(client, live_server, measure_memory_usage, datastore_path):
|
||||
@@ -158,7 +157,7 @@ def test_openapi_validation_create_tag_missing_required_title(client, live_serve
|
||||
|
||||
# Should get 400 error due to missing required field
|
||||
assert res.status_code == 400, f"Expected 400 but got {res.status_code}"
|
||||
assert b"OpenAPI validation failed" in res.data, "Should contain OpenAPI validation error message"
|
||||
assert b"Validation failed" in res.data, "Should contain validation error message"
|
||||
|
||||
|
||||
def test_openapi_validation_watch_update_allows_partial_updates(client, live_server, measure_memory_usage, datastore_path):
|
||||
|
||||
@@ -176,4 +176,57 @@ def test_api_tags_listing(client, live_server, measure_memory_usage, datastore_p
|
||||
assert res.status_code == 204
|
||||
|
||||
|
||||
def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""
|
||||
Test the full round trip, this way we test the default Model fits back into OpenAPI spec
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:param datastore_path:
|
||||
:return:
|
||||
"""
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
set_original_response(datastore_path=datastore_path)
|
||||
|
||||
res = client.post(
|
||||
url_for("tag"),
|
||||
data=json.dumps({"title": "My tag title"}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key}
|
||||
)
|
||||
assert res.status_code == 201
|
||||
|
||||
uuid = res.json.get('uuid')
|
||||
|
||||
# Now fetch it and send it back
|
||||
|
||||
res = client.get(
|
||||
url_for("tag", uuid=uuid),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
|
||||
tag = res.json
|
||||
|
||||
# Only test with date_created (readOnly field that should be filtered out)
|
||||
# last_changed is Watch-specific and doesn't apply to Tags
|
||||
tag['date_created'] = 454444444444
|
||||
|
||||
# HTTP PUT ( UPDATE an existing watch )
|
||||
res = client.put(
|
||||
url_for("tag", uuid=uuid),
|
||||
headers={'x-api-key': api_key, 'content-type': 'application/json'},
|
||||
data=json.dumps(tag),
|
||||
)
|
||||
if res.status_code != 200:
|
||||
print(f"\n=== PUT failed with {res.status_code} ===")
|
||||
print(f"Error: {res.data}")
|
||||
assert res.status_code == 200, "HTTP PUT update was sent OK"
|
||||
|
||||
# Verify readOnly fields like date_created cannot be overridden
|
||||
res = client.get(
|
||||
url_for("tag", uuid=uuid),
|
||||
headers={'x-api-key': api_key}
|
||||
)
|
||||
date_created = res.json.get('date_created')
|
||||
assert date_created != 454444444444, "ReadOnly date_created should not be updateable"
|
||||
assert date_created != "454444444444", "ReadOnly date_created should not be updateable"
|
||||
|
||||
@@ -28,7 +28,7 @@ info:
|
||||
|
||||
For example: `x-api-key: YOUR_API_KEY`
|
||||
|
||||
version: 0.1.5
|
||||
version: 0.1.6
|
||||
contact:
|
||||
name: ChangeDetection.io
|
||||
url: https://github.com/dgtlmoon/changedetection.io
|
||||
@@ -126,13 +126,22 @@ components:
|
||||
WatchBase:
|
||||
type: object
|
||||
properties:
|
||||
uuid:
|
||||
type: string
|
||||
format: uuid
|
||||
description: Unique identifier
|
||||
readOnly: true
|
||||
date_created:
|
||||
type: [integer, 'null']
|
||||
description: Unix timestamp of creation
|
||||
readOnly: true
|
||||
url:
|
||||
type: string
|
||||
format: uri
|
||||
description: URL to monitor for changes
|
||||
maxLength: 5000
|
||||
title:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: Custom title for the web page change monitor (watch), not to be confused with page_title
|
||||
maxLength: 5000
|
||||
tag:
|
||||
@@ -156,56 +165,61 @@ components:
|
||||
description: HTTP method to use
|
||||
fetch_backend:
|
||||
type: string
|
||||
enum: [html_requests, html_webdriver]
|
||||
description: Backend to use for fetching content
|
||||
description: |
|
||||
Backend to use for fetching content. Common values:
|
||||
- `system` (default) - Use the system-wide default fetcher
|
||||
- `html_requests` - Fast requests-based fetcher
|
||||
- `html_webdriver` - Browser-based fetcher (Playwright/Puppeteer)
|
||||
- `extra_browser_*` - Custom browser configurations (if configured)
|
||||
- Plugin-provided fetchers (if installed)
|
||||
pattern: '^(system|html_requests|html_webdriver|extra_browser_.+)$'
|
||||
default: system
|
||||
headers:
|
||||
type: object
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: HTTP headers to include in requests
|
||||
body:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: HTTP request body
|
||||
maxLength: 5000
|
||||
proxy:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: Proxy configuration
|
||||
maxLength: 5000
|
||||
ignore_status_codes:
|
||||
type: [boolean, 'null']
|
||||
description: Ignore HTTP status code errors (boolean or null)
|
||||
webdriver_delay:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
description: Delay in seconds for webdriver
|
||||
webdriver_js_execute_code:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: JavaScript code to execute
|
||||
maxLength: 5000
|
||||
time_between_check:
|
||||
type: object
|
||||
properties:
|
||||
weeks:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
minimum: 0
|
||||
maximum: 52000
|
||||
nullable: true
|
||||
days:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
minimum: 0
|
||||
maximum: 365000
|
||||
nullable: true
|
||||
hours:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
minimum: 0
|
||||
maximum: 8760000
|
||||
nullable: true
|
||||
minutes:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
minimum: 0
|
||||
maximum: 525600000
|
||||
nullable: true
|
||||
seconds:
|
||||
type: integer
|
||||
type: [integer, 'null']
|
||||
minimum: 0
|
||||
maximum: 31536000000
|
||||
nullable: true
|
||||
description: Time intervals between checks. All fields must be non-negative. At least one non-zero value required when not using default settings.
|
||||
time_between_check_use_default:
|
||||
type: boolean
|
||||
@@ -219,11 +233,11 @@ components:
|
||||
maxItems: 100
|
||||
description: Notification URLs for this web page change monitor (watch). Maximum 100 URLs.
|
||||
notification_title:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: Custom notification title
|
||||
maxLength: 5000
|
||||
notification_body:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
description: Custom notification body
|
||||
maxLength: 5000
|
||||
notification_format:
|
||||
@@ -231,7 +245,7 @@ components:
|
||||
enum: ['text', 'html', 'htmlcolor', 'markdown', 'System default']
|
||||
description: Format for notifications
|
||||
track_ldjson_price_data:
|
||||
type: boolean
|
||||
type: [boolean, 'null']
|
||||
description: Whether to track JSON-LD price data
|
||||
browser_steps:
|
||||
type: array
|
||||
@@ -239,17 +253,14 @@ components:
|
||||
type: object
|
||||
properties:
|
||||
operation:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
maxLength: 5000
|
||||
nullable: true
|
||||
selector:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
maxLength: 5000
|
||||
nullable: true
|
||||
optional_value:
|
||||
type: string
|
||||
type: [string, 'null']
|
||||
maxLength: 5000
|
||||
nullable: true
|
||||
required: [operation, selector, optional_value]
|
||||
additionalProperties: false
|
||||
maxItems: 100
|
||||
@@ -260,16 +271,197 @@ components:
|
||||
default: text_json_diff
|
||||
description: Optional processor mode to use for change detection. Defaults to `text_json_diff` if not specified.
|
||||
|
||||
# Content Filtering
|
||||
include_filters:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: CSS/XPath selectors to extract specific content from the page
|
||||
subtractive_selectors:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: CSS/XPath selectors to remove content from the page
|
||||
ignore_text:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: Text patterns to ignore in change detection
|
||||
trigger_text:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: Text/regex patterns that must be present to trigger a change
|
||||
text_should_not_be_present:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: Text that should NOT be present (triggers alert if found)
|
||||
extract_text:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
maxItems: 100
|
||||
description: Regex patterns to extract specific text after filtering
|
||||
|
||||
# Text Processing
|
||||
trim_text_whitespace:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Strip leading/trailing whitespace from text
|
||||
sort_text_alphabetically:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Sort lines alphabetically before comparison
|
||||
remove_duplicate_lines:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Remove duplicate lines from content
|
||||
check_unique_lines:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Compare against all history for unique lines
|
||||
strip_ignored_lines:
|
||||
type: [boolean, 'null']
|
||||
description: Remove lines matching ignore patterns
|
||||
|
||||
# Change Detection Filters
|
||||
filter_text_added:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Include added text in change detection
|
||||
filter_text_removed:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Include removed text in change detection
|
||||
filter_text_replaced:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Include replaced text in change detection
|
||||
|
||||
# Restock/Price Detection
|
||||
in_stock_only:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Only trigger on in-stock transitions (restock_diff processor)
|
||||
follow_price_changes:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Monitor and track price changes (restock_diff processor)
|
||||
price_change_threshold_percent:
|
||||
type: [number, 'null']
|
||||
description: Minimum price change percentage to trigger notification
|
||||
has_ldjson_price_data:
|
||||
type: [boolean, 'null']
|
||||
description: Whether page has LD-JSON price data (auto-detected)
|
||||
readOnly: true
|
||||
|
||||
# Notifications
|
||||
notification_screenshot:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Include screenshot in notifications (if supported by notification URL)
|
||||
filter_failure_notification_send:
|
||||
type: boolean
|
||||
default: true
|
||||
description: Send notification when filters fail to match content
|
||||
|
||||
# History & Display
|
||||
use_page_title_in_list:
|
||||
type: [boolean, 'null']
|
||||
description: Display page title in watch list (null = use system default)
|
||||
history_snapshot_max_length:
|
||||
type: [integer, 'null']
|
||||
minimum: 1
|
||||
maximum: 1000
|
||||
description: Maximum number of history snapshots to keep (null = use system default)
|
||||
|
||||
# Scheduling
|
||||
time_schedule_limit:
|
||||
type: object
|
||||
description: Weekly schedule limiting when checks can run
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
default: false
|
||||
monday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
tuesday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
wednesday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
thursday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
friday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
saturday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
sunday:
|
||||
$ref: '#/components/schemas/DaySchedule'
|
||||
|
||||
# Conditions (advanced logic)
|
||||
conditions:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
field:
|
||||
type: string
|
||||
description: Field to check (e.g., 'page_filtered_text', 'page_title')
|
||||
operator:
|
||||
type: string
|
||||
description: Comparison operator (e.g., 'contains_regex', 'equals', 'not_equals')
|
||||
value:
|
||||
type: string
|
||||
description: Value to compare against
|
||||
required: [field, operator, value]
|
||||
maxItems: 100
|
||||
description: Array of condition rules for change detection logic (empty array when not set)
|
||||
conditions_match_logic:
|
||||
type: string
|
||||
enum: ['ALL', 'ANY']
|
||||
default: 'ALL'
|
||||
description: Logic operator - ALL (match all conditions) or ANY (match any condition)
|
||||
|
||||
DaySchedule:
|
||||
type: object
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
default: true
|
||||
start_time:
|
||||
type: string
|
||||
pattern: '^([0-1]?[0-9]|2[0-3]):[0-5][0-9]$'
|
||||
default: '00:00'
|
||||
description: Start time in HH:MM format
|
||||
duration:
|
||||
type: object
|
||||
properties:
|
||||
hours:
|
||||
type: string
|
||||
pattern: '^[0-9]+$'
|
||||
default: '24'
|
||||
minutes:
|
||||
type: string
|
||||
pattern: '^[0-9]+$'
|
||||
default: '00'
|
||||
|
||||
Watch:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/WatchBase'
|
||||
- type: object
|
||||
properties:
|
||||
uuid:
|
||||
type: string
|
||||
format: uuid
|
||||
description: Unique identifier for the web page change monitor (watch)
|
||||
readOnly: true
|
||||
last_checked:
|
||||
type: integer
|
||||
description: Unix timestamp of last check
|
||||
@@ -278,9 +470,10 @@ components:
|
||||
type: integer
|
||||
description: Unix timestamp of last change
|
||||
readOnly: true
|
||||
x-computed: true
|
||||
last_error:
|
||||
type: string
|
||||
description: Last error message
|
||||
type: [string, boolean, 'null']
|
||||
description: Last error message (false when no error, string when error occurred, null if not checked yet)
|
||||
readOnly: true
|
||||
last_viewed:
|
||||
type: integer
|
||||
@@ -291,6 +484,61 @@ components:
|
||||
format: string
|
||||
description: The watch URL rendered in case of any Jinja2 markup, always use this for listing.
|
||||
readOnly: true
|
||||
x-computed: true
|
||||
page_title:
|
||||
type: [string, 'null']
|
||||
description: HTML <title> tag extracted from the page
|
||||
readOnly: true
|
||||
check_count:
|
||||
type: integer
|
||||
description: Total number of checks performed
|
||||
readOnly: true
|
||||
fetch_time:
|
||||
type: number
|
||||
description: Duration of last fetch in seconds
|
||||
readOnly: true
|
||||
previous_md5:
|
||||
type: [string, boolean]
|
||||
description: MD5 hash of previous content (false if not set)
|
||||
readOnly: true
|
||||
previous_md5_before_filters:
|
||||
type: [string, boolean]
|
||||
description: MD5 hash before filters applied (false if not set)
|
||||
readOnly: true
|
||||
consecutive_filter_failures:
|
||||
type: integer
|
||||
description: Counter for consecutive filter match failures
|
||||
readOnly: true
|
||||
last_notification_error:
|
||||
type: [string, 'null']
|
||||
description: Last notification error message
|
||||
readOnly: true
|
||||
notification_alert_count:
|
||||
type: integer
|
||||
description: Number of notifications sent
|
||||
readOnly: true
|
||||
content-type:
|
||||
type: [string, 'null']
|
||||
description: Content-Type from last fetch
|
||||
readOnly: true
|
||||
remote_server_reply:
|
||||
type: [string, 'null']
|
||||
description: Server header from last response
|
||||
readOnly: true
|
||||
browser_steps_last_error_step:
|
||||
type: [integer, 'null']
|
||||
description: Last browser step that caused an error
|
||||
readOnly: true
|
||||
viewed:
|
||||
type: [integer, boolean]
|
||||
description: Computed property - true if watch has been viewed, false otherwise (deprecated, use last_viewed instead)
|
||||
readOnly: true
|
||||
x-computed: true
|
||||
history_n:
|
||||
type: integer
|
||||
description: Number of history snapshots available
|
||||
readOnly: true
|
||||
x-computed: true
|
||||
|
||||
CreateWatch:
|
||||
allOf:
|
||||
@@ -301,34 +549,45 @@ components:
|
||||
|
||||
UpdateWatch:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/WatchBase'
|
||||
- $ref: '#/components/schemas/WatchBase' # Extends WatchBase for user-settable fields
|
||||
- type: object
|
||||
properties:
|
||||
last_viewed:
|
||||
type: integer
|
||||
description: Unix timestamp in seconds of the last time the watch was viewed. Setting it to a value higher than `last_changed` in the "Update watch" endpoint marks the watch as viewed.
|
||||
minimum: 0
|
||||
# Note: ReadOnly and @property fields are filtered out in the backend before update
|
||||
# We don't use unevaluatedProperties:false here to allow roundtrip GET/PUT workflows
|
||||
# where the response includes computed fields that should be silently ignored
|
||||
|
||||
Tag:
|
||||
type: object
|
||||
properties:
|
||||
uuid:
|
||||
type: string
|
||||
format: uuid
|
||||
description: Unique identifier for the tag
|
||||
readOnly: true
|
||||
title:
|
||||
type: string
|
||||
description: Tag title
|
||||
maxLength: 5000
|
||||
notification_urls:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: Default notification URLs for web page change monitors (watches) with this tag
|
||||
notification_muted:
|
||||
type: boolean
|
||||
description: Whether notifications are muted for this tag
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/WatchBase'
|
||||
- type: object
|
||||
properties:
|
||||
overrides_watch:
|
||||
type: [boolean, 'null']
|
||||
description: |
|
||||
Whether this tag's settings override watch settings for all watches in this tag/group.
|
||||
- true: Tag settings override watch settings
|
||||
- false: Tag settings do not override (watches use their own settings)
|
||||
- null: Not decided yet / inherit default behavior
|
||||
# Future: Aggregated statistics from all watches with this tag
|
||||
# check_count:
|
||||
# type: integer
|
||||
# description: Sum of check_count from all watches with this tag
|
||||
# readOnly: true
|
||||
# x-computed: true
|
||||
# last_checked:
|
||||
# type: integer
|
||||
# description: Most recent last_checked timestamp from all watches with this tag
|
||||
# readOnly: true
|
||||
# x-computed: true
|
||||
# last_changed:
|
||||
# type: integer
|
||||
# description: Most recent last_changed timestamp from all watches with this tag
|
||||
# readOnly: true
|
||||
# x-computed: true
|
||||
|
||||
CreateTag:
|
||||
allOf:
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -5,7 +5,6 @@ flask-compress
|
||||
# 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers)
|
||||
flask-login>=0.6.3
|
||||
flask-paginate
|
||||
flask_expects_json~=1.7
|
||||
flask_restful
|
||||
flask_cors # For the Chrome extension to operate
|
||||
# janus # No longer needed - using pure threading.Queue for multi-loop support
|
||||
@@ -126,8 +125,8 @@ greenlet >= 3.0.3
|
||||
# Default SOCKETIO_MODE=threading is recommended for better compatibility
|
||||
gevent
|
||||
|
||||
# Pinned or it causes problems with flask_expects_json which seems unmaintained
|
||||
referencing==0.35.1
|
||||
# Previously pinned for flask_expects_json (removed 2026-02). Unpinning for now.
|
||||
referencing
|
||||
|
||||
# For conditions
|
||||
panzi-json-logic
|
||||
|
||||
Reference in New Issue
Block a user