Better support for watch API private vars

This commit is contained in:
dgtlmoon
2026-05-15 12:21:18 +02:00
parent bf5efc4c31
commit 11c728ee4a
6 changed files with 190 additions and 31 deletions
+8 -5
View File
@@ -7,7 +7,7 @@ import threading
from flask import request
from . import auth
from . import validate_openapi_request
from . import validate_openapi_request, strip_internal_api_fields
class Tag(Resource):
@@ -85,7 +85,8 @@ class Tag(Resource):
# Create clean tag dict without Watch-specific fields
clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields}
return clean_tag
# Never expose `__`-prefixed transient/internal fields
return strip_internal_api_fields(clean_tag)
@auth.check_token
@validate_openapi_request('deleteTag')
@@ -113,8 +114,9 @@ class Tag(Resource):
if not tag:
abort(404, message='No tag exists with the UUID of {}'.format(uuid))
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Make a mutable copy of request.json for modification.
# Silently discard `__`-prefixed transient/internal keys (not part of the public schema).
json_data = strip_internal_api_fields(dict(request.json))
# Validate notification_urls if provided
if 'notification_urls' in json_data:
@@ -162,7 +164,8 @@ class Tag(Resource):
def post(self):
"""Create a single tag/group."""
json_data = request.get_json()
# Silently discard `__`-prefixed transient/internal keys (not part of the public schema).
json_data = strip_internal_api_fields(request.get_json())
title = json_data.get("title",'').strip()
# Validate that only valid fields are provided
+9 -5
View File
@@ -12,7 +12,7 @@ from flask_restful import abort, Resource
from loguru import logger
import copy
from . import validate_openapi_request, get_readonly_watch_fields
from . import validate_openapi_request, get_readonly_watch_fields, strip_internal_api_fields
from ..notification import valid_notification_formats
from ..notification.handler import newline_re
@@ -126,7 +126,8 @@ class Watch(Resource):
watch['processor_config_restock_diff'] = restock_config
watch['processor_config_restock_diff_source'] = restock_source
return watch
# Never expose `__`-prefixed transient/internal fields (e.g. __check_status)
return strip_internal_api_fields(watch)
@auth.check_token
@validate_openapi_request('deleteWatch')
@@ -187,8 +188,10 @@ class Watch(Resource):
# Handle processor-config-* fields separately (save to JSON, not datastore)
from changedetectionio import processors
# Make a mutable copy of request.json for modification
json_data = dict(request.json)
# Make a mutable copy of request.json for modification.
# Silently discard `__`-prefixed transient/internal keys — they are not part of the
# public schema and must never be writable (e.g. clients that round-trip GET → PUT).
json_data = strip_internal_api_fields(dict(request.json))
# Extract and remove processor config fields from json_data
processor_config_data = processors.extract_processor_config_from_form_data(json_data)
@@ -443,7 +446,8 @@ class CreateWatch(Resource):
def post(self):
"""Create a single watch."""
json_data = request.get_json()
# Silently discard `__`-prefixed transient/internal keys (not part of the public schema).
json_data = strip_internal_api_fields(request.get_json())
url = json_data['url'].strip()
if not is_safe_valid_url(url):
+37
View File
@@ -133,6 +133,43 @@ def get_tag_schema_properties():
"""
return _resolve_schema_properties('Tag')
def strip_private_keys(data):
"""
Remove `__`-prefixed keys from a watch/tag dict at the API boundary.
These are transient in-memory fields (e.g. `__check_status` set by the worker to
surface "Fetching page..." in the UI) and are not part of the public OpenAPI
contract. They must never appear in GET responses (otherwise a client that
round-trips GET PUT trips the unknown-field validator), and must be silently
discarded from incoming PUT/POST payloads.
Returns a new dict; the input is not mutated.
"""
if not isinstance(data, dict):
return data
return {k: v for k, v in data.items() if not (isinstance(k, str) and k.startswith('__'))}
def strip_internal_api_fields(data):
"""
Strip both `__`-prefixed keys AND system-managed fields that aren't in the public
OpenAPI spec (skip-cache hashes, LLM runtime state, processor-set status, etc.).
Use this at every public API boundary so GET responses and PUT/POST payloads agree
on what's part of the contract. The set of system-managed fields lives in
model/schema_utils.py:SYSTEM_MANAGED_NON_SPEC_FIELDS extend it there, not here.
Returns a new dict; the input is not mutated.
"""
if not isinstance(data, dict):
return data
from changedetectionio.model.schema_utils import SYSTEM_MANAGED_NON_SPEC_FIELDS
return {
k: v for k, v in data.items()
if not (isinstance(k, str) and (k.startswith('__') or k in SYSTEM_MANAGED_NON_SPEC_FIELDS))
}
def validate_openapi_request(operation_id):
"""Decorator to validate incoming requests against OpenAPI spec."""
def decorator(f):
+7 -21
View File
@@ -343,28 +343,14 @@ class watch_base(dict):
return
# Import from shared schema utilities (no circular dependency)
from .schema_utils import get_readonly_watch_fields
readonly_fields = get_readonly_watch_fields()
from .schema_utils import get_readonly_watch_fields, SYSTEM_MANAGED_NON_SPEC_FIELDS
# Additional system-managed fields not in OpenAPI spec (yet)
# These are set by processors/workers and should not trigger edited flag
additional_system_fields = {
'last_check_status', # Set by processors
'last_filter_config_hash', # Set by text_json_diff processor, internal skip-cache
'restock', # Set by restock processor
'last_viewed', # Set by mark_all_viewed endpoint
# LLM runtime fields written back by worker/evaluator
'_llm_result',
'_llm_intent',
'_llm_change_summary',
'llm_prefilter',
'llm_evaluation_cache',
'llm_last_tokens_used',
'llm_tokens_used_cumulative',
}
# Only mark as edited if this is a user-writable field
if key not in readonly_fields and key not in additional_system_fields:
# `last_viewed` is set internally by mark_all_viewed and shouldn't flag the watch as
# edited, but is not in SYSTEM_MANAGED_NON_SPEC_FIELDS because it IS user-writable via
# the UpdateWatch schema (the API path).
if (key not in get_readonly_watch_fields()
and key != 'last_viewed'
and key not in SYSTEM_MANAGED_NON_SPEC_FIELDS):
self.__watch_was_edited = True
def __setitem__(self, key, value):
+29
View File
@@ -8,6 +8,35 @@ Shared by both the model layer and API layer to avoid circular dependencies.
import functools
# Watch fields written by workers/processors that are NOT part of the public OpenAPI spec.
#
# These fields exist on a watch dict at runtime but are internal implementation details
# (skip-cache hashes, last-check status strings, LLM runtime state, etc.). Used by:
# - model/__init__.py: don't trigger the "edited" flag when these are written internally
# - api/Watch.py: strip from GET responses and silently discard from PUT/POST inputs
# so that a GET → PUT round trip doesn't trip the unknown-field validator
#
# `last_viewed` is intentionally NOT included: it's set internally by mark_all_viewed BUT
# is also explicitly writable via the UpdateWatch schema (see api/Watch.py valid_fields).
SYSTEM_MANAGED_NON_SPEC_FIELDS = frozenset({
'last_check_status', # Set by processors
'last_filter_config_hash', # text_json_diff internal skip-cache
'restock', # Set by restock processor
'_llm_result', # LLM runtime — populated by evaluator
'_llm_intent',
'_llm_change_summary',
'llm_prefilter',
'llm_evaluation_cache',
'llm_last_tokens_used',
'llm_tokens_used_cumulative',
})
def get_system_managed_non_spec_fields():
"""Return the set of internal fields not in the public OpenAPI spec."""
return SYSTEM_MANAGED_NON_SPEC_FIELDS
@functools.cache
def get_openapi_schema_dict():
"""
+100
View File
@@ -406,6 +406,106 @@ def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path
"extract_lines_containing should be persisted and returned via API"
def test_api_strips_internal_fields(client, live_server, measure_memory_usage, datastore_path):
"""
Internal/transient fields must never cross the API boundary in either direction:
1. `__`-prefixed keys (e.g. `__check_status` set by the worker for UI status)
2. System-managed fields not in the OpenAPI spec (see SYSTEM_MANAGED_NON_SPEC_FIELDS):
`last_check_status`, `last_filter_config_hash`, `_llm_*`, `llm_*`, etc.
GET responses must strip them. PUT/POST payloads must silently discard them.
Without this, a client that round-trips GET PUT trips the unknown-field validator.
"""
from changedetectionio.model.schema_utils import SYSTEM_MANAGED_NON_SPEC_FIELDS
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
datastore = live_server.app.config['DATASTORE']
set_original_response(datastore_path=datastore_path)
test_url = url_for('test_endpoint', _external=True)
# Create
res = client.post(
url_for("createwatch"),
data=json.dumps({"url": test_url}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
follow_redirects=True
)
assert res.status_code == 201
watch_uuid = res.json.get('uuid')
wait_for_all_checks(client)
# Force both a transient __-prefixed and a system-managed field onto the watch,
# simulating worker/processor-set state.
watch_obj = datastore.data['watching'][watch_uuid]
watch_obj['__check_status'] = 'Fetching page..'
watch_obj['last_check_status'] = 200
watch_obj['_llm_result'] = {'summary': 'cached llm output'}
watch_obj['last_filter_config_hash'] = 'abc123'
# --- GET must strip all internal fields ---
res = client.get(
url_for("watch", uuid=watch_uuid),
headers={'x-api-key': api_key},
)
assert res.status_code == 200
assert not any(k.startswith('__') for k in res.json.keys()), \
f"No __-prefixed field should leak into API responses; got keys: {list(res.json.keys())}"
leaked_system_fields = SYSTEM_MANAGED_NON_SPEC_FIELDS & set(res.json.keys())
assert not leaked_system_fields, \
f"System-managed non-spec fields must not appear in GET response; leaked: {leaked_system_fields}"
# --- PUT must accept (and silently drop) those same internal fields ---
# This is the key round-trip property: a client should be able to PUT back what it just GET'd.
# Use the actual GET response as the payload (the realistic round-trip case).
payload = dict(res.json)
payload['__check_status'] = 'attacker-supplied value' # not in the GET, but a client could add it
payload['last_check_status'] = 999 # ditto
payload['_llm_result'] = 'attacker overwrite'
res = client.put(
url_for("watch", uuid=watch_uuid),
headers={'x-api-key': api_key, 'content-type': 'application/json'},
data=json.dumps(payload),
)
assert res.status_code == 200, \
f"PUT round-tripping GET response plus internal fields should succeed (got {res.status_code}: {res.data!r})"
# Internal fields must not have been overwritten by the PUT
assert watch_obj.get('__check_status') == 'Fetching page..', \
"PUT must not overwrite __-prefixed fields"
assert watch_obj.get('_llm_result') == {'summary': 'cached llm output'}, \
"PUT must not overwrite system-managed non-spec fields"
# --- POST must also silently discard internal fields ---
# Use unique sentinel values so we can distinguish "POST persisted my value" from
# "the worker concurrently re-set the field while processing the new watch".
attacker_check_status = 'attacker-sentinel-__check_status-9f7c'
attacker_llm_result = 'attacker-sentinel-_llm_result-9f7c'
res = client.post(
url_for("createwatch"),
data=json.dumps({
"url": test_url + "?2",
"__check_status": attacker_check_status,
"_llm_result": attacker_llm_result,
}),
headers={'content-type': 'application/json', 'x-api-key': api_key},
follow_redirects=True,
)
assert res.status_code == 201, \
f"POST with internal fields should succeed (got {res.status_code}: {res.data!r})"
new_uuid = res.json.get('uuid')
new_watch = datastore.data['watching'][new_uuid]
# If POST had persisted the attacker payload these specific sentinel values would remain.
# The worker may legitimately re-set __check_status with its own status string, that's fine.
assert new_watch.get('__check_status') != attacker_check_status, \
"POST must not persist __-prefixed fields from input"
assert new_watch.get('_llm_result') != attacker_llm_result, \
"POST must not persist system-managed fields from input"
delete_all_watches(client)
def test_access_denied(client, live_server, measure_memory_usage, datastore_path):
# `config_api_token_enabled` Should be On by default
res = client.get(