From 11c728ee4ac9f3f2da380dc67f8ce88ae7e38ffa Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 15 May 2026 12:21:18 +0200 Subject: [PATCH] Better support for watch API private vars --- changedetectionio/api/Tags.py | 13 +-- changedetectionio/api/Watch.py | 14 ++-- changedetectionio/api/__init__.py | 37 +++++++++ changedetectionio/model/__init__.py | 28 ++----- changedetectionio/model/schema_utils.py | 29 +++++++ changedetectionio/tests/test_api.py | 100 ++++++++++++++++++++++++ 6 files changed, 190 insertions(+), 31 deletions(-) diff --git a/changedetectionio/api/Tags.py b/changedetectionio/api/Tags.py index 1142cd9d..ae09dfe2 100644 --- a/changedetectionio/api/Tags.py +++ b/changedetectionio/api/Tags.py @@ -7,7 +7,7 @@ import threading from flask import request from . import auth -from . import validate_openapi_request +from . import validate_openapi_request, strip_internal_api_fields class Tag(Resource): @@ -85,7 +85,8 @@ class Tag(Resource): # Create clean tag dict without Watch-specific fields clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields} - return clean_tag + # Never expose `__`-prefixed transient/internal fields + return strip_internal_api_fields(clean_tag) @auth.check_token @validate_openapi_request('deleteTag') @@ -113,8 +114,9 @@ class Tag(Resource): if not tag: abort(404, message='No tag exists with the UUID of {}'.format(uuid)) - # Make a mutable copy of request.json for modification - json_data = dict(request.json) + # Make a mutable copy of request.json for modification. + # Silently discard `__`-prefixed transient/internal keys (not part of the public schema). + json_data = strip_internal_api_fields(dict(request.json)) # Validate notification_urls if provided if 'notification_urls' in json_data: @@ -162,7 +164,8 @@ class Tag(Resource): def post(self): """Create a single tag/group.""" - json_data = request.get_json() + # Silently discard `__`-prefixed transient/internal keys (not part of the public schema). + json_data = strip_internal_api_fields(request.get_json()) title = json_data.get("title",'').strip() # Validate that only valid fields are provided diff --git a/changedetectionio/api/Watch.py b/changedetectionio/api/Watch.py index 3ab87109..9cb4d721 100644 --- a/changedetectionio/api/Watch.py +++ b/changedetectionio/api/Watch.py @@ -12,7 +12,7 @@ from flask_restful import abort, Resource from loguru import logger import copy -from . import validate_openapi_request, get_readonly_watch_fields +from . import validate_openapi_request, get_readonly_watch_fields, strip_internal_api_fields from ..notification import valid_notification_formats from ..notification.handler import newline_re @@ -126,7 +126,8 @@ class Watch(Resource): watch['processor_config_restock_diff'] = restock_config watch['processor_config_restock_diff_source'] = restock_source - return watch + # Never expose `__`-prefixed transient/internal fields (e.g. __check_status) + return strip_internal_api_fields(watch) @auth.check_token @validate_openapi_request('deleteWatch') @@ -187,8 +188,10 @@ class Watch(Resource): # Handle processor-config-* fields separately (save to JSON, not datastore) from changedetectionio import processors - # Make a mutable copy of request.json for modification - json_data = dict(request.json) + # Make a mutable copy of request.json for modification. + # Silently discard `__`-prefixed transient/internal keys — they are not part of the + # public schema and must never be writable (e.g. clients that round-trip GET → PUT). + json_data = strip_internal_api_fields(dict(request.json)) # Extract and remove processor config fields from json_data processor_config_data = processors.extract_processor_config_from_form_data(json_data) @@ -443,7 +446,8 @@ class CreateWatch(Resource): def post(self): """Create a single watch.""" - json_data = request.get_json() + # Silently discard `__`-prefixed transient/internal keys (not part of the public schema). + json_data = strip_internal_api_fields(request.get_json()) url = json_data['url'].strip() if not is_safe_valid_url(url): diff --git a/changedetectionio/api/__init__.py b/changedetectionio/api/__init__.py index 2fe95106..98837680 100644 --- a/changedetectionio/api/__init__.py +++ b/changedetectionio/api/__init__.py @@ -133,6 +133,43 @@ def get_tag_schema_properties(): """ return _resolve_schema_properties('Tag') +def strip_private_keys(data): + """ + Remove `__`-prefixed keys from a watch/tag dict at the API boundary. + + These are transient in-memory fields (e.g. `__check_status` set by the worker to + surface "Fetching page..." in the UI) and are not part of the public OpenAPI + contract. They must never appear in GET responses (otherwise a client that + round-trips GET → PUT trips the unknown-field validator), and must be silently + discarded from incoming PUT/POST payloads. + + Returns a new dict; the input is not mutated. + """ + if not isinstance(data, dict): + return data + return {k: v for k, v in data.items() if not (isinstance(k, str) and k.startswith('__'))} + + +def strip_internal_api_fields(data): + """ + Strip both `__`-prefixed keys AND system-managed fields that aren't in the public + OpenAPI spec (skip-cache hashes, LLM runtime state, processor-set status, etc.). + + Use this at every public API boundary so GET responses and PUT/POST payloads agree + on what's part of the contract. The set of system-managed fields lives in + model/schema_utils.py:SYSTEM_MANAGED_NON_SPEC_FIELDS — extend it there, not here. + + Returns a new dict; the input is not mutated. + """ + if not isinstance(data, dict): + return data + from changedetectionio.model.schema_utils import SYSTEM_MANAGED_NON_SPEC_FIELDS + return { + k: v for k, v in data.items() + if not (isinstance(k, str) and (k.startswith('__') or k in SYSTEM_MANAGED_NON_SPEC_FIELDS)) + } + + def validate_openapi_request(operation_id): """Decorator to validate incoming requests against OpenAPI spec.""" def decorator(f): diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py index 5ee7f40b..61d41ece 100644 --- a/changedetectionio/model/__init__.py +++ b/changedetectionio/model/__init__.py @@ -343,28 +343,14 @@ class watch_base(dict): return # Import from shared schema utilities (no circular dependency) - from .schema_utils import get_readonly_watch_fields - readonly_fields = get_readonly_watch_fields() + from .schema_utils import get_readonly_watch_fields, SYSTEM_MANAGED_NON_SPEC_FIELDS - # Additional system-managed fields not in OpenAPI spec (yet) - # These are set by processors/workers and should not trigger edited flag - additional_system_fields = { - 'last_check_status', # Set by processors - 'last_filter_config_hash', # Set by text_json_diff processor, internal skip-cache - 'restock', # Set by restock processor - 'last_viewed', # Set by mark_all_viewed endpoint - # LLM runtime fields written back by worker/evaluator - '_llm_result', - '_llm_intent', - '_llm_change_summary', - 'llm_prefilter', - 'llm_evaluation_cache', - 'llm_last_tokens_used', - 'llm_tokens_used_cumulative', - } - - # Only mark as edited if this is a user-writable field - if key not in readonly_fields and key not in additional_system_fields: + # `last_viewed` is set internally by mark_all_viewed and shouldn't flag the watch as + # edited, but is not in SYSTEM_MANAGED_NON_SPEC_FIELDS because it IS user-writable via + # the UpdateWatch schema (the API path). + if (key not in get_readonly_watch_fields() + and key != 'last_viewed' + and key not in SYSTEM_MANAGED_NON_SPEC_FIELDS): self.__watch_was_edited = True def __setitem__(self, key, value): diff --git a/changedetectionio/model/schema_utils.py b/changedetectionio/model/schema_utils.py index d30ffcfb..5f233d23 100644 --- a/changedetectionio/model/schema_utils.py +++ b/changedetectionio/model/schema_utils.py @@ -8,6 +8,35 @@ Shared by both the model layer and API layer to avoid circular dependencies. import functools +# Watch fields written by workers/processors that are NOT part of the public OpenAPI spec. +# +# These fields exist on a watch dict at runtime but are internal implementation details +# (skip-cache hashes, last-check status strings, LLM runtime state, etc.). Used by: +# - model/__init__.py: don't trigger the "edited" flag when these are written internally +# - api/Watch.py: strip from GET responses and silently discard from PUT/POST inputs +# so that a GET → PUT round trip doesn't trip the unknown-field validator +# +# `last_viewed` is intentionally NOT included: it's set internally by mark_all_viewed BUT +# is also explicitly writable via the UpdateWatch schema (see api/Watch.py valid_fields). +SYSTEM_MANAGED_NON_SPEC_FIELDS = frozenset({ + 'last_check_status', # Set by processors + 'last_filter_config_hash', # text_json_diff internal skip-cache + 'restock', # Set by restock processor + '_llm_result', # LLM runtime — populated by evaluator + '_llm_intent', + '_llm_change_summary', + 'llm_prefilter', + 'llm_evaluation_cache', + 'llm_last_tokens_used', + 'llm_tokens_used_cumulative', +}) + + +def get_system_managed_non_spec_fields(): + """Return the set of internal fields not in the public OpenAPI spec.""" + return SYSTEM_MANAGED_NON_SPEC_FIELDS + + @functools.cache def get_openapi_schema_dict(): """ diff --git a/changedetectionio/tests/test_api.py b/changedetectionio/tests/test_api.py index a9f63b2c..b1d8d19c 100644 --- a/changedetectionio/tests/test_api.py +++ b/changedetectionio/tests/test_api.py @@ -406,6 +406,106 @@ def test_roundtrip_API(client, live_server, measure_memory_usage, datastore_path "extract_lines_containing should be persisted and returned via API" +def test_api_strips_internal_fields(client, live_server, measure_memory_usage, datastore_path): + """ + Internal/transient fields must never cross the API boundary in either direction: + 1. `__`-prefixed keys (e.g. `__check_status` set by the worker for UI status) + 2. System-managed fields not in the OpenAPI spec (see SYSTEM_MANAGED_NON_SPEC_FIELDS): + `last_check_status`, `last_filter_config_hash`, `_llm_*`, `llm_*`, etc. + + GET responses must strip them. PUT/POST payloads must silently discard them. + Without this, a client that round-trips GET → PUT trips the unknown-field validator. + """ + from changedetectionio.model.schema_utils import SYSTEM_MANAGED_NON_SPEC_FIELDS + + api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token') + datastore = live_server.app.config['DATASTORE'] + + set_original_response(datastore_path=datastore_path) + test_url = url_for('test_endpoint', _external=True) + + # Create + res = client.post( + url_for("createwatch"), + data=json.dumps({"url": test_url}), + headers={'content-type': 'application/json', 'x-api-key': api_key}, + follow_redirects=True + ) + assert res.status_code == 201 + watch_uuid = res.json.get('uuid') + + wait_for_all_checks(client) + + # Force both a transient __-prefixed and a system-managed field onto the watch, + # simulating worker/processor-set state. + watch_obj = datastore.data['watching'][watch_uuid] + watch_obj['__check_status'] = 'Fetching page..' + watch_obj['last_check_status'] = 200 + watch_obj['_llm_result'] = {'summary': 'cached llm output'} + watch_obj['last_filter_config_hash'] = 'abc123' + + # --- GET must strip all internal fields --- + res = client.get( + url_for("watch", uuid=watch_uuid), + headers={'x-api-key': api_key}, + ) + assert res.status_code == 200 + assert not any(k.startswith('__') for k in res.json.keys()), \ + f"No __-prefixed field should leak into API responses; got keys: {list(res.json.keys())}" + leaked_system_fields = SYSTEM_MANAGED_NON_SPEC_FIELDS & set(res.json.keys()) + assert not leaked_system_fields, \ + f"System-managed non-spec fields must not appear in GET response; leaked: {leaked_system_fields}" + + # --- PUT must accept (and silently drop) those same internal fields --- + # This is the key round-trip property: a client should be able to PUT back what it just GET'd. + # Use the actual GET response as the payload (the realistic round-trip case). + payload = dict(res.json) + payload['__check_status'] = 'attacker-supplied value' # not in the GET, but a client could add it + payload['last_check_status'] = 999 # ditto + payload['_llm_result'] = 'attacker overwrite' + res = client.put( + url_for("watch", uuid=watch_uuid), + headers={'x-api-key': api_key, 'content-type': 'application/json'}, + data=json.dumps(payload), + ) + assert res.status_code == 200, \ + f"PUT round-tripping GET response plus internal fields should succeed (got {res.status_code}: {res.data!r})" + + # Internal fields must not have been overwritten by the PUT + assert watch_obj.get('__check_status') == 'Fetching page..', \ + "PUT must not overwrite __-prefixed fields" + assert watch_obj.get('_llm_result') == {'summary': 'cached llm output'}, \ + "PUT must not overwrite system-managed non-spec fields" + + # --- POST must also silently discard internal fields --- + # Use unique sentinel values so we can distinguish "POST persisted my value" from + # "the worker concurrently re-set the field while processing the new watch". + attacker_check_status = 'attacker-sentinel-__check_status-9f7c' + attacker_llm_result = 'attacker-sentinel-_llm_result-9f7c' + res = client.post( + url_for("createwatch"), + data=json.dumps({ + "url": test_url + "?2", + "__check_status": attacker_check_status, + "_llm_result": attacker_llm_result, + }), + headers={'content-type': 'application/json', 'x-api-key': api_key}, + follow_redirects=True, + ) + assert res.status_code == 201, \ + f"POST with internal fields should succeed (got {res.status_code}: {res.data!r})" + new_uuid = res.json.get('uuid') + new_watch = datastore.data['watching'][new_uuid] + # If POST had persisted the attacker payload these specific sentinel values would remain. + # The worker may legitimately re-set __check_status with its own status string, that's fine. + assert new_watch.get('__check_status') != attacker_check_status, \ + "POST must not persist __-prefixed fields from input" + assert new_watch.get('_llm_result') != attacker_llm_result, \ + "POST must not persist system-managed fields from input" + + delete_all_watches(client) + + def test_access_denied(client, live_server, measure_memory_usage, datastore_path): # `config_api_token_enabled` Should be On by default res = client.get(