Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
df8f61be98 Re #3833 2026-02-05 16:37:33 +01:00
dgtlmoon
bdc2916c07 New datastore message should be warning not critical 2026-02-05 16:25:22 +01:00
5 changed files with 56 additions and 130 deletions

View File

@@ -66,42 +66,47 @@ class Watch(Resource):
@validate_openapi_request('getWatch')
def get(self, uuid):
"""Get information about a single watch, recheck, pause, or mute."""
# Get watch reference first (for pause/mute operations)
watch_obj = self.datastore.data['watching'].get(uuid)
if not watch_obj:
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
import time
from copy import deepcopy
watch = None
# Retry up to 20 times if dict is being modified
# With sleep(0), this is fast: ~200µs best case, ~20ms worst case under heavy load
for attempt in range(20):
try:
watch = deepcopy(self.datastore.data['watching'].get(uuid))
break
except RuntimeError:
# Dict changed during deepcopy, retry after yielding to scheduler
# sleep(0) releases GIL and yields - no fixed delay, just lets other threads run
if attempt < 19: # Don't yield on last attempt
time.sleep(0) # Yield to scheduler (microseconds, not milliseconds)
# Create a dict copy for JSON response (with lock for thread safety)
# This is much faster than deepcopy and doesn't copy the datastore reference
# WARNING: dict() is a SHALLOW copy - nested dicts are shared with original!
# Only safe because we only ADD scalar properties (line 97-101), never modify nested dicts
# If you need to modify nested dicts, use: from copy import deepcopy; watch = deepcopy(dict(watch_obj))
with self.datastore.lock:
watch = dict(watch_obj)
if not watch:
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
if request.args.get('recheck'):
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
return "OK", 200
if request.args.get('paused', '') == 'paused':
watch_obj.pause()
self.datastore.data['watching'].get(uuid).pause()
return "OK", 200
elif request.args.get('paused', '') == 'unpaused':
watch_obj.unpause()
self.datastore.data['watching'].get(uuid).unpause()
return "OK", 200
if request.args.get('muted', '') == 'muted':
watch_obj.mute()
self.datastore.data['watching'].get(uuid).mute()
return "OK", 200
elif request.args.get('muted', '') == 'unmuted':
watch_obj.unmute()
self.datastore.data['watching'].get(uuid).unmute()
return "OK", 200
# Return without history, get that via another API call
# Properties are not returned as a JSON, so add the required props manually
watch['history_n'] = watch_obj.history_n
watch['history_n'] = watch.history_n
# attr .last_changed will check for the last written text snapshot on change
watch['last_changed'] = watch_obj.last_changed
watch['viewed'] = watch_obj.viewed
watch['link'] = watch_obj.link,
watch['last_changed'] = watch.last_changed
watch['viewed'] = watch.viewed
watch['link'] = watch.link,
return watch

View File

@@ -131,95 +131,6 @@ class model(watch_base):
# Be sure the cached timestamp is ready
bump = self.history
def __deepcopy__(self, memo):
"""
Custom deepcopy that excludes __datastore to prevent memory leaks.
CRITICAL FIX: Without this, deepcopy(watch) copies the entire datastore
(which contains all other watches), causing exponential memory growth.
With 100 watches, this creates 10,000 watch objects in memory (100²).
This is called by:
- api/Watch.py:76 (API endpoint)
- processors/base.py:26 (EVERY processor run)
- store/__init__.py:544 (clone watch)
- And 4+ other locations
"""
from copy import deepcopy
# Create a new instance without calling __init__ (avoids __datastore requirement)
cls = self.__class__
new_watch = cls.__new__(cls)
memo[id(self)] = new_watch
# Copy the dict data (all the watch settings)
for key, value in self.items():
new_watch[key] = deepcopy(value, memo)
# Copy instance attributes EXCEPT the datastore references
# These are cached/computed values that need to be preserved
new_watch._model__newest_history_key = self._model__newest_history_key
new_watch._model__history_n = self._model__history_n
new_watch.jitter_seconds = self.jitter_seconds
# Copy datastore_path (string, safe to copy)
new_watch._model__datastore_path = self._model__datastore_path
# CRITICAL: Share the datastore reference (don't copy it!)
# This is safe because we never modify the datastore through the watch
new_watch._model__datastore = self._model__datastore
# Do NOT copy favicon cache - let it be regenerated on demand
# This is just a performance cache (prevents repeated glob operations)
# and will be rebuilt automatically on first access
return new_watch
def __getstate__(self):
"""
Custom pickle serialization that excludes __datastore.
This handles pickle/unpickle (used by multiprocessing, caching, etc.)
and ensures the datastore reference is never serialized.
"""
# Get the dict data
state = dict(self)
# Add the instance attributes we want to preserve
state['__watch_metadata__'] = {
'newest_history_key': self._model__newest_history_key,
'history_n': self._model__history_n,
'jitter_seconds': self.jitter_seconds,
'datastore_path': self._model__datastore_path,
}
# NOTE: __datastore and _favicon_filename_cache are intentionally excluded
# Both will be regenerated/restored as needed
return state
def __setstate__(self, state):
"""
Custom pickle deserialization.
WARNING: This creates a Watch without a __datastore reference!
The caller MUST set watch._model__datastore after unpickling.
"""
# Extract metadata
metadata = state.pop('__watch_metadata__', {})
# Restore dict data
self.update(state)
# Restore instance attributes
self._model__newest_history_key = metadata.get('newest_history_key')
self._model__history_n = metadata.get('history_n', 0)
self.jitter_seconds = metadata.get('jitter_seconds', 0)
self._model__datastore_path = metadata.get('datastore_path')
# __datastore is NOT restored - caller must set it!
# _favicon_filename_cache is NOT restored - will regenerate on demand
self._model__datastore = None
@property
def viewed(self):
# Don't return viewed when last_viewed is 0 and newest_key is 0

View File

@@ -23,14 +23,7 @@ class difference_detection_processor():
def __init__(self, datastore, watch_uuid):
self.datastore = datastore
self.watch_uuid = watch_uuid
# Create a stable snapshot of the watch for processing
# Why deepcopy?
# 1. Prevents "dict changed during iteration" errors if watch is modified during processing
# 2. Preserves Watch object with properties (.link, .is_pdf, etc.) - can't use dict()
# 3. Safe now: Watch.__deepcopy__() shares datastore ref (no memory leak) but copies dict data
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()

View File

@@ -193,18 +193,17 @@ class perform_site_check(difference_detection_processor):
itemprop_availability = {}
multiple_prices_found = False
# Try built-in extraction first, this will scan metadata in the HTML
try:
itemprop_availability = get_itemprop_availability(self.fetcher.content)
except MoreThanOnePriceFound as e:
# Add the real data
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
url=watch.get('url'),
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,
xpath_data=self.fetcher.xpath_data
)
# Don't raise immediately - let plugins try to handle this case
# Plugins might be able to determine which price is correct
logger.warning(f"Built-in detection found multiple prices on {watch.get('url')}, will try plugin override")
multiple_prices_found = True
itemprop_availability = {}
# If built-in extraction didn't get both price AND availability, try plugin override
# Only check plugin if this watch is using a fetcher that might provide better data
@@ -216,9 +215,21 @@ class perform_site_check(difference_detection_processor):
from changedetectionio.pluggy_interface import get_itemprop_availability_from_plugin
fetcher_name = watch.get('fetch_backend', 'html_requests')
# Only try plugin override if not using system default (which might be anything)
if fetcher_name and fetcher_name != 'system':
logger.debug("Calling extra plugins for getting item price/availability")
# Resolve 'system' to the actual fetcher being used
# This allows plugins to work even when watch uses "system settings default"
if fetcher_name == 'system':
# Get the actual fetcher that was used (from self.fetcher)
# Fetcher class name gives us the actual backend (e.g., 'html_requests', 'html_webdriver')
actual_fetcher = type(self.fetcher).__name__
if 'html_requests' in actual_fetcher.lower():
fetcher_name = 'html_requests'
elif 'webdriver' in actual_fetcher.lower() or 'playwright' in actual_fetcher.lower():
fetcher_name = 'html_webdriver'
logger.debug(f"Resolved 'system' fetcher to actual fetcher: {fetcher_name}")
# Try plugin override - plugins can decide if they support this fetcher
if fetcher_name:
logger.debug(f"Calling extra plugins for getting item price/availability (fetcher: {fetcher_name})")
plugin_availability = get_itemprop_availability_from_plugin(self.fetcher.content, fetcher_name, self.fetcher, watch.link)
if plugin_availability:
@@ -233,6 +244,16 @@ class perform_site_check(difference_detection_processor):
if not plugin_availability:
logger.debug("No item price/availability from plugins")
# If we had multiple prices and plugins also failed, NOW raise the exception
if multiple_prices_found and not itemprop_availability.get('price'):
raise ProcessorException(
message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
url=watch.get('url'),
status_code=self.fetcher.get_last_status_code(),
screenshot=self.fetcher.screenshot,
xpath_data=self.fetcher.xpath_data
)
# Something valid in get_itemprop_availability() by scraping metadata ?
if itemprop_availability.get('price') or itemprop_availability.get('availability'):
# Store for other usage

View File

@@ -541,11 +541,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
# Clone a watch by UUID
def clone(self, uuid):
url = self.data['watching'][uuid].get('url')
# No need to deepcopy here - add_watch() will deepcopy extras anyway (line 569)
# Just pass a dict copy (with lock for thread safety)
# NOTE: dict() is shallow copy but safe since add_watch() deepcopies it
with self.lock:
extras = dict(self.data['watching'][uuid])
extras = deepcopy(self.data['watching'][uuid])
new_uuid = self.add_watch(url=url, extras=extras)
watch = self.data['watching'][new_uuid]
return new_uuid