mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-07 14:56:02 +00:00
Compare commits
3 Commits
watch-memo
...
watch-data
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
559a925129 | ||
|
|
2349344d9e | ||
|
|
bdc2916c07 |
@@ -66,47 +66,42 @@ class Watch(Resource):
|
||||
@validate_openapi_request('getWatch')
|
||||
def get(self, uuid):
|
||||
"""Get information about a single watch, recheck, pause, or mute."""
|
||||
import time
|
||||
from copy import deepcopy
|
||||
watch = None
|
||||
# Retry up to 20 times if dict is being modified
|
||||
# With sleep(0), this is fast: ~200µs best case, ~20ms worst case under heavy load
|
||||
for attempt in range(20):
|
||||
try:
|
||||
watch = deepcopy(self.datastore.data['watching'].get(uuid))
|
||||
break
|
||||
except RuntimeError:
|
||||
# Dict changed during deepcopy, retry after yielding to scheduler
|
||||
# sleep(0) releases GIL and yields - no fixed delay, just lets other threads run
|
||||
if attempt < 19: # Don't yield on last attempt
|
||||
time.sleep(0) # Yield to scheduler (microseconds, not milliseconds)
|
||||
|
||||
if not watch:
|
||||
# Get watch reference first (for pause/mute operations)
|
||||
watch_obj = self.datastore.data['watching'].get(uuid)
|
||||
if not watch_obj:
|
||||
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
|
||||
|
||||
# Create a dict copy for JSON response (with lock for thread safety)
|
||||
# This is much faster than deepcopy and doesn't copy the datastore reference
|
||||
# WARNING: dict() is a SHALLOW copy - nested dicts are shared with original!
|
||||
# Only safe because we only ADD scalar properties (line 97-101), never modify nested dicts
|
||||
# If you need to modify nested dicts, use: from copy import deepcopy; watch = deepcopy(dict(watch_obj))
|
||||
with self.datastore.lock:
|
||||
watch = dict(watch_obj)
|
||||
|
||||
if request.args.get('recheck'):
|
||||
worker_pool.queue_item_async_safe(self.update_q, queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
|
||||
return "OK", 200
|
||||
if request.args.get('paused', '') == 'paused':
|
||||
self.datastore.data['watching'].get(uuid).pause()
|
||||
watch_obj.pause()
|
||||
return "OK", 200
|
||||
elif request.args.get('paused', '') == 'unpaused':
|
||||
self.datastore.data['watching'].get(uuid).unpause()
|
||||
watch_obj.unpause()
|
||||
return "OK", 200
|
||||
if request.args.get('muted', '') == 'muted':
|
||||
self.datastore.data['watching'].get(uuid).mute()
|
||||
watch_obj.mute()
|
||||
return "OK", 200
|
||||
elif request.args.get('muted', '') == 'unmuted':
|
||||
self.datastore.data['watching'].get(uuid).unmute()
|
||||
watch_obj.unmute()
|
||||
return "OK", 200
|
||||
|
||||
# Return without history, get that via another API call
|
||||
# Properties are not returned as a JSON, so add the required props manually
|
||||
watch['history_n'] = watch.history_n
|
||||
watch['history_n'] = watch_obj.history_n
|
||||
# attr .last_changed will check for the last written text snapshot on change
|
||||
watch['last_changed'] = watch.last_changed
|
||||
watch['viewed'] = watch.viewed
|
||||
watch['link'] = watch.link,
|
||||
watch['last_changed'] = watch_obj.last_changed
|
||||
watch['viewed'] = watch_obj.viewed
|
||||
watch['link'] = watch_obj.link,
|
||||
|
||||
return watch
|
||||
|
||||
|
||||
@@ -5,6 +5,11 @@ from changedetectionio.model import watch_base
|
||||
class model(watch_base):
|
||||
|
||||
def __init__(self, *arg, **kw):
|
||||
# Store datastore reference (optional for Tags, but good for consistency)
|
||||
self.__datastore = kw.get('__datastore')
|
||||
if kw.get('__datastore'):
|
||||
del kw['__datastore']
|
||||
|
||||
super(model, self).__init__(*arg, **kw)
|
||||
|
||||
self['overrides_watch'] = kw.get('default', {}).get('overrides_watch')
|
||||
|
||||
@@ -131,6 +131,9 @@ class model(watch_base):
|
||||
# Be sure the cached timestamp is ready
|
||||
bump = self.history
|
||||
|
||||
# Note: __deepcopy__, __getstate__, and __setstate__ are inherited from watch_base
|
||||
# This prevents memory leaks by sharing __datastore reference instead of copying it
|
||||
|
||||
@property
|
||||
def viewed(self):
|
||||
# Don't return viewed when last_viewed is 0 and newest_key is 0
|
||||
|
||||
@@ -140,4 +140,100 @@ class watch_base(dict):
|
||||
super(watch_base, self).__init__(*arg, **kw)
|
||||
|
||||
if self.get('default'):
|
||||
del self['default']
|
||||
del self['default']
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""
|
||||
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
|
||||
|
||||
CRITICAL FIX: Prevents copying large reference objects like __datastore
|
||||
which would cause exponential memory growth when Watch objects are deepcopied.
|
||||
|
||||
This is called by:
|
||||
- api/Watch.py:76 (API endpoint)
|
||||
- api/Tags.py:28 (Tags API)
|
||||
- processors/base.py:26 (EVERY processor run)
|
||||
- store/__init__.py:544 (clone watch)
|
||||
- And other locations
|
||||
"""
|
||||
from copy import deepcopy
|
||||
|
||||
# Create new instance without calling __init__
|
||||
cls = self.__class__
|
||||
new_obj = cls.__new__(cls)
|
||||
memo[id(self)] = new_obj
|
||||
|
||||
# Copy the dict data (all the settings)
|
||||
for key, value in self.items():
|
||||
new_obj[key] = deepcopy(value, memo)
|
||||
|
||||
# Copy instance attributes dynamically
|
||||
# This handles Watch-specific attrs (like __datastore) and any future subclass attrs
|
||||
for attr_name in dir(self):
|
||||
# Skip methods, special attrs, and dict keys
|
||||
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
||||
# This catches _model__datastore, _model__history_n, etc.
|
||||
try:
|
||||
attr_value = getattr(self, attr_name)
|
||||
|
||||
# Special handling: Share references to large objects instead of copying
|
||||
# Examples: __datastore, __app_reference, __global_settings, etc.
|
||||
if attr_name.endswith('__datastore') or attr_name.endswith('__app'):
|
||||
# Share the reference (don't copy!) to prevent memory leaks
|
||||
setattr(new_obj, attr_name, attr_value)
|
||||
# Skip cache attributes - let them regenerate on demand
|
||||
elif 'cache' in attr_name.lower():
|
||||
pass # Don't copy caches
|
||||
# Copy regular instance attributes
|
||||
elif not callable(attr_value):
|
||||
setattr(new_obj, attr_name, attr_value)
|
||||
except AttributeError:
|
||||
pass # Attribute doesn't exist in this instance
|
||||
|
||||
return new_obj
|
||||
|
||||
def __getstate__(self):
|
||||
"""
|
||||
Custom pickle serialization for all watch_base subclasses.
|
||||
|
||||
Excludes large reference objects (like __datastore) from serialization.
|
||||
"""
|
||||
# Get the dict data
|
||||
state = dict(self)
|
||||
|
||||
# Collect instance attributes (excluding methods and large references)
|
||||
instance_attrs = {}
|
||||
for attr_name in dir(self):
|
||||
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
||||
try:
|
||||
attr_value = getattr(self, attr_name)
|
||||
# Exclude large reference objects and caches from serialization
|
||||
if not (attr_name.endswith('__datastore') or
|
||||
attr_name.endswith('__app') or
|
||||
'cache' in attr_name.lower() or
|
||||
callable(attr_value)):
|
||||
instance_attrs[attr_name] = attr_value
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if instance_attrs:
|
||||
state['__instance_metadata__'] = instance_attrs
|
||||
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""
|
||||
Custom pickle deserialization for all watch_base subclasses.
|
||||
|
||||
WARNING: Large reference objects (like __datastore) are NOT restored!
|
||||
Caller must restore these references after unpickling if needed.
|
||||
"""
|
||||
# Extract metadata
|
||||
metadata = state.pop('__instance_metadata__', {})
|
||||
|
||||
# Restore dict data
|
||||
self.update(state)
|
||||
|
||||
# Restore instance attributes
|
||||
for attr_name, attr_value in metadata.items():
|
||||
setattr(self, attr_name, attr_value)
|
||||
@@ -23,7 +23,14 @@ class difference_detection_processor():
|
||||
def __init__(self, datastore, watch_uuid):
|
||||
self.datastore = datastore
|
||||
self.watch_uuid = watch_uuid
|
||||
|
||||
# Create a stable snapshot of the watch for processing
|
||||
# Why deepcopy?
|
||||
# 1. Prevents "dict changed during iteration" errors if watch is modified during processing
|
||||
# 2. Preserves Watch object with properties (.link, .is_pdf, etc.) - can't use dict()
|
||||
# 3. Safe now: Watch.__deepcopy__() shares datastore ref (no memory leak) but copies dict data
|
||||
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
|
||||
|
||||
# Generic fetcher that should be extended (requests, playwright etc)
|
||||
self.fetcher = Fetcher()
|
||||
|
||||
|
||||
@@ -248,7 +248,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
|
||||
else:
|
||||
# Fresh install - create new datastore
|
||||
logger.critical(f"No datastore found, creating new datastore at {self.datastore_path}")
|
||||
logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")
|
||||
|
||||
# Set schema version to latest (no updates needed)
|
||||
updates_available = self.get_updates_available()
|
||||
@@ -541,7 +541,11 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
# Clone a watch by UUID
|
||||
def clone(self, uuid):
|
||||
url = self.data['watching'][uuid].get('url')
|
||||
extras = deepcopy(self.data['watching'][uuid])
|
||||
# No need to deepcopy here - add_watch() will deepcopy extras anyway (line 569)
|
||||
# Just pass a dict copy (with lock for thread safety)
|
||||
# NOTE: dict() is shallow copy but safe since add_watch() deepcopies it
|
||||
with self.lock:
|
||||
extras = dict(self.data['watching'][uuid])
|
||||
new_uuid = self.add_watch(url=url, extras=extras)
|
||||
watch = self.data['watching'][new_uuid]
|
||||
return new_uuid
|
||||
@@ -872,10 +876,14 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
# So we use the same model as a Watch
|
||||
with self.lock:
|
||||
from ..model import Tag
|
||||
new_tag = Tag.model(datastore_path=self.datastore_path, default={
|
||||
'title': title.strip(),
|
||||
'date_created': int(time.time())
|
||||
})
|
||||
new_tag = Tag.model(
|
||||
datastore_path=self.datastore_path,
|
||||
__datastore=self.__data,
|
||||
default={
|
||||
'title': title.strip(),
|
||||
'date_created': int(time.time())
|
||||
}
|
||||
)
|
||||
|
||||
new_uuid = new_tag.get('uuid')
|
||||
|
||||
|
||||
@@ -5,8 +5,10 @@
|
||||
|
||||
import unittest
|
||||
import os
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
|
||||
from changedetectionio.model import Watch
|
||||
from changedetectionio.model import Watch, Tag
|
||||
|
||||
# mostly
|
||||
class TestDiffBuilder(unittest.TestCase):
|
||||
@@ -68,5 +70,184 @@ class TestDiffBuilder(unittest.TestCase):
|
||||
p = watch.get_from_version_based_on_last_viewed
|
||||
assert p == "100", "Correct with only one history snapshot"
|
||||
|
||||
def test_watch_deepcopy_doesnt_copy_datastore(self):
|
||||
"""
|
||||
CRITICAL: Ensure deepcopy(watch) shares __datastore instead of copying it.
|
||||
|
||||
Without this, deepcopy causes exponential memory growth:
|
||||
- 100 watches × deepcopy each = 10,000 watch objects in memory (100²)
|
||||
- Memory grows from 120MB → 2GB
|
||||
|
||||
This test prevents regressions in the __deepcopy__ implementation.
|
||||
"""
|
||||
# Create mock datastore with multiple watches
|
||||
mock_datastore = {
|
||||
'settings': {'application': {'history_snapshot_max_length': 10}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Create 3 watches that all reference the same datastore
|
||||
watches = []
|
||||
for i in range(3):
|
||||
watch = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': f'https://example{i}.com', 'title': f'Watch {i}'}
|
||||
)
|
||||
mock_datastore['watching'][watch['uuid']] = watch
|
||||
watches.append(watch)
|
||||
|
||||
# Test 1: Deepcopy shares datastore reference (doesn't copy it)
|
||||
watch_copy = deepcopy(watches[0])
|
||||
|
||||
self.assertIsNotNone(watch_copy._model__datastore,
|
||||
"__datastore should exist in copied watch")
|
||||
self.assertIs(watch_copy._model__datastore, watches[0]._model__datastore,
|
||||
"__datastore should be SHARED (same object), not copied")
|
||||
self.assertIs(watch_copy._model__datastore, mock_datastore,
|
||||
"__datastore should reference the original datastore")
|
||||
|
||||
# Test 2: Dict data is properly copied (not shared)
|
||||
self.assertEqual(watch_copy['title'], 'Watch 0', "Dict data should be copied")
|
||||
watch_copy['title'] = 'MODIFIED'
|
||||
self.assertNotEqual(watches[0]['title'], 'MODIFIED',
|
||||
"Modifying copy should not affect original")
|
||||
|
||||
# Test 3: Verify no nested datastore copies in watch dict
|
||||
# The dict should only contain watch settings, not the datastore
|
||||
watch_dict = dict(watch_copy)
|
||||
self.assertNotIn('__datastore', watch_dict,
|
||||
"__datastore should not be in dict keys")
|
||||
self.assertNotIn('_model__datastore', watch_dict,
|
||||
"_model__datastore should not be in dict keys")
|
||||
|
||||
# Test 4: Multiple deepcopies don't cause exponential memory growth
|
||||
# If datastore was copied, each copy would contain 3 watches,
|
||||
# and those watches would contain the datastore, etc. (infinite recursion)
|
||||
copies = []
|
||||
for _ in range(5):
|
||||
copies.append(deepcopy(watches[0]))
|
||||
|
||||
# All copies should share the same datastore
|
||||
for copy in copies:
|
||||
self.assertIs(copy._model__datastore, mock_datastore,
|
||||
"All copies should share the original datastore")
|
||||
|
||||
def test_watch_pickle_doesnt_serialize_datastore(self):
|
||||
"""
|
||||
Ensure pickle/unpickle doesn't serialize __datastore.
|
||||
|
||||
This is important for multiprocessing and caching - we don't want
|
||||
to serialize the entire datastore when pickling a watch.
|
||||
"""
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
watch = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': 'https://example.com', 'title': 'Test Watch'}
|
||||
)
|
||||
|
||||
# Pickle and unpickle
|
||||
pickled = pickle.dumps(watch)
|
||||
unpickled_watch = pickle.loads(pickled)
|
||||
|
||||
# Test 1: Watch data is preserved
|
||||
self.assertEqual(unpickled_watch['url'], 'https://example.com',
|
||||
"Dict data should be preserved after pickle/unpickle")
|
||||
|
||||
# Test 2: __datastore is NOT serialized (attribute shouldn't exist after unpickle)
|
||||
self.assertFalse(hasattr(unpickled_watch, '_model__datastore'),
|
||||
"__datastore attribute should not exist after unpickle (not serialized)")
|
||||
|
||||
# Test 3: Pickled data shouldn't contain the large datastore object
|
||||
# If datastore was serialized, the pickle size would be much larger
|
||||
pickle_size = len(pickled)
|
||||
# A single watch should be small (< 10KB), not include entire datastore
|
||||
self.assertLess(pickle_size, 10000,
|
||||
f"Pickled watch too large ({pickle_size} bytes) - might include datastore")
|
||||
|
||||
def test_tag_deepcopy_works(self):
|
||||
"""
|
||||
Ensure Tag objects (which also inherit from watch_base) can be deepcopied.
|
||||
|
||||
Tags now have optional __datastore for consistency with Watch objects.
|
||||
"""
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Test 1: Tag without datastore (backward compatibility)
|
||||
tag_without_ds = Tag.model(
|
||||
datastore_path='/tmp/test',
|
||||
default={'title': 'Test Tag', 'overrides_watch': True}
|
||||
)
|
||||
tag_copy1 = deepcopy(tag_without_ds)
|
||||
self.assertEqual(tag_copy1['title'], 'Test Tag', "Tag data should be copied")
|
||||
|
||||
# Test 2: Tag with datastore (new pattern for consistency)
|
||||
tag_with_ds = Tag.model(
|
||||
datastore_path='/tmp/test',
|
||||
__datastore=mock_datastore,
|
||||
default={'title': 'Test Tag With DS', 'overrides_watch': True}
|
||||
)
|
||||
|
||||
# Deepcopy should work
|
||||
tag_copy2 = deepcopy(tag_with_ds)
|
||||
|
||||
# Test 3: Dict data is copied
|
||||
self.assertEqual(tag_copy2['title'], 'Test Tag With DS', "Tag data should be copied")
|
||||
|
||||
# Test 4: Modifications to copy don't affect original
|
||||
tag_copy2['title'] = 'MODIFIED'
|
||||
self.assertNotEqual(tag_with_ds['title'], 'MODIFIED',
|
||||
"Modifying copy should not affect original")
|
||||
|
||||
# Test 5: Tag with datastore shares it (doesn't copy it)
|
||||
if hasattr(tag_with_ds, '_model__datastore'):
|
||||
self.assertIs(tag_copy2._model__datastore, tag_with_ds._model__datastore,
|
||||
"Tag should share __datastore reference like Watch does")
|
||||
|
||||
def test_watch_copy_performance(self):
|
||||
"""
|
||||
Verify that our __deepcopy__ implementation doesn't cause performance issues.
|
||||
|
||||
With the fix, deepcopy should be fast because we're sharing datastore
|
||||
instead of copying it.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Create a watch with large datastore (many watches)
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Add 100 watches to the datastore
|
||||
for i in range(100):
|
||||
w = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': f'https://example{i}.com'}
|
||||
)
|
||||
mock_datastore['watching'][w['uuid']] = w
|
||||
|
||||
# Time how long deepcopy takes
|
||||
watch = list(mock_datastore['watching'].values())[0]
|
||||
|
||||
start = time.time()
|
||||
for _ in range(10):
|
||||
_ = deepcopy(watch)
|
||||
elapsed = time.time() - start
|
||||
|
||||
# Should be fast (< 0.1 seconds for 10 copies)
|
||||
# If datastore was copied, it would take much longer
|
||||
self.assertLess(elapsed, 0.5,
|
||||
f"Deepcopy too slow ({elapsed:.3f}s for 10 copies) - might be copying datastore")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user