mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-08 23:36:02 +00:00
Compare commits
1 Commits
master
...
watch-data
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
559a925129 |
@@ -5,6 +5,11 @@ from changedetectionio.model import watch_base
|
||||
class model(watch_base):
|
||||
|
||||
def __init__(self, *arg, **kw):
|
||||
# Store datastore reference (optional for Tags, but good for consistency)
|
||||
self.__datastore = kw.get('__datastore')
|
||||
if kw.get('__datastore'):
|
||||
del kw['__datastore']
|
||||
|
||||
super(model, self).__init__(*arg, **kw)
|
||||
|
||||
self['overrides_watch'] = kw.get('default', {}).get('overrides_watch')
|
||||
|
||||
@@ -131,94 +131,8 @@ class model(watch_base):
|
||||
# Be sure the cached timestamp is ready
|
||||
bump = self.history
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""
|
||||
Custom deepcopy that excludes __datastore to prevent memory leaks.
|
||||
|
||||
CRITICAL FIX: Without this, deepcopy(watch) copies the entire datastore
|
||||
(which contains all other watches), causing exponential memory growth.
|
||||
With 100 watches, this creates 10,000 watch objects in memory (100²).
|
||||
|
||||
This is called by:
|
||||
- api/Watch.py:76 (API endpoint)
|
||||
- processors/base.py:26 (EVERY processor run)
|
||||
- store/__init__.py:544 (clone watch)
|
||||
- And 4+ other locations
|
||||
"""
|
||||
from copy import deepcopy
|
||||
|
||||
# Create a new instance without calling __init__ (avoids __datastore requirement)
|
||||
cls = self.__class__
|
||||
new_watch = cls.__new__(cls)
|
||||
memo[id(self)] = new_watch
|
||||
|
||||
# Copy the dict data (all the watch settings)
|
||||
for key, value in self.items():
|
||||
new_watch[key] = deepcopy(value, memo)
|
||||
|
||||
# Copy instance attributes EXCEPT the datastore references
|
||||
# These are cached/computed values that need to be preserved
|
||||
new_watch._model__newest_history_key = self._model__newest_history_key
|
||||
new_watch._model__history_n = self._model__history_n
|
||||
new_watch.jitter_seconds = self.jitter_seconds
|
||||
|
||||
# Copy datastore_path (string, safe to copy)
|
||||
new_watch._model__datastore_path = self._model__datastore_path
|
||||
|
||||
# CRITICAL: Share the datastore reference (don't copy it!)
|
||||
# This is safe because we never modify the datastore through the watch
|
||||
new_watch._model__datastore = self._model__datastore
|
||||
|
||||
# Do NOT copy favicon cache - let it be regenerated on demand
|
||||
# This is just a performance cache (prevents repeated glob operations)
|
||||
# and will be rebuilt automatically on first access
|
||||
|
||||
return new_watch
|
||||
|
||||
def __getstate__(self):
|
||||
"""
|
||||
Custom pickle serialization that excludes __datastore.
|
||||
|
||||
This handles pickle/unpickle (used by multiprocessing, caching, etc.)
|
||||
and ensures the datastore reference is never serialized.
|
||||
"""
|
||||
# Get the dict data
|
||||
state = dict(self)
|
||||
|
||||
# Add the instance attributes we want to preserve
|
||||
state['__watch_metadata__'] = {
|
||||
'newest_history_key': self._model__newest_history_key,
|
||||
'history_n': self._model__history_n,
|
||||
'jitter_seconds': self.jitter_seconds,
|
||||
'datastore_path': self._model__datastore_path,
|
||||
}
|
||||
|
||||
# NOTE: __datastore and _favicon_filename_cache are intentionally excluded
|
||||
# Both will be regenerated/restored as needed
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""
|
||||
Custom pickle deserialization.
|
||||
|
||||
WARNING: This creates a Watch without a __datastore reference!
|
||||
The caller MUST set watch._model__datastore after unpickling.
|
||||
"""
|
||||
# Extract metadata
|
||||
metadata = state.pop('__watch_metadata__', {})
|
||||
|
||||
# Restore dict data
|
||||
self.update(state)
|
||||
|
||||
# Restore instance attributes
|
||||
self._model__newest_history_key = metadata.get('newest_history_key')
|
||||
self._model__history_n = metadata.get('history_n', 0)
|
||||
self.jitter_seconds = metadata.get('jitter_seconds', 0)
|
||||
self._model__datastore_path = metadata.get('datastore_path')
|
||||
|
||||
# __datastore is NOT restored - caller must set it!
|
||||
# _favicon_filename_cache is NOT restored - will regenerate on demand
|
||||
self._model__datastore = None
|
||||
# Note: __deepcopy__, __getstate__, and __setstate__ are inherited from watch_base
|
||||
# This prevents memory leaks by sharing __datastore reference instead of copying it
|
||||
|
||||
@property
|
||||
def viewed(self):
|
||||
|
||||
@@ -140,4 +140,100 @@ class watch_base(dict):
|
||||
super(watch_base, self).__init__(*arg, **kw)
|
||||
|
||||
if self.get('default'):
|
||||
del self['default']
|
||||
del self['default']
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""
|
||||
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
|
||||
|
||||
CRITICAL FIX: Prevents copying large reference objects like __datastore
|
||||
which would cause exponential memory growth when Watch objects are deepcopied.
|
||||
|
||||
This is called by:
|
||||
- api/Watch.py:76 (API endpoint)
|
||||
- api/Tags.py:28 (Tags API)
|
||||
- processors/base.py:26 (EVERY processor run)
|
||||
- store/__init__.py:544 (clone watch)
|
||||
- And other locations
|
||||
"""
|
||||
from copy import deepcopy
|
||||
|
||||
# Create new instance without calling __init__
|
||||
cls = self.__class__
|
||||
new_obj = cls.__new__(cls)
|
||||
memo[id(self)] = new_obj
|
||||
|
||||
# Copy the dict data (all the settings)
|
||||
for key, value in self.items():
|
||||
new_obj[key] = deepcopy(value, memo)
|
||||
|
||||
# Copy instance attributes dynamically
|
||||
# This handles Watch-specific attrs (like __datastore) and any future subclass attrs
|
||||
for attr_name in dir(self):
|
||||
# Skip methods, special attrs, and dict keys
|
||||
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
||||
# This catches _model__datastore, _model__history_n, etc.
|
||||
try:
|
||||
attr_value = getattr(self, attr_name)
|
||||
|
||||
# Special handling: Share references to large objects instead of copying
|
||||
# Examples: __datastore, __app_reference, __global_settings, etc.
|
||||
if attr_name.endswith('__datastore') or attr_name.endswith('__app'):
|
||||
# Share the reference (don't copy!) to prevent memory leaks
|
||||
setattr(new_obj, attr_name, attr_value)
|
||||
# Skip cache attributes - let them regenerate on demand
|
||||
elif 'cache' in attr_name.lower():
|
||||
pass # Don't copy caches
|
||||
# Copy regular instance attributes
|
||||
elif not callable(attr_value):
|
||||
setattr(new_obj, attr_name, attr_value)
|
||||
except AttributeError:
|
||||
pass # Attribute doesn't exist in this instance
|
||||
|
||||
return new_obj
|
||||
|
||||
def __getstate__(self):
|
||||
"""
|
||||
Custom pickle serialization for all watch_base subclasses.
|
||||
|
||||
Excludes large reference objects (like __datastore) from serialization.
|
||||
"""
|
||||
# Get the dict data
|
||||
state = dict(self)
|
||||
|
||||
# Collect instance attributes (excluding methods and large references)
|
||||
instance_attrs = {}
|
||||
for attr_name in dir(self):
|
||||
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
||||
try:
|
||||
attr_value = getattr(self, attr_name)
|
||||
# Exclude large reference objects and caches from serialization
|
||||
if not (attr_name.endswith('__datastore') or
|
||||
attr_name.endswith('__app') or
|
||||
'cache' in attr_name.lower() or
|
||||
callable(attr_value)):
|
||||
instance_attrs[attr_name] = attr_value
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if instance_attrs:
|
||||
state['__instance_metadata__'] = instance_attrs
|
||||
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
"""
|
||||
Custom pickle deserialization for all watch_base subclasses.
|
||||
|
||||
WARNING: Large reference objects (like __datastore) are NOT restored!
|
||||
Caller must restore these references after unpickling if needed.
|
||||
"""
|
||||
# Extract metadata
|
||||
metadata = state.pop('__instance_metadata__', {})
|
||||
|
||||
# Restore dict data
|
||||
self.update(state)
|
||||
|
||||
# Restore instance attributes
|
||||
for attr_name, attr_value in metadata.items():
|
||||
setattr(self, attr_name, attr_value)
|
||||
@@ -876,10 +876,14 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
# So we use the same model as a Watch
|
||||
with self.lock:
|
||||
from ..model import Tag
|
||||
new_tag = Tag.model(datastore_path=self.datastore_path, default={
|
||||
'title': title.strip(),
|
||||
'date_created': int(time.time())
|
||||
})
|
||||
new_tag = Tag.model(
|
||||
datastore_path=self.datastore_path,
|
||||
__datastore=self.__data,
|
||||
default={
|
||||
'title': title.strip(),
|
||||
'date_created': int(time.time())
|
||||
}
|
||||
)
|
||||
|
||||
new_uuid = new_tag.get('uuid')
|
||||
|
||||
|
||||
@@ -5,8 +5,10 @@
|
||||
|
||||
import unittest
|
||||
import os
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
|
||||
from changedetectionio.model import Watch
|
||||
from changedetectionio.model import Watch, Tag
|
||||
|
||||
# mostly
|
||||
class TestDiffBuilder(unittest.TestCase):
|
||||
@@ -68,5 +70,184 @@ class TestDiffBuilder(unittest.TestCase):
|
||||
p = watch.get_from_version_based_on_last_viewed
|
||||
assert p == "100", "Correct with only one history snapshot"
|
||||
|
||||
def test_watch_deepcopy_doesnt_copy_datastore(self):
|
||||
"""
|
||||
CRITICAL: Ensure deepcopy(watch) shares __datastore instead of copying it.
|
||||
|
||||
Without this, deepcopy causes exponential memory growth:
|
||||
- 100 watches × deepcopy each = 10,000 watch objects in memory (100²)
|
||||
- Memory grows from 120MB → 2GB
|
||||
|
||||
This test prevents regressions in the __deepcopy__ implementation.
|
||||
"""
|
||||
# Create mock datastore with multiple watches
|
||||
mock_datastore = {
|
||||
'settings': {'application': {'history_snapshot_max_length': 10}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Create 3 watches that all reference the same datastore
|
||||
watches = []
|
||||
for i in range(3):
|
||||
watch = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': f'https://example{i}.com', 'title': f'Watch {i}'}
|
||||
)
|
||||
mock_datastore['watching'][watch['uuid']] = watch
|
||||
watches.append(watch)
|
||||
|
||||
# Test 1: Deepcopy shares datastore reference (doesn't copy it)
|
||||
watch_copy = deepcopy(watches[0])
|
||||
|
||||
self.assertIsNotNone(watch_copy._model__datastore,
|
||||
"__datastore should exist in copied watch")
|
||||
self.assertIs(watch_copy._model__datastore, watches[0]._model__datastore,
|
||||
"__datastore should be SHARED (same object), not copied")
|
||||
self.assertIs(watch_copy._model__datastore, mock_datastore,
|
||||
"__datastore should reference the original datastore")
|
||||
|
||||
# Test 2: Dict data is properly copied (not shared)
|
||||
self.assertEqual(watch_copy['title'], 'Watch 0', "Dict data should be copied")
|
||||
watch_copy['title'] = 'MODIFIED'
|
||||
self.assertNotEqual(watches[0]['title'], 'MODIFIED',
|
||||
"Modifying copy should not affect original")
|
||||
|
||||
# Test 3: Verify no nested datastore copies in watch dict
|
||||
# The dict should only contain watch settings, not the datastore
|
||||
watch_dict = dict(watch_copy)
|
||||
self.assertNotIn('__datastore', watch_dict,
|
||||
"__datastore should not be in dict keys")
|
||||
self.assertNotIn('_model__datastore', watch_dict,
|
||||
"_model__datastore should not be in dict keys")
|
||||
|
||||
# Test 4: Multiple deepcopies don't cause exponential memory growth
|
||||
# If datastore was copied, each copy would contain 3 watches,
|
||||
# and those watches would contain the datastore, etc. (infinite recursion)
|
||||
copies = []
|
||||
for _ in range(5):
|
||||
copies.append(deepcopy(watches[0]))
|
||||
|
||||
# All copies should share the same datastore
|
||||
for copy in copies:
|
||||
self.assertIs(copy._model__datastore, mock_datastore,
|
||||
"All copies should share the original datastore")
|
||||
|
||||
def test_watch_pickle_doesnt_serialize_datastore(self):
|
||||
"""
|
||||
Ensure pickle/unpickle doesn't serialize __datastore.
|
||||
|
||||
This is important for multiprocessing and caching - we don't want
|
||||
to serialize the entire datastore when pickling a watch.
|
||||
"""
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
watch = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': 'https://example.com', 'title': 'Test Watch'}
|
||||
)
|
||||
|
||||
# Pickle and unpickle
|
||||
pickled = pickle.dumps(watch)
|
||||
unpickled_watch = pickle.loads(pickled)
|
||||
|
||||
# Test 1: Watch data is preserved
|
||||
self.assertEqual(unpickled_watch['url'], 'https://example.com',
|
||||
"Dict data should be preserved after pickle/unpickle")
|
||||
|
||||
# Test 2: __datastore is NOT serialized (attribute shouldn't exist after unpickle)
|
||||
self.assertFalse(hasattr(unpickled_watch, '_model__datastore'),
|
||||
"__datastore attribute should not exist after unpickle (not serialized)")
|
||||
|
||||
# Test 3: Pickled data shouldn't contain the large datastore object
|
||||
# If datastore was serialized, the pickle size would be much larger
|
||||
pickle_size = len(pickled)
|
||||
# A single watch should be small (< 10KB), not include entire datastore
|
||||
self.assertLess(pickle_size, 10000,
|
||||
f"Pickled watch too large ({pickle_size} bytes) - might include datastore")
|
||||
|
||||
def test_tag_deepcopy_works(self):
|
||||
"""
|
||||
Ensure Tag objects (which also inherit from watch_base) can be deepcopied.
|
||||
|
||||
Tags now have optional __datastore for consistency with Watch objects.
|
||||
"""
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Test 1: Tag without datastore (backward compatibility)
|
||||
tag_without_ds = Tag.model(
|
||||
datastore_path='/tmp/test',
|
||||
default={'title': 'Test Tag', 'overrides_watch': True}
|
||||
)
|
||||
tag_copy1 = deepcopy(tag_without_ds)
|
||||
self.assertEqual(tag_copy1['title'], 'Test Tag', "Tag data should be copied")
|
||||
|
||||
# Test 2: Tag with datastore (new pattern for consistency)
|
||||
tag_with_ds = Tag.model(
|
||||
datastore_path='/tmp/test',
|
||||
__datastore=mock_datastore,
|
||||
default={'title': 'Test Tag With DS', 'overrides_watch': True}
|
||||
)
|
||||
|
||||
# Deepcopy should work
|
||||
tag_copy2 = deepcopy(tag_with_ds)
|
||||
|
||||
# Test 3: Dict data is copied
|
||||
self.assertEqual(tag_copy2['title'], 'Test Tag With DS', "Tag data should be copied")
|
||||
|
||||
# Test 4: Modifications to copy don't affect original
|
||||
tag_copy2['title'] = 'MODIFIED'
|
||||
self.assertNotEqual(tag_with_ds['title'], 'MODIFIED',
|
||||
"Modifying copy should not affect original")
|
||||
|
||||
# Test 5: Tag with datastore shares it (doesn't copy it)
|
||||
if hasattr(tag_with_ds, '_model__datastore'):
|
||||
self.assertIs(tag_copy2._model__datastore, tag_with_ds._model__datastore,
|
||||
"Tag should share __datastore reference like Watch does")
|
||||
|
||||
def test_watch_copy_performance(self):
|
||||
"""
|
||||
Verify that our __deepcopy__ implementation doesn't cause performance issues.
|
||||
|
||||
With the fix, deepcopy should be fast because we're sharing datastore
|
||||
instead of copying it.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Create a watch with large datastore (many watches)
|
||||
mock_datastore = {
|
||||
'settings': {'application': {}},
|
||||
'watching': {}
|
||||
}
|
||||
|
||||
# Add 100 watches to the datastore
|
||||
for i in range(100):
|
||||
w = Watch.model(
|
||||
__datastore=mock_datastore,
|
||||
datastore_path='/tmp/test',
|
||||
default={'url': f'https://example{i}.com'}
|
||||
)
|
||||
mock_datastore['watching'][w['uuid']] = w
|
||||
|
||||
# Time how long deepcopy takes
|
||||
watch = list(mock_datastore['watching'].values())[0]
|
||||
|
||||
start = time.time()
|
||||
for _ in range(10):
|
||||
_ = deepcopy(watch)
|
||||
elapsed = time.time() - start
|
||||
|
||||
# Should be fast (< 0.1 seconds for 10 copies)
|
||||
# If datastore was copied, it would take much longer
|
||||
self.assertLess(elapsed, 0.5,
|
||||
f"Deepcopy too slow ({elapsed:.3f}s for 10 copies) - might be copying datastore")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user