changedetection.io/changedetectionio/tests/unit/test_watch_model.py

#!/usr/bin/env python3

# run from dir above changedetectionio/ dir
# python3 -m unittest changedetectionio.tests.unit.test_notification_diff

import unittest
import os
import pickle
from copy import deepcopy

from changedetectionio.model import Watch, Tag

# mostly
class TestDiffBuilder(unittest.TestCase):

    def test_watch_get_suggested_from_diff_timestamp(self):
        import uuid as uuid_builder
        # Create minimal mock datastore for tests
        mock_datastore = {
            'settings': {
                'application': {}
            },
            'watching': {}
        }
        watch = Watch.model(datastore_path='/tmp', __datastore=mock_datastore, default={})
        watch.ensure_data_dir_exists()


        # Contents from the browser are always returned from the browser/requests/etc as str, str is basically UTF-16 in python
        watch.save_history_blob(contents="hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4()))
        watch.save_history_blob(contents="hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4()))
        watch.save_history_blob(contents="hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4()))
        watch.save_history_blob(contents="hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4()))
        watch.save_history_blob(contents="hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4()))
        watch.save_history_blob(contents="hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4()))

        p = watch.get_from_version_based_on_last_viewed
        assert p == "100", "Correct 'last viewed' timestamp was detected"

        watch['last_viewed'] = 110
        p = watch.get_from_version_based_on_last_viewed
        assert p == "109", "Correct 'last viewed' timestamp was detected"

        watch['last_viewed'] = 116
        p = watch.get_from_version_based_on_last_viewed
        assert p == "115", "Correct 'last viewed' timestamp was detected"

        watch['last_viewed'] = 99
        p = watch.get_from_version_based_on_last_viewed
        assert p == "100", "When the 'last viewed' timestamp is less than the oldest snapshot, return oldest"

        watch['last_viewed'] = 200
        p = watch.get_from_version_based_on_last_viewed
        assert p == "115", "When the 'last viewed' timestamp is greater than the newest snapshot, return second newest"

        watch['last_viewed'] = 109
        p = watch.get_from_version_based_on_last_viewed
        assert p == "109", "Correct when its the same time"

        # new empty one
        watch = Watch.model(datastore_path='/tmp', __datastore=mock_datastore, default={})
        p = watch.get_from_version_based_on_last_viewed
        assert p == None, "None when no history available"

        watch.save_history_blob(contents="hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4()))
        p = watch.get_from_version_based_on_last_viewed
        assert p == "100", "Correct with only one history snapshot"

        watch['last_viewed'] = 200
        p = watch.get_from_version_based_on_last_viewed
        assert p == "100", "Correct with only one history snapshot"

    def test_watch_deepcopy_doesnt_copy_datastore(self):
        """
        CRITICAL: Ensure deepcopy(watch) shares __datastore instead of copying it.

        Without this, deepcopy causes exponential memory growth:
        - 100 watches × deepcopy each = 10,000 watch objects in memory (100²)
        - Memory grows from 120MB → 2GB

        This test prevents regressions in the __deepcopy__ implementation.
        """
        # Create mock datastore with multiple watches
        mock_datastore = {
            'settings': {'application': {'history_snapshot_max_length': 10}},
            'watching': {}
        }

        # Create 3 watches that all reference the same datastore
        watches = []
        for i in range(3):
            watch = Watch.model(
                __datastore=mock_datastore,
                datastore_path='/tmp/test',
                default={'url': f'https://example{i}.com', 'title': f'Watch {i}'}
            )
            mock_datastore['watching'][watch['uuid']] = watch
            watches.append(watch)

        # Test 1: Deepcopy shares datastore reference (doesn't copy it)
        watch_copy = deepcopy(watches[0])

        self.assertIsNotNone(watch_copy._datastore,
                            "__datastore should exist in copied watch")
        self.assertIs(watch_copy._datastore, watches[0]._datastore,
                     "__datastore should be SHARED (same object), not copied")
        self.assertIs(watch_copy._datastore, mock_datastore,
                     "__datastore should reference the original datastore")

        # Test 2: Dict data is properly copied (not shared)
        self.assertEqual(watch_copy['title'], 'Watch 0', "Dict data should be copied")
        watch_copy['title'] = 'MODIFIED'
        self.assertNotEqual(watches[0]['title'], 'MODIFIED',
                           "Modifying copy should not affect original")

        # Test 3: Verify no nested datastore copies in watch dict
        # The dict should only contain watch settings, not the datastore
        watch_dict = dict(watch_copy)
        self.assertNotIn('__datastore', watch_dict,
                        "__datastore should not be in dict keys")
        self.assertNotIn('_model__datastore', watch_dict,
                        "_model__datastore should not be in dict keys")

        # Test 4: Multiple deepcopies don't cause exponential memory growth
        # If datastore was copied, each copy would contain 3 watches,
        # and those watches would contain the datastore, etc. (infinite recursion)
        copies = []
        for _ in range(5):
            copies.append(deepcopy(watches[0]))

        # All copies should share the same datastore
        for copy in copies:
            self.assertIs(copy._datastore, mock_datastore,
                         "All copies should share the original datastore")

    def test_watch_pickle_doesnt_serialize_datastore(self):
        """
        Ensure pickle/unpickle doesn't serialize __datastore.

        This is important for multiprocessing and caching - we don't want
        to serialize the entire datastore when pickling a watch.
        """
        mock_datastore = {
            'settings': {'application': {}},
            'watching': {}
        }

        watch = Watch.model(
            __datastore=mock_datastore,
            datastore_path='/tmp/test',
            default={'url': 'https://example.com', 'title': 'Test Watch'}
        )

        # Pickle and unpickle
        pickled = pickle.dumps(watch)
        unpickled_watch = pickle.loads(pickled)

        # Test 1: Watch data is preserved
        self.assertEqual(unpickled_watch['url'], 'https://example.com',
                        "Dict data should be preserved after pickle/unpickle")

        # Test 2: __datastore is NOT serialized (attribute shouldn't exist after unpickle)
        self.assertFalse(hasattr(unpickled_watch, '_datastore'),
                         "__datastore attribute should not exist after unpickle (not serialized)")

        # Test 3: Pickled data shouldn't contain the large datastore object
        # If datastore was serialized, the pickle size would be much larger
        pickle_size = len(pickled)
        # A single watch should be small (< 10KB), not include entire datastore
        self.assertLess(pickle_size, 10000,
                       f"Pickled watch too large ({pickle_size} bytes) - might include datastore")

    def test_tag_deepcopy_works(self):
        """
        Ensure Tag objects (which also inherit from watch_base) can be deepcopied.

        Tags now have optional __datastore for consistency with Watch objects.
        """
        mock_datastore = {
            'settings': {'application': {}},
            'watching': {}
        }

        # Test 1: Tag without datastore (backward compatibility)
        tag_without_ds = Tag.model(
            datastore_path='/tmp/test',
            default={'title': 'Test Tag', 'overrides_watch': True}
        )
        tag_copy1 = deepcopy(tag_without_ds)
        self.assertEqual(tag_copy1['title'], 'Test Tag', "Tag data should be copied")

        # Test 2: Tag with datastore (new pattern for consistency)
        tag_with_ds = Tag.model(
            datastore_path='/tmp/test',
            __datastore=mock_datastore,
            default={'title': 'Test Tag With DS', 'overrides_watch': True}
        )

        # Deepcopy should work
        tag_copy2 = deepcopy(tag_with_ds)

        # Test 3: Dict data is copied
        self.assertEqual(tag_copy2['title'], 'Test Tag With DS', "Tag data should be copied")

        # Test 4: Modifications to copy don't affect original
        tag_copy2['title'] = 'MODIFIED'
        self.assertNotEqual(tag_with_ds['title'], 'MODIFIED',
                           "Modifying copy should not affect original")

        # Test 5: Tag with datastore shares it (doesn't copy it)
        if hasattr(tag_with_ds, '_datastore'):
            self.assertIs(tag_copy2._datastore, tag_with_ds._datastore,
                         "Tag should share __datastore reference like Watch does")

    def test_watch_copy_performance(self):
        """
        Verify that our __deepcopy__ implementation doesn't cause performance issues.

        With the fix, deepcopy should be fast because we're sharing datastore
        instead of copying it.
        """
        import time

        # Create a watch with large datastore (many watches)
        mock_datastore = {
            'settings': {'application': {}},
            'watching': {}
        }

        # Add 100 watches to the datastore
        for i in range(100):
            w = Watch.model(
                __datastore=mock_datastore,
                datastore_path='/tmp/test',
                default={'url': f'https://example{i}.com'}
            )
            mock_datastore['watching'][w['uuid']] = w

        # Time how long deepcopy takes
        watch = list(mock_datastore['watching'].values())[0]

        start = time.time()
        for _ in range(10):
            _ = deepcopy(watch)
        elapsed = time.time() - start

        # Should be fast (< 0.1 seconds for 10 copies)
        # If datastore was copied, it would take much longer
        self.assertLess(elapsed, 0.5,
                       f"Deepcopy too slow ({elapsed:.3f}s for 10 copies) - might be copying datastore")

class TestLLMDiffSummaryCache(unittest.TestCase):
    """Tests for get_llm_diff_summary / save_llm_diff_summary — version-pair + prompt-hash caching."""

    PROMPT = 'List what changed as bullet points'

    def _make_watch(self):
        mock_datastore = {'settings': {'application': {}}, 'watching': {}}
        watch = Watch.model(datastore_path='/tmp', __datastore=mock_datastore, default={})
        watch.ensure_data_dir_exists()
        return watch

    def test_returns_empty_when_no_file_exists(self):
        watch = self._make_watch()
        assert watch.get_llm_diff_summary('1000', '2000', prompt=self.PROMPT) == ''

    def test_save_and_retrieve(self):
        watch = self._make_watch()
        watch.save_llm_diff_summary('Price dropped to $199', '1000', '2000', prompt=self.PROMPT)
        assert watch.get_llm_diff_summary('1000', '2000', prompt=self.PROMPT) == 'Price dropped to $199'

    def test_different_version_pairs_are_independent(self):
        watch = self._make_watch()
        watch.save_llm_diff_summary('Summary A', '1000', '2000', prompt=self.PROMPT)
        watch.save_llm_diff_summary('Summary B', '2000', '3000', prompt=self.PROMPT)
        assert watch.get_llm_diff_summary('1000', '2000', prompt=self.PROMPT) == 'Summary A'
        assert watch.get_llm_diff_summary('2000', '3000', prompt=self.PROMPT) == 'Summary B'

    def test_unknown_pair_returns_empty(self):
        watch = self._make_watch()
        watch.save_llm_diff_summary('Summary A', '1000', '2000', prompt=self.PROMPT)
        assert watch.get_llm_diff_summary('9999', '8888', prompt=self.PROMPT) == ''

    def test_changed_prompt_is_a_cache_miss(self):
        """Changing the prompt must invalidate the cached summary for the same version pair."""
        watch = self._make_watch()
        watch.save_llm_diff_summary('Old summary', '1000', '2000', prompt='original prompt')
        # Different prompt → different hash → different filename → miss
        assert watch.get_llm_diff_summary('1000', '2000', prompt='new different prompt') == ''

    def test_file_named_by_versions_and_prompt_hash(self):
        """Cache file must be named change-summary-{from}-to-{to}-{hash}.txt."""
        import hashlib
        watch = self._make_watch()
        prompt = 'my summary prompt'
        watch.save_llm_diff_summary('Test summary', '1776000000', '1776001000', prompt=prompt)
        prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:8]
        expected_path = os.path.join(
            watch.data_dir,
            f'change-summary-1776000000-to-1776001000-{prompt_hash}.txt'
        )
        assert os.path.isfile(expected_path), \
            f"Expected cache file not found: {expected_path}"
        with open(expected_path, 'r') as f:
            assert f.read().strip() == 'Test summary'

    def test_overwrite_same_pair_and_prompt(self):
        watch = self._make_watch()
        watch.save_llm_diff_summary('First summary', '1000', '2000', prompt=self.PROMPT)
        watch.save_llm_diff_summary('Updated summary', '1000', '2000', prompt=self.PROMPT)
        assert watch.get_llm_diff_summary('1000', '2000', prompt=self.PROMPT) == 'Updated summary'


class TestHistoryPathTraversal(unittest.TestCase):
    """GHSA-8757-69j2-hx56: history.txt must not allow reads outside the watch data dir."""

    def _make_watch(self):
        mock_datastore = {'settings': {'application': {}}, 'watching': {}}
        watch = Watch.model(datastore_path='/tmp', __datastore=mock_datastore, default={})
        watch.ensure_data_dir_exists()
        return watch

    def _write_history_txt(self, watch, lines):
        """Directly write raw lines to history.txt to simulate a restored backup."""
        fname = os.path.join(watch.data_dir, watch.history_index_filename)
        with open(fname, 'w', encoding='utf-8') as f:
            f.writelines(lines)

    def test_absolute_path_in_history_is_rejected(self):
        """An absolute path like /etc/passwd must not appear in history."""
        watch = self._make_watch()
        self._write_history_txt(watch, ['1000000000,/etc/passwd\n'])
        history = watch.history
        self.assertEqual(history, {}, "Absolute path entry must be rejected")

    def test_traversal_path_in_history_is_rejected(self):
        """A relative traversal path like ../../etc/passwd must not appear in history."""
        watch = self._make_watch()
        self._write_history_txt(watch, ['1000000000,../../etc/passwd\n'])
        history = watch.history
        self.assertEqual(history, {}, "Path traversal entry must be rejected")

    def test_normal_snapshot_entry_is_accepted(self):
        """A bare filename written by save_history_blob must still load correctly."""
        import uuid as uuid_builder
        watch = self._make_watch()
        watch.save_history_blob(contents="hello world", timestamp=1000000000, snapshot_id=str(uuid_builder.uuid4()))
        history = watch.history
        self.assertEqual(len(history), 1, "Normal snapshot entry must be accepted")
        self.assertTrue(
            list(history.values())[0].startswith(watch.data_dir),
            "Resolved path must be inside the watch data directory"
        )

    def test_get_history_snapshot_blocks_outside_path_directly(self):
        """get_history_snapshot(filepath=...) must raise if the path escapes data_dir."""
        watch = self._make_watch()
        with self.assertRaises(PermissionError):
            watch.get_history_snapshot(filepath='/etc/passwd')

    def test_get_history_snapshot_blocks_traversal_directly(self):
        """get_history_snapshot(filepath=...) must raise on ../../ traversal paths."""
        watch = self._make_watch()
        with self.assertRaises(PermissionError):
            watch.get_history_snapshot(filepath=os.path.join(watch.data_dir, '../../etc/passwd'))

    def test_resolved_path_stays_inside_data_dir(self):
        """All resolved history paths must reside within the watch's data_dir."""
        import uuid as uuid_builder
        watch = self._make_watch()
        for ts in [1000000001, 1000000002, 1000000003]:
            watch.save_history_blob(contents=f"content {ts}", timestamp=ts, snapshot_id=str(uuid_builder.uuid4()))
        safe_dir = os.path.realpath(watch.data_dir)
        for path in watch.history.values():
            self.assertTrue(
                os.path.realpath(path).startswith(safe_dir),
                f"Path {path!r} escapes the watch data directory"
            )


if __name__ == '__main__':
    unittest.main()