changedetection.io/changedetectionio/store/__init__.py

import shutil

from changedetectionio.strtobool import strtobool

from changedetectionio.validate_url import is_safe_valid_url

from flask import (
    flash
)
from flask_babel import gettext

from ..model import App, Watch
from copy import deepcopy
from os import path, unlink
import json
import os
import re
import secrets
import sys
import time
import uuid as uuid_builder
from loguru import logger
from blinker import signal

from ..model.Tags import TagsDict

# Try to import orjson for faster JSON serialization
try:
    import orjson

    HAS_ORJSON = True
except ImportError:
    HAS_ORJSON = False

from ..processors import get_custom_watch_obj_for_processor

# Import the base class and helpers
from .file_saving_datastore import FileSavingDataStore, load_all_watches, load_all_tags, save_json_atomic
from .updates import DatastoreUpdatesMixin

# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'

dictfilt = lambda x, y: dict([(i, x[i]) for i in x if i in set(y)])


# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
# Open a github issue if you know something :)
# https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change
class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
    __version_check = True

    def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"):
        # Initialize parent class
        super().__init__()

        # Should only be active for docker
        # logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
        self.datastore_path = datastore_path
        self.start_time = time.time()
        self.save_version_copy_json_db(version_tag)
        self.reload_state(datastore_path=datastore_path, include_default_watches=include_default_watches, version_tag=version_tag)

    def save_version_copy_json_db(self, version_tag):
        """
        Create version-tagged backup of changedetection.json.

        This is called on version upgrades to preserve a backup in case
        the new version has issues.
        """
        import re

        version_text = re.sub(r'\D+', '-', version_tag)
        db_path = os.path.join(self.datastore_path, "changedetection.json")
        db_path_version_backup = os.path.join(self.datastore_path, f"changedetection-{version_text}.json")

        if not os.path.isfile(db_path_version_backup) and os.path.isfile(db_path):
            from shutil import copyfile
            logger.info(f"Backing up changedetection.json due to new version to '{db_path_version_backup}'.")
            copyfile(db_path, db_path_version_backup)

    def _load_settings(self, filename="changedetection.json"):
        """
        Load settings from storage.

        File backend implementation: reads from changedetection.json

        Returns:
            dict: Settings data loaded from storage
        """
        changedetection_json = os.path.join(self.datastore_path, filename)

        logger.info(f"Loading settings from {changedetection_json}")

        if HAS_ORJSON:
            with open(changedetection_json, 'rb') as f:
                return orjson.loads(f.read())
        else:
            with open(changedetection_json, 'r', encoding='utf-8') as f:
                return json.load(f)

    def _apply_settings(self, settings_data):
        """
        Apply loaded settings data to internal data structure.

        Args:
            settings_data: Dictionary loaded from changedetection.json
        """
        # Apply top-level fields
        if 'app_guid' in settings_data:
            self.__data['app_guid'] = settings_data['app_guid']
        if 'build_sha' in settings_data:
            self.__data['build_sha'] = settings_data['build_sha']
        if 'version_tag' in settings_data:
            self.__data['version_tag'] = settings_data['version_tag']

        # Apply settings sections
        if 'settings' in settings_data:
            if 'headers' in settings_data['settings']:
                self.__data['settings']['headers'].update(settings_data['settings']['headers'])
            if 'requests' in settings_data['settings']:
                self.__data['settings']['requests'].update(settings_data['settings']['requests'])
            if 'application' in settings_data['settings']:
                self.__data['settings']['application'].update(settings_data['settings']['application'])

                # Use our Tags dict with cleanup helpers etc
                # @todo Same for Watches
                existing_tags = settings_data.get('settings', {}).get('application', {}).get('tags') or {}
                self.__data['settings']['application']['tags'] = TagsDict(existing_tags, datastore_path=self.datastore_path)

        # More or less for the old format which had this data in the one url-watches.json
        # cant hurt to leave it here,
        if 'watching' in settings_data:
            self.__data['watching'].update(settings_data['watching'])

    def _rehydrate_tags(self):
        """Rehydrate tag entities from stored data into Tag objects with restock_diff processor."""
        from ..model import Tag

        for uuid, tag in self.__data['settings']['application']['tags'].items():
            # Force processor to restock_diff for override functionality (technical debt)
            tag['processor'] = 'restock_diff'

            self.__data['settings']['application']['tags'][uuid] = Tag.model(
                datastore_path=self.datastore_path,
                __datastore=self.__data,
                default=tag
            )
            logger.info(f"Tag: {uuid} {tag['title']}")

    def _rehydrate_watches(self):
        """Rehydrate watch entities from stored data (converts dicts to Watch objects)."""
        watch_count = len(self.__data.get('watching', {}))
        if watch_count == 0:
            return

        logger.info(f"Rehydrating {watch_count} watches...")
        watching_rehydrated = {}
        for uuid, watch_dict in self.__data.get('watching', {}).items():
            if isinstance(watch_dict, dict):
                watching_rehydrated[uuid] = self.rehydrate_entity(uuid, watch_dict)
            else:
                logger.error(f"Watch UUID {uuid} already rehydrated")

        self.__data['watching'] = watching_rehydrated
        logger.success(f"Rehydrated {watch_count} watches into Watch objects")


    def _load_state(self, main_settings_filename="changedetection.json"):
        """
        Load complete datastore state from storage.

        Orchestrates loading of settings, watches, and tags using polymorphic methods.
        """
        # Load settings
        settings_data = self._load_settings(filename=main_settings_filename)
        self._apply_settings(settings_data)

        # Load watches, scan them from the disk
        self._load_watches()
        self._rehydrate_watches()

        # Load tags from individual tag.json files
        # These will override any tags in settings (migration path)
        self._load_tags()

        # Rehydrate any remaining tags from settings (legacy/fallback)
        self._rehydrate_tags()

    def reload_state(self, datastore_path, include_default_watches, version_tag):
        """
        Load datastore from storage or create new one.

        Supports two scenarios:
        1. NEW format: changedetection.json exists → load and run updates if needed
        2. EMPTY: No changedetection.json → create new OR trigger migration from legacy

        Note: Legacy url-watches.json migration happens in update_26, not here.
        """
        logger.info(f"Datastore path is '{datastore_path}'")

        # CRITICAL: Update datastore_path (was using old path from __init__)
        self.datastore_path = datastore_path

        # Initialize data structure
        self.__data = App.model(datastore_path=datastore_path)
        self.json_store_path = os.path.join(self.datastore_path, "changedetection.json")

        # Base definition for all watchers (deepcopy part of #569)
        self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, __datastore=self.__data, default={}))

        # Load build SHA if available (Docker deployments)
        if path.isfile('changedetectionio/source.txt'):
            with open('changedetectionio/source.txt') as f:
                self.__data['build_sha'] = f.read()

        # Check if datastore already exists
        changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
        changedetection_json_old_schema = os.path.join(self.datastore_path, "url-watches.json")

        if os.path.exists(changedetection_json):
            # Run schema updates if needed
            # Pass current schema version from loaded datastore (defaults to 0 if not set)
            # Load existing datastore (changedetection.json + watch.json files)
            logger.info("Loading existing datastore")
            self._load_state()
            current_schema = self.data['settings']['application'].get('schema_version', 0)
            self.run_updates(current_schema_version=current_schema)

        # Legacy datastore detected - trigger migration, even works if the schema is much before the migration step.
        elif os.path.exists(changedetection_json_old_schema):

            logger.critical(f"Legacy datastore detected at {changedetection_json_old_schema}, loading and running updates")
            self._load_state(main_settings_filename="url-watches.json")
            # update 26 will load the whole old config from disk to __data
            current_schema = self.__data['settings']['application'].get('schema_version', 0)
            self.run_updates(current_schema_version=current_schema)
            # Probably tags were also shifted to disk and many other changes, so best to reload here.
            self._load_state()

        else:
            # No datastore yet - check if this is a fresh install or legacy migration
            self.init_fresh_install(include_default_watches=include_default_watches,
                                    version_tag=version_tag)
            # Maybe they copied a bunch of watch subdirs across too
            self._load_state()

    def init_fresh_install(self, include_default_watches, version_tag):
      # Generate app_guid FIRST (required for all operations)
        if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
            self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
        else:
            self.__data['app_guid'] = str(uuid_builder.uuid4())

        # Generate RSS access token
        self.__data['settings']['application']['rss_access_token'] = secrets.token_hex(16)

        # Generate API access token
        self.__data['settings']['application']['api_access_token'] = secrets.token_hex(16)
        logger.warning(f"No datastore found, creating new datastore at {self.datastore_path}")

        # Set schema version to latest (no updates needed)
        latest_update_available = self.get_updates_available().pop()
        logger.info(f"Marking fresh install to schema version {latest_update_available}")
        self.__data['settings']['application']['schema_version'] = latest_update_available

        # Add default watches if requested
        if include_default_watches:
            self.add_watch(
                url='https://news.ycombinator.com/',
                tag='Tech news',
                extras={'fetch_backend': 'html_requests'}
            )
            self.add_watch(
                url='https://changedetection.io/CHANGELOG.txt',
                tag='changedetection.io',
                extras={'fetch_backend': 'html_requests'}
            )

        # Create changedetection.json immediately
        try:
            self._save_settings()
            logger.info("Created changedetection.json for new datastore")
        except Exception as e:
            logger.error(f"Failed to create initial changedetection.json: {e}")


        # Set version tag
        self.__data['version_tag'] = version_tag

        # Validate proxies.json if it exists
        _ = self.proxy_list  # Just to test parsing

        # Ensure app_guid exists (for datastores loaded from existing files)
        if 'app_guid' not in self.__data:
            if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
                self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
            else:
                self.__data['app_guid'] = str(uuid_builder.uuid4())
            self.commit()

        # Ensure RSS access token exists
        if not self.__data['settings']['application'].get('rss_access_token'):
            secret = secrets.token_hex(16)
            self.__data['settings']['application']['rss_access_token'] = secret
            self.commit()

        # Ensure API access token exists
        if not self.__data['settings']['application'].get('api_access_token'):
            secret = secrets.token_hex(16)
            self.__data['settings']['application']['api_access_token'] = secret
            self.commit()

        # Handle password reset lockfile
        password_reset_lockfile = os.path.join(self.datastore_path, "removepassword.lock")
        if path.isfile(password_reset_lockfile):
            self.remove_password()
            unlink(password_reset_lockfile)

    def rehydrate_entity(self, uuid, entity, processor_override=None):
        """Set the dict back to the dict Watch object"""
        entity['uuid'] = uuid

        if processor_override:
            watch_class = get_custom_watch_obj_for_processor(processor_override)
            entity['processor'] = processor_override
        else:
            watch_class = get_custom_watch_obj_for_processor(entity.get('processor'))

        if entity.get('processor') != 'text_json_diff':
            logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}")

        entity = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, default=entity)
        return entity

    # ============================================================================
    # FileSavingDataStore Abstract Method Implementations
    # ============================================================================

    def _watch_exists(self, uuid):
        """Check if watch exists in datastore."""
        return uuid in self.__data['watching']

    def _get_watch_dict(self, uuid):
        """Get watch as dictionary."""
        return dict(self.__data['watching'][uuid])

    def _build_settings_data(self):
        """
        Build settings data structure for saving.

        Tags behavior depends on schema version:
        - Before update_28 (schema < 28): Tags saved in settings for migration
        - After update_28 (schema >= 28): Tags excluded from settings (in individual files)

        Returns:
            dict: Settings data ready for serialization
        """
        import copy

        # Deep copy settings to avoid modifying the original
        settings_copy = copy.deepcopy(self.__data['settings'])

        # Is saved as {uuid}/tag.json
        settings_copy['application']['tags'] = {}

        return {
            'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
            'app_guid': self.__data.get('app_guid'),
            'settings': settings_copy,
            'build_sha': self.__data.get('build_sha'),
            'version_tag': self.__data.get('version_tag')
        }

    def _save_settings(self):
        """
        Save settings to storage.

        File backend implementation: saves to changedetection.json
        Implementation of abstract method from FileSavingDataStore.
        Uses the generic save_json_atomic helper.

        Raises:
            OSError: If disk is full or other I/O error
        """
        settings_data = self._build_settings_data()
        changedetection_json = os.path.join(self.datastore_path, "changedetection.json")
        save_json_atomic(changedetection_json, settings_data, label="settings")

    def _load_watches(self):
        """
        Load all watches from storage.

        File backend implementation: reads individual watch.json files
        Implementation of abstract method from FileSavingDataStore.
        Delegates to helper function and stores results in internal data structure.
        """

        # Store loaded data
        # @note this will also work for the old legacy format because self.__data['watching'] should already have them loaded by this point.
        self.__data['watching'].update(load_all_watches(
            self.datastore_path,
            self.rehydrate_entity
        ))
        logger.debug(f"Loaded {len(self.__data['watching'])} watches")

    def _load_tags(self):
        """
        Load all tags from storage.

        File backend implementation: reads individual tag.json files.
        Tags loaded from files override any tags in settings (migration path).
        """
        from ..model import Tag

        def rehydrate_tag(uuid, entity_dict):
            """Rehydrate tag as Tag object with forced restock_diff processor."""
            entity_dict['uuid'] = uuid
            entity_dict['processor'] = 'restock_diff'  # Force processor for override functionality

            return Tag.model(
                datastore_path=self.datastore_path,
                __datastore=self.__data,
                default=entity_dict
            )

        tags = load_all_tags(
            self.datastore_path,
            rehydrate_tag
        )

        # Override settings tags with loaded tags
        # This ensures tag.json files take precedence over settings
        if tags:
            self.__data['settings']['application']['tags'].update(tags)
            logger.info(f"Loaded {len(tags)} tags from individual tag.json files")

    def _delete_watch(self, uuid):
        """
        Delete a watch from storage.

        File backend implementation: deletes entire {uuid}/ directory recursively.
        Implementation of abstract method from FileSavingDataStore.

        Args:
            uuid: Watch UUID to delete
        """
        watch_dir = os.path.join(self.datastore_path, uuid)
        if os.path.exists(watch_dir):
            shutil.rmtree(watch_dir)
            logger.info(f"Deleted watch directory: {watch_dir}")

    # ============================================================================
    # Watch Management Methods
    # ============================================================================

    def set_last_viewed(self, uuid, timestamp):
        logger.debug(f"Setting watch UUID: {uuid} last viewed to {int(timestamp)}")
        self.data['watching'][uuid].update({'last_viewed': int(timestamp)})
        self.data['watching'][uuid].commit()

        watch_check_update = signal('watch_check_update')
        if watch_check_update:
            watch_check_update.send(watch_uuid=uuid)

    def remove_password(self):
        self.__data['settings']['application']['password'] = False
        self.commit()

    def clear_all_last_checksums(self):
        """
        Delete all last-checksum.txt files to force reprocessing of all watches.

        This should be called when global settings change, since watches inherit
        configuration and need to reprocess even if their individual watch dict
        hasn't been modified.

        Note: We delete the checksum file rather than setting was_edited=True because:
        - was_edited is not persisted across restarts
        - File deletion ensures reprocessing works across app restarts
        """
        deleted_count = 0
        for uuid in self.__data['watching'].keys():
            watch = self.__data['watching'][uuid]
            if watch.data_dir:
                checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
                if os.path.isfile(checksum_file):
                    try:
                        os.remove(checksum_file)
                        deleted_count += 1
                        logger.debug(f"Cleared checksum for watch {uuid}")
                    except OSError as e:
                        logger.warning(f"Failed to delete checksum file for {uuid}: {e}")

        logger.info(f"Cleared {deleted_count} checksum files to force reprocessing")
        return deleted_count

    def clear_checksums_for_tag(self, tag_uuid):
        """
        Delete last-checksum.txt files for all watches using a specific tag.

        This should be called when a tag configuration is edited, since watches
        inherit tag settings and need to reprocess.

        Args:
            tag_uuid: UUID of the tag that was modified

        Returns:
            int: Number of checksum files deleted
        """
        deleted_count = 0
        for uuid, watch in self.__data['watching'].items():
            if watch.get('tags') and tag_uuid in watch['tags']:
                if watch.data_dir:
                    checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
                    if os.path.isfile(checksum_file):
                        try:
                            os.remove(checksum_file)
                            deleted_count += 1
                            logger.debug(f"Cleared checksum for watch {uuid} (tag {tag_uuid})")
                        except OSError as e:
                            logger.warning(f"Failed to delete checksum file for {uuid}: {e}")

        logger.info(f"Cleared {deleted_count} checksum files for tag {tag_uuid}")
        return deleted_count

    def commit(self):
        """
        Save settings immediately to disk using atomic write.

        Uses atomic write pattern (temp file + rename) for crash safety.

        Fire-and-forget: Logs errors but does not raise exceptions.
        Settings data remains in memory even if save fails, so next commit will retry.
        """
        try:
            self._save_settings()
            logger.debug("Committed settings")
        except Exception as e:
            logger.error(f"Failed to commit settings: {e}")

    def update_watch(self, uuid, update_obj):

        # It's possible that the watch could be deleted before update
        if not self.__data['watching'].get(uuid):
            return

        with self.lock:

            # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
            for dict_key, d in self.generic_definition.items():
                if isinstance(d, dict):
                    if update_obj is not None and dict_key in update_obj:
                        self.__data['watching'][uuid][dict_key].update(update_obj[dict_key])
                        del (update_obj[dict_key])

            self.__data['watching'][uuid].update(update_obj)

        # Immediate save
        self.__data['watching'][uuid].commit()

    @property
    def threshold_seconds(self):
        seconds = 0
        for m, n in Watch.mtable.items():
            x = self.__data['settings']['requests']['time_between_check'].get(m)
            if x:
                seconds += x * n
        return seconds

    @property
    def unread_changes_count(self):
        unread_changes_count = 0
        for uuid, watch in self.__data['watching'].items():
            if watch.history_n >= 2 and watch.viewed == False:
                unread_changes_count += 1

        return unread_changes_count

    @property
    def data(self):
        # Re #152, Return env base_url if not overriden
        # Re #148 - Some people have just {{ base_url }} in the body or title, but this may break some notification services
        #           like 'Join', so it's always best to atleast set something obvious so that they are not broken.

        active_base_url = BASE_URL_NOT_SET_TEXT
        if self.__data['settings']['application'].get('base_url'):
            active_base_url = self.__data['settings']['application'].get('base_url')
        elif os.getenv('BASE_URL'):
            active_base_url = os.getenv('BASE_URL')

        # I looked at various ways todo the following, but in the end just copying the dict seemed simplest/most reliable
        # even given the memory tradeoff - if you know a better way.. maybe return d|self.__data.. or something
        d = self.__data
        d['settings']['application']['active_base_url'] = active_base_url.strip('" ')
        return d

    # Delete a single watch by UUID
    def delete(self, uuid):
        """
        Delete a watch by UUID.

        Uses abstracted storage method for backend-agnostic deletion.
        Supports 'all' to delete all watches (mainly for testing).

        Args:
            uuid: Watch UUID to delete, or 'all' to delete all watches
        """
        with self.lock:
            if uuid == 'all':
                # Delete all watches - capture UUIDs first before modifying dict
                all_uuids = list(self.__data['watching'].keys())

                for watch_uuid in all_uuids:
                    # Delete from storage using polymorphic method
                    try:
                        self._delete_watch(watch_uuid)
                    except Exception as e:
                        logger.error(f"Failed to delete watch {watch_uuid} from storage: {e}")

                    # Send delete signal
                    watch_delete_signal = signal('watch_deleted')
                    if watch_delete_signal:
                        watch_delete_signal.send(watch_uuid=watch_uuid)

                # Clear the dict
                self.__data['watching'] = {}

                # Mainly used for testing to allow all items to flush before running next test
                time.sleep(1)

            else:
                # Delete single watch from storage using polymorphic method
                try:
                    self._delete_watch(uuid)
                except Exception as e:
                    logger.error(f"Failed to delete watch {uuid} from storage: {e}")

                # Remove from watching dict
                del self.data['watching'][uuid]

                # Send delete signal
                watch_delete_signal = signal('watch_deleted')
                if watch_delete_signal:
                    watch_delete_signal.send(watch_uuid=uuid)

    # Clone a watch by UUID
    def clone(self, uuid):
        url = self.data['watching'][uuid].get('url')
        # No need to deepcopy here - add_watch() will deepcopy extras anyway (line 569)
        # Just pass a dict copy (with lock for thread safety)
        # NOTE: dict() is shallow copy but safe since add_watch() deepcopies it
        with self.lock:
            extras = dict(self.data['watching'][uuid])
        new_uuid = self.add_watch(url=url, extras=extras)
        watch = self.data['watching'][new_uuid]
        return new_uuid

    def url_exists(self, url):

        # Probably their should be dict...
        for watch in self.data['watching'].values():
            if watch['url'].lower() == url.lower():
                return True

        return False

    # Remove a watchs data but keep the entry (URL etc)
    def clear_watch_history(self, uuid):
        self.__data['watching'][uuid].clear_watch()
        self.__data['watching'][uuid].commit()

    def add_watch(self, url, tag='', extras=None, tag_uuids=None, save_immediately=True):

        if extras is None:
            extras = {}

        # Incase these are copied across, assume it's a reference and deepcopy()
        apply_extras = deepcopy(extras)
        apply_extras['tags'] = [] if not apply_extras.get('tags') else apply_extras.get('tags')

        # Was it a share link? try to fetch the data
        if (url.startswith("https://changedetection.io/share/")):
            import requests

            try:
                r = requests.request(method="GET",
                                     url=url,
                                     # So we know to return the JSON instead of the human-friendly "help" page
                                     headers={'App-Guid': self.__data['app_guid']},
                                     timeout=5.0)  # 5 second timeout to prevent blocking
                res = r.json()

                # List of permissible attributes we accept from the wild internet
                for k in [
                    'body',
                    'browser_steps',
                    'css_filter',
                    'extract_text',
                    'headers',
                    'ignore_text',
                    'include_filters',
                    'method',
                    'paused',
                    'previous_md5',
                    'processor',
                    'subtractive_selectors',
                    'tag',
                    'tags',
                    'text_should_not_be_present',
                    'title',
                    'trigger_text',
                    'url',
                    'use_page_title_in_list',
                    'webdriver_js_execute_code',
                ]:
                    if res.get(k):
                        if k != 'css_filter':
                            apply_extras[k] = res[k]
                        else:
                            # We renamed the field and made it a list
                            apply_extras['include_filters'] = [res['css_filter']]

            except Exception as e:
                logger.error(f"Error fetching metadata for shared watch link {url} {str(e)}")
                flash(gettext("Error fetching metadata for {}").format(url), 'error')
                return False

        if not is_safe_valid_url(url):
            from flask import has_request_context
            if has_request_context():
                flash(gettext('Watch protocol is not permitted or invalid URL format'), 'error')
            else:
                logger.error(f"add_watch: URL '{url}' is not permitted or invalid, skipping.")
            return None

        # Check PAGE_WATCH_LIMIT if set
        page_watch_limit = os.getenv('PAGE_WATCH_LIMIT')
        if page_watch_limit:
            try:
                page_watch_limit = int(page_watch_limit)
                current_watch_count = len(self.__data['watching'])
                if current_watch_count >= page_watch_limit:
                    logger.error(f"Watch limit reached: {current_watch_count}/{page_watch_limit} watches. Cannot add {url}")
                    flash(gettext("Watch limit reached ({}/{} watches). Cannot add more watches.").format(current_watch_count, page_watch_limit), 'error')
                    return None
            except ValueError:
                logger.warning(f"Invalid PAGE_WATCH_LIMIT value: {page_watch_limit}, ignoring limit check")

        if tag and type(tag) == str:
            # Then it's probably a string of the actual tag by name, split and add it
            for t in tag.split(','):
                # for each stripped tag, add tag as UUID
                for a_t in t.split(','):
                    tag_uuid = self.add_tag(a_t)
                    apply_extras['tags'].append(tag_uuid)

        # Or if UUIDs given directly
        if tag_uuids:
            for t in tag_uuids:
                apply_extras['tags'] = list(set(apply_extras['tags'] + [t.strip()]))

        # Make any uuids unique
        if apply_extras.get('tags'):
            apply_extras['tags'] = list(set(apply_extras.get('tags')))

        # If the processor also has its own Watch implementation
        watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor'))
        new_watch = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, url=url)

        new_uuid = new_watch.get('uuid')

        logger.debug(f"Adding URL '{url}' - {new_uuid}")

        for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']:
            if k in apply_extras:
                del apply_extras[k]

        if not apply_extras.get('date_created'):
            apply_extras['date_created'] = int(time.time())

        new_watch.update(apply_extras)
        new_watch.ensure_data_dir_exists()
        self.__data['watching'][new_uuid] = new_watch

        if save_immediately:
            # Save immediately using commit
            new_watch.commit()
            logger.debug(f"Saved new watch {new_uuid}")

        logger.debug(f"Added '{url}'")

        return new_uuid

    def _watch_resource_exists(self, watch_uuid, resource_name):
        """
        Check if a watch-related resource exists.

        File backend implementation: checks if file exists in watch directory.

        Args:
            watch_uuid: Watch UUID
            resource_name: Name of resource (e.g., "last-screenshot.png")

        Returns:
            bool: True if resource exists
        """
        resource_path = os.path.join(self.datastore_path, watch_uuid, resource_name)
        return path.isfile(resource_path)

    def visualselector_data_is_ready(self, watch_uuid):
        """
        Check if visual selector data (screenshot + elements) is ready.

        Returns:
            bool: True if both screenshot and elements data exist
        """
        has_screenshot = self._watch_resource_exists(watch_uuid, "last-screenshot.png")
        has_elements = self._watch_resource_exists(watch_uuid, "elements.deflate")
        return has_screenshot and has_elements

    # Old sync_to_json and save_datastore methods removed - now handled by FileSavingDataStore parent class

    @property
    def proxy_list(self):
        proxy_list = {}
        proxy_list_file = os.path.join(self.datastore_path, 'proxies.json')

        # Load from external config file
        if path.isfile(proxy_list_file):
            if HAS_ORJSON:
                # orjson.loads() expects UTF-8 encoded bytes #3611
                with open(os.path.join(self.datastore_path, "proxies.json"), 'rb') as f:
                    proxy_list = orjson.loads(f.read())
            else:
                with open(os.path.join(self.datastore_path, "proxies.json"), encoding='utf-8') as f:
                    proxy_list = json.load(f)

        # Mapping from UI config if available
        extras = self.data['settings']['requests'].get('extra_proxies')
        if extras:
            i = 0
            for proxy in extras:
                i += 0
                if proxy.get('proxy_name') and proxy.get('proxy_url'):
                    k = "ui-" + str(i) + proxy.get('proxy_name')
                    proxy_list[k] = {'label': proxy.get('proxy_name'), 'url': proxy.get('proxy_url')}

        if proxy_list and strtobool(os.getenv('ENABLE_NO_PROXY_OPTION', 'True')):
            proxy_list["no-proxy"] = {'label': "No proxy", 'url': ''}

        return proxy_list if len(proxy_list) else None

    def get_preferred_proxy_for_watch(self, uuid):
        """
        Returns the preferred proxy by ID key
        :param uuid: UUID
        :return: proxy "key" id
        """

        if self.proxy_list is None:
            return None

        # If it's a valid one
        watch = self.data['watching'].get(uuid)

        if strtobool(os.getenv('ENABLE_NO_PROXY_OPTION', 'True')) and watch.get('proxy') == "no-proxy":
            return None

        if watch.get('proxy') and watch.get('proxy') in list(self.proxy_list.keys()):
            return watch.get('proxy')

        # not valid (including None), try the system one
        else:
            system_proxy_id = self.data['settings']['requests'].get('proxy')
            # Is not None and exists
            if self.proxy_list.get(system_proxy_id):
                return system_proxy_id

        # Fallback - Did not resolve anything, or doesnt exist, use the first available
        if system_proxy_id is None or not self.proxy_list.get(system_proxy_id):
            first_default = list(self.proxy_list)[0]
            return first_default

        return None

    @property
    def has_extra_headers_file(self):
        filepath = os.path.join(self.datastore_path, 'headers.txt')
        return os.path.isfile(filepath)

    def get_all_base_headers(self):
        headers = {}
        # Global app settings
        headers.update(self.data['settings'].get('headers', {}))

        return headers

    def get_all_headers_in_textfile_for_watch(self, uuid):
        from ..model.App import parse_headers_from_text_file
        headers = {}

        # Global in /datastore/headers.txt
        filepath = os.path.join(self.datastore_path, 'headers.txt')
        try:
            if os.path.isfile(filepath):
                headers.update(parse_headers_from_text_file(filepath))
        except Exception as e:
            logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")

        watch = self.data['watching'].get(uuid)
        if watch:

            # In /datastore/xyz-xyz/headers.txt
            filepath = os.path.join(watch.data_dir, 'headers.txt')
            try:
                if os.path.isfile(filepath):
                    headers.update(parse_headers_from_text_file(filepath))
            except Exception as e:
                logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")

            # In /datastore/tag-name.txt
            tags = self.get_all_tags_for_watch(uuid=uuid)
            for tag_uuid, tag in tags.items():
                fname = "headers-" + re.sub(r'[\W_]', '', tag.get('title')).lower().strip() + ".txt"
                filepath = os.path.join(self.datastore_path, fname)
                try:
                    if os.path.isfile(filepath):
                        headers.update(parse_headers_from_text_file(filepath))
                except Exception as e:
                    logger.error(f"ERROR reading headers.txt at {filepath} {str(e)}")

        return headers

    def get_tag_overrides_for_watch(self, uuid, attr):
        tags = self.get_all_tags_for_watch(uuid=uuid)
        ret = []

        if tags:
            for tag_uuid, tag in tags.items():
                if attr in tag and tag[attr]:
                    ret = [*ret, *tag[attr]]

        return ret

    def add_tag(self, title):
        # If name exists, return that
        n = title.strip().lower()
        logger.debug(f">>> Adding new tag - '{n}'")
        if not n:
            return False

        for uuid, tag in self.__data['settings']['application'].get('tags', {}).items():
            if n == tag.get('title', '').lower().strip():
                logger.warning(f"Tag '{title}' already exists, skipping creation.")
                return uuid

        # Eventually almost everything todo with a watch will apply as a Tag
        # So we use the same model as a Watch
        with self.lock:
            from ..model import Tag
            new_tag = Tag.model(
                datastore_path=self.datastore_path,
                __datastore=self.__data,
                default={
                    'title': title.strip(),
                    'date_created': int(time.time())
                }
            )

            new_uuid = new_tag.get('uuid')

            self.__data['settings']['application']['tags'][new_uuid] = new_tag

        # Save tag to its own tag.json file instead of settings
        new_tag.commit()
        return new_uuid

    def get_all_tags_for_watch(self, uuid):
        """This should be in Watch model but Watch doesn't have access to datastore, not sure how to solve that yet"""
        watch = self.data['watching'].get(uuid)

        # Should return a dict of full tag info linked by UUID
        if watch:
            return dictfilt(self.__data['settings']['application']['tags'], watch.get('tags', []))

        return {}

    @property
    def extra_browsers(self):
        res = []
        p = list(filter(
            lambda s: (s.get('browser_name') and s.get('browser_connection_url')),
            self.__data['settings']['requests'].get('extra_browsers', [])))
        if p:
            for i in p:
                res.append(("extra_browser_" + i['browser_name'], i['browser_name']))

        return res

    def tag_exists_by_name(self, tag_name):
        # Check if any tag dictionary has a 'title' attribute matching the provided tag_name
        tags = self.__data['settings']['application']['tags'].values()
        return next((v for v in tags if v.get('title', '').lower() == tag_name.lower()),
                    None)

    def any_watches_have_processor_by_name(self, processor_name):
        for watch in self.data['watching'].values():
            if watch.get('processor') == processor_name:
                return True
        return False

    def search_watches_for_url(self, query, tag_limit=None, partial=False):
        """Search watches by URL, title, or error messages

        Args:
            query (str): Search term to match against watch URLs, titles, and error messages
            tag_limit (str, optional): Optional tag name to limit search results
            partial: (bool, optional): sub-string matching

        Returns:
            list: List of UUIDs of watches that match the search criteria
        """
        matching_uuids = []
        query = query.lower().strip()
        tag = self.tag_exists_by_name(tag_limit) if tag_limit else False

        for uuid, watch in self.data['watching'].items():
            # Filter by tag if requested
            if tag_limit:
                if not tag.get('uuid') in watch.get('tags', []):
                    continue

            # Search in URL, title, or error messages
            if partial:
                if ((watch.get('title') and query in watch.get('title').lower()) or
                        query in watch.get('url', '').lower() or
                        (watch.get('last_error') and query in watch.get('last_error').lower())):
                    matching_uuids.append(uuid)
            else:
                if ((watch.get('title') and query == watch.get('title').lower()) or
                        query == watch.get('url', '').lower() or
                        (watch.get('last_error') and query == watch.get('last_error').lower())):
                    matching_uuids.append(uuid)

        return matching_uuids

    def get_unique_notification_tokens_available(self):
        # Ask each type of watch if they have any extra notification token to add to the validation
        extra_notification_tokens = {}
        watch_processors_checked = set()

        for watch_uuid, watch in self.__data['watching'].items():
            processor = watch.get('processor')
            if processor not in watch_processors_checked:
                extra_notification_tokens.update(watch.extra_notification_token_values())
                watch_processors_checked.add(processor)

        return extra_notification_tokens

    def get_unique_notification_token_placeholders_available(self):
        # The actual description of the tokens, could be combined with get_unique_notification_tokens_available instead of doing this twice
        extra_notification_tokens = []
        watch_processors_checked = set()

        for watch_uuid, watch in self.__data['watching'].items():
            processor = watch.get('processor')
            if processor not in watch_processors_checked:
                extra_notification_tokens += watch.extra_notification_token_placeholder_info()
                watch_processors_checked.add(processor)

        return extra_notification_tokens

    def add_notification_url(self, notification_url):

        logger.debug(f">>> Adding new notification_url - '{notification_url}'")

        notification_urls = self.data['settings']['application'].get('notification_urls', [])

        if notification_url in notification_urls:
            return notification_url

        with self.lock:
            notification_urls = self.__data['settings']['application'].get('notification_urls', [])

            if notification_url in notification_urls:
                return notification_url

            # Append and update the datastore
            notification_urls.append(notification_url)
            self.__data['settings']['application']['notification_urls'] = notification_urls

        self.commit()
        return notification_url

    # Schema update methods moved to store/updates.py (DatastoreUpdatesMixin)
    # This includes: get_updates_available(), run_updates(), and update_1() through update_26()