Compare commits

...

3 Commits

Author SHA1 Message Date
dgtlmoon
822d1782a3 more robust str handling
Some checks failed
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2026-02-14 20:08:18 +01:00
dgtlmoon
0866a85934 Add tests
Some checks failed
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
2026-02-14 18:43:18 +01:00
dgtlmoon
528ef378da Dont reprocess page watches if the content didnt change 2026-02-14 18:42:55 +01:00
20 changed files with 918 additions and 109 deletions

View File

@@ -79,7 +79,7 @@ class Tag(Resource):
'browser_steps_last_error_step', 'check_count', 'consecutive_filter_failures',
'content-type', 'fetch_time', 'last_changed', 'last_checked', 'last_error',
'last_notification_error', 'last_viewed', 'notification_alert_count',
'page_title', 'previous_md5', 'previous_md5_before_filters', 'remote_server_reply'
'page_title', 'previous_md5', 'remote_server_reply'
}
# Create clean tag dict without Watch-specific fields
@@ -160,6 +160,11 @@ class Tag(Resource):
tag.update(json_data)
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = self.datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated via API, cleared {cleared_count} watch checksums")
return "OK", 200

View File

@@ -70,46 +70,6 @@ def _resolve_schema_properties(schema_name):
return properties
@functools.cache
def _resolve_readonly_fields(schema_name):
"""
Generic helper to resolve readOnly fields, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
Returns:
frozenset: All readOnly field names including inherited ones
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
readonly_fields = set()
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
if 'properties' in ref_schema:
for field_name, field_def in ref_schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
# Check schema-specific properties
if 'properties' in item:
for field_name, field_def in item['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
else:
# Direct properties (no inheritance)
if 'properties' in schema:
for field_name, field_def in schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
return frozenset(readonly_fields)
@functools.cache
def get_watch_schema_properties():
@@ -120,14 +80,8 @@ def get_watch_schema_properties():
"""
return _resolve_schema_properties('WatchBase')
@functools.cache
def get_readonly_watch_fields():
"""
Extract readOnly field names from Watch schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
"""
return _resolve_readonly_fields('Watch')
# Import readonly field utilities from shared module (avoids circular dependencies with model layer)
from changedetectionio.model.schema_utils import get_readonly_watch_fields, get_readonly_tag_fields
@functools.cache
def get_tag_schema_properties():
@@ -138,15 +92,6 @@ def get_tag_schema_properties():
"""
return _resolve_schema_properties('Tag')
@functools.cache
def get_readonly_tag_fields():
"""
Extract readOnly field names from Tag schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
"""
return _resolve_readonly_fields('Tag')
def validate_openapi_request(operation_id):
"""Decorator to validate incoming requests against OpenAPI spec."""
def decorator(f):

View File

@@ -83,6 +83,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
datastore.data['settings']['requests'].update(form.data['requests'])
datastore.commit()
# Clear all checksums to force reprocessing with new settings
# Global settings can affect watch behavior (filters, rendering, etc.)
datastore.clear_all_last_checksums()
# Adjust worker count if it changed
if new_worker_count != old_worker_count:
from changedetectionio import worker_pool

View File

@@ -244,6 +244,12 @@ def construct_blueprint(datastore: ChangeDetectionStore):
tag.update(form.data)
tag['processor'] = 'restock_diff'
tag.commit()
# Clear checksums for all watches using this tag to force reprocessing
# Tag changes affect inherited configuration
cleared_count = datastore.clear_checksums_for_tag(uuid)
logger.info(f"Tag {uuid} updated, cleared {cleared_count} watch checksums")
flash(gettext("Updated"))
return redirect(url_for('tags.tags_overview_page'))

View File

@@ -335,7 +335,6 @@ class model(EntityPersistenceMixin, watch_base):
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None
})
@@ -386,10 +385,16 @@ class model(EntityPersistenceMixin, watch_base):
@property
def is_pdf(self):
# content_type field is set in the future
# https://github.com/dgtlmoon/changedetection.io/issues/1392
# Not sure the best logic here
return self.get('url', '').lower().endswith('.pdf') or 'pdf' in self.get('content_type', '').lower()
url = str(self.get("url") or "").lower()
content_type = str(self.get("content-type") or "").lower()
if content_type in ("none", "null", ""):
content_type = ""
return (
url.endswith(".pdf")
or content_type.split(";")[0].strip() == "application/pdf"
)
@property
def label(self):

View File

@@ -129,7 +129,6 @@ class watch_base(dict):
fetch_time (float): Duration of last fetch in seconds
consecutive_filter_failures (int): Counter for consecutive filter match failures
previous_md5 (str|bool): MD5 hash of previous content
previous_md5_before_filters (str|bool): MD5 hash before filters applied
history_snapshot_max_length (int|None): Max history snapshots to keep (None = use global)
Conditions:
@@ -166,6 +165,10 @@ class watch_base(dict):
if kw.get('datastore_path'):
del kw['datastore_path']
# IMPORTANT: Don't initialize __watch_was_edited yet!
# We'll initialize it AFTER the initial update() call below
# This prevents marking the watch as edited during initialization
self.update({
# Custom notification content
# Re #110, so then if this is set to None, we know to use the default value instead
@@ -211,7 +214,6 @@ class watch_base(dict):
'page_title': None, # <title> from the page
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'price_change_threshold_percent': None,
'proxy': None, # Preferred proxy connection
@@ -297,9 +299,116 @@ class watch_base(dict):
super(watch_base, self).__init__(*arg, **kw)
# Check if we're being initialized from an existing watch object
# that has was_edited=True, so we can preserve the flag
preserve_edited_flag = False
if self.get('default'):
# When creating a new watch object from an existing one (e.g., changing processor),
# preserve the was_edited flag if it was True
default_watch = self.get('default')
if hasattr(default_watch, 'was_edited') and default_watch.was_edited:
preserve_edited_flag = True
del self['default']
# NOW initialize the edited flag after all initial setup is complete
# This ensures initialization doesn't trigger the edited flag
# But preserve it if the source watch had it set to True
self.__watch_was_edited = preserve_edited_flag
def _mark_field_as_edited(self, key):
"""
Helper to mark a field as edited if it's writable.
Internal method used by __setitem__, update(), pop(), etc.
"""
# Don't track edits during initial load or if already edited
if not hasattr(self, '_watch_base__watch_was_edited'):
return
if self.__watch_was_edited:
return # Already marked as edited
# Import from shared schema utilities (no circular dependency)
from .schema_utils import get_readonly_watch_fields
readonly_fields = get_readonly_watch_fields()
# Additional system-managed fields not in OpenAPI spec (yet)
# These are set by processors/workers and should not trigger edited flag
additional_system_fields = {
'last_check_status', # Set by processors
'restock', # Set by restock processor
'last_viewed', # Set by mark_all_viewed endpoint
}
# Only mark as edited if this is a user-writable field
if key not in readonly_fields and key not in additional_system_fields:
self.__watch_was_edited = True
def __setitem__(self, key, value):
"""
Override dict.__setitem__ to track when writable watch fields are modified.
This enables skipping reprocessing when:
1. HTML content is unchanged (checksumFromPreviousCheckWasTheSame)
2. AND watch configuration was not edited
Only sets the edited flag when field is NOT in readonly_fields (from OpenAPI spec).
"""
# Set the value first (always)
super().__setitem__(key, value)
# Mark as edited if writable field
self._mark_field_as_edited(key)
def __delitem__(self, key):
"""Override dict.__delitem__ to track deletions of writable fields."""
super().__delitem__(key)
self._mark_field_as_edited(key)
def update(self, *args, **kwargs):
"""Override dict.update() to track modifications to writable fields."""
# Call parent update first
super().update(*args, **kwargs)
# Mark as edited for any writable fields that were updated
# Handle both update(dict) and update(key=value) forms
if args:
for key in args[0].keys():
self._mark_field_as_edited(key)
for key in kwargs.keys():
self._mark_field_as_edited(key)
def pop(self, key, *args):
"""Override dict.pop() to track removal of writable fields."""
result = super().pop(key, *args)
self._mark_field_as_edited(key)
return result
def setdefault(self, key, default=None):
"""Override dict.setdefault() to track modifications to writable fields."""
# Only marks as edited if key didn't exist (i.e., a new value was set)
existed = key in self
result = super().setdefault(key, default)
if not existed:
self._mark_field_as_edited(key)
return result
@property
def was_edited(self):
"""
Check if watch configuration was edited since last processing.
Returns:
bool: True if writable fields were modified, False otherwise
"""
return getattr(self, '_watch_base__watch_was_edited', False)
def reset_watch_edited_flag(self):
"""
Reset the watch edited flag after successful processing.
Call this after processing completes to allow future content-only change detection.
"""
self.__watch_was_edited = False
@classmethod
def get_property_names(cls):
"""

View File

@@ -0,0 +1,92 @@
"""
Schema utilities for Watch and Tag models.
Provides functions to extract readonly fields and properties from OpenAPI spec.
Shared by both the model layer and API layer to avoid circular dependencies.
"""
import functools
@functools.cache
def get_openapi_schema_dict():
"""
Get the raw OpenAPI spec dictionary for schema access.
Returns the YAML dict directly (not the OpenAPI object).
"""
import os
import yaml
spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
if not os.path.exists(spec_path):
spec_path = os.path.join(os.path.dirname(__file__), '../docs/api-spec.yaml')
with open(spec_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@functools.cache
def _resolve_readonly_fields(schema_name):
"""
Generic helper to resolve readOnly fields, including allOf inheritance.
Args:
schema_name: Name of the schema (e.g., 'Watch', 'Tag')
Returns:
frozenset: All readOnly field names including inherited ones
"""
spec_dict = get_openapi_schema_dict()
schema = spec_dict['components']['schemas'].get(schema_name, {})
readonly_fields = set()
# Handle allOf (schema inheritance)
if 'allOf' in schema:
for item in schema['allOf']:
# Resolve $ref to parent schema
if '$ref' in item:
ref_path = item['$ref'].split('/')[-1]
ref_schema = spec_dict['components']['schemas'].get(ref_path, {})
if 'properties' in ref_schema:
for field_name, field_def in ref_schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
# Check schema-specific properties
if 'properties' in item:
for field_name, field_def in item['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
else:
# Direct properties (no inheritance)
if 'properties' in schema:
for field_name, field_def in schema['properties'].items():
if field_def.get('readOnly') is True:
readonly_fields.add(field_name)
return frozenset(readonly_fields)
@functools.cache
def get_readonly_watch_fields():
"""
Extract readOnly field names from Watch schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Watch-specific readOnly fields.
Used by:
- model/watch_base.py: Track when writable fields are edited
- api/Watch.py: Filter readonly fields from PUT requests
"""
return _resolve_readonly_fields('Watch')
@functools.cache
def get_readonly_tag_fields():
"""
Extract readOnly field names from Tag schema in OpenAPI spec.
Returns readOnly fields from WatchBase (uuid, date_created) + Tag-specific readOnly fields.
"""
return _resolve_readonly_fields('Tag')

View File

@@ -1,6 +1,6 @@
from functools import lru_cache
from loguru import logger
from flask_babel import gettext
from flask_babel import gettext, get_locale
import importlib
import inspect
import os
@@ -190,14 +190,15 @@ def get_plugin_processor_metadata():
logger.warning(f"Error getting plugin processor metadata: {e}")
return metadata
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
:return: A list :)
@lru_cache(maxsize=32)
def _available_processors_cached(locale_str):
"""
Internal cached function that includes locale in cache key.
This ensures translations are cached per-language instead of globally.
:param locale_str: The locale string (e.g., 'en', 'it', 'zh')
:return: A list of tuples (processor_name, translated_description, weight)
"""
processor_classes = find_processors()
# Check if DISABLED_PROCESSORS env var is set
@@ -256,6 +257,22 @@ def available_processors():
# Return as tuples without weight (for backwards compatibility)
return [(name, desc) for name, desc, weight in available]
def available_processors():
"""
Get a list of processors by name and description for the UI elements.
Can be filtered via DISABLED_PROCESSORS environment variable (comma-separated list).
This function delegates to a locale-aware cached version to ensure translations
are cached per-language instead of globally.
:return: A list of tuples (processor_name, translated_description)
"""
# Get current locale and use it as cache key
# Convert Babel Locale object to string for use as cache key
locale = get_locale()
locale_str = str(locale) if locale else 'en'
return _available_processors_cached(locale_str)
def get_default_processor():
"""

View File

@@ -19,6 +19,7 @@ class difference_detection_processor():
xpath_data = None
preferred_proxy = None
screenshot_format = SCREENSHOT_FORMAT_JPEG
last_raw_content_checksum = None
def __init__(self, datastore, watch_uuid):
self.datastore = datastore
@@ -34,6 +35,64 @@ class difference_detection_processor():
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()
# Load the last raw content checksum from file
self.read_last_raw_content_checksum()
def update_last_raw_content_checksum(self, checksum):
"""
Save the raw content MD5 checksum to file.
This is used for skip logic - avoid reprocessing if raw HTML unchanged.
"""
if not checksum:
return
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
return
data_dir = watch.data_dir
if not data_dir:
return
watch.ensure_data_dir_exists()
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
try:
with open(checksum_file, 'w', encoding='utf-8') as f:
f.write(checksum)
self.last_raw_content_checksum = checksum
except IOError as e:
logger.warning(f"Failed to write checksum file for {self.watch_uuid}: {e}")
def read_last_raw_content_checksum(self):
"""
Read the last raw content MD5 checksum from file.
Returns None if file doesn't exist (first run) or can't be read.
"""
watch = self.datastore.data['watching'].get(self.watch_uuid)
if not watch:
self.last_raw_content_checksum = None
return
data_dir = watch.data_dir
if not data_dir:
self.last_raw_content_checksum = None
return
checksum_file = os.path.join(data_dir, 'last-checksum.txt')
if not os.path.isfile(checksum_file):
self.last_raw_content_checksum = None
return
try:
with open(checksum_file, 'r', encoding='utf-8') as f:
self.last_raw_content_checksum = f.read().strip()
except IOError as e:
logger.warning(f"Failed to read checksum file for {self.watch_uuid}: {e}")
self.last_raw_content_checksum = None
async def call_browser(self, preferred_proxy_id=None):
from requests.structures import CaseInsensitiveDict
@@ -257,8 +316,16 @@ class difference_detection_processor():
except IOError as e:
logger.error(f"Failed to write extra watch config {filename}: {e}")
def get_raw_document_checksum(self):
checksum = None
if self.fetcher.content:
checksum = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
return checksum
@abstractmethod
def run_changedetection(self, watch):
def run_changedetection(self, watch, force_reprocess=False):
update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()

View File

@@ -30,7 +30,7 @@ class perform_site_check(difference_detection_processor):
# Override to use PNG format for better image comparison (JPEG compression creates noise)
screenshot_format = SCREENSHOT_FORMAT_PNG
def run_changedetection(self, watch):
def run_changedetection(self, watch, force_reprocess=False):
"""
Perform screenshot comparison using OpenCV subprocess handler.

View File

@@ -2,6 +2,7 @@ from ..base import difference_detection_processor
from ..exceptions import ProcessorException
from . import Restock
from loguru import logger
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
import urllib3
import time
@@ -403,22 +404,37 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, watch):
def run_changedetection(self, watch, force_reprocess=False):
import hashlib
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False, 'restock': Restock()}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type
update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = self.fetcher.headers.get('Content-Type', '') # Use hyphen (matches OpenAPI spec)
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.
# Otherwise it will assume "in stock" because nothing suggesting the opposite was found
from ...html_tools import html_to_text

View File

@@ -17,7 +17,8 @@ def _task(watch, update_handler):
try:
# The slow process (we run 2 of these in parallel)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch)
# Always force reprocess for preview - we want to show the filtered content regardless of checksums
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch, force_reprocess=True)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:

View File

@@ -7,6 +7,7 @@ import re
import urllib3
from changedetectionio.conditions import execute_ruleset_against_all_plugins
from changedetectionio.content_fetchers.exceptions import checksumFromPreviousCheckWasTheSame
from ..base import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
@@ -368,12 +369,24 @@ class ChecksumCalculator:
# (set_proxy_from_list)
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch):
def run_changedetection(self, watch, force_reprocess=False):
changed_detected = False
if not watch:
raise Exception("Watch no longer exists.")
current_raw_document_checksum = self.get_raw_document_checksum()
# Skip processing only if BOTH conditions are true:
# 1. HTML content unchanged (checksum matches last saved checksum)
# 2. Watch configuration was not edited (including trigger_text, filters, etc.)
# The was_edited flag handles all watch configuration changes, so we don't need
# separate checks for trigger_text or other processing rules.
if (not force_reprocess and
not watch.was_edited and
self.last_raw_content_checksum and
self.last_raw_content_checksum == current_raw_document_checksum):
raise checksumFromPreviousCheckWasTheSame()
# Initialize components
filter_config = FilterConfig(watch, self.datastore)
content_processor = ContentProcessor(self.fetcher, watch, filter_config, self.datastore)
@@ -391,9 +404,11 @@ class perform_site_check(difference_detection_processor):
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type and checksum before filters
update_obj['content_type'] = ctype_header
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
# Track the content type (readonly field, doesn't trigger was_edited)
update_obj['content-type'] = ctype_header # Use hyphen (matches OpenAPI spec and watch_base default)
# Save the raw content checksum to file (processor implementation detail, not watch config)
self.update_last_raw_content_checksum(current_raw_document_checksum)
# === CONTENT PREPROCESSING ===
# Avoid creating unnecessary intermediate string copies by reassigning only when needed

View File

@@ -456,6 +456,63 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
self.__data['settings']['application']['password'] = False
self.commit()
def clear_all_last_checksums(self):
"""
Delete all last-checksum.txt files to force reprocessing of all watches.
This should be called when global settings change, since watches inherit
configuration and need to reprocess even if their individual watch dict
hasn't been modified.
Note: We delete the checksum file rather than setting was_edited=True because:
- was_edited is not persisted across restarts
- File deletion ensures reprocessing works across app restarts
"""
deleted_count = 0
for uuid in self.__data['watching'].keys():
watch = self.__data['watching'][uuid]
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid}")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files to force reprocessing")
return deleted_count
def clear_checksums_for_tag(self, tag_uuid):
"""
Delete last-checksum.txt files for all watches using a specific tag.
This should be called when a tag configuration is edited, since watches
inherit tag settings and need to reprocess.
Args:
tag_uuid: UUID of the tag that was modified
Returns:
int: Number of checksum files deleted
"""
deleted_count = 0
for uuid, watch in self.__data['watching'].items():
if watch.get('tags') and tag_uuid in watch['tags']:
if watch.data_dir:
checksum_file = os.path.join(watch.data_dir, 'last-checksum.txt')
if os.path.isfile(checksum_file):
try:
os.remove(checksum_file)
deleted_count += 1
logger.debug(f"Cleared checksum for watch {uuid} (tag {tag_uuid})")
except OSError as e:
logger.warning(f"Failed to delete checksum file for {uuid}: {e}")
logger.info(f"Cleared {deleted_count} checksum files for tag {tag_uuid}")
return deleted_count
def commit(self):
"""
Save settings immediately to disk using atomic write.

View File

@@ -54,11 +54,11 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
backup = ZipFile(io.BytesIO(res.data))
l = backup.namelist()
# Check for UUID-based txt files (history and snapshot)
# Check for UUID-based txt files (history, snapshot, and last-checksum)
uuid4hex_txt = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, l))
# Should be two txt files in the archive (history and the snapshot)
assert len(txt_files) == 2
# Should be three txt files in the archive (history, snapshot, and last-checksum)
assert len(txt_files) == 3
# Check for watch.json files (new format)
uuid4hex_json = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}/watch\.json$', re.I)

View File

@@ -71,22 +71,19 @@ def test_include_filters_output():
# Tests the whole stack works with the CSS Filter
def test_check_markup_include_filters_restriction(client, live_server, measure_memory_usage, datastore_path):
sleep_time_for_fetch_thread = 3
include_filters = "#sametext"
set_original_response(datastore_path=datastore_path)
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# Goto the edit page, add our ignore text
# Add our URL to the import page
@@ -103,15 +100,15 @@ def test_check_markup_include_filters_restriction(client, live_server, measure_m
)
assert bytes(include_filters.encode('utf-8')) in res.data
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# Make a change
set_modified_response(datastore_path=datastore_path)
# Trigger a check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# It should have 'has-unread-changes' still
# Because it should be looking at only that 'sametext' id

View File

@@ -106,7 +106,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and fname != 'watch.json' and 'html' not in fname:
if fname != 'history.txt' and fname != 'watch.json' and fname != 'last-checksum.txt' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
@@ -123,11 +123,18 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
# With new format, we also have watch.json, so 4 files total
# With new format, we have watch.json, so 4 files minimum
# Note: last-checksum.txt may or may not exist - it gets cleared by settings changes,
# and this test changes settings before checking files
# This assertion should be AFTER the loop, not inside it
if os.path.exists(changedetection_json):
assert len(files_in_watch_dir) == 4, "Should be four files in the dir with new format: watch.json, html.br snapshot, history.txt and the extracted text snapshot"
# 4 required files: watch.json, html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional (cleared by settings changes in this test)
assert len(files_in_watch_dir) >= 4 and len(files_in_watch_dir) <= 5, f"Should be 4-5 files in the dir with new format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
else:
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir with legacy format: html.br snapshot, history.txt and the extracted text snapshot"
# 3 required files: html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional
assert len(files_in_watch_dir) >= 3 and len(files_in_watch_dir) <= 4, f"Should be 3-4 files in the dir with legacy format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
# Check that 'default' Watch vars aren't accidentally being saved
if os.path.exists(changedetection_json):

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""
Test that changing global settings or tag configurations forces reprocessing.
When settings or tag configurations change, all affected watches need to
reprocess even if their content hasn't changed, because configuration affects
the processing result.
"""
import os
import time
from flask import url_for
from .util import wait_for_all_checks
def test_settings_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing global settings clears all checksums to force reprocessing.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Add two watches
datastore = client.application.config.get('DATASTORE')
uuid1 = datastore.add_watch(url=test_url, extras={'title': 'Watch 1'})
uuid2 = datastore.add_watch(url=test_url, extras={'title': 'Watch 2'})
# Unpause watches
datastore.data['watching'][uuid1]['paused'] = False
datastore.data['watching'][uuid2]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum1 = os.path.join(datastore_path, uuid1, 'last-checksum.txt')
checksum2 = os.path.join(datastore_path, uuid2, 'last-checksum.txt')
assert os.path.isfile(checksum1), "First check should create checksum file for watch 1"
assert os.path.isfile(checksum2), "First check should create checksum file for watch 2"
# Change global settings (any setting will do)
res = client.post(
url_for("settings.settings_page"),
data={
"application-empty_pages_are_a_change": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Settings updated." in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ALL checksum files were deleted
assert not os.path.isfile(checksum1), "Settings change should delete checksum for watch 1"
assert not os.path.isfile(checksum2), "Settings change should delete checksum for watch 2"
# Next check should reprocess (not skip) and recreate checksums
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were recreated
assert os.path.isfile(checksum1), "Reprocessing should recreate checksum file for watch 1"
assert os.path.isfile(checksum2), "Reprocessing should recreate checksum file for watch 2"
print("✓ Settings change forces reprocessing of all watches")
def test_tag_change_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that changing a tag configuration clears checksums only for watches with that tag.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content that stays the same</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('Test Tag')
# Add watches - one with tag, one without
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch With Tag', 'tags': [tag_uuid]})
uuid_without_tag = datastore.add_watch(url=test_url, extras={'title': 'Watch Without Tag'})
# Unpause watches
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_without_tag]['paused'] = False
# First check - establishes baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum files were created
checksum_with = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
checksum_without = os.path.join(datastore_path, uuid_without_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_with), "First check should create checksum for tagged watch"
assert os.path.isfile(checksum_without), "First check should create checksum for untagged watch"
# Edit the tag (change notification_muted as an example)
tag = datastore.data['settings']['application']['tags'][tag_uuid]
res = client.post(
url_for("tags.form_tag_edit_submit", uuid=tag_uuid),
data={
'title': 'Test Tag',
'notification_muted': 'y',
'overrides_watch': 'n'
},
follow_redirects=True
)
assert b"Updated" in res.data
# Give it a moment to process
time.sleep(0.5)
# Verify ONLY the tagged watch's checksum was deleted
assert not os.path.isfile(checksum_with), "Tag change should delete checksum for watch WITH tag"
assert os.path.isfile(checksum_without), "Tag change should NOT delete checksum for watch WITHOUT tag"
# Next check should reprocess tagged watch and recreate its checksum
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify tagged watch's checksum was recreated
assert os.path.isfile(checksum_with), "Reprocessing should recreate checksum for tagged watch"
assert os.path.isfile(checksum_without), "Untagged watch should still have its checksum"
print("✓ Tag change forces reprocessing only for watches with that tag")
def test_tag_change_via_api_forces_reprocess(client, live_server, measure_memory_usage, datastore_path):
"""
Test that updating a tag via API also clears checksums for affected watches.
"""
# Setup test content
test_html = """<html>
<body>
<p>Test content</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
test_url = url_for('test_endpoint', _external=True)
# Create a tag
datastore = client.application.config.get('DATASTORE')
tag_uuid = datastore.add_tag('API Test Tag')
# Add watch with tag
uuid_with_tag = datastore.add_watch(url=test_url, extras={'title': 'API Watch'})
datastore.data['watching'][uuid_with_tag]['paused'] = False
datastore.data['watching'][uuid_with_tag]['tags'] = [tag_uuid]
datastore.data['watching'][uuid_with_tag].commit()
# First check
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify checksum exists
checksum_file = os.path.join(datastore_path, uuid_with_tag, 'last-checksum.txt')
assert os.path.isfile(checksum_file), "First check should create checksum file"
# Update tag via API
res = client.put(
f'/api/v1/tag/{tag_uuid}',
json={'notification_muted': True},
headers={'x-api-key': datastore.data['settings']['application']['api_access_token']}
)
assert res.status_code == 200, f"API call failed with status {res.status_code}: {res.data}"
# Give it more time for async operations
time.sleep(1.0)
# Debug: Check if checksum still exists
if os.path.isfile(checksum_file):
# Read checksum to see if it changed
with open(checksum_file, 'r') as f:
checksum_content = f.read()
print(f"Checksum still exists: {checksum_content}")
# Verify checksum was deleted
assert not os.path.isfile(checksum_file), "API tag update should delete checksum"
print("✓ Tag update via API forces reprocessing")

View File

@@ -0,0 +1,246 @@
#!/usr/bin/env python3
"""
Test the watch edited flag functionality.
This tests the private __watch_was_edited flag that tracks when writable
watch fields are modified, which prevents skipping reprocessing when the
watch configuration has changed.
"""
import os
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
def set_test_content(datastore_path):
"""Write test HTML content to endpoint-content.txt for test server."""
test_html = """<html>
<body>
<p>Test content for watch edited flag tests</p>
<p>This content stays the same across checks</p>
</body>
</html>
"""
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(test_html)
def test_watch_edited_flag_lifecycle(client, live_server, measure_memory_usage, datastore_path):
"""
Test the full lifecycle of the was_edited flag:
1. Flag starts False when watch is created
2. Flag becomes True when writable fields are modified
3. Flag is reset False after worker processing
4. Flag stays False when readonly fields are modified
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
assert b"Watch added" in res.data or b"Updated watch" in res.data
# Get the watch UUID
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Reset flag after initial form submission (form sets fields which trigger the flag)
watch.reset_watch_edited_flag()
# Test 1: Flag should be False after reset
assert not watch.was_edited, "Flag should be False after reset"
# Test 2: Modify a writable field (title) - flag should become True
watch['title'] = 'New Title'
assert watch.was_edited, "Flag should be True after modifying writable field 'title'"
# Test 3: Reset flag manually (simulating what worker does)
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Test 4: Modify another writable field (url) - flag should become True again
watch['url'] = 'https://example.com'
assert watch.was_edited, "Flag should be True after modifying writable field 'url'"
# Test 5: Reset and modify a readonly field - flag should stay False
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Modify readonly field (uuid) - should not set flag
old_uuid = watch['uuid']
watch['uuid'] = 'readonly-test-uuid'
assert not watch.was_edited, "Flag should stay False when modifying readonly field 'uuid'"
watch['uuid'] = old_uuid # Restore original
# Note: Worker reset behavior is tested in test_check_removed_line_contains_trigger
# and test_watch_edited_flag_prevents_skip
print("✓ All watch edited flag lifecycle tests passed")
def test_watch_edited_flag_dict_methods(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the flag is set correctly by various dict methods:
- __setitem__ (watch['key'] = value)
- update() (watch.update({'key': value}))
- setdefault() (watch.setdefault('key', default))
- pop() (watch.pop('key'))
- __delitem__ (del watch['key'])
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Test __setitem__
watch.reset_watch_edited_flag()
watch['title'] = 'Test via setitem'
assert watch.was_edited, "Flag should be True after __setitem__ on writable field"
# Test update() with dict
watch.reset_watch_edited_flag()
watch.update({'title': 'Test via update dict'})
assert watch.was_edited, "Flag should be True after update() with writable field"
# Test update() with kwargs
watch.reset_watch_edited_flag()
watch.update(title='Test via update kwargs')
assert watch.was_edited, "Flag should be True after update() kwargs with writable field"
# Test setdefault() on new key
watch.reset_watch_edited_flag()
watch.setdefault('title', 'Should not be set') # Key exists, no change
assert not watch.was_edited, "Flag should stay False when setdefault() doesn't change existing key"
watch.setdefault('custom_field', 'New value') # New key
assert watch.was_edited, "Flag should be True after setdefault() creates new writable field"
# Test pop() on writable field
watch.reset_watch_edited_flag()
watch.pop('custom_field', None)
assert watch.was_edited, "Flag should be True after pop() on writable field"
# Test __delitem__ on writable field
watch.reset_watch_edited_flag()
watch['temp_field'] = 'temp'
watch.reset_watch_edited_flag() # Reset after adding
del watch['temp_field']
assert watch.was_edited, "Flag should be True after __delitem__ on writable field"
print("✓ All dict methods correctly set the flag")
def test_watch_edited_flag_prevents_skip(client, live_server, measure_memory_usage, datastore_path):
"""
Test that the was_edited flag prevents skipping reprocessing.
When watch configuration is edited, it should reprocess even if content unchanged.
After worker processing, flag should be reset and subsequent checks can skip.
"""
# Setup test content
set_test_content(datastore_path)
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
assert b"Watch added" in res.data or b"Updated watch" in res.data
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Unpause the watch (watches are paused by default in tests)
watch['paused'] = False
# Run first check to establish baseline
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify first check completed successfully - checksum file should exist
checksum_file = os.path.join(datastore_path, uuid, 'last-checksum.txt')
assert os.path.isfile(checksum_file), "First check should create last-checksum.txt file"
# Reset the was_edited flag (simulating clean state after processing)
watch.reset_watch_edited_flag()
assert not watch.was_edited, "Flag should be False after reset"
# Run second check without any changes - should skip via checksumFromPreviousCheckWasTheSame
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Verify it was skipped (last_check_status should indicate skip)
# Note: The actual skip is tested in test_check_removed_line_contains_trigger
# Here we're focused on the was_edited flag interaction
# Now modify the watch - flag should become True
watch['title'] = 'Modified Title'
assert watch.was_edited, "Flag should be True after modifying watch"
# Run third check - should NOT skip because was_edited=True even though content unchanged
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# After worker processing, the flag should be reset by the worker
# This reset happens in the processor's run() method after processing completes
assert not watch.was_edited, "Flag should be False after worker processing"
print("✓ was_edited flag correctly prevents skip and is reset by worker")
def test_watch_edited_flag_system_fields(client, live_server, measure_memory_usage, datastore_path):
"""
Test that system fields (readonly + additional system fields) don't trigger the flag.
"""
# Setup - Add a watch
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": "", "edit_and_watch_submit_button": "Edit > Watch"},
follow_redirects=True
)
datastore = client.application.config.get('DATASTORE')
uuid = list(datastore.data['watching'].keys())[0]
watch = datastore.data['watching'][uuid]
# Test readonly fields from OpenAPI spec
readonly_fields = ['uuid', 'date_created', 'last_viewed']
for field in readonly_fields:
watch.reset_watch_edited_flag()
if field in watch:
old_value = watch[field]
watch[field] = 'modified-readonly-value'
assert not watch.was_edited, f"Flag should stay False when modifying readonly field '{field}'"
watch[field] = old_value # Restore
# Test additional system fields not in OpenAPI spec yet
system_fields = ['last_check_status']
for field in system_fields:
watch.reset_watch_edited_flag()
watch[field] = 'system-value'
assert not watch.was_edited, f"Flag should stay False when modifying system field '{field}'"
# Test that content-type (readonly per OpenAPI) doesn't trigger flag
watch.reset_watch_edited_flag()
watch['content-type'] = 'text/html'
assert not watch.was_edited, "Flag should stay False when modifying 'content-type' (readonly)"
print("✓ System fields correctly don't trigger the flag")

View File

@@ -276,6 +276,9 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
logger.debug(f'[{uuid}] - checksumFromPreviousCheckWasTheSame - Checksum from previous check was the same, nothing todo here.')
# Reset the edited flag since we successfully completed the check
watch.reset_watch_edited_flag()
except content_fetchers_exceptions.BrowserConnectError as e:
datastore.update_watch(uuid=uuid,
@@ -378,7 +381,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if not datastore.data['watching'].get(uuid):
continue
update_obj['content-type'] = update_handler.fetcher.get_all_headers().get('content-type', '').lower()
update_obj['content-type'] = str(update_handler.fetcher.get_all_headers().get('content-type', '') or "").lower()
if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
@@ -392,6 +395,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.debug(f"Processing watch UUID: {uuid} - xpath_data length returned {len(update_handler.xpath_data) if update_handler and update_handler.xpath_data else 'empty.'}")
if update_handler and process_changedetection_results:
try:
# Reset the edited flag BEFORE update_watch (which calls watch.update() and would set it again)
watch.reset_watch_edited_flag()
datastore.update_watch(uuid=uuid, update_obj=update_obj)
if changed_detected or not watch.history_n:
@@ -439,8 +444,22 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.exception(f"Worker {worker_id} full exception details:")
datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Always record attempt count
count = watch.get('check_count', 0) + 1
final_updates = {'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count,
}
# Record server header
try:
server_header = str(update_handler.fetcher.get_all_headers().get('server', '') or "").strip().lower()[:255]
if server_header:
final_updates['remote_server_reply'] = server_header
except Exception as e:
server_header = None
pass
if update_handler: # Could be none or empty if the processor was not found
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
@@ -449,17 +468,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
final_updates['page_title'] = page_title
except Exception as e:
logger.exception(f"Worker {worker_id} full exception details:")
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Record server header
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
datastore.update_watch(uuid=uuid, update_obj={'remote_server_reply': server_header})
except Exception as e:
pass
# Store favicon if necessary
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
@@ -467,14 +481,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
favicon_base_64=update_handler.fetcher.favicon_blob.get('base64')
)
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
datastore.update_watch(uuid=uuid, update_obj=final_updates)
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
# Explicitly delete update_handler to free all references
if update_handler: