Puppeteer/playwright - Stop elements from resizing on capture screenshot and xpath element mapping which could lead to incorrect xpath/element-selection info

2025-12-20 23:16:44 +00:00 · 2025-12-18 17:37:12 +01:00
parent 377955eedb
commit ddeba6e4cc
9 changed files with 392 additions and 8 deletions
--- a/changedetectionio/api/Watch.py
+++ b/changedetectionio/api/Watch.py
@@ -127,7 +127,60 @@ class Watch(Resource):
        if request.json.get('url') and not is_safe_valid_url(request.json.get('url')):
            return "Invalid URL", 400

-        watch.update(request.json)
+        # Handle processor-config-* fields separately (save to JSON, not datastore)
+        from changedetectionio import processors
+        processor_config_data = {}
+        regular_data = {}
+
+        for key, value in request.json.items():
+            if key.startswith('processor_config_'):
+                config_key = key.replace('processor_config_', '')
+                if value:  # Only save non-empty values
+                    processor_config_data[config_key] = value
+            else:
+                regular_data[key] = value
+
+        # Update watch with regular (non-processor-config) fields
+        watch.update(regular_data)
+
+        # Save processor config to JSON file if any config data exists
+        if processor_config_data:
+            try:
+                processor_name = request.json.get('processor', watch.get('processor'))
+                if processor_name:
+                    # Create a processor instance to access config methods
+                    from changedetectionio.processors import difference_detection_processor
+                    processor_instance = difference_detection_processor(self.datastore, uuid)
+                    # Use processor name as filename so each processor keeps its own config
+                    config_filename = f'{processor_name}.json'
+                    processor_instance.update_extra_watch_config(config_filename, processor_config_data)
+                    logger.debug(f"API: Saved processor config to {config_filename}: {processor_config_data}")
+
+                    # Call optional edit_hook if processor has one
+                    try:
+                        import importlib
+                        edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
+
+                        try:
+                            edit_hook = importlib.import_module(edit_hook_module_name)
+                            logger.debug(f"API: Found edit_hook module for {processor_name}")
+
+                            if hasattr(edit_hook, 'on_config_save'):
+                                logger.info(f"API: Calling edit_hook.on_config_save for {processor_name}")
+                                # Call hook and get updated config
+                                updated_config = edit_hook.on_config_save(watch, processor_config_data, self.datastore)
+                                # Save updated config back to file
+                                processor_instance.update_extra_watch_config(config_filename, updated_config)
+                                logger.info(f"API: Edit hook updated config: {updated_config}")
+                            else:
+                                logger.debug(f"API: Edit hook module found but no on_config_save function")
+                        except ModuleNotFoundError:
+                            logger.debug(f"API: No edit_hook module for processor {processor_name} (this is normal)")
+                    except Exception as hook_error:
+                        logger.error(f"API: Edit hook error (non-fatal): {hook_error}", exc_info=True)
+
+            except Exception as e:
+                logger.error(f"API: Failed to save processor config: {e}")

        return "OK", 200

--- a/changedetectionio/blueprint/ui/edit.py
+++ b/changedetectionio/blueprint/ui/edit.py
@@ -168,6 +168,32 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
                    config_filename = f'{processor_name}.json'
                    processor_instance.update_extra_watch_config(config_filename, processor_config_data)
                    logger.debug(f"Saved processor config to {config_filename}: {processor_config_data}")
+
+                    # Call optional edit_hook if processor has one
+                    try:
+                        # Try to import the edit_hook module from the processor package
+                        import importlib
+                        edit_hook_module_name = f'changedetectionio.processors.{processor_name}.edit_hook'
+
+                        try:
+                            edit_hook = importlib.import_module(edit_hook_module_name)
+                            logger.debug(f"Found edit_hook module for {processor_name}")
+
+                            if hasattr(edit_hook, 'on_config_save'):
+                                logger.info(f"Calling edit_hook.on_config_save for {processor_name}")
+                                watch_obj = datastore.data['watching'][uuid]
+                                # Call hook and get updated config
+                                updated_config = edit_hook.on_config_save(watch_obj, processor_config_data, datastore)
+                                # Save updated config back to file
+                                processor_instance.update_extra_watch_config(config_filename, updated_config)
+                                logger.info(f"Edit hook updated config: {updated_config}")
+                            else:
+                                logger.debug(f"Edit hook module found but no on_config_save function")
+                        except ModuleNotFoundError:
+                            logger.debug(f"No edit_hook module for processor {processor_name} (this is normal)")
+                    except Exception as hook_error:
+                        logger.error(f"Edit hook error (non-fatal): {hook_error}", exc_info=True)
+
                except Exception as e:
                    logger.error(f"Failed to save processor config: {e}")

--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -28,6 +28,16 @@ async def capture_full_page_async(page, screenshot_format='JPEG'):
    y = 0

    if page_height > page.viewport_size['height']:
+
+        # Lock all element dimensions BEFORE screenshot to prevent CSS media queries from resizing
+        # capture_full_page_async() changes viewport height which triggers @media (min-height) rules
+        lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js')
+        with open(lock_elements_js_path, 'r') as f:
+            lock_elements_js = f.read()
+        await page.evaluate(lock_elements_js)
+
+        logger.debug("Element dimensions locked before screenshot capture")
+
        if page_height < step_size:
            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
        logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -50,6 +50,14 @@ async def capture_full_page(page, screenshot_format='JPEG'):
    screenshot_chunks = []
    y = 0
    if page_height > page.viewport['height']:
+        # Lock all element dimensions BEFORE screenshot to prevent CSS media queries from resizing
+        # capture_full_page() changes viewport height which triggers @media (min-height) rules
+        lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js')
+        with open(lock_elements_js_path, 'r') as f:
+            lock_elements_js = f.read()
+        await page.evaluate(lock_elements_js)
+        logger.debug("Element dimensions locked before screenshot capture")
+
        if page_height < step_size:
            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
        await page.setViewport({'width': page.viewport['width'], 'height': step_size})
@@ -222,7 +230,6 @@ class fetcher(Fetcher):
                    "height": int(match.group(2))
                })
                logger.debug(f"Puppeteer viewport size {self.page.viewport}")
-
        try:
            from pyppeteerstealth import inject_evasions_into_page
        except ImportError:
@@ -354,6 +361,11 @@ class fetcher(Fetcher):
            await self.page.evaluate(f"var include_filters=''")

        MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
+
+        self.content = await self.page.content
+
+        # Now take screenshot (scrolling may trigger layout changes, but measurements are already captured)
+        self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)
        self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
            "visualselector_xpath_selectors": visualselector_xpath_selectors,
            "max_height": MAX_TOTAL_HEIGHT
@@ -361,12 +373,9 @@ class fetcher(Fetcher):
        if not self.xpath_data:
            raise Exception(f"Content Fetcher > xPath scraper failed. Please report this URL so we can fix it :)")

+
        self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS)

-        self.content = await self.page.content
-
-        self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)
-
        # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
        logger.success(f"Fetching '{url}' complete, closing page")
        await self.page.close()
--- a/changedetectionio/content_fetchers/res/lock-elements-sizing.js
+++ b/changedetectionio/content_fetchers/res/lock-elements-sizing.js
@@ -0,0 +1,79 @@
+/**
+ * Lock Element Dimensions for Screenshot Capture
+ *
+ * THE PROBLEM:
+ * When taking full-page screenshots of tall pages, Chrome/Puppeteer/Playwright need to:
+ * 1. Temporarily change the viewport height to a large value (e.g., 800px → 3809px)
+ * 2. Take screenshots in chunks while scrolling
+ * 3. Stitch the chunks together
+ *
+ * However, changing the viewport height triggers CSS media queries like:
+ *   @media (min-height: 860px) { .ad { height: 250px; } }
+ *
+ * This causes elements (especially ads) to resize during screenshot capture, creating a mismatch:
+ * - Screenshot shows element at NEW size (after media query triggered)
+ * - xpath element coordinates measured at OLD size (before viewport change)
+ * - Visual selector overlays don't align with screenshot
+ *
+ * EXAMPLE BUG:
+ * - Initial viewport: 1280x800, ad height: 138px, article position: 279px ✓
+ * - Viewport changes to 1280x3809 for screenshot
+ * - Media query triggers: ad expands to 250px
+ * - All content below shifts down by 112px (250-138)
+ * - Article now at position: 391px (279+112)
+ * - But xpath data says 279px → 112px mismatch! ✗
+ *
+ * THE SOLUTION:
+ * Before changing viewport, lock ALL element dimensions with !important inline styles.
+ * Inline styles with !important override media query CSS, preventing layout changes.
+ *
+ * WHAT THIS SCRIPT DOES:
+ * 1. Iterates through every element on the page
+ * 2. Captures current computed dimensions (width, height)
+ * 3. Sets inline styles with !important to freeze those dimensions
+ * 4. Disables ResizeObserver API (for JS-based resizing)
+ * 5. When viewport changes for screenshot, media queries can't resize anything
+ * 6. Layout remains consistent → xpath coordinates match screenshot ✓
+ *
+ * USAGE:
+ * Execute this script BEFORE calling capture_full_page() / screenshot functions.
+ * The page must be fully loaded and settled at its initial viewport size.
+ * No need to restore state afterward - page is closed after screenshot.
+ *
+ * PERFORMANCE:
+ * - Iterates all DOM elements (can be 1000s on complex pages)
+ * - Typically completes in 50-200ms
+ * - One-time cost before screenshot, well worth it for coordinate accuracy
+ *
+ * @see https://github.com/dgtlmoon/changedetection.io/issues/XXXX
+ */
+
+(() => {
+    // Lock ALL element dimensions to prevent media query layout changes
+    document.querySelectorAll('*').forEach(el => {
+        const computed = window.getComputedStyle(el);
+        const rect = el.getBoundingClientRect();
+
+        // Lock dimensions with !important to override media queries
+        if (rect.height > 0) {
+            el.style.setProperty('height', computed.height, 'important');
+            el.style.setProperty('min-height', computed.height, 'important');
+            el.style.setProperty('max-height', computed.height, 'important');
+        }
+        if (rect.width > 0) {
+            el.style.setProperty('width', computed.width, 'important');
+            el.style.setProperty('min-width', computed.width, 'important');
+            el.style.setProperty('max-width', computed.width, 'important');
+        }
+    });
+
+    // Also disable ResizeObserver for JS-based resizing
+    window.ResizeObserver = class {
+        constructor() {}
+        observe() {}
+        unobserve() {}
+        disconnect() {}
+    };
+
+    console.log('✓ Element dimensions locked to prevent media query changes during screenshot');
+})();
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -96,8 +96,17 @@ class model(watch_base):
    def clear_watch(self):
        import pathlib

+        # Get list of processor config files to preserve
+        from changedetectionio.processors import find_processors
+        processor_names = [name for cls, name in find_processors()]
+        processor_config_files = {f"{name}.json" for name in processor_names}
+
        # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
+        # But preserve processor config files (they're configuration, not history data)
        for item in pathlib.Path(str(self.watch_data_dir)).rglob("*.*"):
+            # Skip processor config files
+            if item.name in processor_config_files:
+                continue
            os.unlink(item)

        # Force the attr to recalculate
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@@ -328,6 +328,26 @@ def get_custom_watch_obj_for_processor(processor_name):
    return watch_class


+def find_processor_module(processor_name):
+    """
+    Find the processor module by name.
+
+    Args:
+        processor_name: Processor machine name (e.g., 'image_ssim_diff')
+
+    Returns:
+        module: The processor's parent module, or None if not found
+    """
+    processor_classes = find_processors()
+    processor_tuple = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
+
+    if processor_tuple:
+        # Return the parent module (the package containing processor.py)
+        return get_parent_module(processor_tuple[0])
+
+    return None
+
+
 def available_processors():
    """
    Get a list of processors by name and description for the UI elements.
--- a/changedetectionio/processors/image_ssim_diff/init.py
+++ b/changedetectionio/processors/image_ssim_diff/init.py
@@ -16,3 +16,6 @@ processor_weight = 2  # Lower weight = appears at top, heavier weight = appears
 DEFAULT_COMPARISON_METHOD = os.getenv('COMPARISON_METHOD', 'opencv')
 DEFAULT_COMPARISON_THRESHOLD_OPENCV = float(os.getenv('COMPARISON_THRESHOLD_OPENCV', '30'))
 DEFAULT_COMPARISON_THRESHOLD_PIXELMATCH = float(os.getenv('COMPARISON_THRESHOLD_PIXELMATCH', '10'))
+
+# Template tracking filename
+CROPPED_IMAGE_TEMPLATE_FILENAME = 'cropped_image_template.png'
--- a/changedetectionio/processors/image_ssim_diff/processor.py
+++ b/changedetectionio/processors/image_ssim_diff/processor.py
@@ -10,9 +10,10 @@ import hashlib
 import os
 import time
 from loguru import logger
+from changedetectionio import strtobool
 from changedetectionio.processors import difference_detection_processor, SCREENSHOT_FORMAT_PNG
 from changedetectionio.processors.exceptions import ProcessorException
-from . import DEFAULT_COMPARISON_METHOD, DEFAULT_COMPARISON_THRESHOLD_OPENCV, DEFAULT_COMPARISON_THRESHOLD_PIXELMATCH
+from . import DEFAULT_COMPARISON_METHOD, DEFAULT_COMPARISON_THRESHOLD_OPENCV, DEFAULT_COMPARISON_THRESHOLD_PIXELMATCH, CROPPED_IMAGE_TEMPLATE_FILENAME

 name = 'Visual/Image screenshot change detection'
 description = 'Compares screenshots using fast algorithms (OpenCV or pixelmatch), 10-100x faster than SSIM'
@@ -90,9 +91,12 @@ class perform_site_check(difference_detection_processor):
        # Automatically use the processor name from watch config as filename
        processor_name = watch.get('processor', 'default')
        config_filename = f'{processor_name}.json'
-        processor_config = self.get_extra_watch_config(config_filename)
+        processor_config = self.get_extra_watch_config(config_filename) if self.get_extra_watch_config(config_filename) else {}
        bounding_box = processor_config.get('bounding_box') if processor_config else None

+        # Template matching for tracking content movement
+        template_matching_enabled = processor_config.get('auto_track_region', False)
+
        if bounding_box:
            try:
                # Parse bounding box: "x,y,width,height"
@@ -154,6 +158,8 @@ class perform_site_check(difference_detection_processor):

        # Crop the current image if region was found (for comparison only, keep full screenshot for history)
        cropped_current_img = None
+        original_crop_region = crop_region  # Store original for template matching
+
        if crop_region:
            try:
                cropped_current_img = current_img.crop(crop_region)
@@ -194,6 +200,42 @@ class perform_site_check(difference_detection_processor):

            previous_img = Image.open(io.BytesIO(previous_screenshot_bytes))

+            # Template matching: If enabled, search for content that may have moved
+            # Check if feature is globally enabled via ENV var
+            feature_enabled = strtobool(os.getenv('ENABLE_TEMPLATE_TRACKING', 'True'))
+            # Check if auto-tracking is enabled for this specific watch (determined by feature analysis)
+            auto_track_enabled = template_matching_enabled
+
+            if feature_enabled and auto_track_enabled and original_crop_region:
+                try:
+                    # Check if template exists, if not regenerate from previous snapshot
+                    template_path = os.path.join(watch.watch_data_dir, CROPPED_IMAGE_TEMPLATE_FILENAME)
+                    if not os.path.isfile(template_path):
+                        logger.info("Template file missing, regenerating from previous snapshot")
+                        self._regenerate_template_from_snapshot(
+                            previous_img, watch, original_crop_region
+                        )
+
+                    logger.debug("Template matching enabled - searching for region movement")
+                    new_crop_region = self._find_region_with_template_matching(
+                        current_img, watch, original_crop_region, search_tolerance=0.2
+                    )
+
+                    if new_crop_region:
+                        old_region = original_crop_region
+                        crop_region = new_crop_region
+                        logger.info(f"Template matching: Region moved from {old_region} to {new_crop_region}")
+
+                        # Update cropped image with new region
+                        if cropped_current_img:
+                            cropped_current_img.close()
+                        cropped_current_img = current_img.crop(crop_region)
+                    else:
+                        logger.warning("Template matching: Could not find region, using original position")
+
+                except Exception as e:
+                    logger.warning(f"Template matching error (continuing with original position): {e}")
+
            # Crop previous image to the same region if cropping is enabled
            cropped_previous_img = None
            if crop_region:
@@ -423,3 +465,136 @@ class perform_site_check(difference_detection_processor):
            del alpha

        return changed_detected, change_percentage
+
+    def _regenerate_template_from_snapshot(self, snapshot_img, watch, bbox):
+        """
+        Regenerate template file from a snapshot (typically after 'clear data').
+
+        When user clears watch data, the template file is deleted but config remains.
+        This extracts the region from the previous/baseline snapshot and saves it
+        as the template so tracking can continue.
+
+        Args:
+            snapshot_img: PIL Image to extract template from (usually previous_img)
+            watch: Watch object (to access data directory)
+            bbox: (left, top, right, bottom) bounding box coordinates
+        """
+        try:
+            left, top, right, bottom = bbox
+            width = right - left
+            height = bottom - top
+
+            # Ensure watch data directory exists
+            watch.ensure_data_dir_exists()
+
+            # Crop the template region
+            template = snapshot_img.crop(bbox)
+
+            # Save as PNG (lossless, no compression artifacts)
+            template_path = os.path.join(watch.watch_data_dir, CROPPED_IMAGE_TEMPLATE_FILENAME)
+            template.save(template_path, format='PNG', optimize=True)
+
+            logger.info(f"Regenerated template: {template_path} ({width}x{height}px)")
+            template.close()
+
+        except Exception as e:
+            logger.error(f"Failed to regenerate template: {e}")
+
+    def _find_region_with_template_matching(self, current_img, watch, original_bbox, search_tolerance=0.2):
+        """
+        Use OpenCV template matching to find where content moved on the page.
+
+        This handles cases where page layout shifts push content to different
+        pixel coordinates, but the visual content remains the same.
+
+        Args:
+            current_img: PIL Image of current screenshot
+            watch: Watch object (to access template file)
+            original_bbox: (left, top, right, bottom) tuple of original region
+            search_tolerance: How far to search (0.2 = ±20% of region size)
+
+        Returns:
+            tuple: New (left, top, right, bottom) region, or None if not found
+        """
+        import cv2
+        import numpy as np
+
+        try:
+            # Load template from watch data directory
+            template_path = os.path.join(watch.watch_data_dir, CROPPED_IMAGE_TEMPLATE_FILENAME)
+
+            if not os.path.isfile(template_path):
+                logger.warning(f"Template file not found: {template_path}")
+                return None
+            from PIL import Image
+
+            template_img = Image.open(template_path)
+
+            # Convert images to numpy arrays for OpenCV
+            current_array = np.array(current_img)
+            template_array = np.array(template_img)
+
+            # Convert to grayscale for matching
+            if len(current_array.shape) == 3:
+                current_gray = cv2.cvtColor(current_array, cv2.COLOR_RGB2GRAY)
+            else:
+                current_gray = current_array
+
+            if len(template_array.shape) == 3:
+                template_gray = cv2.cvtColor(template_array, cv2.COLOR_RGB2GRAY)
+            else:
+                template_gray = template_array
+
+            # Calculate search region
+            left, top, right, bottom = original_bbox
+            width = right - left
+            height = bottom - top
+
+            margin_x = int(width * search_tolerance)
+            margin_y = int(height * search_tolerance)
+
+            # Expand search area
+            search_left = max(0, left - margin_x)
+            search_top = max(0, top - margin_y)
+            search_right = min(current_img.width, right + margin_x)
+            search_bottom = min(current_img.height, bottom + margin_y)
+
+            # Extract search region
+            search_region = current_gray[search_top:search_bottom, search_left:search_right]
+
+            logger.debug(f"Searching for template in region: ({search_left}, {search_top}) to ({search_right}, {search_bottom})")
+
+            # Perform template matching
+            result = cv2.matchTemplate(search_region, template_gray, cv2.TM_CCOEFF_NORMED)
+            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
+
+            logger.debug(f"Template matching confidence: {max_val:.2%}")
+
+            # Check if match is good enough (80% confidence threshold)
+            if max_val >= 0.8:
+                # Calculate new bounding box in original image coordinates
+                match_x = search_left + max_loc[0]
+                match_y = search_top + max_loc[1]
+
+                new_bbox = (match_x, match_y, match_x + width, match_y + height)
+
+                # Calculate movement distance
+                move_x = abs(match_x - left)
+                move_y = abs(match_y - top)
+
+                logger.info(f"Template found at ({match_x}, {match_y}), "
+                           f"moved {move_x}px horizontally, {move_y}px vertically, "
+                           f"confidence: {max_val:.2%}")
+
+                # Close template image
+                template_img.close()
+
+                return new_bbox
+            else:
+                logger.warning(f"Template match confidence too low: {max_val:.2%} (need 80%)")
+                template_img.close()
+                return None
+
+        except Exception as e:
+            logger.error(f"Template matching error: {e}")
+            return None