Compare commits

...

1 Commits

4 changed files with 144 additions and 60 deletions

View File

@@ -71,10 +71,19 @@ class Fetcher():
supports_screenshots = False # Can capture page screenshots
supports_xpath_element_data = False # Can extract xpath element positions/data for visual selector
# Screenshot element locking - prevents layout shifts during screenshot capture
# Only needed for visual comparison (image_ssim_diff processor)
# Locks element dimensions in the first viewport to prevent headers/ads from resizing
lock_viewport_elements = False # Default: disabled for performance
def __init__(self, **kwargs):
if kwargs and 'screenshot_format' in kwargs:
self.screenshot_format = kwargs.get('screenshot_format')
# Allow lock_viewport_elements to be set via kwargs
if kwargs and 'lock_viewport_elements' in kwargs:
self.lock_viewport_elements = kwargs.get('lock_viewport_elements')
@classmethod
def get_status_icon_data(cls):

View File

@@ -10,18 +10,21 @@ from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, vi
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
async def capture_full_page_async(page, screenshot_format='JPEG'):
async def capture_full_page_async(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False):
import os
import time
import multiprocessing
start = time.time()
watch_info = f"[{watch_uuid}] " if watch_uuid else ""
setup_start = time.time()
page_height = await page.evaluate("document.documentElement.scrollHeight")
page_width = await page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport_size
dimensions_time = time.time() - setup_start
logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
logger.debug(f"{watch_info}Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width} (got dimensions in {dimensions_time:.2f}s)")
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
@@ -29,25 +32,31 @@ async def capture_full_page_async(page, screenshot_format='JPEG'):
y = 0
elements_locked = False
if page_height > page.viewport_size['height']:
# Lock all element dimensions BEFORE screenshot to prevent CSS media queries from resizing
# capture_full_page_async() changes viewport height which triggers @media (min-height) rules
# Only lock viewport elements if explicitly enabled (for image_ssim_diff processor)
# This prevents headers/ads from resizing when viewport changes
if lock_viewport_elements and page_height > page.viewport_size['height']:
lock_start = time.time()
lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js')
with open(lock_elements_js_path, 'r') as f:
lock_elements_js = f.read()
await page.evaluate(lock_elements_js)
elements_locked = True
lock_time = time.time() - lock_start
logger.debug(f"{watch_info}Viewport element locking enabled (took {lock_time:.2f}s)")
logger.debug("Element dimensions locked before screenshot capture")
if page_height > page.viewport_size['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
viewport_start = time.time()
logger.debug(f"{watch_info}Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
# Set viewport to a larger size to capture more content at once
await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
viewport_time = time.time() - viewport_start
logger.debug(f"{watch_info}Viewport changed to {page.viewport_size['width']}x{step_size} (took {viewport_time:.2f}s)")
# Capture screenshots in chunks up to the max total height
capture_start = time.time()
chunk_times = []
# Use PNG for better quality (no compression artifacts), JPEG for smaller size
screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg'
# PNG should use quality 100, JPEG uses configurable quality
@@ -69,7 +78,11 @@ async def capture_full_page_async(page, screenshot_format='JPEG'):
if screenshot_type == 'jpeg':
screenshot_kwargs['quality'] = screenshot_quality
chunk_start = time.time()
screenshot_chunks.append(await page.screenshot(**screenshot_kwargs))
chunk_time = time.time() - chunk_start
chunk_times.append(chunk_time)
logger.debug(f"{watch_info}Chunk {len(screenshot_chunks)} captured in {chunk_time:.2f}s")
y += step_size
# Restore original viewport size
@@ -81,11 +94,16 @@ async def capture_full_page_async(page, screenshot_format='JPEG'):
with open(unlock_elements_js_path, 'r') as f:
unlock_elements_js = f.read()
await page.evaluate(unlock_elements_js)
logger.debug("Element dimensions unlocked after screenshot capture")
logger.debug(f"{watch_info}Element dimensions unlocked after screenshot capture")
capture_time = time.time() - capture_start
total_capture_time = sum(chunk_times)
logger.debug(f"{watch_info}All {len(screenshot_chunks)} chunks captured in {capture_time:.2f}s (total chunk time: {total_capture_time:.2f}s)")
# If we have multiple chunks, stitch them together
if len(screenshot_chunks) > 1:
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
stitch_start = time.time()
logger.debug(f"{watch_info}Starting stitching of {len(screenshot_chunks)} chunks")
# For small number of chunks (2-3), stitch inline to avoid multiprocessing overhead
# Only use separate process for many chunks (4+) to avoid blocking the event loop
@@ -106,15 +124,22 @@ async def capture_full_page_async(page, screenshot_format='JPEG'):
del p
del parent_conn, child_conn
stitch_time = time.time() - stitch_start
total_time = time.time() - start
setup_time = total_time - capture_time - stitch_time
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
f"Setup: {setup_time:.2f}s, Capture: {capture_time:.2f}s, Stitching: {stitch_time:.2f}s, Total: {total_time:.2f}s")
# Explicit cleanup
del screenshot_chunks
screenshot_chunks = None
return screenshot
total_time = time.time() - start
setup_time = total_time - capture_time
logger.debug(
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
f"Setup: {setup_time:.2f}s, Single chunk: {capture_time:.2f}s, Total: {total_time:.2f}s")
return screenshot_chunks[0]
@@ -184,7 +209,8 @@ class fetcher(Fetcher):
async def screenshot_step(self, step_n=''):
super().screenshot_step(step_n=step_n)
screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format)
watch_uuid = getattr(self, 'watch_uuid', None)
screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
# Request GC immediately after screenshot to free memory
# Screenshots can be large and browser steps take many of them
@@ -233,6 +259,7 @@ class fetcher(Fetcher):
import playwright._impl._errors
import time
self.delete_browser_steps_screenshots()
self.watch_uuid = watch_uuid # Store for use in screenshot_step
response = None
async with async_playwright() as p:
@@ -318,7 +345,7 @@ class fetcher(Fetcher):
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format)
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
# Cleanup before raising to prevent memory leak
await self.page.close()
await context.close()
@@ -374,7 +401,7 @@ class fetcher(Fetcher):
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format)
self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
except ScreenshotUnavailable:
# Re-raise screenshot unavailable exceptions

View File

@@ -20,18 +20,21 @@ from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
async def capture_full_page(page, screenshot_format='JPEG'):
async def capture_full_page(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False):
import os
import time
import multiprocessing
start = time.time()
watch_info = f"[{watch_uuid}] " if watch_uuid else ""
setup_start = time.time()
page_height = await page.evaluate("document.documentElement.scrollHeight")
page_width = await page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport
dimensions_time = time.time() - setup_start
logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
logger.debug(f"{watch_info}Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width} (got dimensions in {dimensions_time:.2f}s)")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
@@ -50,20 +53,35 @@ async def capture_full_page(page, screenshot_format='JPEG'):
screenshot_chunks = []
y = 0
elements_locked = False
if page_height > page.viewport['height']:
# Lock all element dimensions BEFORE screenshot to prevent CSS media queries from resizing
# capture_full_page() changes viewport height which triggers @media (min-height) rules
# Only lock viewport elements if explicitly enabled (for image_ssim_diff processor)
# This prevents headers/ads from resizing when viewport changes
if lock_viewport_elements and page_height > page.viewport['height']:
lock_start = time.time()
lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js')
file_read_start = time.time()
with open(lock_elements_js_path, 'r') as f:
lock_elements_js = f.read()
await page.evaluate(lock_elements_js)
elements_locked = True
logger.debug("Element dimensions locked before screenshot capture")
file_read_time = time.time() - file_read_start
evaluate_start = time.time()
await page.evaluate(lock_elements_js)
evaluate_time = time.time() - evaluate_start
elements_locked = True
lock_time = time.time() - lock_start
logger.debug(f"{watch_info}Viewport element locking enabled - File read: {file_read_time:.3f}s, Browser evaluate: {evaluate_time:.2f}s, Total: {lock_time:.2f}s")
if page_height > page.viewport['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
viewport_start = time.time()
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
viewport_time = time.time() - viewport_start
logger.debug(f"{watch_info}Viewport changed to {page.viewport['width']}x{step_size} (took {viewport_time:.2f}s)")
capture_start = time.time()
chunk_times = []
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
# better than scrollTo incase they override it in the page
await page.evaluate(
@@ -82,7 +100,11 @@ async def capture_full_page(page, screenshot_format='JPEG'):
if screenshot_type == 'jpeg':
screenshot_kwargs['quality'] = screenshot_quality
chunk_start = time.time()
screenshot_chunks.append(await page.screenshot(**screenshot_kwargs))
chunk_time = time.time() - chunk_start
chunk_times.append(chunk_time)
logger.debug(f"{watch_info}Chunk {len(screenshot_chunks)} captured in {chunk_time:.2f}s")
y += step_size
await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
@@ -93,26 +115,38 @@ async def capture_full_page(page, screenshot_format='JPEG'):
with open(unlock_elements_js_path, 'r') as f:
unlock_elements_js = f.read()
await page.evaluate(unlock_elements_js)
logger.debug("Element dimensions unlocked after screenshot capture")
logger.debug(f"{watch_info}Element dimensions unlocked after screenshot capture")
capture_time = time.time() - capture_start
total_capture_time = sum(chunk_times)
logger.debug(f"{watch_info}All {len(screenshot_chunks)} chunks captured in {capture_time:.2f}s (total chunk time: {total_capture_time:.2f}s)")
if len(screenshot_chunks) > 1:
# Always use spawn for thread safety - consistent behavior in tests and production
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
stitch_start = time.time()
logger.debug(f"{watch_info}Starting stitching of {len(screenshot_chunks)} chunks")
ctx = multiprocessing.get_context('spawn')
parent_conn, child_conn = ctx.Pipe()
p = ctx.Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
p.start()
screenshot = parent_conn.recv_bytes()
p.join()
stitch_time = time.time() - stitch_start
total_time = time.time() - start
setup_time = total_time - capture_time - stitch_time
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
f"Setup: {setup_time:.2f}s, Capture: {capture_time:.2f}s, Stitching: {stitch_time:.2f}s, Total: {total_time:.2f}s")
screenshot_chunks = None
return screenshot
total_time = time.time() - start
setup_time = total_time - capture_time
logger.debug(
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
f"Setup: {setup_time:.2f}s, Single chunk: {capture_time:.2f}s, Total: {total_time:.2f}s")
return screenshot_chunks[0]
@@ -357,7 +391,7 @@ class fetcher(Fetcher):
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
if self.status_code != 200 and not ignore_status_codes:
screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)
screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
@@ -387,7 +421,7 @@ class fetcher(Fetcher):
# Now take screenshot (scrolling may trigger layout changes, but measurements are already captured)
logger.debug(f"Screenshot format {self.screenshot_format}")
self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)
self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
"visualselector_xpath_selectors": visualselector_xpath_selectors,
"max_height": MAX_TOTAL_HEIGHT

View File

@@ -1,5 +1,5 @@
/**
* Lock Element Dimensions for Screenshot Capture
* Lock Element Dimensions for Screenshot Capture (First Viewport Only)
*
* THE PROBLEM:
* When taking full-page screenshots of tall pages, Chrome/Puppeteer/Playwright need to:
@@ -10,40 +10,31 @@
* However, changing the viewport height triggers CSS media queries like:
* @media (min-height: 860px) { .ad { height: 250px; } }
*
* This causes elements (especially ads) to resize during screenshot capture, creating a mismatch:
* - Screenshot shows element at NEW size (after media query triggered)
* - xpath element coordinates measured at OLD size (before viewport change)
* - Visual selector overlays don't align with screenshot
*
* EXAMPLE BUG:
* - Initial viewport: 1280x800, ad height: 138px, article position: 279px ✓
* - Viewport changes to 1280x3809 for screenshot
* - Media query triggers: ad expands to 250px
* - All content below shifts down by 112px (250-138)
* - Article now at position: 391px (279+112)
* - But xpath data says 279px → 112px mismatch! ✗
* This causes elements (especially ads/headers) to resize during screenshot capture.
*
* THE SOLUTION:
* Before changing viewport, lock ALL element dimensions with !important inline styles.
* Inline styles with !important override media query CSS, preventing layout changes.
* Lock element dimensions in the FIRST VIEWPORT ONLY with !important inline styles.
* This prevents headers, navigation, and top ads from resizing when viewport changes.
* We only lock the visible portion because:
* - Most layout shifts happen in headers/navbars/top ads
* - Locking only visible elements is 100x+ faster (100-200 elements vs 10,000+)
* - Below-fold content shifts don't affect visual comparison accuracy
*
* WHAT THIS SCRIPT DOES:
* 1. Iterates through every element on the page
* 2. Captures current computed dimensions (width, height)
* 3. Sets inline styles with !important to freeze those dimensions
* 1. Gets current viewport height
* 2. Finds elements within first viewport (top of page to bottom of screen)
* 3. Locks their dimensions with !important inline styles
* 4. Disables ResizeObserver API (for JS-based resizing)
* 5. When viewport changes for screenshot, media queries can't resize anything
* 6. Layout remains consistent → xpath coordinates match screenshot ✓
*
* USAGE:
* Execute this script BEFORE calling capture_full_page() / screenshot functions.
* The page must be fully loaded and settled at its initial viewport size.
* No need to restore state afterward - page is closed after screenshot.
* Only enabled for image_ssim_diff processor (visual comparison).
* Default: OFF for performance.
*
* PERFORMANCE:
* - Iterates all DOM elements (can be 1000s on complex pages)
* - Typically completes in 50-200ms
* - One-time cost before screenshot, well worth it for coordinate accuracy
* - Only processes 100-300 elements (first viewport) vs 10,000+ (entire page)
* - Typically completes in 10-50ms
* - 100x+ faster than locking entire page
*
* @see https://github.com/dgtlmoon/changedetection.io/issues/XXXX
*/
@@ -52,11 +43,34 @@
// Store original styles in a global WeakMap for later restoration
window.__elementSizingRestore = new WeakMap();
// Lock ALL element dimensions to prevent media query layout changes
document.querySelectorAll('*').forEach(el => {
const computed = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const start = performance.now();
// Get current viewport height (visible portion of page)
const viewportHeight = window.innerHeight;
// Get all elements and filter to FIRST VIEWPORT ONLY
// This dramatically reduces elements to process (100-300 vs 10,000+)
const allElements = Array.from(document.querySelectorAll('*'));
// BATCH READ PHASE: Get bounding rects and filter to viewport
const measurements = allElements.map(el => {
const rect = el.getBoundingClientRect();
const computed = window.getComputedStyle(el);
// Only lock elements in the first viewport (visible on initial page load)
// rect.top < viewportHeight means element starts within visible area
const inViewport = rect.top < viewportHeight && rect.top >= 0;
const hasSize = rect.height > 0 && rect.width > 0;
return inViewport && hasSize ? { el, computed, rect } : null;
}).filter(Boolean); // Remove null entries
const elapsed = performance.now() - start;
console.log(`Locked first viewport elements: ${measurements.length} of ${allElements.length} total elements (viewport height: ${viewportHeight}px, took ${elapsed.toFixed(0)}ms)`);
// BATCH WRITE PHASE: Apply all inline styles without triggering layout
// No interleaved reads means browser can optimize style application
measurements.forEach(({el, computed, rect}) => {
// Save original inline style values BEFORE locking
const properties = ['height', 'min-height', 'max-height', 'width', 'min-width', 'max-width'];
const originalStyles = {};
@@ -89,5 +103,5 @@
disconnect() {}
};
console.log('✓ Element dimensions locked to prevent media query changes during screenshot');
console.log(`✓ Element dimensions locked (${measurements.length} elements) to prevent media query changes during screenshot`);
})();