mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-04-30 14:50:39 +00:00
475 lines
22 KiB
Python
475 lines
22 KiB
Python
import asyncio
|
|
import gc
|
|
import json
|
|
import os
|
|
from urllib.parse import urlparse
|
|
|
|
from loguru import logger
|
|
|
|
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
|
|
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, FAVICON_FETCHER_JS
|
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, \
|
|
BrowserStepsStepException
|
|
|
|
|
|
async def capture_full_page_async(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False):
|
|
import os
|
|
import time
|
|
|
|
start = time.time()
|
|
watch_info = f"[{watch_uuid}] " if watch_uuid else ""
|
|
|
|
setup_start = time.time()
|
|
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
|
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
|
original_viewport = page.viewport_size
|
|
dimensions_time = time.time() - setup_start
|
|
|
|
logger.debug(f"{watch_info}Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width} (got dimensions in {dimensions_time:.2f}s)")
|
|
|
|
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
|
|
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
|
|
screenshot_chunks = []
|
|
y = 0
|
|
elements_locked = False
|
|
|
|
# Only lock viewport elements if explicitly enabled (for image_ssim_diff processor)
|
|
# This prevents headers/ads from resizing when viewport changes
|
|
if lock_viewport_elements and page_height > page.viewport_size['height']:
|
|
lock_start = time.time()
|
|
lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js')
|
|
with open(lock_elements_js_path, 'r') as f:
|
|
lock_elements_js = f.read()
|
|
await page.evaluate(lock_elements_js)
|
|
elements_locked = True
|
|
lock_time = time.time() - lock_start
|
|
logger.debug(f"{watch_info}Viewport element locking enabled (took {lock_time:.2f}s)")
|
|
|
|
if page_height > page.viewport_size['height']:
|
|
if page_height < step_size:
|
|
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
|
|
# Never set viewport taller than our max capture height - otherwise one screenshot chunk
|
|
# captures the whole (e.g. 8098px) page even when SCREENSHOT_MAX_HEIGHT=1000
|
|
step_size = min(step_size, SCREENSHOT_MAX_TOTAL_HEIGHT)
|
|
viewport_start = time.time()
|
|
logger.debug(f"{watch_info}Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
|
|
# Set viewport to a larger size to capture more content at once
|
|
await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
|
|
viewport_time = time.time() - viewport_start
|
|
logger.debug(f"{watch_info}Viewport changed to {page.viewport_size['width']}x{step_size} (took {viewport_time:.2f}s)")
|
|
|
|
# Capture screenshots in chunks up to the max total height
|
|
capture_start = time.time()
|
|
chunk_times = []
|
|
# Use PNG for better quality (no compression artifacts), JPEG for smaller size
|
|
screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg'
|
|
# PNG should use quality 100, JPEG uses configurable quality
|
|
screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72))
|
|
|
|
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
|
# Only scroll if not at the top (y > 0)
|
|
if y > 0:
|
|
await page.evaluate(f"window.scrollTo(0, {y})")
|
|
|
|
# Request GC only before screenshot (not 3x per chunk)
|
|
await page.request_gc()
|
|
|
|
screenshot_kwargs = {
|
|
'type': screenshot_type,
|
|
'full_page': False
|
|
}
|
|
# Only pass quality parameter for jpeg (PNG doesn't support it in Playwright)
|
|
if screenshot_type == 'jpeg':
|
|
screenshot_kwargs['quality'] = screenshot_quality
|
|
|
|
chunk_start = time.time()
|
|
screenshot_chunks.append(await page.screenshot(**screenshot_kwargs))
|
|
chunk_time = time.time() - chunk_start
|
|
chunk_times.append(chunk_time)
|
|
logger.debug(f"{watch_info}Chunk {len(screenshot_chunks)} captured in {chunk_time:.2f}s")
|
|
y += step_size
|
|
|
|
# Restore original viewport size
|
|
await page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
|
|
|
|
# Unlock element dimensions if they were locked
|
|
if elements_locked:
|
|
unlock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'unlock-elements-sizing.js')
|
|
with open(unlock_elements_js_path, 'r') as f:
|
|
unlock_elements_js = f.read()
|
|
await page.evaluate(unlock_elements_js)
|
|
logger.debug(f"{watch_info}Element dimensions unlocked after screenshot capture")
|
|
|
|
capture_time = time.time() - capture_start
|
|
total_capture_time = sum(chunk_times)
|
|
logger.debug(f"{watch_info}All {len(screenshot_chunks)} chunks captured in {capture_time:.2f}s (total chunk time: {total_capture_time:.2f}s)")
|
|
|
|
# If we have multiple chunks, stitch them together
|
|
if len(screenshot_chunks) > 1:
|
|
stitch_start = time.time()
|
|
logger.debug(f"{watch_info}Starting stitching of {len(screenshot_chunks)} chunks")
|
|
|
|
# Always use spawn subprocess for ANY stitching (2+ chunks)
|
|
# PIL allocates at C level and Python GC never releases it - subprocess exit forces OS to reclaim
|
|
# Trade-off: 35MB resource_tracker vs 500MB+ PIL leak in main process
|
|
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker_raw_bytes
|
|
import multiprocessing
|
|
import struct
|
|
|
|
ctx = multiprocessing.get_context('spawn')
|
|
parent_conn, child_conn = ctx.Pipe()
|
|
p = ctx.Process(target=stitch_images_worker_raw_bytes, args=(child_conn, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
|
|
p.start()
|
|
|
|
# Send via raw bytes (no pickle)
|
|
parent_conn.send_bytes(struct.pack('I', len(screenshot_chunks)))
|
|
for chunk in screenshot_chunks:
|
|
parent_conn.send_bytes(chunk)
|
|
|
|
screenshot = parent_conn.recv_bytes()
|
|
p.join()
|
|
|
|
parent_conn.close()
|
|
child_conn.close()
|
|
del p, parent_conn, child_conn
|
|
|
|
stitch_time = time.time() - stitch_start
|
|
total_time = time.time() - start
|
|
setup_time = total_time - capture_time - stitch_time
|
|
logger.debug(
|
|
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
|
|
f"Setup: {setup_time:.2f}s, Capture: {capture_time:.2f}s, Stitching: {stitch_time:.2f}s, Total: {total_time:.2f}s")
|
|
return screenshot
|
|
|
|
total_time = time.time() - start
|
|
setup_time = total_time - capture_time
|
|
logger.debug(
|
|
f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | "
|
|
f"Setup: {setup_time:.2f}s, Single chunk: {capture_time:.2f}s, Total: {total_time:.2f}s")
|
|
|
|
return screenshot_chunks[0]
|
|
|
|
class fetcher(Fetcher):
|
|
fetcher_description = "Playwright {}/Javascript".format(
|
|
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
|
)
|
|
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
|
|
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
|
|
|
|
browser_type = ''
|
|
command_executor = ''
|
|
|
|
# Configs for Proxy setup
|
|
# In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
|
|
playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
|
|
|
|
proxy = None
|
|
|
|
# Capability flags
|
|
supports_browser_steps = True
|
|
supports_screenshots = True
|
|
supports_xpath_element_data = True
|
|
|
|
@classmethod
|
|
def get_status_icon_data(cls):
|
|
"""Return Chrome browser icon data for Playwright fetcher."""
|
|
return {
|
|
'filename': 'google-chrome-icon.png',
|
|
'alt': 'Using a Chrome browser',
|
|
'title': 'Using a Chrome browser'
|
|
}
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
|
|
|
|
if custom_browser_connection_url:
|
|
self.browser_connection_is_custom = True
|
|
self.browser_connection_url = custom_browser_connection_url
|
|
else:
|
|
# Fallback to fetching from system
|
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
|
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
|
|
|
|
# If any proxy settings are enabled, then we should setup the proxy object
|
|
proxy_args = {}
|
|
for k in self.playwright_proxy_settings_mappings:
|
|
v = os.getenv('playwright_proxy_' + k, False)
|
|
if v:
|
|
proxy_args[k] = v.strip('"')
|
|
|
|
if proxy_args:
|
|
self.proxy = proxy_args
|
|
|
|
# allow per-watch proxy selection override
|
|
if proxy_override:
|
|
self.proxy = {'server': proxy_override}
|
|
|
|
if self.proxy:
|
|
# Playwright needs separate username and password values
|
|
parsed = urlparse(self.proxy.get('server'))
|
|
if parsed.username:
|
|
self.proxy['username'] = parsed.username
|
|
self.proxy['password'] = parsed.password
|
|
|
|
async def screenshot_step(self, step_n=''):
|
|
super().screenshot_step(step_n=step_n)
|
|
watch_uuid = getattr(self, 'watch_uuid', None)
|
|
screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
|
|
|
# Request GC immediately after screenshot to free memory
|
|
# Screenshots can be large and browser steps take many of them
|
|
await self.page.request_gc()
|
|
|
|
if self.browser_steps_screenshot_path is not None:
|
|
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
|
|
logger.debug(f"Saving step screenshot to {destination}")
|
|
with open(destination, 'wb') as f:
|
|
f.write(screenshot)
|
|
# Clear local reference to allow screenshot bytes to be collected
|
|
del screenshot
|
|
gc.collect()
|
|
|
|
async def save_step_html(self, step_n):
|
|
super().save_step_html(step_n=step_n)
|
|
content = await self.page.content()
|
|
|
|
# Request GC after getting page content
|
|
await self.page.request_gc()
|
|
|
|
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
|
|
logger.debug(f"Saving step HTML to {destination}")
|
|
with open(destination, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
# Clear local reference
|
|
del content
|
|
gc.collect()
|
|
|
|
async def run(self,
|
|
fetch_favicon=True,
|
|
current_include_filters=None,
|
|
empty_pages_are_a_change=False,
|
|
ignore_status_codes=False,
|
|
is_binary=False,
|
|
request_body=None,
|
|
request_headers=None,
|
|
request_method=None,
|
|
screenshot_format=None,
|
|
timeout=None,
|
|
url=None,
|
|
watch_uuid=None,
|
|
):
|
|
|
|
from playwright.async_api import async_playwright
|
|
import playwright._impl._errors
|
|
import time
|
|
self.delete_browser_steps_screenshots()
|
|
self.watch_uuid = watch_uuid # Store for use in screenshot_step
|
|
response = None
|
|
|
|
async with async_playwright() as p:
|
|
browser_type = getattr(p, self.browser_type)
|
|
|
|
# Seemed to cause a connection Exception even tho I can see it connect
|
|
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
|
|
# 60,000 connection timeout only
|
|
browser = await browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)
|
|
|
|
# SOCKS5 with authentication is not supported (yet)
|
|
# https://github.com/microsoft/playwright/issues/10567
|
|
|
|
# Set user agent to prevent Cloudflare from blocking the browser
|
|
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
|
|
context = await browser.new_context(
|
|
accept_downloads=False, # Should never be needed
|
|
bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
|
|
extra_http_headers=request_headers,
|
|
ignore_https_errors=True,
|
|
proxy=self.proxy,
|
|
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
|
|
user_agent=manage_user_agent(headers=request_headers),
|
|
)
|
|
|
|
self.page = await context.new_page()
|
|
|
|
# Listen for all console events and handle errors
|
|
self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
|
|
|
|
# Re-use as much code from browser steps as possible so its the same
|
|
from changedetectionio.browser_steps.browser_steps import steppable_browser_interface
|
|
browsersteps_interface = steppable_browser_interface(start_url=url)
|
|
browsersteps_interface.page = self.page
|
|
|
|
response = await browsersteps_interface.action_goto_url(value=url)
|
|
|
|
if response is None:
|
|
await context.close()
|
|
await browser.close()
|
|
logger.debug("Content Fetcher > Response object from the browser communication was none")
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
# In async_playwright, all_headers() returns a coroutine
|
|
try:
|
|
self.headers = await response.all_headers()
|
|
except TypeError:
|
|
# Fallback for sync version
|
|
self.headers = response.all_headers()
|
|
|
|
try:
|
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
|
await browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
|
|
except playwright._impl._errors.TimeoutError as e:
|
|
await context.close()
|
|
await browser.close()
|
|
# This can be ok, we will try to grab what we could retrieve
|
|
pass
|
|
except Exception as e:
|
|
logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}")
|
|
await context.close()
|
|
await browser.close()
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
|
await self.page.wait_for_timeout(extra_wait * 1000)
|
|
|
|
try:
|
|
self.status_code = response.status
|
|
except Exception as e:
|
|
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
|
|
logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
|
|
logger.critical(response)
|
|
await context.close()
|
|
await browser.close()
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
if fetch_favicon:
|
|
try:
|
|
self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS)
|
|
await self.page.request_gc()
|
|
except Exception as e:
|
|
logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")
|
|
|
|
if self.status_code != 200 and not ignore_status_codes:
|
|
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
|
# Finally block will handle cleanup
|
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
|
|
|
if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0:
|
|
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
|
|
await context.close()
|
|
await browser.close()
|
|
raise EmptyReply(url=url, status_code=response.status)
|
|
|
|
# Wrap remaining operations in try/finally to ensure cleanup
|
|
try:
|
|
# Run Browser Steps here
|
|
if self.browser_steps:
|
|
try:
|
|
await self.iterate_browser_steps(start_url=url)
|
|
except BrowserStepsStepException:
|
|
# Finally block will handle cleanup
|
|
raise
|
|
|
|
await self.page.wait_for_timeout(extra_wait * 1000)
|
|
|
|
now = time.time()
|
|
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
|
if current_include_filters is not None:
|
|
await self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
|
else:
|
|
await self.page.evaluate("var include_filters=''")
|
|
await self.page.request_gc()
|
|
|
|
# request_gc before and after evaluate to free up memory
|
|
# @todo browsersteps etc
|
|
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
|
|
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
|
|
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
|
"max_height": MAX_TOTAL_HEIGHT
|
|
})
|
|
await self.page.request_gc()
|
|
|
|
self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS)
|
|
await self.page.request_gc()
|
|
|
|
self.content = await self.page.content()
|
|
await self.page.request_gc()
|
|
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
|
|
|
|
|
|
# Bug 3 in Playwright screenshot handling
|
|
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
|
# JPEG is better here because the screenshots can be very very large
|
|
|
|
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
|
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
|
# acceptable screenshot quality here
|
|
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
|
self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
|
|
|
# Force aggressive memory cleanup - screenshots are large and base64 decode creates temporary buffers
|
|
await self.page.request_gc()
|
|
gc.collect()
|
|
|
|
except ScreenshotUnavailable:
|
|
# Re-raise screenshot unavailable exceptions
|
|
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
|
|
|
finally:
|
|
# Clean up resources properly with timeouts to prevent hanging
|
|
try:
|
|
if hasattr(self, 'page') and self.page:
|
|
await self.page.request_gc()
|
|
await asyncio.wait_for(self.page.close(), timeout=5.0)
|
|
logger.debug(f"Successfully closed page for {url}")
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Timed out closing page for {url} (5s)")
|
|
except Exception as e:
|
|
logger.warning(f"Error closing page for {url}: {e}")
|
|
finally:
|
|
self.page = None
|
|
|
|
try:
|
|
if context:
|
|
await asyncio.wait_for(context.close(), timeout=5.0)
|
|
logger.debug(f"Successfully closed context for {url}")
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Timed out closing context for {url} (5s)")
|
|
except Exception as e:
|
|
logger.warning(f"Error closing context for {url}: {e}")
|
|
finally:
|
|
context = None
|
|
|
|
try:
|
|
if browser:
|
|
await asyncio.wait_for(browser.close(), timeout=5.0)
|
|
logger.debug(f"Successfully closed browser connection for {url}")
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Timed out closing browser connection for {url} (5s)")
|
|
except Exception as e:
|
|
logger.warning(f"Error closing browser for {url}: {e}")
|
|
finally:
|
|
browser = None
|
|
|
|
# Force Python GC to release Playwright resources immediately
|
|
# Playwright objects can have circular references that delay cleanup
|
|
gc.collect()
|
|
|
|
|
|
# Plugin registration for built-in fetcher
|
|
class PlaywrightFetcherPlugin:
|
|
"""Plugin class that registers the Playwright fetcher as a built-in plugin."""
|
|
|
|
def register_content_fetcher(self):
|
|
"""Register the Playwright fetcher"""
|
|
return ('html_webdriver', fetcher)
|
|
|
|
|
|
# Create module-level instance for plugin registration
|
|
playwright_plugin = PlaywrightFetcherPlugin()
|
|
|
|
|
|
|