mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			update-sto
			...
			puppeteer-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					78bc6ae0d3 | ||
| 
						 | 
					c07ab75837 | ||
| 
						 | 
					0c7689fbd5 | 
@@ -7,13 +7,13 @@ import os
 | 
			
		||||
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
 | 
			
		||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
 | 
			
		||||
 | 
			
		||||
SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
 | 
			
		||||
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
 | 
			
		||||
SCREENSHOT_DEFAULT_QUALITY = 40
 | 
			
		||||
 | 
			
		||||
# Maximum total height for the final image (When in stitch mode).
 | 
			
		||||
# We limit this to 16000px due to the huge amount of RAM that was being used
 | 
			
		||||
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
 | 
			
		||||
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
 | 
			
		||||
SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
 | 
			
		||||
 | 
			
		||||
# The size at which we will switch to stitching method, when below this (and
 | 
			
		||||
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
 | 
			
		||||
 
 | 
			
		||||
@@ -5,13 +5,10 @@ from urllib.parse import urlparse
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
 | 
			
		||||
    SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
 | 
			
		||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
 | 
			
		||||
    SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def capture_full_page(page):
 | 
			
		||||
    import os
 | 
			
		||||
    import time
 | 
			
		||||
@@ -20,84 +17,56 @@ def capture_full_page(page):
 | 
			
		||||
    start = time.time()
 | 
			
		||||
 | 
			
		||||
    page_height = page.evaluate("document.documentElement.scrollHeight")
 | 
			
		||||
    page_width = page.evaluate("document.documentElement.scrollWidth")
 | 
			
		||||
    original_viewport = page.viewport_size
 | 
			
		||||
 | 
			
		||||
    logger.debug(f"Playwright viewport size {page.viewport_size}")
 | 
			
		||||
    logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
 | 
			
		||||
 | 
			
		||||
    ############################################################
 | 
			
		||||
    #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
 | 
			
		||||
    ############################################################
 | 
			
		||||
    # Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
 | 
			
		||||
    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
 | 
			
		||||
    screenshot_chunks = []
 | 
			
		||||
    y = 0
 | 
			
		||||
    
 | 
			
		||||
    # If page height is larger than current viewport, use a larger viewport for better capturing
 | 
			
		||||
    if page_height > page.viewport_size['height']:
 | 
			
		||||
        # Set viewport to a larger size to capture more content at once
 | 
			
		||||
        page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
 | 
			
		||||
 | 
			
		||||
    # Optimization to avoid unnecessary stitching if we can avoid it
 | 
			
		||||
    # Use the default screenshot method for smaller pages to take advantage
 | 
			
		||||
    # of GPU and native playwright screenshot optimizations
 | 
			
		||||
    # - No PIL needed here, no danger of memory leaks, no sub process required
 | 
			
		||||
    if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
 | 
			
		||||
        logger.debug("Using default screenshot method")
 | 
			
		||||
    # Capture screenshots in chunks up to the max total height
 | 
			
		||||
    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
        screenshot = page.screenshot(
 | 
			
		||||
        page.evaluate(f"window.scrollTo(0, {y})")
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
        screenshot_chunks.append(page.screenshot(
 | 
			
		||||
            type="jpeg",
 | 
			
		||||
            quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
 | 
			
		||||
            full_page=True,
 | 
			
		||||
        )
 | 
			
		||||
            full_page=False,
 | 
			
		||||
            quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
 | 
			
		||||
        ))
 | 
			
		||||
        y += step_size
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
        logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
    # Restore original viewport size
 | 
			
		||||
    page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
 | 
			
		||||
 | 
			
		||||
    # If we have multiple chunks, stitch them together
 | 
			
		||||
    if len(screenshot_chunks) > 1:
 | 
			
		||||
        from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
 | 
			
		||||
        logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
 | 
			
		||||
        parent_conn, child_conn = Pipe()
 | 
			
		||||
        p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
 | 
			
		||||
        p.start()
 | 
			
		||||
        screenshot = parent_conn.recv_bytes()
 | 
			
		||||
        p.join()
 | 
			
		||||
        logger.debug(
 | 
			
		||||
            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
        screenshot_chunks = None
 | 
			
		||||
        return screenshot
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ###################################################################################
 | 
			
		||||
    #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  #####
 | 
			
		||||
    ###################################################################################
 | 
			
		||||
    # - PIL can easily allocate memory and not release it cleanly
 | 
			
		||||
    # - Fetching screenshot from playwright seems  OK
 | 
			
		||||
    # Image.new is leaky even with .close()
 | 
			
		||||
    # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
 | 
			
		||||
 | 
			
		||||
    logger.debug(
 | 
			
		||||
        "Using stitching method for large screenshot because page height exceeds threshold"
 | 
			
		||||
    )
 | 
			
		||||
        f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
    # Limit the total capture height
 | 
			
		||||
    capture_height = min(page_height, MAX_TOTAL_HEIGHT)
 | 
			
		||||
 | 
			
		||||
    # Calculate number of chunks needed using ORIGINAL viewport height
 | 
			
		||||
    num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
 | 
			
		||||
    screenshot_chunks = []
 | 
			
		||||
 | 
			
		||||
    # Track cumulative paste position
 | 
			
		||||
    y_offset = 0
 | 
			
		||||
    for _ in range(num_chunks):
 | 
			
		||||
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
        page.evaluate(f"window.scrollTo(0, {y_offset})")
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
        h = min(page.viewport_size['height'], capture_height - y_offset)
 | 
			
		||||
        screenshot_chunks.append(page.screenshot(
 | 
			
		||||
                type="jpeg",
 | 
			
		||||
                clip={
 | 
			
		||||
                    "x": 0,
 | 
			
		||||
                    "y": 0,
 | 
			
		||||
                    "width": page.viewport_size['width'],
 | 
			
		||||
                    "height": h,
 | 
			
		||||
                },
 | 
			
		||||
                quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        y_offset += h # maybe better to inspect the image here?
 | 
			
		||||
        page.request_gc()
 | 
			
		||||
 | 
			
		||||
    # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
 | 
			
		||||
 | 
			
		||||
    parent_conn, child_conn = Pipe()
 | 
			
		||||
    p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
 | 
			
		||||
    p.start()
 | 
			
		||||
    result = parent_conn.recv_bytes()
 | 
			
		||||
    p.join()
 | 
			
		||||
 | 
			
		||||
    screenshot_chunks = None
 | 
			
		||||
    logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
    return result
 | 
			
		||||
    return screenshot_chunks[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
@@ -292,6 +261,7 @@ class fetcher(Fetcher):
 | 
			
		||||
            self.page.request_gc()
 | 
			
		||||
 | 
			
		||||
            self.content = self.page.content()
 | 
			
		||||
            self.page.request_gc()
 | 
			
		||||
            logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
 | 
			
		||||
 | 
			
		||||
            # Bug 3 in Playwright screenshot handling
 | 
			
		||||
@@ -317,4 +287,11 @@ class fetcher(Fetcher):
 | 
			
		||||
                
 | 
			
		||||
                # Clean up resources properly
 | 
			
		||||
                context.close()
 | 
			
		||||
                context = None
 | 
			
		||||
 | 
			
		||||
                self.page.close()
 | 
			
		||||
                self.page = None
 | 
			
		||||
 | 
			
		||||
                browser.close()
 | 
			
		||||
                borwser = None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -7,10 +7,11 @@ from urllib.parse import urlparse
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
 | 
			
		||||
    SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
 | 
			
		||||
    SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
 | 
			
		||||
    SCREENSHOT_MAX_TOTAL_HEIGHT
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
 | 
			
		||||
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
 | 
			
		||||
    BrowserConnectError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Bug 3 in Playwright screenshot handling
 | 
			
		||||
@@ -27,71 +28,53 @@ async def capture_full_page(page):
 | 
			
		||||
    start = time.time()
 | 
			
		||||
 | 
			
		||||
    page_height = await page.evaluate("document.documentElement.scrollHeight")
 | 
			
		||||
    page_width = await page.evaluate("document.documentElement.scrollWidth")
 | 
			
		||||
    original_viewport = page.viewport
 | 
			
		||||
 | 
			
		||||
    logger.debug(f"Puppeteer viewport size {page.viewport}")
 | 
			
		||||
    logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
 | 
			
		||||
 | 
			
		||||
    ############################################################
 | 
			
		||||
    #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
 | 
			
		||||
    ############################################################
 | 
			
		||||
    # Bug 3 in Playwright screenshot handling
 | 
			
		||||
    # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
 | 
			
		||||
    # JPEG is better here because the screenshots can be very very large
 | 
			
		||||
 | 
			
		||||
    # Optimization to avoid unnecessary stitching if we can avoid it
 | 
			
		||||
    # Use the default screenshot method for smaller pages to take advantage
 | 
			
		||||
    # of GPU and native playwright screenshot optimizations
 | 
			
		||||
    # - No PIL needed here, no danger of memory leaks, no sub process required
 | 
			
		||||
    if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
 | 
			
		||||
        logger.debug("Using default screenshot method")
 | 
			
		||||
        await page.evaluate(f"window.scrollTo(0, 0)")
 | 
			
		||||
        screenshot = await page.screenshot(
 | 
			
		||||
            type_="jpeg",
 | 
			
		||||
            quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
 | 
			
		||||
            fullPage=True,
 | 
			
		||||
        )
 | 
			
		||||
        logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
 | 
			
		||||
    # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
 | 
			
		||||
    # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
 | 
			
		||||
    # acceptable screenshot quality here
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
 | 
			
		||||
    screenshot_chunks = []
 | 
			
		||||
    y = 0
 | 
			
		||||
    if page_height > page.viewport['height']:
 | 
			
		||||
        await page.setViewport({'width': page.viewport['width'], 'height': step_size})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
 | 
			
		||||
        await page.evaluate(f"window.scrollTo(0, {y})")
 | 
			
		||||
        screenshot_chunks.append(await page.screenshot(type_='jpeg',
 | 
			
		||||
                                                       fullPage=False,
 | 
			
		||||
                                                       quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
 | 
			
		||||
        y += step_size
 | 
			
		||||
 | 
			
		||||
    await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
 | 
			
		||||
 | 
			
		||||
    if len(screenshot_chunks) > 1:
 | 
			
		||||
        from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
 | 
			
		||||
        logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
 | 
			
		||||
        parent_conn, child_conn = Pipe()
 | 
			
		||||
        p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
 | 
			
		||||
        p.start()
 | 
			
		||||
        screenshot = parent_conn.recv_bytes()
 | 
			
		||||
        p.join()
 | 
			
		||||
        logger.debug(
 | 
			
		||||
            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
        screenshot_chunks = None
 | 
			
		||||
        return screenshot
 | 
			
		||||
 | 
			
		||||
    ###################################################################################
 | 
			
		||||
    #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  #####
 | 
			
		||||
    ###################################################################################
 | 
			
		||||
    # - PIL can easily allocate memory and not release it cleanly
 | 
			
		||||
    # - Fetching screenshot from playwright seems  OK
 | 
			
		||||
    # Image.new is leaky even with .close()
 | 
			
		||||
    # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
 | 
			
		||||
 | 
			
		||||
    logger.debug(
 | 
			
		||||
        "Using stitching method for large screenshot because page height exceeds threshold"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Limit the total capture height
 | 
			
		||||
    capture_height = min(page_height, MAX_TOTAL_HEIGHT)
 | 
			
		||||
 | 
			
		||||
    # Calculate number of chunks needed using ORIGINAL viewport height
 | 
			
		||||
    num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
 | 
			
		||||
    screenshot_chunks = []
 | 
			
		||||
 | 
			
		||||
    # Track cumulative paste position
 | 
			
		||||
    y_offset = 0
 | 
			
		||||
    for _ in range(num_chunks):
 | 
			
		||||
        await page.evaluate(f"window.scrollTo(0, {y_offset})")
 | 
			
		||||
        h = min(page.viewport['height'], capture_height - y_offset)
 | 
			
		||||
        screenshot_chunks.append(await page.screenshot(
 | 
			
		||||
                type_="jpeg",
 | 
			
		||||
                quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        y_offset += h # maybe better to inspect the image here?
 | 
			
		||||
 | 
			
		||||
    # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
 | 
			
		||||
 | 
			
		||||
    parent_conn, child_conn = Pipe()
 | 
			
		||||
    p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
 | 
			
		||||
    p.start()
 | 
			
		||||
    result = parent_conn.recv_bytes()
 | 
			
		||||
    p.join()
 | 
			
		||||
 | 
			
		||||
    screenshot_chunks = None
 | 
			
		||||
    logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
 | 
			
		||||
    return result
 | 
			
		||||
        f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
 | 
			
		||||
    return screenshot_chunks[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user