mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 14:47:21 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			0.50.14
			...
			puppeteer-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 78bc6ae0d3 | ||
|   | c07ab75837 | ||
|   | 0c7689fbd5 | 
| @@ -7,13 +7,13 @@ import os | ||||
| # Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>. | ||||
| visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button' | ||||
|  | ||||
| SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000 | ||||
| SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000 | ||||
| SCREENSHOT_DEFAULT_QUALITY = 40 | ||||
|  | ||||
| # Maximum total height for the final image (When in stitch mode). | ||||
| # We limit this to 16000px due to the huge amount of RAM that was being used | ||||
| # Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc) | ||||
| MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) | ||||
| SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) | ||||
|  | ||||
| # The size at which we will switch to stitching method, when below this (and | ||||
| # MAX_TOTAL_HEIGHT which can be set by a user) we will use the default | ||||
|   | ||||
| @@ -5,13 +5,10 @@ from urllib.parse import urlparse | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
| from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
| from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable | ||||
|  | ||||
|  | ||||
|  | ||||
| def capture_full_page(page): | ||||
|     import os | ||||
|     import time | ||||
| @@ -20,84 +17,56 @@ def capture_full_page(page): | ||||
|     start = time.time() | ||||
|  | ||||
|     page_height = page.evaluate("document.documentElement.scrollHeight") | ||||
|     page_width = page.evaluate("document.documentElement.scrollWidth") | ||||
|     original_viewport = page.viewport_size | ||||
|  | ||||
|     logger.debug(f"Playwright viewport size {page.viewport_size}") | ||||
|     logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}") | ||||
|  | ||||
|     ############################################################ | ||||
|     #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### | ||||
|     ############################################################ | ||||
|     # Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks | ||||
|     step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow | ||||
|     screenshot_chunks = [] | ||||
|     y = 0 | ||||
|      | ||||
|     # If page height is larger than current viewport, use a larger viewport for better capturing | ||||
|     if page_height > page.viewport_size['height']: | ||||
|         # Set viewport to a larger size to capture more content at once | ||||
|         page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size}) | ||||
|  | ||||
|     # Optimization to avoid unnecessary stitching if we can avoid it | ||||
|     # Use the default screenshot method for smaller pages to take advantage | ||||
|     # of GPU and native playwright screenshot optimizations | ||||
|     # - No PIL needed here, no danger of memory leaks, no sub process required | ||||
|     if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): | ||||
|         logger.debug("Using default screenshot method") | ||||
|     # Capture screenshots in chunks up to the max total height | ||||
|     while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): | ||||
|         page.request_gc() | ||||
|         screenshot = page.screenshot( | ||||
|         page.evaluate(f"window.scrollTo(0, {y})") | ||||
|         page.request_gc() | ||||
|         screenshot_chunks.append(page.screenshot( | ||||
|             type="jpeg", | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             full_page=True, | ||||
|         ) | ||||
|             full_page=False, | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", 72)) | ||||
|         )) | ||||
|         y += step_size | ||||
|         page.request_gc() | ||||
|         logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") | ||||
|  | ||||
|     # Restore original viewport size | ||||
|     page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']}) | ||||
|  | ||||
|     # If we have multiple chunks, stitch them together | ||||
|     if len(screenshot_chunks) > 1: | ||||
|         from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|         logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together") | ||||
|         parent_conn, child_conn = Pipe() | ||||
|         p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) | ||||
|         p.start() | ||||
|         screenshot = parent_conn.recv_bytes() | ||||
|         p.join() | ||||
|         logger.debug( | ||||
|             f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|         screenshot_chunks = None | ||||
|         return screenshot | ||||
|  | ||||
|  | ||||
|  | ||||
|     ################################################################################### | ||||
|     #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  ##### | ||||
|     ################################################################################### | ||||
|     # - PIL can easily allocate memory and not release it cleanly | ||||
|     # - Fetching screenshot from playwright seems  OK | ||||
|     # Image.new is leaky even with .close() | ||||
|     # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling | ||||
|  | ||||
|     logger.debug( | ||||
|         "Using stitching method for large screenshot because page height exceeds threshold" | ||||
|     ) | ||||
|         f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     # Limit the total capture height | ||||
|     capture_height = min(page_height, MAX_TOTAL_HEIGHT) | ||||
|  | ||||
|     # Calculate number of chunks needed using ORIGINAL viewport height | ||||
|     num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height'] | ||||
|     screenshot_chunks = [] | ||||
|  | ||||
|     # Track cumulative paste position | ||||
|     y_offset = 0 | ||||
|     for _ in range(num_chunks): | ||||
|  | ||||
|         page.request_gc() | ||||
|         page.evaluate(f"window.scrollTo(0, {y_offset})") | ||||
|         page.request_gc() | ||||
|         h = min(page.viewport_size['height'], capture_height - y_offset) | ||||
|         screenshot_chunks.append(page.screenshot( | ||||
|                 type="jpeg", | ||||
|                 clip={ | ||||
|                     "x": 0, | ||||
|                     "y": 0, | ||||
|                     "width": page.viewport_size['width'], | ||||
|                     "height": h, | ||||
|                 }, | ||||
|                 quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             )) | ||||
|  | ||||
|         y_offset += h # maybe better to inspect the image here? | ||||
|         page.request_gc() | ||||
|  | ||||
|     # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) | ||||
|     p.start() | ||||
|     result = parent_conn.recv_bytes() | ||||
|     p.join() | ||||
|  | ||||
|     screenshot_chunks = None | ||||
|     logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     return result | ||||
|     return screenshot_chunks[0] | ||||
|  | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
| @@ -292,6 +261,7 @@ class fetcher(Fetcher): | ||||
|             self.page.request_gc() | ||||
|  | ||||
|             self.content = self.page.content() | ||||
|             self.page.request_gc() | ||||
|             logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s") | ||||
|  | ||||
|             # Bug 3 in Playwright screenshot handling | ||||
| @@ -317,4 +287,11 @@ class fetcher(Fetcher): | ||||
|                  | ||||
|                 # Clean up resources properly | ||||
|                 context.close() | ||||
|                 context = None | ||||
|  | ||||
|                 self.page.close() | ||||
|                 self.page = None | ||||
|  | ||||
|                 browser.close() | ||||
|                 borwser = None | ||||
|  | ||||
|   | ||||
| @@ -7,10 +7,11 @@ from urllib.parse import urlparse | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \ | ||||
|     SCREENSHOT_MAX_TOTAL_HEIGHT | ||||
| from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError | ||||
| from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \ | ||||
|     BrowserConnectError | ||||
|  | ||||
|  | ||||
| # Bug 3 in Playwright screenshot handling | ||||
| @@ -27,71 +28,53 @@ async def capture_full_page(page): | ||||
|     start = time.time() | ||||
|  | ||||
|     page_height = await page.evaluate("document.documentElement.scrollHeight") | ||||
|     page_width = await page.evaluate("document.documentElement.scrollWidth") | ||||
|     original_viewport = page.viewport | ||||
|  | ||||
|     logger.debug(f"Puppeteer viewport size {page.viewport}") | ||||
|     logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}") | ||||
|  | ||||
|     ############################################################ | ||||
|     #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### | ||||
|     ############################################################ | ||||
|     # Bug 3 in Playwright screenshot handling | ||||
|     # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|     # JPEG is better here because the screenshots can be very very large | ||||
|  | ||||
|     # Optimization to avoid unnecessary stitching if we can avoid it | ||||
|     # Use the default screenshot method for smaller pages to take advantage | ||||
|     # of GPU and native playwright screenshot optimizations | ||||
|     # - No PIL needed here, no danger of memory leaks, no sub process required | ||||
|     if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): | ||||
|         logger.debug("Using default screenshot method") | ||||
|         await page.evaluate(f"window.scrollTo(0, 0)") | ||||
|         screenshot = await page.screenshot( | ||||
|             type_="jpeg", | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             fullPage=True, | ||||
|         ) | ||||
|         logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") | ||||
|     # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded | ||||
|     # which will significantly increase the IO size between the server and client, it's recommended to use the lowest | ||||
|     # acceptable screenshot quality here | ||||
|  | ||||
|  | ||||
|     step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot | ||||
|     screenshot_chunks = [] | ||||
|     y = 0 | ||||
|     if page_height > page.viewport['height']: | ||||
|         await page.setViewport({'width': page.viewport['width'], 'height': step_size}) | ||||
|  | ||||
|  | ||||
|     while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): | ||||
|         await page.evaluate(f"window.scrollTo(0, {y})") | ||||
|         screenshot_chunks.append(await page.screenshot(type_='jpeg', | ||||
|                                                        fullPage=False, | ||||
|                                                        quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))) | ||||
|         y += step_size | ||||
|  | ||||
|     await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']}) | ||||
|  | ||||
|     if len(screenshot_chunks) > 1: | ||||
|         from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|         logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together") | ||||
|         parent_conn, child_conn = Pipe() | ||||
|         p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) | ||||
|         p.start() | ||||
|         screenshot = parent_conn.recv_bytes() | ||||
|         p.join() | ||||
|         logger.debug( | ||||
|             f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|         screenshot_chunks = None | ||||
|         return screenshot | ||||
|  | ||||
|     ################################################################################### | ||||
|     #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  ##### | ||||
|     ################################################################################### | ||||
|     # - PIL can easily allocate memory and not release it cleanly | ||||
|     # - Fetching screenshot from playwright seems  OK | ||||
|     # Image.new is leaky even with .close() | ||||
|     # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling | ||||
|  | ||||
|     logger.debug( | ||||
|         "Using stitching method for large screenshot because page height exceeds threshold" | ||||
|     ) | ||||
|  | ||||
|     # Limit the total capture height | ||||
|     capture_height = min(page_height, MAX_TOTAL_HEIGHT) | ||||
|  | ||||
|     # Calculate number of chunks needed using ORIGINAL viewport height | ||||
|     num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height'] | ||||
|     screenshot_chunks = [] | ||||
|  | ||||
|     # Track cumulative paste position | ||||
|     y_offset = 0 | ||||
|     for _ in range(num_chunks): | ||||
|         await page.evaluate(f"window.scrollTo(0, {y_offset})") | ||||
|         h = min(page.viewport['height'], capture_height - y_offset) | ||||
|         screenshot_chunks.append(await page.screenshot( | ||||
|                 type_="jpeg", | ||||
|                 quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             )) | ||||
|  | ||||
|         y_offset += h # maybe better to inspect the image here? | ||||
|  | ||||
|     # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) | ||||
|     p.start() | ||||
|     result = parent_conn.recv_bytes() | ||||
|     p.join() | ||||
|  | ||||
|     screenshot_chunks = None | ||||
|     logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     return result | ||||
|         f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|     return screenshot_chunks[0] | ||||
|  | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user