diff --git a/changedetectionio/async_update_worker.py b/changedetectionio/async_update_worker.py index 4f7f4249..35d5f8fd 100644 --- a/changedetectionio/async_update_worker.py +++ b/changedetectionio/async_update_worker.py @@ -89,9 +89,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore): processor = watch.get('processor', 'text_json_diff') # Init a new 'difference_detection_processor' - processor_module_name = f"changedetectionio.processors.{processor}.processor" try: - processor_module = importlib.import_module(processor_module_name) + processor_module = importlib.import_module(f"changedetectionio.processors.{processor}.processor") except ModuleNotFoundError as e: print(f"Processor module '{processor}' not found.") raise e diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 351424fe..24be85e6 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -51,6 +51,7 @@ class Fetcher(): favicon_blob = None instock_data = None instock_data_js = "" + screenshot_format = None status_code = None webdriver_js_execute_code = None xpath_data = None @@ -70,6 +71,11 @@ class Fetcher(): supports_screenshots = False # Can capture page screenshots supports_xpath_element_data = False # Can extract xpath element positions/data for visual selector + def __init__(self, **kwargs): + if kwargs and 'screenshot_format' in kwargs: + self.screenshot_format = kwargs.get('screenshot_format') + + @classmethod def get_status_icon_data(cls): """Return data for status icon to display in the watch overview. diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index cd22c614..a2d44a65 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -9,7 +9,7 @@ from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, vi from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable -async def capture_full_page_async(page): +async def capture_full_page_async(page, screenshot_format='JPEG'): import os import time from multiprocessing import Process, Pipe @@ -35,6 +35,11 @@ async def capture_full_page_async(page): await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size}) # Capture screenshots in chunks up to the max total height + # Use PNG for better quality (no compression artifacts), JPEG for smaller size + screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg' + # PNG should use quality 100, JPEG uses configurable quality + screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72)) + while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): # Only scroll if not at the top (y > 0) if y > 0: @@ -43,11 +48,15 @@ async def capture_full_page_async(page): # Request GC only before screenshot (not 3x per chunk) await page.request_gc() - screenshot_chunks.append(await page.screenshot( - type="jpeg", - full_page=False, - quality=int(os.getenv("SCREENSHOT_QUALITY", 72)) - )) + screenshot_kwargs = { + 'type': screenshot_type, + 'full_page': False + } + # Only pass quality parameter for jpeg (PNG doesn't support it in Playwright) + if screenshot_type == 'jpeg': + screenshot_kwargs['quality'] = screenshot_quality + + screenshot_chunks.append(await page.screenshot(**screenshot_kwargs)) y += step_size # Restore original viewport size @@ -116,8 +125,8 @@ class fetcher(Fetcher): 'title': 'Using a Chrome browser' } - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(**kwargs) self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') @@ -152,7 +161,7 @@ class fetcher(Fetcher): async def screenshot_step(self, step_n=''): super().screenshot_step(step_n=step_n) - screenshot = await capture_full_page_async(page=self.page) + screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format) if self.browser_steps_screenshot_path is not None: @@ -178,6 +187,7 @@ class fetcher(Fetcher): request_body=None, request_headers=None, request_method=None, + screenshot_format=None, timeout=None, url=None, watch_uuid=None, @@ -272,7 +282,7 @@ class fetcher(Fetcher): logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") if self.status_code != 200 and not ignore_status_codes: - screenshot = await capture_full_page_async(self.page) + screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format) raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0: @@ -321,7 +331,7 @@ class fetcher(Fetcher): # acceptable screenshot quality here try: # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage - self.screenshot = await capture_full_page_async(page=self.page) + self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format) except Exception as e: # It's likely the screenshot was too long/big and something crashed diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index a77fd83b..c3548539 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -20,7 +20,7 @@ from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200 # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded # which will significantly increase the IO size between the server and client, it's recommended to use the lowest # acceptable screenshot quality here -async def capture_full_page(page): +async def capture_full_page(page, screenshot_format='JPEG'): import os import time from multiprocessing import Process, Pipe @@ -41,6 +41,10 @@ async def capture_full_page(page): # which will significantly increase the IO size between the server and client, it's recommended to use the lowest # acceptable screenshot quality here + # Use PNG for better quality (no compression artifacts), JPEG for smaller size + screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg' + # PNG should use quality 100, JPEG uses configurable quality + screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72)) step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot screenshot_chunks = [] @@ -60,9 +64,15 @@ async def capture_full_page(page): y ) - screenshot_chunks.append(await page.screenshot(type_='jpeg', - fullPage=False, - quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))) + screenshot_kwargs = { + 'type_': screenshot_type, + 'fullPage': False + } + # PNG doesn't support quality parameter in Puppeteer + if screenshot_type == 'jpeg': + screenshot_kwargs['quality'] = screenshot_quality + + screenshot_chunks.append(await page.screenshot(**screenshot_kwargs)) y += step_size await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']}) @@ -112,8 +122,8 @@ class fetcher(Fetcher): 'title': 'Using a Chrome browser' } - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(**kwargs) if custom_browser_connection_url: self.browser_connection_is_custom = True @@ -167,6 +177,7 @@ class fetcher(Fetcher): request_body, request_headers, request_method, + screenshot_format, timeout, url, watch_uuid @@ -316,7 +327,7 @@ class fetcher(Fetcher): logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") if self.status_code != 200 and not ignore_status_codes: - screenshot = await capture_full_page(page=self.page) + screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format) raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) @@ -354,7 +365,7 @@ class fetcher(Fetcher): self.content = await self.page.content - self.screenshot = await capture_full_page(page=self.page) + self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format) # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need logger.success(f"Fetching '{url}' complete, closing page") @@ -375,6 +386,7 @@ class fetcher(Fetcher): request_body=None, request_headers=None, request_method=None, + screenshot_format=None, timeout=None, url=None, watch_uuid=None, @@ -394,6 +406,7 @@ class fetcher(Fetcher): request_body=request_body, request_headers=request_headers, request_method=request_method, + screenshot_format=None, timeout=timeout, url=url, watch_uuid=watch_uuid, diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index 7bfd32c1..23f2adcc 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -12,8 +12,8 @@ from changedetectionio.content_fetchers.base import Fetcher class fetcher(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(**kwargs) self.proxy_override = proxy_override # browser_connection_url is none because its always 'launched locally' @@ -135,6 +135,7 @@ class fetcher(Fetcher): request_body=None, request_headers=None, request_method=None, + screenshot_format=None, timeout=None, url=None, watch_uuid=None, diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index 60396829..2399b712 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -28,8 +28,8 @@ class fetcher(Fetcher): 'title': 'Using a Chrome browser' } - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(**kwargs) from urllib.parse import urlparse from selenium.webdriver.common.proxy import Proxy @@ -69,6 +69,7 @@ class fetcher(Fetcher): request_body=None, request_headers=None, request_method=None, + screenshot_format=None, timeout=None, url=None, watch_uuid=None, @@ -146,7 +147,21 @@ class fetcher(Fetcher): time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) self.content = driver.page_source self.headers = {} - self.screenshot = driver.get_screenshot_as_png() + + # Selenium always captures as PNG, convert to JPEG if needed + screenshot_png = driver.get_screenshot_as_png() + + # Convert to JPEG if requested (for smaller file size) + if self.screenshot_format and self.screenshot_format.upper() == 'JPEG': + from PIL import Image + import io + img = Image.open(io.BytesIO(screenshot_png)) + jpeg_buffer = io.BytesIO() + img.save(jpeg_buffer, format='JPEG', quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + self.screenshot = jpeg_buffer.getvalue() + img.close() + else: + self.screenshot = screenshot_png except Exception as e: driver.quit() raise e diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index d263da69..0225394d 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -10,6 +10,9 @@ import os import pkgutil import re +SCREENSHOT_FORMAT_JPEG = 'JPEG' +SCREENSHOT_FORMAT_PNG = 'PNG' + class difference_detection_processor(): browser_steps = None @@ -19,9 +22,9 @@ class difference_detection_processor(): watch = None xpath_data = None preferred_proxy = None + screenshot_format = SCREENSHOT_FORMAT_JPEG - def __init__(self, *args, datastore, watch_uuid, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, datastore, watch_uuid): self.datastore = datastore self.watch_uuid = watch_uuid self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) @@ -97,7 +100,8 @@ class difference_detection_processor(): # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) self.fetcher = fetcher_obj(proxy_override=proxy_url, - custom_browser_connection_url=custom_browser_connection_url + custom_browser_connection_url=custom_browser_connection_url, + screenshot_format=self.screenshot_format ) if self.watch.has_browser_steps: @@ -159,6 +163,7 @@ class difference_detection_processor(): request_body=request_body, request_headers=request_headers, request_method=request_method, + screenshot_format = self.screenshot_format, timeout=timeout, url=url, watch_uuid=self.watch_uuid, diff --git a/changedetectionio/processors/image_ssim_diff/processor.py b/changedetectionio/processors/image_ssim_diff/processor.py index c3a7f450..63711786 100644 --- a/changedetectionio/processors/image_ssim_diff/processor.py +++ b/changedetectionio/processors/image_ssim_diff/processor.py @@ -10,7 +10,7 @@ import hashlib import os import time from loguru import logger -from changedetectionio.processors import difference_detection_processor +from changedetectionio.processors import difference_detection_processor, SCREENSHOT_FORMAT_PNG from changedetectionio.processors.exceptions import ProcessorException from . import DEFAULT_COMPARISON_METHOD, DEFAULT_COMPARISON_THRESHOLD_OPENCV, DEFAULT_COMPARISON_THRESHOLD_PIXELMATCH @@ -21,6 +21,9 @@ description = 'Compares screenshots using fast algorithms (OpenCV or pixelmatch) class perform_site_check(difference_detection_processor): """Fast screenshot comparison processor.""" + # Override to use PNG format for better image comparison (JPEG compression creates noise) + screenshot_format = SCREENSHOT_FORMAT_PNG + def run_changedetection(self, watch): """ Perform screenshot comparison using OpenCV or pixelmatch.