WIP

2025-12-19 14:35:35 +00:00 · 2025-12-17 11:43:48 +01:00
parent fbd22bfafe
commit 6986c6687c
8 changed files with 82 additions and 30 deletions
--- a/changedetectionio/async_update_worker.py
+++ b/changedetectionio/async_update_worker.py
@@ -89,9 +89,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
                    processor = watch.get('processor', 'text_json_diff')

                    # Init a new 'difference_detection_processor'
-                    processor_module_name = f"changedetectionio.processors.{processor}.processor"
                    try:
-                        processor_module = importlib.import_module(processor_module_name)
+                        processor_module = importlib.import_module(f"changedetectionio.processors.{processor}.processor")
                    except ModuleNotFoundError as e:
                        print(f"Processor module '{processor}' not found.")
                        raise e
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -51,6 +51,7 @@ class Fetcher():
    favicon_blob = None
    instock_data = None
    instock_data_js = ""
+    screenshot_format = None
    status_code = None
    webdriver_js_execute_code = None
    xpath_data = None
@@ -70,6 +71,11 @@ class Fetcher():
    supports_screenshots = False        # Can capture page screenshots
    supports_xpath_element_data = False # Can extract xpath element positions/data for visual selector

+    def __init__(self, **kwargs):
+        if kwargs and 'screenshot_format' in kwargs:
+            self.screenshot_format = kwargs.get('screenshot_format')
+
+
    @classmethod
    def get_status_icon_data(cls):
        """Return data for status icon to display in the watch overview.
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -9,7 +9,7 @@ from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, vi
 from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable

-async def capture_full_page_async(page):
+async def capture_full_page_async(page, screenshot_format='JPEG'):
    import os
    import time
    from multiprocessing import Process, Pipe
@@ -35,6 +35,11 @@ async def capture_full_page_async(page):
        await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})

    # Capture screenshots in chunks up to the max total height
+    # Use PNG for better quality (no compression artifacts), JPEG for smaller size
+    screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg'
+    # PNG should use quality 100, JPEG uses configurable quality
+    screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72))
+
    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
        # Only scroll if not at the top (y > 0)
        if y > 0:
@@ -43,11 +48,15 @@ async def capture_full_page_async(page):
        # Request GC only before screenshot (not 3x per chunk)
        await page.request_gc()

-        screenshot_chunks.append(await page.screenshot(
-            type="jpeg",
-            full_page=False,
-            quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
-        ))
+        screenshot_kwargs = {
+            'type': screenshot_type,
+            'full_page': False
+        }
+        # Only pass quality parameter for jpeg (PNG doesn't support it in Playwright)
+        if screenshot_type == 'jpeg':
+            screenshot_kwargs['quality'] = screenshot_quality
+
+        screenshot_chunks.append(await page.screenshot(**screenshot_kwargs))
        y += step_size

    # Restore original viewport size
@@ -116,8 +125,8 @@ class fetcher(Fetcher):
            'title': 'Using a Chrome browser'
        }

-    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
-        super().__init__()
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
+        super().__init__(**kwargs)

        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')

@@ -152,7 +161,7 @@ class fetcher(Fetcher):

    async def screenshot_step(self, step_n=''):
        super().screenshot_step(step_n=step_n)
-        screenshot = await capture_full_page_async(page=self.page)
+        screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format)


        if self.browser_steps_screenshot_path is not None:
@@ -178,6 +187,7 @@ class fetcher(Fetcher):
                  request_body=None,
                  request_headers=None,
                  request_method=None,
+                  screenshot_format=None,
                  timeout=None,
                  url=None,
                  watch_uuid=None,
@@ -272,7 +282,7 @@ class fetcher(Fetcher):
                    logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")

            if self.status_code != 200 and not ignore_status_codes:
-                screenshot = await capture_full_page_async(self.page)
+                screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format)
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

            if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0:
@@ -321,7 +331,7 @@ class fetcher(Fetcher):
            # acceptable screenshot quality here
            try:
                # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
-                self.screenshot = await capture_full_page_async(page=self.page)
+                self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format)

            except Exception as e:
                # It's likely the screenshot was too long/big and something crashed
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -20,7 +20,7 @@ from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200
 # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
 # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
 # acceptable screenshot quality here
-async def capture_full_page(page):
+async def capture_full_page(page, screenshot_format='JPEG'):
    import os
    import time
    from multiprocessing import Process, Pipe
@@ -41,6 +41,10 @@ async def capture_full_page(page):
    # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
    # acceptable screenshot quality here

+    # Use PNG for better quality (no compression artifacts), JPEG for smaller size
+    screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg'
+    # PNG should use quality 100, JPEG uses configurable quality
+    screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72))

    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
    screenshot_chunks = []
@@ -60,9 +64,15 @@ async def capture_full_page(page):
            y
        )

-        screenshot_chunks.append(await page.screenshot(type_='jpeg',
-                                                       fullPage=False,
-                                                       quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
+        screenshot_kwargs = {
+            'type_': screenshot_type,
+            'fullPage': False
+        }
+        # PNG doesn't support quality parameter in Puppeteer
+        if screenshot_type == 'jpeg':
+            screenshot_kwargs['quality'] = screenshot_quality
+
+        screenshot_chunks.append(await page.screenshot(**screenshot_kwargs))
        y += step_size

    await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
@@ -112,8 +122,8 @@ class fetcher(Fetcher):
            'title': 'Using a Chrome browser'
        }

-    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
-        super().__init__()
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
+        super().__init__(**kwargs)

        if custom_browser_connection_url:
            self.browser_connection_is_custom = True
@@ -167,6 +177,7 @@ class fetcher(Fetcher):
                         request_body,
                         request_headers,
                         request_method,
+                         screenshot_format,
                         timeout,
                         url,
                         watch_uuid
@@ -316,7 +327,7 @@ class fetcher(Fetcher):
                logger.error(f"Error fetching FavIcon info {str(e)}, continuing.")

        if self.status_code != 200 and not ignore_status_codes:
-            screenshot = await capture_full_page(page=self.page)
+            screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)

            raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

@@ -354,7 +365,7 @@ class fetcher(Fetcher):

        self.content = await self.page.content

-        self.screenshot = await capture_full_page(page=self.page)
+        self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format)

        # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
        logger.success(f"Fetching '{url}' complete, closing page")
@@ -375,6 +386,7 @@ class fetcher(Fetcher):
                  request_body=None,
                  request_headers=None,
                  request_method=None,
+                  screenshot_format=None,
                  timeout=None,
                  url=None,
                  watch_uuid=None,
@@ -394,6 +406,7 @@ class fetcher(Fetcher):
                request_body=request_body,
                request_headers=request_headers,
                request_method=request_method,
+                screenshot_format=None,
                timeout=timeout,
                url=url,
                watch_uuid=watch_uuid,
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -12,8 +12,8 @@ from changedetectionio.content_fetchers.base import Fetcher
 class fetcher(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

-    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
-        super().__init__()
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
+        super().__init__(**kwargs)
        self.proxy_override = proxy_override
        # browser_connection_url is none because its always 'launched locally'

@@ -135,6 +135,7 @@ class fetcher(Fetcher):
                  request_body=None,
                  request_headers=None,
                  request_method=None,
+                  screenshot_format=None,
                  timeout=None,
                  url=None,
                  watch_uuid=None,
--- a/changedetectionio/content_fetchers/webdriver_selenium.py
+++ b/changedetectionio/content_fetchers/webdriver_selenium.py
@@ -28,8 +28,8 @@ class fetcher(Fetcher):
            'title': 'Using a Chrome browser'
        }

-    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
-        super().__init__()
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
+        super().__init__(**kwargs)
        from urllib.parse import urlparse
        from selenium.webdriver.common.proxy import Proxy

@@ -69,6 +69,7 @@ class fetcher(Fetcher):
                  request_body=None,
                  request_headers=None,
                  request_method=None,
+                  screenshot_format=None,
                  timeout=None,
                  url=None,
                  watch_uuid=None,
@@ -146,7 +147,21 @@ class fetcher(Fetcher):
                time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
                self.content = driver.page_source
                self.headers = {}
-                self.screenshot = driver.get_screenshot_as_png()
+
+                # Selenium always captures as PNG, convert to JPEG if needed
+                screenshot_png = driver.get_screenshot_as_png()
+
+                # Convert to JPEG if requested (for smaller file size)
+                if self.screenshot_format and self.screenshot_format.upper() == 'JPEG':
+                    from PIL import Image
+                    import io
+                    img = Image.open(io.BytesIO(screenshot_png))
+                    jpeg_buffer = io.BytesIO()
+                    img.save(jpeg_buffer, format='JPEG', quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
+                    self.screenshot = jpeg_buffer.getvalue()
+                    img.close()
+                else:
+                    self.screenshot = screenshot_png
            except Exception as e:
                driver.quit()
                raise e
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@@ -10,6 +10,9 @@ import os
 import pkgutil
 import re

+SCREENSHOT_FORMAT_JPEG = 'JPEG'
+SCREENSHOT_FORMAT_PNG = 'PNG'
+
 class difference_detection_processor():

    browser_steps = None
@@ -19,9 +22,9 @@ class difference_detection_processor():
    watch = None
    xpath_data = None
    preferred_proxy = None
+    screenshot_format = SCREENSHOT_FORMAT_JPEG

-    def __init__(self, *args, datastore, watch_uuid, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, datastore, watch_uuid):
        self.datastore = datastore
        self.watch_uuid = watch_uuid
        self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
@@ -97,7 +100,8 @@ class difference_detection_processor():
        # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
        # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
        self.fetcher = fetcher_obj(proxy_override=proxy_url,
-                                   custom_browser_connection_url=custom_browser_connection_url
+                                   custom_browser_connection_url=custom_browser_connection_url,
+                                   screenshot_format=self.screenshot_format
                                   )

        if self.watch.has_browser_steps:
@@ -159,6 +163,7 @@ class difference_detection_processor():
            request_body=request_body,
            request_headers=request_headers,
            request_method=request_method,
+            screenshot_format = self.screenshot_format,
            timeout=timeout,
            url=url,
            watch_uuid=self.watch_uuid,
--- a/changedetectionio/processors/image_ssim_diff/processor.py
+++ b/changedetectionio/processors/image_ssim_diff/processor.py
@@ -10,7 +10,7 @@ import hashlib
 import os
 import time
 from loguru import logger
-from changedetectionio.processors import difference_detection_processor
+from changedetectionio.processors import difference_detection_processor, SCREENSHOT_FORMAT_PNG
 from changedetectionio.processors.exceptions import ProcessorException
 from . import DEFAULT_COMPARISON_METHOD, DEFAULT_COMPARISON_THRESHOLD_OPENCV, DEFAULT_COMPARISON_THRESHOLD_PIXELMATCH

@@ -21,6 +21,9 @@ description = 'Compares screenshots using fast algorithms (OpenCV or pixelmatch)
 class perform_site_check(difference_detection_processor):
    """Fast screenshot comparison processor."""

+    # Override to use PNG format for better image comparison (JPEG compression creates noise)
+    screenshot_format = SCREENSHOT_FORMAT_PNG
+
    def run_changedetection(self, watch):
        """
        Perform screenshot comparison using OpenCV or pixelmatch.