Small safety catch

Playwright + Puppeteer fix for when page is taller than viewport but less than step_size
Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110 )
2025-10-31 06:37:41 +00:00 · 2025-04-12 18:40:15 +02:00 · 2025-04-12 17:56:16 +02:00 · 2025-04-11 18:18:29 +02:00 · 2025-04-11 17:36:29 +02:00 · 2025-04-11 13:46:58 +02:00
15 changed files with 449 additions and 146 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.49.12'
+__version__ = '0.49.13'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/api/Notifications.py
+++ b/changedetectionio/api/Notifications.py
@@ -0,0 +1,145 @@
+from flask_expects_json import expects_json
+from flask_restful import Resource
+from . import auth
+from flask_restful import abort, Resource
+from flask import request
+from . import auth
+from . import schema_create_notification_urls, schema_delete_notification_urls
+
+class Notifications(Resource):
+    def __init__(self, **kwargs):
+        # datastore is a black box dependency
+        self.datastore = kwargs['datastore']
+
+    @auth.check_token
+    def get(self):
+        """
+        @api {get} /api/v1/notifications Return Notification URL List
+        @apiDescription Return the Notification URL List from the configuration
+        @apiExample {curl} Example usage:
+            curl http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45"
+            HTTP/1.0 200
+            {
+                'notification_urls': ["notification-urls-list"]
+            }
+        @apiName Get
+        @apiGroup Notifications
+        """
+
+        notification_urls = self.datastore.data.get('settings', {}).get('application', {}).get('notification_urls', [])        
+
+        return {
+                'notification_urls': notification_urls,
+               }, 200
+    
+    @auth.check_token
+    @expects_json(schema_create_notification_urls)
+    def post(self):
+        """
+        @api {post} /api/v1/notifications Create Notification URLs
+        @apiDescription Add one or more notification URLs from the configuration
+        @apiExample {curl} Example usage:
+            curl http://localhost:5000/api/v1/notifications/batch -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
+        @apiName CreateBatch
+        @apiGroup Notifications
+        @apiSuccess (201) {Object[]} notification_urls List of added notification URLs
+        @apiError (400) {String} Invalid input
+        """
+
+        json_data = request.get_json()
+        notification_urls = json_data.get("notification_urls", [])
+
+        from wtforms import ValidationError
+        try:
+            validate_notification_urls(notification_urls)
+        except ValidationError as e:
+            return str(e), 400
+
+        added_urls = []
+
+        for url in notification_urls:
+            clean_url = url.strip()
+            added_url = self.datastore.add_notification_url(clean_url)
+            if added_url:
+                added_urls.append(added_url)
+
+        if not added_urls:
+            return "No valid notification URLs were added", 400
+
+        return {'notification_urls': added_urls}, 201
+    
+    @auth.check_token
+    @expects_json(schema_create_notification_urls)
+    def put(self):
+        """
+        @api {put} /api/v1/notifications Replace Notification URLs
+        @apiDescription Replace all notification URLs with the provided list (can be empty)
+        @apiExample {curl} Example usage:
+            curl -X PUT http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
+        @apiName Replace
+        @apiGroup Notifications
+        @apiSuccess (200) {Object[]} notification_urls List of current notification URLs
+        @apiError (400) {String} Invalid input
+        """
+        json_data = request.get_json()
+        notification_urls = json_data.get("notification_urls", [])
+
+        from wtforms import ValidationError
+        try:
+            validate_notification_urls(notification_urls)
+        except ValidationError as e:
+            return str(e), 400
+        
+        if not isinstance(notification_urls, list):
+            return "Invalid input format", 400
+
+        clean_urls = [url.strip() for url in notification_urls if isinstance(url, str)]
+        self.datastore.data['settings']['application']['notification_urls'] = clean_urls
+        self.datastore.needs_write = True
+
+        return {'notification_urls': clean_urls}, 200
+        
+    @auth.check_token
+    @expects_json(schema_delete_notification_urls)
+    def delete(self):
+        """
+        @api {delete} /api/v1/notifications Delete Notification URLs
+        @apiDescription Deletes one or more notification URLs from the configuration
+        @apiExample {curl} Example usage:
+            curl http://localhost:5000/api/v1/notifications -X DELETE -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
+        @apiParam {String[]} notification_urls The notification URLs to delete.
+        @apiName Delete
+        @apiGroup Notifications
+        @apiSuccess (204) {String} OK Deleted
+        @apiError (400) {String} No matching notification URLs found.
+        """
+
+        json_data = request.get_json()
+        urls_to_delete = json_data.get("notification_urls", [])
+        if not isinstance(urls_to_delete, list):
+            abort(400, message="Expected a list of notification URLs.")
+
+        notification_urls = self.datastore.data['settings']['application'].get('notification_urls', [])
+        deleted = []
+
+        for url in urls_to_delete:
+            clean_url = url.strip()
+            if clean_url in notification_urls:
+                notification_urls.remove(clean_url)
+                deleted.append(clean_url)
+
+        if not deleted:
+            abort(400, message="No matching notification URLs found.")
+
+        self.datastore.data['settings']['application']['notification_urls'] = notification_urls
+        self.datastore.needs_write = True
+
+        return 'OK', 204
+    
+def validate_notification_urls(notification_urls):
+    from changedetectionio.forms import ValidateAppRiseServers
+    validator = ValidateAppRiseServers()
+    class DummyForm: pass
+    dummy_form = DummyForm()
+    field = type("Field", (object,), {"data": notification_urls, "gettext": lambda self, x: x})()
+    validator(dummy_form, field)
--- a/changedetectionio/api/init.py
+++ b/changedetectionio/api/init.py
@@ -19,8 +19,15 @@ schema_create_tag['required'] = ['title']
 schema_update_tag = copy.deepcopy(schema_tag)
 schema_update_tag['additionalProperties'] = False

+schema_notification_urls = copy.deepcopy(schema)
+schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
+schema_create_notification_urls['required'] = ['notification_urls']
+schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
+schema_delete_notification_urls['required'] = ['notification_urls']
+
 # Import all API resources
 from .Watch import Watch, WatchHistory, WatchSingleHistory, CreateWatch
 from .Tags import Tags, Tag
 from .Import import Import
 from .SystemInfo import SystemInfo
+from .Notifications import Notifications
--- a/changedetectionio/blueprint/ui/edit.py
+++ b/changedetectionio/blueprint/ui/edit.py
@@ -19,6 +19,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
            if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')):
                return True

+    def levenshtein_ratio_recent_history(watch):
+        try:
+            from Levenshtein import ratio, distance
+            k = list(watch.history.keys())
+            if len(k) >= 2:
+                a = watch.get_history_snapshot(timestamp=k[0])
+                b = watch.get_history_snapshot(timestamp=k[1])
+                distance = distance(a, b)
+                return distance
+        except Exception as e:
+            logger.warning("Unable to calc similarity", e)
+            return "Unable to calc similarity"
+        return ''
+
    @edit_blueprint.route("/edit/<string:uuid>", methods=['GET', 'POST'])
    @login_optionally_required
    # https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists
@@ -247,14 +261,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
                'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
                'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
                'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
-                'watch_uses_webdriver': watch_uses_webdriver,
                'jq_support': jq_support,
+                'lev_info': levenshtein_ratio_recent_history(watch),
                'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
                'settings_application': datastore.data['settings']['application'],
                'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
                'using_global_webdriver_wait': not default['webdriver_delay'],
                'uuid': uuid,
-                'watch': watch
+                'watch': watch,
+                'watch_uses_webdriver': watch_uses_webdriver,
            }

            included_content = None
--- a/changedetectionio/conditions/init.py
+++ b/changedetectionio/conditions/init.py
@@ -96,7 +96,7 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
    
    ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)

-    if ruleset_settings.get("conditions"):
+    if ruleset_settings and ruleset_settings.get("conditions"):
        logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
        complete_rules = filter_complete_rules(ruleset_settings['conditions'])
        if complete_rules:
--- a/changedetectionio/content_fetchers/init.py
+++ b/changedetectionio/content_fetchers/init.py
@@ -7,13 +7,13 @@ import os
 # Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
 visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'

-SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
+SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
 SCREENSHOT_DEFAULT_QUALITY = 40

 # Maximum total height for the final image (When in stitch mode).
 # We limit this to 16000px due to the huge amount of RAM that was being used
 # Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
-MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
+SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))

 # The size at which we will switch to stitching method, when below this (and
 # MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -5,13 +5,10 @@ from urllib.parse import urlparse
 from loguru import logger

 from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
-    SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
-from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
+    SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
 from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable

-
-
 def capture_full_page(page):
    import os
    import time
@@ -20,84 +17,61 @@ def capture_full_page(page):
    start = time.time()

    page_height = page.evaluate("document.documentElement.scrollHeight")
+    page_width = page.evaluate("document.documentElement.scrollWidth")
+    original_viewport = page.viewport_size

-    logger.debug(f"Playwright viewport size {page.viewport_size}")
+    logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")

-    ############################################################
-    #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
-    ############################################################
+    # Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
+    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
+    screenshot_chunks = []
+    y = 0

-    # Optimization to avoid unnecessary stitching if we can avoid it
-    # Use the default screenshot method for smaller pages to take advantage
-    # of GPU and native playwright screenshot optimizations
-    # - No PIL needed here, no danger of memory leaks, no sub process required
-    if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
-        logger.debug("Using default screenshot method")
+    if page_height > page.viewport_size['height']:
+        if page_height < step_size:
+            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
+        logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
+        # Set viewport to a larger size to capture more content at once
+        page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
+
+    # Capture screenshots in chunks up to the max total height
+    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
        page.request_gc()
-        screenshot = page.screenshot(
+        page.evaluate(f"window.scrollTo(0, {y})")
+        page.request_gc()
+        screenshot_chunks.append(page.screenshot(
            type="jpeg",
-            quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
-            full_page=True,
-        )
+            full_page=False,
+            quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
+        ))
+        y += step_size
        page.request_gc()
-        logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
+
+    # Restore original viewport size
+    page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
+
+    # If we have multiple chunks, stitch them together
+    if len(screenshot_chunks) > 1:
+        from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
+        logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
+        parent_conn, child_conn = Pipe()
+        p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
+        p.start()
+        screenshot = parent_conn.recv_bytes()
+        p.join()
+        logger.debug(
+            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
+        # Explicit cleanup
+        del screenshot_chunks
+        del p
+        del parent_conn, child_conn
+        screenshot_chunks = None
        return screenshot

-
-
-    ###################################################################################
-    #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  #####
-    ###################################################################################
-    # - PIL can easily allocate memory and not release it cleanly
-    # - Fetching screenshot from playwright seems  OK
-    # Image.new is leaky even with .close()
-    # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
-
    logger.debug(
-        "Using stitching method for large screenshot because page height exceeds threshold"
-    )
+        f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")

-    # Limit the total capture height
-    capture_height = min(page_height, MAX_TOTAL_HEIGHT)
-
-    # Calculate number of chunks needed using ORIGINAL viewport height
-    num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
-    screenshot_chunks = []
-
-    # Track cumulative paste position
-    y_offset = 0
-    for _ in range(num_chunks):
-
-        page.request_gc()
-        page.evaluate(f"window.scrollTo(0, {y_offset})")
-        page.request_gc()
-        h = min(page.viewport_size['height'], capture_height - y_offset)
-        screenshot_chunks.append(page.screenshot(
-                type="jpeg",
-                clip={
-                    "x": 0,
-                    "y": 0,
-                    "width": page.viewport_size['width'],
-                    "height": h,
-                },
-                quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
-            ))
-
-        y_offset += h # maybe better to inspect the image here?
-        page.request_gc()
-
-    # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
-
-    parent_conn, child_conn = Pipe()
-    p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
-    p.start()
-    result = parent_conn.recv_bytes()
-    p.join()
-
-    screenshot_chunks = None
-    logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
-
-    return result
+    return screenshot_chunks[0]


 class fetcher(Fetcher):
@@ -292,6 +266,7 @@ class fetcher(Fetcher):
            self.page.request_gc()

            self.content = self.page.content()
+            self.page.request_gc()
            logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")

            # Bug 3 in Playwright screenshot handling
@@ -316,5 +291,28 @@ class fetcher(Fetcher):
                    pass
                
                # Clean up resources properly
-                context.close()
-                browser.close()
+                try:
+                    self.page.request_gc()
+                except:
+                    pass
+
+                try:
+                    self.page.close()
+                except:
+                    pass
+                self.page = None
+
+                try:
+                    context.close()
+                except:
+                    pass
+                context = None
+
+                try:
+                    browser.close()
+                except:
+                    pass
+                browser = None
+
+
+
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -7,10 +7,11 @@ from urllib.parse import urlparse
 from loguru import logger

 from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
-    SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
+    SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
+    SCREENSHOT_MAX_TOTAL_HEIGHT
 from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
-from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
-from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
+from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
+    BrowserConnectError


 # Bug 3 in Playwright screenshot handling
@@ -27,71 +28,54 @@ async def capture_full_page(page):
    start = time.time()

    page_height = await page.evaluate("document.documentElement.scrollHeight")
+    page_width = await page.evaluate("document.documentElement.scrollWidth")
+    original_viewport = page.viewport

-    logger.debug(f"Puppeteer viewport size {page.viewport}")
+    logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")

-    ############################################################
-    #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
-    ############################################################
+    # Bug 3 in Playwright screenshot handling
+    # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
+    # JPEG is better here because the screenshots can be very very large

-    # Optimization to avoid unnecessary stitching if we can avoid it
-    # Use the default screenshot method for smaller pages to take advantage
-    # of GPU and native playwright screenshot optimizations
-    # - No PIL needed here, no danger of memory leaks, no sub process required
-    if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
-        logger.debug("Using default screenshot method")
-        await page.evaluate(f"window.scrollTo(0, 0)")
-        screenshot = await page.screenshot(
-            type_="jpeg",
-            quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
-            fullPage=True,
-        )
-        logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
+    # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
+    # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
+    # acceptable screenshot quality here
+
+
+    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
+    screenshot_chunks = []
+    y = 0
+    if page_height > page.viewport['height']:
+        if page_height < step_size:
+            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
+        await page.setViewport({'width': page.viewport['width'], 'height': step_size})
+
+    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
+        await page.evaluate(f"window.scrollTo(0, {y})")
+        screenshot_chunks.append(await page.screenshot(type_='jpeg',
+                                                       fullPage=False,
+                                                       quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
+        y += step_size
+
+    await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
+
+    if len(screenshot_chunks) > 1:
+        from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
+        logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
+        parent_conn, child_conn = Pipe()
+        p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
+        p.start()
+        screenshot = parent_conn.recv_bytes()
+        p.join()
+        logger.debug(
+            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
+
+        screenshot_chunks = None
        return screenshot

-    ###################################################################################
-    #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  #####
-    ###################################################################################
-    # - PIL can easily allocate memory and not release it cleanly
-    # - Fetching screenshot from playwright seems  OK
-    # Image.new is leaky even with .close()
-    # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
-
    logger.debug(
-        "Using stitching method for large screenshot because page height exceeds threshold"
-    )
-
-    # Limit the total capture height
-    capture_height = min(page_height, MAX_TOTAL_HEIGHT)
-
-    # Calculate number of chunks needed using ORIGINAL viewport height
-    num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
-    screenshot_chunks = []
-
-    # Track cumulative paste position
-    y_offset = 0
-    for _ in range(num_chunks):
-        await page.evaluate(f"window.scrollTo(0, {y_offset})")
-        h = min(page.viewport['height'], capture_height - y_offset)
-        screenshot_chunks.append(await page.screenshot(
-                type_="jpeg",
-                quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
-            ))
-
-        y_offset += h # maybe better to inspect the image here?
-
-    # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
-
-    parent_conn, child_conn = Pipe()
-    p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
-    p.start()
-    result = parent_conn.recv_bytes()
-    p.join()
-
-    screenshot_chunks = None
-    logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
-
-    return result
+        f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
+    return screenshot_chunks[0]


 class fetcher(Fetcher):
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -33,7 +33,7 @@ from loguru import logger

 from changedetectionio import __version__
 from changedetectionio import queuedWatchMetaData
-from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags
+from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags, Notifications
 from changedetectionio.api.Search import Search
 from .time_handler import is_within_schedule

@@ -285,7 +285,8 @@ def changedetection_app(config=None, datastore_o=None):
    watch_api.add_resource(Search, '/api/v1/search',
                           resource_class_kwargs={'datastore': datastore})

-
+    watch_api.add_resource(Notifications, '/api/v1/notifications',
+                           resource_class_kwargs={'datastore': datastore})

    @login_manager.user_loader
    def user_loader(email):
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False

    return re.sub(pattern, repl, html_content)

-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
+
+def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
+    conn.send(text_content)
+    conn.close()

-    return text_content
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+    from multiprocessing import Process, Pipe

+    parent_conn, child_conn = Pipe()
+    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
+    p.start()
+    text = parent_conn.recv()
+    p.join()
+    return text

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -964,3 +964,25 @@ class ChangeDetectionStore:
                        f_d.write(zlib.compress(f_j.read()))
                        os.unlink(json_path)

+    def add_notification_url(self, notification_url):
+        
+        logger.debug(f">>> Adding new notification_url - '{notification_url}'")
+
+        notification_urls = self.data['settings']['application'].get('notification_urls', [])
+
+        if notification_url in notification_urls:
+            return notification_url
+
+        with self.lock:
+            notification_urls = self.__data['settings']['application'].get('notification_urls', [])
+
+            if notification_url in notification_urls:
+                return notification_url
+
+            # Append and update the datastore
+            notification_urls.append(notification_url)
+            self.__data['settings']['application']['notification_urls'] = notification_urls
+            self.needs_write = True
+
+        return notification_url
+
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -443,6 +443,10 @@ Math: {{ 1 + 1 }}") }}
                        </tr>
                        </tbody>
                    </table>
+
+                    <h4>Text similarity</h4>
+                    <p><strong>Levenshtein Distance</strong> - Last 2 snapshots: {{ lev_info }}</p>
+                    <p style="max-width: 80%; font-size: 80%"><strong>Levenshtein Distance</strong> Calculates the minimum number of insertions, deletions, and substitutions required to change one text into the other.</p>
                    {% if watch.history_n %}
                        <p>
                             <a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a>
--- a/changedetectionio/tests/test_api_notifications.py
+++ b/changedetectionio/tests/test_api_notifications.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+from flask import url_for
+from .util import live_server_setup
+import json
+
+def test_api_notifications_crud(client, live_server):
+    live_server_setup(live_server)
+    api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
+
+    # Confirm notifications are initially empty
+    res = client.get(
+        url_for("notifications"),
+        headers={'x-api-key': api_key}
+    )
+    assert res.status_code == 200
+    assert res.json == {"notification_urls": []}
+
+    # Add notification URLs
+    test_urls = ["posts://example.com/notify1", "posts://example.com/notify2"]
+    res = client.post(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": test_urls}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 201
+    for url in test_urls:
+        assert url in res.json["notification_urls"]
+
+    # Confirm the notification URLs were added
+    res = client.get(
+        url_for("notifications"),
+        headers={'x-api-key': api_key}
+    )
+    assert res.status_code == 200
+    for url in test_urls:
+        assert url in res.json["notification_urls"]
+
+    # Delete one notification URL
+    res = client.delete(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": [test_urls[0]]}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 204
+
+    # Confirm it was removed and the other remains
+    res = client.get(
+        url_for("notifications"),
+        headers={'x-api-key': api_key}
+    )
+    assert res.status_code == 200
+    assert test_urls[0] not in res.json["notification_urls"]
+    assert test_urls[1] in res.json["notification_urls"]
+
+    # Try deleting a non-existent URL
+    res = client.delete(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": ["posts://nonexistent.com"]}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 400
+
+    res = client.post(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": test_urls}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 201
+
+    # Replace with a new list
+    replacement_urls = ["posts://new.example.com"]
+    res = client.put(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": replacement_urls}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 200
+    assert res.json["notification_urls"] == replacement_urls
+
+    # Replace with an empty list
+    res = client.put(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": []}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 200
+    assert res.json["notification_urls"] == []
+
+    # Provide an invalid AppRise URL to trigger validation error
+    invalid_urls = ["ftp://not-app-rise"]
+    res = client.post(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": invalid_urls}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 400
+    assert "is not a valid AppRise URL." in res.data.decode()
+
+    res = client.put(
+        url_for("notifications"),
+        data=json.dumps({"notification_urls": invalid_urls}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key}
+    )
+    assert res.status_code == 400
+    assert "is not a valid AppRise URL." in res.data.decode()
+
+    
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -74,6 +74,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    res = client.get(url_for("ui.ui_edit.watch_get_latest_html", uuid=uuid))
    assert b'which has this one new line' in res.data

+    # Check the 'levenshtein' distance calc showed something useful
+    res = client.get(url_for("ui.ui_edit.edit_page", uuid=uuid))
+    assert b'Last 2 snapshots: 17' in res.data
+
+
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("watchlist.index"))
    assert b'unviewed' in res.data
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,6 +68,8 @@ openpyxl
 jq~=1.3; python_version >= "3.8" and sys_platform == "darwin"
 jq~=1.3; python_version >= "3.8" and sys_platform == "linux"

+levenshtein
+
 # playwright is installed at Dockerfile build time because it's not available on all platforms

 pyppeteer-ng==2.0.0rc9
Author	SHA1	Message	Date
dgtlmoon	3d2bc5049b	Small safety catch	2025-04-12 18:40:15 +02:00
dgtlmoon	186016e605	Playwright + Puppeteer fix for when page is taller than viewport but less than step_size	2025-04-12 17:56:16 +02:00
dgtlmoon	3a583a4e5d	Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110 ) Some checks failed Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io Container Build Test / test-container-build (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2025-04-11 18:18:29 +02:00
dgtlmoon	cfb4decf67	UI Edit/Stats - Add levenshtein distance info, explains how "different" the last two snapshot are (#3109 )	2025-04-11 17:36:29 +02:00
dgtlmoon	8067d5170b	0.49.13 Some checks failed Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2025-04-11 13:46:58 +02:00
Rob Mulder 	5551acf67d	API - Added notifications API endpoints (#3103 )	2025-04-11 13:43:59 +02:00
dgtlmoon	45a030bac6	Fetcher - Use bigger screenshot chunks to speed up page screenshot (#3107 )	2025-04-11 13:42:50 +02:00