Small safety catch

Playwright + Puppeteer fix for when page is taller than viewport but less than step_size
Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110 )
2025-11-13 04:56:39 +00:00 · 2025-04-12 18:40:15 +02:00 · 2025-04-12 17:56:16 +02:00 · 2025-04-11 18:18:29 +02:00 · 2025-04-11 17:36:29 +02:00
8 changed files with 74 additions and 14 deletions
--- a/changedetectionio/blueprint/ui/edit.py
+++ b/changedetectionio/blueprint/ui/edit.py
@@ -19,6 +19,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
            if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')):
                return True

+    def levenshtein_ratio_recent_history(watch):
+        try:
+            from Levenshtein import ratio, distance
+            k = list(watch.history.keys())
+            if len(k) >= 2:
+                a = watch.get_history_snapshot(timestamp=k[0])
+                b = watch.get_history_snapshot(timestamp=k[1])
+                distance = distance(a, b)
+                return distance
+        except Exception as e:
+            logger.warning("Unable to calc similarity", e)
+            return "Unable to calc similarity"
+        return ''
+
    @edit_blueprint.route("/edit/<string:uuid>", methods=['GET', 'POST'])
    @login_optionally_required
    # https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists
@@ -247,14 +261,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
                'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
                'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
                'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
-                'watch_uses_webdriver': watch_uses_webdriver,
                'jq_support': jq_support,
+                'lev_info': levenshtein_ratio_recent_history(watch),
                'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
                'settings_application': datastore.data['settings']['application'],
                'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
                'using_global_webdriver_wait': not default['webdriver_delay'],
                'uuid': uuid,
-                'watch': watch
+                'watch': watch,
+                'watch_uses_webdriver': watch_uses_webdriver,
            }

            included_content = None
--- a/changedetectionio/conditions/init.py
+++ b/changedetectionio/conditions/init.py
@@ -96,7 +96,7 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
    
    ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)

-    if ruleset_settings.get("conditions"):
+    if ruleset_settings and ruleset_settings.get("conditions"):
        logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
        complete_rules = filter_complete_rules(ruleset_settings['conditions'])
        if complete_rules:
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -26,9 +26,11 @@ def capture_full_page(page):
    step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
    screenshot_chunks = []
    y = 0
-    
-    # If page height is larger than current viewport, use a larger viewport for better capturing
+
    if page_height > page.viewport_size['height']:
+        if page_height < step_size:
+            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
+        logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
        # Set viewport to a larger size to capture more content at once
        page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})

@@ -59,7 +61,10 @@ def capture_full_page(page):
        p.join()
        logger.debug(
            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
-
+        # Explicit cleanup
+        del screenshot_chunks
+        del p
+        del parent_conn, child_conn
        screenshot_chunks = None
        return screenshot

@@ -286,12 +291,28 @@ class fetcher(Fetcher):
                    pass
                
                # Clean up resources properly
-                context.close()
-                context = None
+                try:
+                    self.page.request_gc()
+                except:
+                    pass

-                self.page.close()
+                try:
+                    self.page.close()
+                except:
+                    pass
                self.page = None

-                browser.close()
-                borwser = None
+                try:
+                    context.close()
+                except:
+                    pass
+                context = None
+
+                try:
+                    browser.close()
+                except:
+                    pass
+                browser = None
+
+

--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -46,9 +46,10 @@ async def capture_full_page(page):
    screenshot_chunks = []
    y = 0
    if page_height > page.viewport['height']:
+        if page_height < step_size:
+            step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
        await page.setViewport({'width': page.viewport['width'], 'height': step_size})

-
    while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
        await page.evaluate(f"window.scrollTo(0, {y})")
        screenshot_chunks.append(await page.screenshot(type_='jpeg',
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False

    return re.sub(pattern, repl, html_content)

-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
+
+def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
+    conn.send(text_content)
+    conn.close()

-    return text_content
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+    from multiprocessing import Process, Pipe

+    parent_conn, child_conn = Pipe()
+    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
+    p.start()
+    text = parent_conn.recv()
+    p.join()
+    return text

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -443,6 +443,10 @@ Math: {{ 1 + 1 }}") }}
                        </tr>
                        </tbody>
                    </table>
+
+                    <h4>Text similarity</h4>
+                    <p><strong>Levenshtein Distance</strong> - Last 2 snapshots: {{ lev_info }}</p>
+                    <p style="max-width: 80%; font-size: 80%"><strong>Levenshtein Distance</strong> Calculates the minimum number of insertions, deletions, and substitutions required to change one text into the other.</p>
                    {% if watch.history_n %}
                        <p>
                             <a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a>
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -74,6 +74,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    res = client.get(url_for("ui.ui_edit.watch_get_latest_html", uuid=uuid))
    assert b'which has this one new line' in res.data

+    # Check the 'levenshtein' distance calc showed something useful
+    res = client.get(url_for("ui.ui_edit.edit_page", uuid=uuid))
+    assert b'Last 2 snapshots: 17' in res.data
+
+
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("watchlist.index"))
    assert b'unviewed' in res.data
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,6 +68,8 @@ openpyxl
 jq~=1.3; python_version >= "3.8" and sys_platform == "darwin"
 jq~=1.3; python_version >= "3.8" and sys_platform == "linux"

+levenshtein
+
 # playwright is installed at Dockerfile build time because it's not available on all platforms

 pyppeteer-ng==2.0.0rc9
Author	SHA1	Message	Date
dgtlmoon	3d2bc5049b	Small safety catch	2025-04-12 18:40:15 +02:00
dgtlmoon	186016e605	Playwright + Puppeteer fix for when page is taller than viewport but less than step_size	2025-04-12 17:56:16 +02:00
dgtlmoon	3a583a4e5d	Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110 ) Some checks failed Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io Container Build Test / test-container-build (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2025-04-11 18:18:29 +02:00
dgtlmoon	cfb4decf67	UI Edit/Stats - Add levenshtein distance info, explains how "different" the last two snapshot are (#3109 )	2025-04-11 17:36:29 +02:00