Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110)

2025-10-30 14:17:40 +00:00 · 2025-04-11 18:18:29 +02:00
parent cfb4decf67
commit 3a583a4e5d
2 changed files with 39 additions and 8 deletions
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -59,7 +59,10 @@ def capture_full_page(page):
        p.join()
        logger.debug(
            f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
-
+        # Explicit cleanup
+        del screenshot_chunks
+        del p
+        del parent_conn, child_conn
        screenshot_chunks = None
        return screenshot

@@ -286,12 +289,28 @@ class fetcher(Fetcher):
                    pass
                
                # Clean up resources properly
-                context.close()
-                context = None
+                try:
+                    self.page.request_gc()
+                except:
+                    pass

-                self.page.close()
+                try:
+                    self.page.close()
+                except:
+                    pass
                self.page = None

-                browser.close()
-                borwser = None
+                try:
+                    context.close()
+                except:
+                    pass
+                context = None
+
+                try:
+                    browser.close()
+                except:
+                    pass
+                browser = None
+
+

--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False

    return re.sub(pattern, repl, html_content)

-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
+
+def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
+    conn.send(text_content)
+    conn.close()

-    return text_content
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+    from multiprocessing import Process, Pipe

+    parent_conn, child_conn = Pipe()
+    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
+    p.start()
+    text = parent_conn.recv()
+    p.join()
+    return text

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):