mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-04 21:36:00 +00:00
Compare commits
21 Commits
master
...
processor-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
50fe822d8e | ||
|
|
a7ea79e1fc | ||
|
|
9921d303de | ||
|
|
7832787b84 | ||
|
|
bd5f7187f7 | ||
|
|
e2f9fdb384 | ||
|
|
261f88b272 | ||
|
|
8d2e668b42 | ||
|
|
c7050077be | ||
|
|
5f25e3825c | ||
|
|
041c1ad531 | ||
|
|
e958acebed | ||
|
|
b826d9b236 | ||
|
|
bc3efbff27 | ||
|
|
063ee38099 | ||
|
|
5007b8201e | ||
|
|
7e853a4b46 | ||
|
|
4dc5301de4 | ||
|
|
edf0989cd4 | ||
|
|
423b546948 | ||
|
|
c1c810a79a |
@@ -26,7 +26,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
|||||||
add_paused = request.form.get('edit_and_watch_submit_button') != None
|
add_paused = request.form.get('edit_and_watch_submit_button') != None
|
||||||
from changedetectionio import processors
|
from changedetectionio import processors
|
||||||
processor = request.form.get('processor', processors.get_default_processor())
|
processor = request.form.get('processor', processors.get_default_processor())
|
||||||
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags','').strip(), extras={'paused': add_paused, 'processor': processor})
|
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags').strip(), extras={'paused': add_paused, 'processor': processor})
|
||||||
|
|
||||||
if new_uuid:
|
if new_uuid:
|
||||||
if add_paused:
|
if add_paused:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import asyncio
|
|
||||||
import gc
|
import gc
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@@ -350,7 +349,12 @@ class fetcher(Fetcher):
|
|||||||
|
|
||||||
if self.status_code != 200 and not ignore_status_codes:
|
if self.status_code != 200 and not ignore_status_codes:
|
||||||
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
||||||
# Finally block will handle cleanup
|
# Cleanup before raising to prevent memory leak
|
||||||
|
await self.page.close()
|
||||||
|
await context.close()
|
||||||
|
await browser.close()
|
||||||
|
# Force garbage collection to release Playwright resources immediately
|
||||||
|
gc.collect()
|
||||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||||
|
|
||||||
if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0:
|
if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0:
|
||||||
@@ -366,7 +370,12 @@ class fetcher(Fetcher):
|
|||||||
try:
|
try:
|
||||||
await self.iterate_browser_steps(start_url=url)
|
await self.iterate_browser_steps(start_url=url)
|
||||||
except BrowserStepsStepException:
|
except BrowserStepsStepException:
|
||||||
# Finally block will handle cleanup
|
try:
|
||||||
|
await context.close()
|
||||||
|
await browser.close()
|
||||||
|
except Exception as e:
|
||||||
|
# Fine, could be messy situation
|
||||||
|
pass
|
||||||
raise
|
raise
|
||||||
|
|
||||||
await self.page.wait_for_timeout(extra_wait * 1000)
|
await self.page.wait_for_timeout(extra_wait * 1000)
|
||||||
@@ -415,40 +424,35 @@ class fetcher(Fetcher):
|
|||||||
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up resources properly with timeouts to prevent hanging
|
# Request garbage collection one more time before closing
|
||||||
try:
|
try:
|
||||||
if hasattr(self, 'page') and self.page:
|
await self.page.request_gc()
|
||||||
await self.page.request_gc()
|
except:
|
||||||
await asyncio.wait_for(self.page.close(), timeout=5.0)
|
pass
|
||||||
logger.debug(f"Successfully closed page for {url}")
|
|
||||||
except asyncio.TimeoutError:
|
# Clean up resources properly
|
||||||
logger.warning(f"Timed out closing page for {url} (5s)")
|
try:
|
||||||
except Exception as e:
|
await self.page.request_gc()
|
||||||
logger.warning(f"Error closing page for {url}: {e}")
|
except:
|
||||||
finally:
|
pass
|
||||||
self.page = None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if context:
|
await self.page.close()
|
||||||
await asyncio.wait_for(context.close(), timeout=5.0)
|
except:
|
||||||
logger.debug(f"Successfully closed context for {url}")
|
pass
|
||||||
except asyncio.TimeoutError:
|
self.page = None
|
||||||
logger.warning(f"Timed out closing context for {url} (5s)")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Error closing context for {url}: {e}")
|
|
||||||
finally:
|
|
||||||
context = None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if browser:
|
await context.close()
|
||||||
await asyncio.wait_for(browser.close(), timeout=5.0)
|
except:
|
||||||
logger.debug(f"Successfully closed browser connection for {url}")
|
pass
|
||||||
except asyncio.TimeoutError:
|
context = None
|
||||||
logger.warning(f"Timed out closing browser connection for {url} (5s)")
|
|
||||||
except Exception as e:
|
try:
|
||||||
logger.warning(f"Error closing browser for {url}: {e}")
|
await browser.close()
|
||||||
finally:
|
except:
|
||||||
browser = None
|
pass
|
||||||
|
browser = None
|
||||||
|
|
||||||
# Force Python GC to release Playwright resources immediately
|
# Force Python GC to release Playwright resources immediately
|
||||||
# Playwright objects can have circular references that delay cleanup
|
# Playwright objects can have circular references that delay cleanup
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import gc
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import websockets.exceptions
|
import websockets.exceptions
|
||||||
@@ -222,36 +221,19 @@ class fetcher(Fetcher):
|
|||||||
self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
|
self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
|
||||||
|
|
||||||
async def quit(self, watch=None):
|
async def quit(self, watch=None):
|
||||||
watch_uuid = watch.get('uuid') if watch else 'unknown'
|
|
||||||
|
|
||||||
# Close page
|
|
||||||
try:
|
try:
|
||||||
if hasattr(self, 'page') and self.page:
|
await self.page.close()
|
||||||
await asyncio.wait_for(self.page.close(), timeout=5.0)
|
del self.page
|
||||||
logger.debug(f"[{watch_uuid}] Page closed successfully")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning(f"[{watch_uuid}] Timed out closing page (5s)")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[{watch_uuid}] Error closing page: {e}")
|
pass
|
||||||
finally:
|
|
||||||
self.page = None
|
|
||||||
|
|
||||||
# Close browser connection
|
|
||||||
try:
|
try:
|
||||||
if hasattr(self, 'browser') and self.browser:
|
await self.browser.close()
|
||||||
await asyncio.wait_for(self.browser.close(), timeout=5.0)
|
del self.browser
|
||||||
logger.debug(f"[{watch_uuid}] Browser closed successfully")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning(f"[{watch_uuid}] Timed out closing browser (5s)")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[{watch_uuid}] Error closing browser: {e}")
|
pass
|
||||||
finally:
|
|
||||||
self.browser = None
|
|
||||||
|
|
||||||
logger.info(f"[{watch_uuid}] Cleanup puppeteer complete")
|
logger.info("Cleanup puppeteer complete.")
|
||||||
|
|
||||||
# Force garbage collection to release resources
|
|
||||||
gc.collect()
|
|
||||||
|
|
||||||
async def fetch_page(self,
|
async def fetch_page(self,
|
||||||
current_include_filters,
|
current_include_filters,
|
||||||
@@ -281,11 +263,9 @@ class fetcher(Fetcher):
|
|||||||
# Connect directly using the specified browser_ws_endpoint
|
# Connect directly using the specified browser_ws_endpoint
|
||||||
# @todo timeout
|
# @todo timeout
|
||||||
try:
|
try:
|
||||||
logger.debug(f"[{watch_uuid}] Connecting to browser at {self.browser_connection_url}")
|
|
||||||
self.browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url,
|
self.browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url,
|
||||||
ignoreHTTPSErrors=True
|
ignoreHTTPSErrors=True
|
||||||
)
|
)
|
||||||
logger.debug(f"[{watch_uuid}] Browser connected successfully")
|
|
||||||
except websockets.exceptions.InvalidStatusCode as e:
|
except websockets.exceptions.InvalidStatusCode as e:
|
||||||
raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access, whitelist IP, password etc)")
|
raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access, whitelist IP, password etc)")
|
||||||
except websockets.exceptions.InvalidURI:
|
except websockets.exceptions.InvalidURI:
|
||||||
@@ -294,18 +274,7 @@ class fetcher(Fetcher):
|
|||||||
raise BrowserConnectError(msg=f"Error connecting to the browser - Exception '{str(e)}'")
|
raise BrowserConnectError(msg=f"Error connecting to the browser - Exception '{str(e)}'")
|
||||||
|
|
||||||
# more reliable is to just request a new page
|
# more reliable is to just request a new page
|
||||||
try:
|
self.page = await self.browser.newPage()
|
||||||
logger.debug(f"[{watch_uuid}] Creating new page")
|
|
||||||
self.page = await self.browser.newPage()
|
|
||||||
logger.debug(f"[{watch_uuid}] Page created successfully")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"[{watch_uuid}] Failed to create new page: {e}")
|
|
||||||
# Browser is connected but page creation failed - must cleanup browser
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(self.browser.close(), timeout=3.0)
|
|
||||||
except Exception as cleanup_error:
|
|
||||||
logger.error(f"[{watch_uuid}] Failed to cleanup browser after page creation failure: {cleanup_error}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Add console handler to capture console.log from favicon fetcher
|
# Add console handler to capture console.log from favicon fetcher
|
||||||
#self.page.on('console', lambda msg: logger.debug(f"Browser console [{msg.type}]: {msg.text}"))
|
#self.page.on('console', lambda msg: logger.debug(f"Browser console [{msg.type}]: {msg.text}"))
|
||||||
@@ -374,12 +343,6 @@ class fetcher(Fetcher):
|
|||||||
w = extra_wait - 2 if extra_wait > 4 else 2
|
w = extra_wait - 2 if extra_wait > 4 else 2
|
||||||
logger.debug(f"Waiting {w} seconds before calling Page.stopLoading...")
|
logger.debug(f"Waiting {w} seconds before calling Page.stopLoading...")
|
||||||
await asyncio.sleep(w)
|
await asyncio.sleep(w)
|
||||||
|
|
||||||
# Check if page still exists (might have been closed due to error during sleep)
|
|
||||||
if not self.page or not hasattr(self.page, '_client'):
|
|
||||||
logger.debug("Page already closed, skipping stopLoading")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.debug("Issuing stopLoading command...")
|
logger.debug("Issuing stopLoading command...")
|
||||||
await self.page._client.send('Page.stopLoading')
|
await self.page._client.send('Page.stopLoading')
|
||||||
logger.debug("stopLoading command sent!")
|
logger.debug("stopLoading command sent!")
|
||||||
@@ -405,9 +368,7 @@ class fetcher(Fetcher):
|
|||||||
asyncio.create_task(handle_frame_navigation())
|
asyncio.create_task(handle_frame_navigation())
|
||||||
response = await self.page.goto(url, timeout=0)
|
response = await self.page.goto(url, timeout=0)
|
||||||
await asyncio.sleep(1 + extra_wait)
|
await asyncio.sleep(1 + extra_wait)
|
||||||
# Check if page still exists before sending command
|
await self.page._client.send('Page.stopLoading')
|
||||||
if self.page and hasattr(self.page, '_client'):
|
|
||||||
await self.page._client.send('Page.stopLoading')
|
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
break
|
break
|
||||||
@@ -476,9 +437,15 @@ class fetcher(Fetcher):
|
|||||||
logger.debug(f"Screenshot format {self.screenshot_format}")
|
logger.debug(f"Screenshot format {self.screenshot_format}")
|
||||||
self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
self.screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements)
|
||||||
|
|
||||||
# Force garbage collection - pyppeteer base64 decode creates temporary buffers
|
# Force aggressive memory cleanup - pyppeteer base64 decode creates temporary buffers
|
||||||
import gc
|
import gc
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
# Release C-level memory from base64 decode back to OS
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
ctypes.CDLL('libc.so.6').malloc_trim(0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
|
self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, {
|
||||||
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
"visualselector_xpath_selectors": visualselector_xpath_selectors,
|
||||||
"max_height": MAX_TOTAL_HEIGHT
|
"max_height": MAX_TOTAL_HEIGHT
|
||||||
|
|||||||
@@ -480,9 +480,14 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
|||||||
del update_handler
|
del update_handler
|
||||||
update_handler = None
|
update_handler = None
|
||||||
|
|
||||||
# Force garbage collection
|
# Force aggressive memory cleanup after clearing
|
||||||
import gc
|
import gc
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
ctypes.CDLL('libc.so.6').malloc_trim(0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
@@ -498,7 +503,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
|||||||
finally:
|
finally:
|
||||||
# Always cleanup - this runs whether there was an exception or not
|
# Always cleanup - this runs whether there was an exception or not
|
||||||
if uuid:
|
if uuid:
|
||||||
# Call quit() as backup (Puppeteer/Playwright have internal cleanup, but this acts as safety net)
|
|
||||||
try:
|
try:
|
||||||
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
|
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
|
||||||
await update_handler.fetcher.quit(watch=watch)
|
await update_handler.fetcher.quit(watch=watch)
|
||||||
@@ -510,22 +514,32 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
|||||||
|
|
||||||
# Send completion signal
|
# Send completion signal
|
||||||
if watch:
|
if watch:
|
||||||
|
#logger.info(f"Worker {worker_id} sending completion signal for UUID {watch['uuid']}")
|
||||||
watch_check_update.send(watch_uuid=watch['uuid'])
|
watch_check_update.send(watch_uuid=watch['uuid'])
|
||||||
|
|
||||||
# Clean up all memory references BEFORE garbage collection
|
# Explicitly clean up update_handler and all its references
|
||||||
if update_handler:
|
if update_handler:
|
||||||
|
# Clear fetcher content using the proper method
|
||||||
if hasattr(update_handler, 'fetcher') and update_handler.fetcher:
|
if hasattr(update_handler, 'fetcher') and update_handler.fetcher:
|
||||||
update_handler.fetcher.clear_content()
|
update_handler.fetcher.clear_content()
|
||||||
|
|
||||||
|
# Clear processor references
|
||||||
if hasattr(update_handler, 'content_processor'):
|
if hasattr(update_handler, 'content_processor'):
|
||||||
update_handler.content_processor = None
|
update_handler.content_processor = None
|
||||||
del update_handler
|
|
||||||
update_handler = None
|
update_handler = None
|
||||||
|
|
||||||
# Clear large content variables
|
# Clear local contents variable if it still exists
|
||||||
if 'contents' in locals():
|
if 'contents' in locals():
|
||||||
del contents
|
del contents
|
||||||
|
|
||||||
# Force garbage collection after all references are cleared
|
# Note: We don't set watch = None here because:
|
||||||
|
# 1. watch is just a local reference to datastore.data['watching'][uuid]
|
||||||
|
# 2. Setting it to None doesn't affect the datastore
|
||||||
|
# 3. GC can't collect the object anyway (still referenced by datastore)
|
||||||
|
# 4. It would just cause confusion
|
||||||
|
|
||||||
|
# Force garbage collection after cleanup
|
||||||
import gc
|
import gc
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ lxml >=4.8.0,!=5.2.0,!=5.2.1,<7
|
|||||||
|
|
||||||
# XPath 2.0-3.1 support - 4.2.0 had issues, 4.1.5 stable
|
# XPath 2.0-3.1 support - 4.2.0 had issues, 4.1.5 stable
|
||||||
# Consider updating to latest stable version periodically
|
# Consider updating to latest stable version periodically
|
||||||
elementpath==5.1.1
|
elementpath==5.1.0
|
||||||
|
|
||||||
# For fast image comparison in screenshot change detection
|
# For fast image comparison in screenshot change detection
|
||||||
# opencv-python-headless is OPTIONAL (excluded from requirements.txt)
|
# opencv-python-headless is OPTIONAL (excluded from requirements.txt)
|
||||||
|
|||||||
Reference in New Issue
Block a user