Compare commits

...

4 Commits

Author SHA1 Message Date
dgtlmoon 7375fd01b6 Also sendStop on ERR_ABORTED
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled
2026-03-22 11:21:56 +01:00
dgtlmoon 5dd68468b9 Handle ERR_ABORTED caused by some redirects etc 2026-03-22 11:21:05 +01:00
dgtlmoon 6ec65e7b91 Merge branch 'master' into puppeteer-stop-hang-on-redirect 2026-03-22 11:19:30 +01:00
dgtlmoon 0ecfe19c48 Small fix for puppeteer redirect 2026-02-19 20:24:05 +01:00
@@ -386,9 +386,17 @@ class fetcher(Fetcher):
await self.page._client.send('Page.stopLoading')
logger.debug("stopLoading command sent!")
# Capture response metadata from CDP events as a fallback for when goto() times out
# (Page.stopLoading does not cause pyppeteer's goto() to resolve, so we need a hard timeout)
first_response_data = {}
async def setup_frame_handlers_on_first_response(event):
# Only trigger for the main document response
if event.get('type') == 'Document':
# Save status/headers so we can proceed even if goto() times out
resp = event.get('response', {})
first_response_data['status'] = resp.get('status', 200)
first_response_data['headers'] = resp.get('headers', {})
logger.debug("First response received, setting up frame handlers for forced page stop load.")
self.page._client.on('Page.frameStartedNavigating', lambda e: asyncio.create_task(handle_frame_navigation(e)))
self.page._client.on('Page.frameStartedLoading', lambda e: asyncio.create_task(handle_frame_navigation(e)))
@@ -405,7 +413,37 @@ class fetcher(Fetcher):
while not response:
logger.debug(f"Attempting page fetch {url} attempt {attempt}")
asyncio.create_task(handle_frame_navigation())
response = await self.page.goto(url, timeout=0)
try:
# Hard timeout safety net: Page.stopLoading CDP command does NOT cause pyppeteer's
# goto() to resolve. Iframes like reCAPTCHA v3 maintain persistent connections that
# prevent the 'load' event from ever firing, causing goto() to block forever.
# asyncio.wait_for() breaks the deadlock - for normal pages goto() returns naturally
# well within the timeout; for recaptcha/iframe-heavy pages it times out and we
# proceed with whatever content was loaded (first_response_data as fallback).
response = await asyncio.wait_for(self.page.goto(url, timeout=0), timeout=extra_wait + 15)
except asyncio.TimeoutError:
logger.warning(f"[{watch_uuid}] page.goto() timed out after {extra_wait + 15}s - Page.stopLoading did not resolve navigation, proceeding with content retrieved so far")
if self.page and hasattr(self.page, '_client'):
try:
await self.page._client.send('Page.stopLoading')
except Exception:
pass
break # response stays None; fall through to use first_response_data
except Exception as e:
if 'ERR_ABORTED' in str(e):
# Anti-bot JS challenges (Cloudflare, WP Simple Firewall etc) or JS-initiated
# redirects can cause Chrome to abort the original navigation. The page DOM is
# still loaded with whatever content was served, so proceed and scrape it.
logger.opt(exception=True).warning(f"[{watch_uuid}] page.goto() ERR_ABORTED for {url} - likely anti-bot challenge or JS redirect, proceeding with loaded content")
await asyncio.sleep(1 + extra_wait)
if self.page and hasattr(self.page, '_client'):
try:
await self.page._client.send('Page.stopLoading')
except Exception:
pass
break # response stays None; fall through to use first_response_data
raise
await asyncio.sleep(1 + extra_wait)
# Check if page still exists before sending command
if self.page and hasattr(self.page, '_client'):
@@ -420,7 +458,7 @@ class fetcher(Fetcher):
raise EmptyReply(url=url, status_code=None)
attempt+=1
self.headers = response.headers
self.headers = response.headers if response else first_response_data.get('headers', {})
try:
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
@@ -432,7 +470,7 @@ class fetcher(Fetcher):
raise PageUnloadable(url=url, status_code=None, message=str(e))
try:
self.status_code = response.status
self.status_code = response.status if response else first_response_data.get('status', 200)
except Exception as e:
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
@@ -454,7 +492,7 @@ class fetcher(Fetcher):
if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
raise EmptyReply(url=url, status_code=response.status)
raise EmptyReply(url=url, status_code=self.status_code)
# Run Browser Steps here
# @todo not yet supported, we switch to playwright in this case