Also sendStop on ERR_ABORTED

Handle ERR_ABORTED caused by some redirects etc
Merge branch 'master' into puppeteer-stop-hang-on-redirect
2026-04-30 14:50:39 +00:00 · 2026-03-22 11:21:56 +01:00 · 2026-03-22 11:21:05 +01:00 · 2026-03-22 11:19:30 +01:00 · 2026-02-19 20:24:05 +01:00
1 changed files with 42 additions and 4 deletions
@@ -386,9 +386,17 @@ class fetcher(Fetcher):
            await self.page._client.send('Page.stopLoading')
            logger.debug("stopLoading command sent!")

+        # Capture response metadata from CDP events as a fallback for when goto() times out
+        # (Page.stopLoading does not cause pyppeteer's goto() to resolve, so we need a hard timeout)
+        first_response_data = {}
+
        async def setup_frame_handlers_on_first_response(event):
            # Only trigger for the main document response
            if event.get('type') == 'Document':
+                # Save status/headers so we can proceed even if goto() times out
+                resp = event.get('response', {})
+                first_response_data['status'] = resp.get('status', 200)
+                first_response_data['headers'] = resp.get('headers', {})
                logger.debug("First response received, setting up frame handlers for forced page stop load.")
                self.page._client.on('Page.frameStartedNavigating', lambda e: asyncio.create_task(handle_frame_navigation(e)))
                self.page._client.on('Page.frameStartedLoading', lambda e: asyncio.create_task(handle_frame_navigation(e)))
@@ -405,7 +413,37 @@ class fetcher(Fetcher):
        while not response:
            logger.debug(f"Attempting page fetch {url} attempt {attempt}")
            asyncio.create_task(handle_frame_navigation())
-            response = await self.page.goto(url, timeout=0)
+            try:
+                # Hard timeout safety net: Page.stopLoading CDP command does NOT cause pyppeteer's
+                # goto() to resolve. Iframes like reCAPTCHA v3 maintain persistent connections that
+                # prevent the 'load' event from ever firing, causing goto() to block forever.
+                # asyncio.wait_for() breaks the deadlock - for normal pages goto() returns naturally
+                # well within the timeout; for recaptcha/iframe-heavy pages it times out and we
+                # proceed with whatever content was loaded (first_response_data as fallback).
+                response = await asyncio.wait_for(self.page.goto(url, timeout=0), timeout=extra_wait + 15)
+            except asyncio.TimeoutError:
+                logger.warning(f"[{watch_uuid}] page.goto() timed out after {extra_wait + 15}s - Page.stopLoading did not resolve navigation, proceeding with content retrieved so far")
+                if self.page and hasattr(self.page, '_client'):
+                    try:
+                        await self.page._client.send('Page.stopLoading')
+                    except Exception:
+                        pass
+                break  # response stays None; fall through to use first_response_data
+            except Exception as e:
+                if 'ERR_ABORTED' in str(e):
+                    # Anti-bot JS challenges (Cloudflare, WP Simple Firewall etc) or JS-initiated
+                    # redirects can cause Chrome to abort the original navigation. The page DOM is
+                    # still loaded with whatever content was served, so proceed and scrape it.
+                    logger.opt(exception=True).warning(f"[{watch_uuid}] page.goto() ERR_ABORTED for {url} - likely anti-bot challenge or JS redirect, proceeding with loaded content")
+                    await asyncio.sleep(1 + extra_wait)
+                    if self.page and hasattr(self.page, '_client'):
+                        try:
+                            await self.page._client.send('Page.stopLoading')
+                        except Exception:
+                            pass
+                    break  # response stays None; fall through to use first_response_data
+                raise
+
            await asyncio.sleep(1 + extra_wait)
            # Check if page still exists before sending command
            if self.page and hasattr(self.page, '_client'):
@@ -420,7 +458,7 @@ class fetcher(Fetcher):
                raise EmptyReply(url=url, status_code=None)
            attempt+=1

-        self.headers = response.headers
+        self.headers = response.headers if response else first_response_data.get('headers', {})

        try:
            if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
@@ -432,7 +470,7 @@ class fetcher(Fetcher):
            raise PageUnloadable(url=url, status_code=None, message=str(e))

        try:
-            self.status_code = response.status
+            self.status_code = response.status if response else first_response_data.get('status', 200)
        except Exception as e:
            # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
            logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
@@ -454,7 +492,7 @@ class fetcher(Fetcher):

        if not empty_pages_are_a_change and len(content.strip()) == 0:
            logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
-            raise EmptyReply(url=url, status_code=response.status)
+            raise EmptyReply(url=url, status_code=self.status_code)

        # Run Browser Steps here
        # @todo not yet supported, we switch to playwright in this case
Author	SHA1	Message	Date
dgtlmoon	7375fd01b6	Also sendStop on ERR_ABORTED Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled Details ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled Details	2026-03-22 11:21:56 +01:00
dgtlmoon	5dd68468b9	Handle ERR_ABORTED caused by some redirects etc	2026-03-22 11:21:05 +01:00
dgtlmoon	6ec65e7b91	Merge branch 'master' into puppeteer-stop-hang-on-redirect	2026-03-22 11:19:30 +01:00
dgtlmoon	0ecfe19c48	Small fix for puppeteer redirect	2026-02-19 20:24:05 +01:00