Attempt retry

BEtter messages
Provide more information on PageUnloadable
2025-11-10 11:36:55 +00:00 · 2022-12-16 17:29:16 +01:00 · 2022-12-16 17:04:12 +01:00 · 2022-12-15 09:13:09 +01:00 · 2022-12-14 19:06:49 +01:00
4 changed files with 94 additions and 19 deletions
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -42,7 +42,7 @@ class BrowserStepsStepTimout(Exception):


 class PageUnloadable(Exception):
-    def __init__(self, status_code, url, screenshot=False, message=False):
+    def __init__(self, status_code, url, message, screenshot=False):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
@@ -299,23 +299,34 @@ class base_html_playwright(Fetcher):
            if len(request_headers):
                context.set_extra_http_headers(request_headers)

-            try:
                self.page.set_default_navigation_timeout(90000)
                self.page.set_default_timeout(90000)

                # Listen for all console events and handle errors
                self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))

-                # Bug - never set viewport size BEFORE page.goto
-
-
-                # Waits for the next navigation. Using Python context manager
-                # prevents a race condition between clicking and waiting for a navigation.
-                response = self.page.goto(url, wait_until='commit')
+            # Goto page
+            try:
                # Wait_until = commit
                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
                # This seemed to solve nearly all 'TimeoutErrors'
+                response = self.page.goto(url, wait_until='commit')
+            except playwright._impl._api_types.Error as e:
+                # Retry once - https://github.com/browserless/chrome/issues/2485
+                # Sometimes errors related to invalid cert's and other can be random
+                print ("Content Fetcher > retrying request got error - ", str(e))
+                time.sleep(1)
+                response = self.page.goto(url, wait_until='commit')
+
+            except Exception as e:
+                print ("Content Fetcher > Other exception when page.goto", str(e))
+                context.close()
+                browser.close()
+                raise PageUnloadable(url=url, status_code=None, message=str(e))
+
+            # Execute any browser steps
+            try:
                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
                self.page.wait_for_timeout(extra_wait * 1000)

@@ -328,17 +339,15 @@ class base_html_playwright(Fetcher):
                # This can be ok, we will try to grab what we could retrieve
                pass
            except Exception as e:
-                print ("other exception when page.goto")
-                print (str(e))
+                print ("Content Fetcher > Other exception when executing custom JS code", str(e))
                context.close()
                browser.close()
-                raise PageUnloadable(url=url, status_code=None)
-
+                raise PageUnloadable(url=url, status_code=None, message=str(e))

            if response is None:
                context.close()
                browser.close()
-                print ("response object was none")
+                print ("Content Fetcher > Response object was none")
                raise EmptyReply(url=url, status_code=None)

            # Bug 2(?) Set the viewport size AFTER loading the page
@@ -357,7 +366,7 @@ class base_html_playwright(Fetcher):
            if len(self.page.content().strip()) == 0:
                context.close()
                browser.close()
-                print ("Content was empty")
+                print ("Content Fetcher > Content was empty")
                raise EmptyReply(url=url, status_code=None)

            # Bug 2(?) Set the viewport size AFTER loading the page
@@ -502,7 +511,7 @@ class base_html_webdriver(Fetcher):
            try:
                self.driver.quit()
            except Exception as e:
-                print("Exception in chrome shutdown/quit" + str(e))
+                print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))


 # "html_requests" is listed as the default fetcher in store.py!
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -1,4 +1,5 @@
 import hashlib
+import json
 import logging
 import os
 import re
@@ -167,6 +168,14 @@ class perform_site_check():
            include_filters_rule.append("json:$")
            has_filter_rule = True

+        if is_json:
+            # Sort the JSON so we dont get false alerts when the content is just re-ordered
+            try:
+                fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True)
+            except Exception as e:
+                # Might have just been a snippet, or otherwise bad JSON, continue
+                pass
+
        if has_filter_rule:
            json_filter_prefixes = ['json:', 'jq:']
            for filter in include_filters_rule:
@@ -174,6 +183,8 @@ class perform_site_check():
                    stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                    is_html = False

+
+
        if is_html or is_source:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
--- a/changedetectionio/res/xpath_element_scraper.js
+++ b/changedetectionio/res/xpath_element_scraper.js
@@ -174,10 +174,23 @@ if (include_filters.length) {
        }

        if (q) {
-            bbox = q.getBoundingClientRect();
-            console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y)
-        } else {
-            console.log("xpath_element_scraper: filter element "+f+" was not found");
+            // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
+            if (q.hasOwnProperty('getBoundingClientRect')) {
+                bbox = q.getBoundingClientRect();
+                console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
+            } else {
+                try {
+                    // Try and see we can find its ownerElement
+                    bbox = q.ownerElement.getBoundingClientRect();
+                    console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
+                } catch (e) {
+                    console.log("xpath_element_scraper: error looking up ownerElement")
+                }
+            }
+        }
+        
+        if(!q) {
+            console.log("xpath_element_scraper: filter element " + f + " was not found");
        }

        if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server):
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data

+def test_ignore_json_order(client, live_server):
+    # A change in order shouldn't trigger a notification
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"hello" : 123, "world": 123}')
+
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(2)
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"world" : 123, "hello": 123}')
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(2)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+    # Just to be sure it still works
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"world" : 123, "hello": 124}')
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(2)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
 def test_check_jsonpath_ext_filter(client, live_server):
    check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
Author	SHA1	Message	Date
dgtlmoon	6788796788	Attempt retry BEtter messages	2022-12-16 17:29:16 +01:00
dgtlmoon	efafc9bef8	Provide more information on PageUnloadable	2022-12-16 17:04:12 +01:00
dgtlmoon	b7a2501d64	Fetching - Always sort the key order of JSON content for less false alerts (May cause an alert on upgrade, but will be better going forwards) #1219	2022-12-15 09:13:09 +01:00
dgtlmoon	e970fef991	Fetcher + VisualSelector - xPath filter with attribute filter was breaking the element finder	2022-12-14 19:06:49 +01:00