mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-10 11:36:55 +00:00
Compare commits
4 Commits
ticket-123
...
fetch-reli
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6788796788 | ||
|
|
efafc9bef8 | ||
|
|
b7a2501d64 | ||
|
|
e970fef991 |
@@ -42,7 +42,7 @@ class BrowserStepsStepTimout(Exception):
|
||||
|
||||
|
||||
class PageUnloadable(Exception):
|
||||
def __init__(self, status_code, url, screenshot=False, message=False):
|
||||
def __init__(self, status_code, url, message, screenshot=False):
|
||||
# Set this so we can use it in other parts of the app
|
||||
self.status_code = status_code
|
||||
self.url = url
|
||||
@@ -299,23 +299,34 @@ class base_html_playwright(Fetcher):
|
||||
if len(request_headers):
|
||||
context.set_extra_http_headers(request_headers)
|
||||
|
||||
try:
|
||||
self.page.set_default_navigation_timeout(90000)
|
||||
self.page.set_default_timeout(90000)
|
||||
|
||||
# Listen for all console events and handle errors
|
||||
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
|
||||
|
||||
# Bug - never set viewport size BEFORE page.goto
|
||||
|
||||
|
||||
# Waits for the next navigation. Using Python context manager
|
||||
# prevents a race condition between clicking and waiting for a navigation.
|
||||
response = self.page.goto(url, wait_until='commit')
|
||||
# Goto page
|
||||
try:
|
||||
# Wait_until = commit
|
||||
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
|
||||
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
|
||||
# This seemed to solve nearly all 'TimeoutErrors'
|
||||
response = self.page.goto(url, wait_until='commit')
|
||||
except playwright._impl._api_types.Error as e:
|
||||
# Retry once - https://github.com/browserless/chrome/issues/2485
|
||||
# Sometimes errors related to invalid cert's and other can be random
|
||||
print ("Content Fetcher > retrying request got error - ", str(e))
|
||||
time.sleep(1)
|
||||
response = self.page.goto(url, wait_until='commit')
|
||||
|
||||
except Exception as e:
|
||||
print ("Content Fetcher > Other exception when page.goto", str(e))
|
||||
context.close()
|
||||
browser.close()
|
||||
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
||||
|
||||
# Execute any browser steps
|
||||
try:
|
||||
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
||||
self.page.wait_for_timeout(extra_wait * 1000)
|
||||
|
||||
@@ -328,17 +339,15 @@ class base_html_playwright(Fetcher):
|
||||
# This can be ok, we will try to grab what we could retrieve
|
||||
pass
|
||||
except Exception as e:
|
||||
print ("other exception when page.goto")
|
||||
print (str(e))
|
||||
print ("Content Fetcher > Other exception when executing custom JS code", str(e))
|
||||
context.close()
|
||||
browser.close()
|
||||
raise PageUnloadable(url=url, status_code=None)
|
||||
|
||||
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
||||
|
||||
if response is None:
|
||||
context.close()
|
||||
browser.close()
|
||||
print ("response object was none")
|
||||
print ("Content Fetcher > Response object was none")
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
# Bug 2(?) Set the viewport size AFTER loading the page
|
||||
@@ -357,7 +366,7 @@ class base_html_playwright(Fetcher):
|
||||
if len(self.page.content().strip()) == 0:
|
||||
context.close()
|
||||
browser.close()
|
||||
print ("Content was empty")
|
||||
print ("Content Fetcher > Content was empty")
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
# Bug 2(?) Set the viewport size AFTER loading the page
|
||||
@@ -502,7 +511,7 @@ class base_html_webdriver(Fetcher):
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception as e:
|
||||
print("Exception in chrome shutdown/quit" + str(e))
|
||||
print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))
|
||||
|
||||
|
||||
# "html_requests" is listed as the default fetcher in store.py!
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -167,6 +168,14 @@ class perform_site_check():
|
||||
include_filters_rule.append("json:$")
|
||||
has_filter_rule = True
|
||||
|
||||
if is_json:
|
||||
# Sort the JSON so we dont get false alerts when the content is just re-ordered
|
||||
try:
|
||||
fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True)
|
||||
except Exception as e:
|
||||
# Might have just been a snippet, or otherwise bad JSON, continue
|
||||
pass
|
||||
|
||||
if has_filter_rule:
|
||||
json_filter_prefixes = ['json:', 'jq:']
|
||||
for filter in include_filters_rule:
|
||||
@@ -174,6 +183,8 @@ class perform_site_check():
|
||||
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
|
||||
is_html = False
|
||||
|
||||
|
||||
|
||||
if is_html or is_source:
|
||||
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
|
||||
@@ -174,10 +174,23 @@ if (include_filters.length) {
|
||||
}
|
||||
|
||||
if (q) {
|
||||
bbox = q.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y)
|
||||
} else {
|
||||
console.log("xpath_element_scraper: filter element "+f+" was not found");
|
||||
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
|
||||
if (q.hasOwnProperty('getBoundingClientRect')) {
|
||||
bbox = q.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
|
||||
} else {
|
||||
try {
|
||||
// Try and see we can find its ownerElement
|
||||
bbox = q.ownerElement.getBoundingClientRect();
|
||||
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
|
||||
} catch (e) {
|
||||
console.log("xpath_element_scraper: error looking up ownerElement")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!q) {
|
||||
console.log("xpath_element_scraper: filter element " + f + " was not found");
|
||||
}
|
||||
|
||||
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||
|
||||
@@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server):
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_ignore_json_order(client, live_server):
|
||||
# A change in order shouldn't trigger a notification
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write('{"hello" : 123, "world": 123}')
|
||||
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', content_type="application/json", _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write('{"world" : 123, "hello": 123}')
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(2)
|
||||
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
|
||||
# Just to be sure it still works
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write('{"world" : 123, "hello": 124}')
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(2)
|
||||
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_jsonpath_ext_filter(client, live_server):
|
||||
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user