mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-02 23:57:22 +00:00
Compare commits
10 Commits
test-tidy
...
update-ins
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc33d49840 | ||
|
|
c30f96c4cd | ||
|
|
c8310b7e93 | ||
|
|
3b16b19a94 | ||
|
|
4ee9fa79e1 | ||
|
|
4b49759113 | ||
|
|
e9a9790cb0 | ||
|
|
593660e2f6 | ||
|
|
7d96b4ba83 | ||
|
|
fca40e4d5b |
@@ -18,8 +18,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue
|
||||
def accept(uuid):
|
||||
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
|
||||
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
|
||||
return redirect(url_for("form_watch_checknow", uuid=uuid))
|
||||
|
||||
return redirect(url_for("index"))
|
||||
|
||||
@login_required
|
||||
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
|
||||
|
||||
@@ -51,6 +51,7 @@ class BrowserStepsStepException(Exception):
|
||||
return
|
||||
|
||||
|
||||
# @todo - make base Exception class that announces via logger()
|
||||
class PageUnloadable(Exception):
|
||||
def __init__(self, status_code, url, message, screenshot=False):
|
||||
# Set this so we can use it in other parts of the app
|
||||
@@ -60,6 +61,10 @@ class PageUnloadable(Exception):
|
||||
self.message = message
|
||||
return
|
||||
|
||||
class BrowserStepsInUnsupportedFetcher(Exception):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
return
|
||||
|
||||
class EmptyReply(Exception):
|
||||
def __init__(self, status_code, url, screenshot=None):
|
||||
@@ -389,10 +394,24 @@ class base_html_playwright(Fetcher):
|
||||
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
|
||||
else:
|
||||
# 200 Here means that the communication to browserless worked only, not the page state
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
x = response.json()
|
||||
except Exception as e:
|
||||
raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
|
||||
|
||||
try:
|
||||
self.status_code = response.status_code
|
||||
except Exception as e:
|
||||
raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
|
||||
|
||||
self.headers = x.get('headers')
|
||||
|
||||
if self.status_code != 200 and not ignore_status_codes:
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
|
||||
|
||||
if self.status_code == 200:
|
||||
import base64
|
||||
|
||||
x = response.json()
|
||||
if not x.get('screenshot'):
|
||||
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
|
||||
# https://github.com/puppeteer/puppeteer/issues/1834
|
||||
@@ -403,16 +422,10 @@ class base_html_playwright(Fetcher):
|
||||
if not x.get('content', '').strip():
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
if x.get('status_code', 200) != 200 and not ignore_status_codes:
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
|
||||
|
||||
self.content = x.get('content')
|
||||
self.headers = x.get('headers')
|
||||
self.instock_data = x.get('instock_data')
|
||||
self.screenshot = base64.b64decode(x.get('screenshot'))
|
||||
self.status_code = x.get('status_code')
|
||||
self.xpath_data = x.get('xpath_data')
|
||||
|
||||
else:
|
||||
# Some other error from browserless
|
||||
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
|
||||
@@ -703,6 +716,9 @@ class html_requests(Fetcher):
|
||||
current_include_filters=None,
|
||||
is_binary=False):
|
||||
|
||||
if self.browser_steps_get_valid_steps():
|
||||
raise BrowserStepsInUnsupportedFetcher(url=url)
|
||||
|
||||
# Make requests use a more modern looking user-agent
|
||||
if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
|
||||
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
|
||||
@@ -742,6 +758,8 @@ class html_requests(Fetcher):
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
|
||||
self.headers = r.headers
|
||||
|
||||
if not r.content or not len(r.content):
|
||||
raise EmptyReply(url=url, status_code=r.status_code)
|
||||
|
||||
@@ -758,7 +776,7 @@ class html_requests(Fetcher):
|
||||
else:
|
||||
self.content = r.text
|
||||
|
||||
self.headers = r.headers
|
||||
|
||||
self.raw_content = r.content
|
||||
|
||||
|
||||
|
||||
@@ -409,23 +409,6 @@ def has_ldjson_product_info(content):
|
||||
x=bool(pricing_data)
|
||||
return x
|
||||
|
||||
|
||||
def workarounds_for_obfuscations(content):
|
||||
"""
|
||||
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
||||
This could go into its own Pip package in the future, for faster updates
|
||||
"""
|
||||
|
||||
# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
|
||||
# https://github.com/weblyzard/inscriptis/issues/45
|
||||
if not content:
|
||||
return content
|
||||
|
||||
content = re.sub('<!--\s+-->', '', content)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def get_triggered_text(content, trigger_text):
|
||||
triggered_text = []
|
||||
result = strip_ignore_text(content=content,
|
||||
|
||||
@@ -45,6 +45,7 @@ base_config = {
|
||||
'last_error': False,
|
||||
'last_viewed': 0, # history key value of the last viewed via the [diff] link
|
||||
'method': 'GET',
|
||||
'notification_alert_count': 0,
|
||||
# Custom notification content
|
||||
'notification_body': None,
|
||||
'notification_format': default_notification_format_for_watch,
|
||||
@@ -56,6 +57,7 @@ base_config = {
|
||||
'previous_md5': False,
|
||||
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
|
||||
'proxy': None, # Preferred proxy connection
|
||||
'remote_server_reply': None, # From 'server' reply header
|
||||
'subtractive_selectors': [],
|
||||
'tag': '', # Old system of text name for a tag, to be removed
|
||||
'tags': [], # list of UUIDs to App.Tags
|
||||
|
||||
@@ -151,7 +151,6 @@ class perform_site_check(difference_detection_processor):
|
||||
if is_html or watch.is_source_type_url:
|
||||
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||
html_content = self.fetcher.content
|
||||
|
||||
# If not JSON, and if it's not text/plain..
|
||||
|
||||
@@ -47,6 +47,7 @@ function isItemInStock() {
|
||||
'não estamos a aceitar encomendas',
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'prodotto esaurito',
|
||||
'produkt niedostępny',
|
||||
'sold out',
|
||||
'sold-out',
|
||||
|
||||
@@ -90,5 +90,10 @@ $(document).ready(function () {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
$('#diff-form').on('submit', function (e) {
|
||||
if ($('select[name=from_version]').val() === $('select[name=to_version]').val()) {
|
||||
e.preventDefault();
|
||||
alert('Error - You are trying to compare the same version.');
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -255,6 +255,7 @@ class ChangeDetectionStore:
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'previous_md5_before_filters': False,
|
||||
'remote_server_reply': None,
|
||||
'track_ldjson_price_data': None,
|
||||
})
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
<script src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
|
||||
|
||||
<div id="settings">
|
||||
<form class="pure-form " action="" method="GET">
|
||||
<form class="pure-form " action="" method="GET" id="diff-form">
|
||||
<fieldset>
|
||||
{% if versions|length >= 1 %}
|
||||
<strong>Compare</strong>
|
||||
|
||||
@@ -483,6 +483,10 @@ Unavailable") }}
|
||||
<td>Last fetch time</td>
|
||||
<td>{{ watch.fetch_time }}s</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Notification alert count</td>
|
||||
<td>{{ watch.notification_alert_count }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
def set_original_ignore_response():
|
||||
@@ -21,7 +21,7 @@ def set_original_ignore_response():
|
||||
def test_obfuscations(client, live_server):
|
||||
set_original_ignore_response()
|
||||
live_server_setup(live_server)
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
@@ -32,12 +32,12 @@ def test_obfuscations(client, live_server):
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(3)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Check HTML conversion detected and workd
|
||||
res = client.get(
|
||||
url_for("preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# whitespace appears but it renders https://github.com/weblyzard/inscriptis/issues/45#issuecomment-1923339265
|
||||
assert b'$90.74' in res.data
|
||||
|
||||
@@ -10,7 +10,7 @@ def test_setup(live_server):
|
||||
# Hard to just add more live server URLs when one test is already running (I think)
|
||||
# So we add our test here (was in a different file)
|
||||
def test_headers_in_request(client, live_server):
|
||||
#live_server_setup(live_server)
|
||||
#ve_server_setup(live_server)
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_headers', _external=True)
|
||||
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
|
||||
@@ -70,16 +70,17 @@ def test_headers_in_request(client, live_server):
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Re #137 - Examine the JSON index file, it should have only one set of headers entered
|
||||
# Re #137 - It should have only one set of headers entered
|
||||
watches_with_headers = 0
|
||||
with open('test-datastore/url-watches.json') as f:
|
||||
app_struct = json.load(f)
|
||||
for uuid in app_struct['watching']:
|
||||
if (len(app_struct['watching'][uuid]['headers'])):
|
||||
for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
|
||||
if (len(watch['headers'])):
|
||||
watches_with_headers += 1
|
||||
assert watches_with_headers == 1
|
||||
|
||||
# 'server' http header was automatically recorded
|
||||
for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
|
||||
assert 'custom' in watch.get('remote_server_reply') # added in util.py
|
||||
|
||||
# Should be only one with headers set
|
||||
assert watches_with_headers==1
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
|
||||
@@ -175,12 +175,16 @@ def live_server_setup(live_server):
|
||||
@live_server.app.route('/test-headers')
|
||||
def test_headers():
|
||||
|
||||
output= []
|
||||
output = []
|
||||
|
||||
for header in request.headers:
|
||||
output.append("{}:{}".format(str(header[0]),str(header[1]) ))
|
||||
output.append("{}:{}".format(str(header[0]), str(header[1])))
|
||||
|
||||
return "\n".join(output)
|
||||
content = "\n".join(output)
|
||||
|
||||
resp = make_response(content, 200)
|
||||
resp.headers['server'] = 'custom'
|
||||
return resp
|
||||
|
||||
# Just return the body in the request
|
||||
@live_server.app.route('/test-body', methods=['POST', 'GET'])
|
||||
|
||||
@@ -150,6 +150,10 @@ class update_worker(threading.Thread):
|
||||
queued = False
|
||||
if n_object and n_object.get('notification_urls'):
|
||||
queued = True
|
||||
|
||||
count = watch.get('notification_alert_count', 0) + 1
|
||||
self.datastore.update_watch(uuid=watch_uuid, update_obj={'notification_alert_count': count})
|
||||
|
||||
self.queue_notification_for_watch(notification_q=self.notification_q, n_object=n_object, watch=watch)
|
||||
|
||||
return queued
|
||||
@@ -430,6 +434,12 @@ class update_worker(threading.Thread):
|
||||
'last_check_status': e.status_code,
|
||||
'has_ldjson_price_data': None})
|
||||
process_changedetection_results = False
|
||||
except content_fetcher.BrowserStepsInUnsupportedFetcher as e:
|
||||
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
|
||||
process_changedetection_results = False
|
||||
logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}")
|
||||
|
||||
except UnableToExtractRestockData as e:
|
||||
# Usually when fetcher.instock_data returns empty
|
||||
logger.error(f"Exception (UnableToExtractRestockData) reached processing watch UUID: {uuid}")
|
||||
@@ -491,6 +501,16 @@ class update_worker(threading.Thread):
|
||||
if self.datastore.data['watching'].get(uuid):
|
||||
# Always record that we atleast tried
|
||||
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
|
||||
|
||||
# Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds
|
||||
try:
|
||||
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
|
||||
self.datastore.update_watch(uuid=uuid,
|
||||
update_obj={'remote_server_reply': server_header}
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
|
||||
'last_checked': round(time.time()),
|
||||
'check_count': count
|
||||
|
||||
@@ -94,7 +94,8 @@ services:
|
||||
#
|
||||
|
||||
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||
# Note: works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) and other issues
|
||||
# Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
|
||||
# Does not report status codes (200, 404, 403) and other issues
|
||||
# More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
|
||||
# browser-chrome:
|
||||
# hostname: browser-chrome
|
||||
|
||||
@@ -8,7 +8,7 @@ flask_expects_json~=1.7
|
||||
flask_restful
|
||||
flask_wtf~=1.2
|
||||
flask~=2.3
|
||||
inscriptis~=2.2
|
||||
inscriptis~=2.4
|
||||
pytz
|
||||
timeago~=1.0
|
||||
validators~=0.21
|
||||
|
||||
Reference in New Issue
Block a user