Compare commits

..

17 Commits

Author SHA1 Message Date
dgtlmoon
c3731cf055 0.40.0.3 2022-12-19 12:41:52 +01:00
dgtlmoon
a287e5a86c Visual Selector - Select smallest/most precise element first, better filtering of zero size elements 2022-12-19 12:33:31 +01:00
dgtlmoon
235535c327 Fetching - Check the most overdue watch first (#1242) 2022-12-17 15:40:57 +01:00
dgtlmoon
44dc62da2d Overview list - Checkbox action "Recheck" 2022-12-16 18:35:09 +01:00
dgtlmoon
0c380c170f Playwright - Better error reporting and re-try fetch on fail once (#1238) 2022-12-16 18:06:14 +01:00
dgtlmoon
b7a2501d64 Fetching - Always sort the key order of JSON content for less false alerts (May cause an alert on upgrade, but will be better going forwards) #1219 2022-12-15 09:13:09 +01:00
dgtlmoon
e970fef991 Fetcher + VisualSelector - xPath filter with attribute filter was breaking the element finder 2022-12-14 19:06:49 +01:00
dgtlmoon
b76148a0f4 Fetcher - CPU usage - Skip processing if the previous checksum and the just fetched one was the same (#925) 2022-12-14 15:08:34 +01:00
dgtlmoon
93cc30437f Playwright+BrowserSteps - Fetch changes - Fetch simply after page starts rendering + delay seconds, disable service workers 2022-12-14 12:16:04 +01:00
dgtlmoon
6562d6e0d4 Improve ARM/rust build comment 2022-12-13 12:28:20 +01:00
dgtlmoon
6c217cc3b6 README.md - Improving JSONPath example for LD+JSON product data 2022-12-11 11:14:52 +01:00
dgtlmoon
f30cdf0674 0.40.0.2 2022-12-08 22:36:59 +01:00
dgtlmoon
14da0646a7 Price follower - Dont scan for ldjson data when 'no' was clicked on the suggestion (#1207) 2022-12-08 22:35:37 +01:00
dgtlmoon
b413cdecc7 Adding missing parts for pip build Re #1206 2022-12-08 21:54:55 +01:00
dgtlmoon
7bf52d9275 0.40.0 2022-12-08 20:09:42 +01:00
dgtlmoon
09e6624afd VisualSelector - Exclude items that are not interactable or visible 2022-12-08 20:08:41 +01:00
dgtlmoon
b58fd995b5 Automatically offer to track LD+JSON product price data (#1204) 2022-12-08 19:28:20 +01:00
22 changed files with 350 additions and 142 deletions

View File

@@ -1,7 +1,7 @@
# pip dependencies install stage
FROM python:3.8-slim as builder
# rustc compiler would be needed on ARM type devices but theres an issue with some deps not building..
# See `cryptography` pin comment in requirements.txt
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -31,8 +31,7 @@ RUN pip install --target=/dependencies playwright~=1.27.1 \
# Final image stage
FROM python:3.8-slim
# Actual packages needed at runtime, usually due to the notification (apprise) backend
# rustc compiler would be needed on ARM type devices but theres an issue with some deps not building..
# See `cryptography` pin comment in requirements.txt
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
# Re #93, #73, excluding rustc (adds another 430Mb~)

View File

@@ -1,9 +1,10 @@
recursive-include changedetectionio/api *
recursive-include changedetectionio/templates *
recursive-include changedetectionio/static *
recursive-include changedetectionio/blueprint *
recursive-include changedetectionio/model *
recursive-include changedetectionio/tests *
recursive-include changedetectionio/res *
recursive-include changedetectionio/static *
recursive-include changedetectionio/templates *
recursive-include changedetectionio/tests *
prune changedetectionio/static/package-lock.json
prune changedetectionio/static/styles/node_modules
prune changedetectionio/static/styles/package-lock.json

View File

@@ -187,11 +187,29 @@ When you enable a `json:` or `jq:` filter, you can even automatically extract an
<html>
...
<script type="application/ld+json">
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","price": 23.50 }
{
"@context":"http://schema.org/",
"@type":"Product",
"offers":{
"@type":"Offer",
"availability":"http://schema.org/InStock",
"price":"3949.99",
"priceCurrency":"USD",
"url":"https://www.newegg.com/p/3D5-000D-001T1"
},
"description":"Cobratype King Cobra Hero Desktop Gaming PC",
"name":"Cobratype King Cobra Hero Desktop Gaming PC",
"sku":"3D5-000D-001T1",
"itemCondition":"NewCondition"
}
</script>
```
`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
`json:$..price` or `jq:..price` would give `3949.99`, or you can extract the whole structure (use a JSONpath test website to validate with)
The application also supports notifying you that it can follow this information automatically
## Proxy Configuration

View File

@@ -10,6 +10,7 @@ import threading
import time
import timeago
from changedetectionio import queuedWatchMetaData
from copy import deepcopy
from distutils.util import strtobool
from feedgen.feed import FeedGenerator
@@ -35,7 +36,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools
from changedetectionio.api import api_v1
__version__ = '0.39.22.1'
__version__ = '0.40.0.3'
datastore = None
@@ -404,7 +405,6 @@ def changedetection_app(config=None, datastore_o=None):
sorted_watches.append(watch)
existing_tags = datastore.get_all_tags()
form = forms.quickWatchForm(request.form)
output = render_template("watch-overview.html",
form=form,
@@ -416,7 +416,7 @@ def changedetection_app(config=None, datastore_o=None):
# Don't link to hosting when we're on the hosting environment
hosted_sticky=os.getenv("SALTED_PASS", False) == False,
guid=datastore.data['app_guid'],
queued_uuids=[uuid for p,uuid in update_q.queue])
queued_uuids=[q_uuid.item['uuid'] for q_uuid in update_q.queue])
if session.get('share-link'):
@@ -596,25 +596,16 @@ def changedetection_app(config=None, datastore_o=None):
using_default_check_time = False
break
# Use the default if its the same as system wide
# Use the default if it's the same as system-wide.
if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']:
extra_update_obj['fetch_backend'] = None
# Ignore text
form_ignore_text = form.ignore_text.data
datastore.data['watching'][uuid]['ignore_text'] = form_ignore_text
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if form_ignore_text:
if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []):
if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Be sure proxy value is None
if datastore.proxy_list is not None and form.data['proxy'] == '':
extra_update_obj['proxy'] = None
@@ -632,7 +623,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.needs_write_urgent = True
# Queue the watch for immediate recheck, with a higher priority
update_q.put((1, uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
@@ -773,7 +764,7 @@ def changedetection_app(config=None, datastore_o=None):
importer = import_url_list()
importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
for uuid in importer.new_uuids:
update_q.put((1, uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
if len(importer.remaining_data) == 0:
return redirect(url_for('index'))
@@ -786,7 +777,7 @@ def changedetection_app(config=None, datastore_o=None):
d_importer = import_distill_io_json()
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
for uuid in d_importer.new_uuids:
update_q.put((1, uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
@@ -1151,7 +1142,7 @@ def changedetection_app(config=None, datastore_o=None):
if not add_paused and new_uuid:
# Straight into the queue.
update_q.put((1, new_uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
flash("Watch added.")
if add_paused:
@@ -1188,7 +1179,7 @@ def changedetection_app(config=None, datastore_o=None):
uuid = list(datastore.data['watching'].keys()).pop()
new_uuid = datastore.clone(uuid)
update_q.put((5, new_uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
flash('Cloned.')
return redirect(url_for('index'))
@@ -1196,7 +1187,7 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/api/checknow", methods=['GET'])
@login_required
def form_watch_checknow():
# Forced recheck will skip the 'skip if content is the same' rule (, 'reprocess_existing_data': True})))
tag = request.args.get('tag')
uuid = request.args.get('uuid')
i = 0
@@ -1205,11 +1196,9 @@ def changedetection_app(config=None, datastore_o=None):
for t in running_update_threads:
running_uuids.append(t.current_uuid)
# @todo check thread is running and skip
if uuid:
if uuid not in running_uuids:
update_q.put((1, uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
i = 1
elif tag != None:
@@ -1217,14 +1206,14 @@ def changedetection_app(config=None, datastore_o=None):
for watch_uuid, watch in datastore.data['watching'].items():
if (tag != None and tag in watch['tag']):
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put((1, watch_uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1
else:
# No tag, no uuid, add everything.
for watch_uuid, watch in datastore.data['watching'].items():
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put((1, watch_uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1
flash("{} watches are queued for rechecking.".format(i))
return redirect(url_for('index', tag=tag))
@@ -1271,6 +1260,14 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['watching'][uuid.strip()]['notification_muted'] = False
flash("{} watches un-muted".format(len(uuids)))
elif (op == 'recheck'):
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
# Recheck and require a full reprocessing
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
flash("{} watches un-muted".format(len(uuids)))
elif (op == 'notification-default'):
from changedetectionio.notification import (
default_notification_format_for_watch
@@ -1344,7 +1341,7 @@ def changedetection_app(config=None, datastore_o=None):
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
import changedetectionio.blueprint.price_data_follower as price_data_follower
app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
app.register_blueprint(price_data_follower.construct_blueprint(datastore, update_q), url_prefix='/price_data_follower')
# @todo handle ctrl break
@@ -1452,7 +1449,11 @@ def ticker_thread_check_time_launch_checks():
watch_uuid_list = []
while True:
try:
watch_uuid_list = datastore.data['watching'].keys()
# Get a list of watches sorted by last_checked, [1] because it gets passed a tuple
# This is so we examine the most over-due first
for k in sorted(datastore.data['watching'].items(), key=lambda item: item[1].get('last_checked',0)):
watch_uuid_list.append(k[0])
except RuntimeError as e:
# RuntimeError: dictionary changed size during iteration
time.sleep(0.1)
@@ -1492,7 +1493,7 @@ def ticker_thread_check_time_launch_checks():
seconds_since_last_recheck = now - watch['last_checked']
if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds:
if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]:
if not uuid in running_uuids and uuid not in [q_uuid.item['uuid'] for q_uuid in update_q.queue]:
# Proxies can be set to have a limit on seconds between which they can be called
watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid)
@@ -1523,8 +1524,9 @@ def ticker_thread_check_time_launch_checks():
priority,
watch.jitter_seconds,
now - watch['last_checked']))
# Into the queue with you
update_q.put((priority, uuid))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid, 'skip_when_checksum_same': True}))
# Reset for next time
watch.jitter_seconds = 0

View File

@@ -1,3 +1,4 @@
from changedetectionio import queuedWatchMetaData
from flask_restful import abort, Resource
from flask import request, make_response
import validators
@@ -24,7 +25,7 @@ class Watch(Resource):
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
if request.args.get('recheck'):
self.update_q.put((1, uuid))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return "OK", 200
# Return without history, get that via another API call
@@ -100,7 +101,7 @@ class CreateWatch(Resource):
extras = {'title': json_data['title'].strip()} if json_data.get('title') else {}
new_uuid = self.datastore.add_watch(url=json_data['url'].strip(), tag=tag, extras=extras)
self.update_q.put((1, new_uuid))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
return {'uuid': new_uuid}, 201
# Return concise list of available watches and some very basic info
@@ -118,7 +119,7 @@ class CreateWatch(Resource):
if request.args.get('recheck_all'):
for uuid in self.datastore.data['watching'].keys():
self.update_q.put((1, uuid))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return {'status': "OK"}, 200
return list, 200

View File

@@ -75,15 +75,13 @@ class steppable_browser_interface():
def action_goto_url(self, url, optional_value):
# self.page.set_viewport_size({"width": 1280, "height": 5000})
now = time.time()
response = self.page.goto(url, timeout=0, wait_until='domcontentloaded')
print("Time to goto URL", time.time() - now)
response = self.page.goto(url, timeout=0, wait_until='commit')
# Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
self.page.wait_for_timeout(extra_wait * 1000)
print("Time to goto URL ", time.time() - now)
def action_click_element_containing_text(self, selector=None, value=''):
if not len(value.strip()):

View File

@@ -3,22 +3,28 @@ from distutils.util import strtobool
from flask import Blueprint, flash, redirect, url_for
from flask_login import login_required
from changedetectionio.store import ChangeDetectionStore
from changedetectionio import queuedWatchMetaData
from queue import PriorityQueue
def construct_blueprint(datastore: ChangeDetectionStore):
PRICE_DATA_TRACK_ACCEPT = 'accepted'
PRICE_DATA_TRACK_REJECT = 'rejected'
def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue):
price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
@login_required
@price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
def accept(uuid):
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
return redirect(url_for("form_watch_checknow", uuid=uuid))
@login_required
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
def reject(uuid):
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_REJECT
return redirect(url_for("index"))

View File

@@ -23,6 +23,9 @@ class Non200ErrorCodeReceived(Exception):
self.page_text = html_tools.html_to_text(page_html)
return
class checksumFromPreviousCheckWasTheSame(Exception):
def __init__(self):
return
class JSActionExceptions(Exception):
def __init__(self, status_code, url, screenshot, message=''):
@@ -39,7 +42,7 @@ class BrowserStepsStepTimout(Exception):
class PageUnloadable(Exception):
def __init__(self, status_code, url, screenshot=False, message=False):
def __init__(self, status_code, url, message, screenshot=False):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
@@ -286,6 +289,8 @@ class base_html_playwright(Fetcher):
proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Can't think why we need the service workers for our use case?
service_workers='block',
# Should never be needed
accept_downloads=False
)
@@ -294,24 +299,34 @@ class base_html_playwright(Fetcher):
if len(request_headers):
context.set_extra_http_headers(request_headers)
try:
self.page.set_default_navigation_timeout(90000)
self.page.set_default_timeout(90000)
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto
# Waits for the next navigation. Using Python context manager
# prevents a race condition between clicking and waiting for a navigation.
with self.page.expect_navigation():
response = self.page.goto(url, wait_until='load')
# Goto page
try:
# Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
response = self.page.goto(url, wait_until='commit')
except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random
print ("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1)
response = self.page.goto(url, wait_until='commit')
except Exception as e:
print ("Content Fetcher > Other exception when page.goto", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
# Execute any browser steps
try:
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000)
@@ -324,17 +339,15 @@ class base_html_playwright(Fetcher):
# This can be ok, we will try to grab what we could retrieve
pass
except Exception as e:
print ("other exception when page.goto")
print (str(e))
print ("Content Fetcher > Other exception when executing custom JS code", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None)
raise PageUnloadable(url=url, status_code=None, message=str(e))
if response is None:
context.close()
browser.close()
print ("response object was none")
print ("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page
@@ -353,7 +366,7 @@ class base_html_playwright(Fetcher):
if len(self.page.content().strip()) == 0:
context.close()
browser.close()
print ("Content was empty")
print ("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page
@@ -498,7 +511,7 @@ class base_html_webdriver(Fetcher):
try:
self.driver.quit()
except Exception as e:
print("Exception in chrome shutdown/quit" + str(e))
print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))
# "html_requests" is listed as the default fetcher in store.py!

View File

@@ -1,10 +1,13 @@
import hashlib
import json
import logging
import os
import re
import urllib3
from changedetectionio import content_fetcher, html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -37,8 +40,7 @@ class perform_site_check():
return regex
def run(self, uuid):
from copy import deepcopy
def run(self, uuid, skip_when_checksum_same=True):
changed_detected = False
screenshot = False # as bytes
stripped_text_from_html = ""
@@ -121,6 +123,14 @@ class perform_site_check():
self.screenshot = fetcher.screenshot
self.xpath_data = fetcher.xpath_data
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
if skip_when_checksum_same:
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
# Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base?
@@ -148,8 +158,8 @@ class perform_site_check():
)
# Inject a virtual LD+JSON price tracker rule
if watch.get('track_ldjson_price_data'):
include_filters_rule.append('json:$..price')
if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
@@ -158,6 +168,14 @@ class perform_site_check():
include_filters_rule.append("json:$")
has_filter_rule = True
if is_json:
# Sort the JSON so we dont get false alerts when the content is just re-ordered
try:
fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True)
except Exception as e:
# Might have just been a snippet, or otherwise bad JSON, continue
pass
if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule:
@@ -165,6 +183,8 @@ class perform_site_check():
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
is_html = False
if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text

View File

@@ -10,6 +10,10 @@ import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br/>"
# 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
@@ -166,7 +170,9 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
if ensure_is_ldjson_info_type:
# Could sometimes be list, string or something else random
if isinstance(json_data, dict):
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower():
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
break
elif stripped_text_from_html:
break
@@ -259,7 +265,7 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
try:
pricing_data = extract_json_as_string(content=content, json_filter='json:$..price', ensure_is_ldjson_info_type="product")
pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
except JSONNotFound as e:
# Totally fine
return False

View File

@@ -14,51 +14,52 @@ from changedetectionio.notification import (
class model(dict):
__newest_history_key = None
__history_n=0
__history_n = 0
__base_config = {
#'history': {}, # Dict of timestamp and output stripped filename (removed)
#'newest_history_key': 0, (removed, taken from history.txt index)
'body': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'has_ldjson_price_data': None,
'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'include_filters': [],
'last_checked': 0,
'last_error': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET',
# Custom notification content
'notification_body': None,
'notification_format': default_notification_format_for_watch,
'notification_muted': False,
'notification_title': None,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'paused': False,
'previous_md5': False,
'proxy': None, # Preferred proxy connection
'subtractive_selectors': [],
'tag': None,
'text_should_not_be_present': [], # Text that should not present
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'title': None,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': None,
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
}
# 'history': {}, # Dict of timestamp and output stripped filename (removed)
# 'newest_history_key': 0, (removed, taken from history.txt index)
'body': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'has_ldjson_price_data': None,
'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'include_filters': [],
'last_checked': 0,
'last_error': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET',
# Custom notification content
'notification_body': None,
'notification_format': default_notification_format_for_watch,
'notification_muted': False,
'notification_title': None,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'proxy': None, # Preferred proxy connection
'subtractive_selectors': [],
'tag': None,
'text_should_not_be_present': [], # Text that should not present
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'title': None,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': None,
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
}
jitter_seconds = 0
def __init__(self, *arg, **kw):

View File

@@ -0,0 +1,10 @@
from dataclasses import dataclass, field
from typing import Any
# So that we can queue some metadata in `item`
# https://docs.python.org/3/library/queue.html#queue.PriorityQueue
#
@dataclass(order=True)
class PrioritizedItem:
priority: int
item: Any=field(compare=False)

View File

@@ -1,3 +1,6 @@
// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
// All rights reserved.
// @file Scrape the page looking for elements of concern (%ELEMENTS%)
// http://matatk.agrip.org.uk/tests/position-and-width/
// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
@@ -81,8 +84,16 @@ var bbox;
for (var i = 0; i < elements.length; i++) {
bbox = elements[i].getBoundingClientRect();
// Forget really small ones
if (bbox['width'] < 10 && bbox['height'] < 10) {
// Exclude items that are not interactable or visible
if(elements[i].style.opacity === "0") {
continue
}
if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) {
continue
}
// Skip really small ones, and where width or height ==0
if (bbox['width'] * bbox['height'] < 100) {
continue;
}
@@ -138,7 +149,6 @@ for (var i = 0; i < elements.length; i++) {
}
// Inject the current one set in the include_filters, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated.
if (include_filters.length) {
@@ -166,10 +176,23 @@ if (include_filters.length) {
}
if (q) {
bbox = q.getBoundingClientRect();
console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y)
} else {
console.log("xpath_element_scraper: filter element "+f+" was not found");
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
if (q.hasOwnProperty('getBoundingClientRect')) {
bbox = q.getBoundingClientRect();
console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
} else {
try {
// Try and see we can find its ownerElement
bbox = q.ownerElement.getBoundingClientRect();
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
} catch (e) {
console.log("xpath_element_scraper: error looking up ownerElement")
}
}
}
if(!q) {
console.log("xpath_element_scraper: filter element " + f + " was not found");
}
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
@@ -184,5 +207,9 @@ if (include_filters.length) {
}
}
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
// so that we dont select the wrapping element by mistake and be unable to select what we want
size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1)
// Window.width required for proper scaling in the frontend
return {'size_pos': size_pos, 'browser_width': window.innerWidth};

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg width="83.39" height="89.648" enable-background="new 0 0 122.406 122.881" version="1.1" viewBox="0 0 83.39 89.648" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(5e-4 -33.234)"><path d="m44.239 42.946-39.111 39.896 34.908 34.91 39.09-39.876-1.149-34.931zm-0.91791 42.273c0.979-0.979 1.507-1.99 1.577-3.027 0.077-1.043-0.248-2.424-0.967-4.135-0.725-1.717-1.348-3.346-1.87-4.885s-0.814-3.014-0.897-4.432c-0.07-1.42 0.134-2.768 0.624-4.045 0.477-1.279 1.348-2.545 2.607-3.804 2.099-2.099 4.535-3.123 7.314-3.065 2.773 0.063 5.457 1.158 8.04 3.294l2.881 3.034c1.946 2.607 2.799 5.33 2.557 8.166-0.235 2.83-1.532 5.426-3.893 7.785l-6.296-6.297c1.291-1.291 2.035-2.531 2.238-3.727 0.191-1.197-0.165-2.252-1.081-3.168-0.821-0.82-1.717-1.195-2.69-1.139-0.967 0.064-1.908 0.547-2.817 1.457-0.922 0.922-1.393 1.914-1.412 2.977s0.306 2.416 0.973 4.064c0.661 1.652 1.24 3.25 1.736 4.801 0.496 1.553 0.782 3.035 0.858 4.445 0.076 1.426-0.127 2.787-0.591 4.104-0.477 1.316-1.336 2.596-2.588 3.848-2.125 2.125-4.522 3.186-7.212 3.18s-5.311-1.063-7.855-3.16l-3.747 3.746-2.964-2.965 3.766-3.764c-2.423-2.996-3.568-5.998-3.447-9.02 0.127-3.014 1.476-5.813 4.045-8.383l6.278 6.277c-1.412 1.412-2.175 2.799-2.277 4.16-0.108 1.367 0.414 2.627 1.571 3.783 0.839 0.84 1.755 1.26 2.741 1.242 0.985-0.017 1.92-0.47 2.798-1.347zm21.127-46.435h17.457c-0.0269 2.2368 0.69936 16.025 0.69936 16.025l0.785 23.858c0.019 0.609-0.221 1.164-0.619 1.564l5e-3 4e-3 -41.236 42.022c-0.82213 0.8378-2.175 0.83-3.004 0l-37.913-37.91c-0.83-0.83-0.83-2.176 0-3.006l41.236-42.021c0.39287-0.42671 1.502-0.53568 1.502-0.53568zm18.011 11.59c-59.392-29.687-29.696-14.843 0 0z"/></g></svg>

After

Width:  |  Height:  |  Size: 1.7 KiB

View File

@@ -1,4 +1,5 @@
// Horrible proof of concept code :)
// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
// All rights reserved.
// yes - this is really a hack, if you are a front-ender and want to help, please get in touch!
$(document).ready(function () {
@@ -177,9 +178,10 @@ $(document).ready(function () {
// Basically, find the most 'deepest'
var found = 0;
ctx.fillStyle = 'rgba(205,0,0,0.35)';
for (var i = selector_data['size_pos'].length; i !== 0; i--) {
// Will be sorted by smallest width*height first
for (var i = 0; i <= selector_data['size_pos'].length; i++) {
// draw all of them? let them choose somehow?
var sel = selector_data['size_pos'][i - 1];
var sel = selector_data['size_pos'][i];
// If we are in a bounding-box
if (e.offsetY > sel.top * y_scale && e.offsetY < sel.top * y_scale + sel.height * y_scale
&&
@@ -195,7 +197,7 @@ $(document).ready(function () {
// no need to keep digging
// @todo or, O to go out/up, I to go in
// or double click to go up/out the selector?
current_selected_i = i - 1;
current_selected_i = i;
found += 1;
break;
}

View File

@@ -1025,4 +1025,14 @@ ul {
padding: 3px;
background-color: var(--color-background-button-green);
}
font-weight: bold;
font-style: italic;
}
.price-follow-tag-icon {
display: inline-block;
height: 0.8rem;
vertical-align: middle;
}

View File

@@ -954,7 +954,15 @@ ul {
border-radius: 3px;
white-space: nowrap; }
.ldjson-price-track-offer a.pure-button {
border-radius: 3px;
padding: 3px;
background-color: var(--color-background-button-green); }
.ldjson-price-track-offer {
font-weight: bold;
font-style: italic; }
.ldjson-price-track-offer a.pure-button {
border-radius: 3px;
padding: 3px;
background-color: var(--color-background-button-green); }
.price-follow-tag-icon {
display: inline-block;
height: 0.8rem;
vertical-align: middle; }

View File

@@ -32,6 +32,7 @@
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unpause">UnPause</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="mute">Mute</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unmute">UnMute</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="recheck">Recheck</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="notification-default">Use default notification</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242; font-size: 70%" name="op" value="delete">Delete</button>
</div>
@@ -88,9 +89,9 @@
</td>
<td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" /></a>
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" title="Create a link to share watch config with others" /></a>
{%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %}
{%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
{% if watch.last_error is defined and watch.last_error != False %}
<div class="fetch-error">{{ watch.last_error }}</div>
@@ -101,8 +102,8 @@
{% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %}
<div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
{% endif %}
{% if watch['track_ldjson_price_data'] %}
<span class="tracking-ldjson-price-data">Price</span>
{% if watch['track_ldjson_price_data'] == 'accepted' %}
<span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="price-follow-tag-icon"/> Price</span>
{% endif %}
{% if not active_tag %}
<span class="watch-tag-list">{{ watch.tag}}</span>

View File

@@ -4,9 +4,6 @@ import time
from flask import url_for
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
from ..html_tools import *
def set_response_with_ldjson():
test_return_data = """<html>
<body>
@@ -61,6 +58,22 @@ def set_response_with_ldjson():
f.write(test_return_data)
return None
def set_response_without_ldjson():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
<div class="sametext">Some text thats the same</div>
<div class="changetext">Some text that will change</div>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
# actually only really used by the distll.io importer, but could be handy too
def test_check_ldjson_price_autodetect(client, live_server):
@@ -106,7 +119,28 @@ def test_check_ldjson_price_autodetect(client, live_server):
headers={'x-api-key': api_key},
)
# Should just see the price in the API reply
assert res.data == b'8097000'
# Should see this (dont know where the whitespace came from)
assert b'"highPrice": 8099900' in res.data
# And not this cause its not the ld-json
assert b"So let's see what happens" not in res.data
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
##########################################################################################
# And we shouldnt see the offer
set_response_without_ldjson()
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(3)
res = client.get(url_for("index"))
assert b'ldjson-price-track-offer' not in res.data
##########################################################################################
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

View File

@@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_ignore_json_order(client, live_server):
# A change in order shouldn't trigger a notification
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"hello" : 123, "world": 123}')
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type="application/json", _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(2)
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"world" : 123, "hello": 123}')
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(2)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Just to be sure it still works
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"world" : 123, "hello": 124}')
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(2)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)

View File

@@ -4,6 +4,7 @@ import queue
import time
from changedetectionio import content_fetcher
from changedetectionio import queuedWatchMetaData
from changedetectionio.fetch_site_status import FilterNotFoundInResponse
# A single update worker
@@ -157,11 +158,12 @@ class update_worker(threading.Thread):
while not self.app.config.exit.is_set():
try:
priority, uuid = self.q.get(block=False)
queued_item_data = self.q.get(block=False)
except queue.Empty:
pass
else:
uuid = queued_item_data.item.get('uuid')
self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()):
@@ -171,11 +173,11 @@ class update_worker(threading.Thread):
update_obj= {}
xpath_data = False
process_changedetection_results = True
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
print("> Processing UUID {} Priority {} URL {}".format(uuid, queued_item_data.priority, self.datastore.data['watching'][uuid]['url']))
now = time.time()
try:
changed_detected, update_obj, contents = update_handler.run(uuid)
changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same'))
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
@@ -241,6 +243,10 @@ class update_worker(threading.Thread):
process_changedetection_results = True
except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo
pass
except content_fetcher.BrowserStepsStepTimout as e:
if not self.datastore.data['watching'].get(uuid):

View File

@@ -29,8 +29,9 @@ apprise~=1.2.0
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
paho-mqtt
# Pinned version of cryptography otherwise
# ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
# This mainly affects some ARM builds, which unlike the other builds ignores "ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1"
# so without this pinning, the newer versions on ARM will forcefully try to build rust, which results in "rust compiler not found"
# (introduced once apprise became a dep)
cryptography~=3.4
# Used for CSS filtering