Compare commits

...

11 Commits

17 changed files with 283 additions and 116 deletions

View File

@@ -10,6 +10,7 @@ import threading
import time import time
import timeago import timeago
from changedetectionio import queuedWatchMetaData
from copy import deepcopy from copy import deepcopy
from distutils.util import strtobool from distutils.util import strtobool
from feedgen.feed import FeedGenerator from feedgen.feed import FeedGenerator
@@ -35,7 +36,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools from changedetectionio import html_tools
from changedetectionio.api import api_v1 from changedetectionio.api import api_v1
__version__ = '0.40.0.2' __version__ = '0.40.0.3'
datastore = None datastore = None
@@ -404,7 +405,6 @@ def changedetection_app(config=None, datastore_o=None):
sorted_watches.append(watch) sorted_watches.append(watch)
existing_tags = datastore.get_all_tags() existing_tags = datastore.get_all_tags()
form = forms.quickWatchForm(request.form) form = forms.quickWatchForm(request.form)
output = render_template("watch-overview.html", output = render_template("watch-overview.html",
form=form, form=form,
@@ -416,7 +416,7 @@ def changedetection_app(config=None, datastore_o=None):
# Don't link to hosting when we're on the hosting environment # Don't link to hosting when we're on the hosting environment
hosted_sticky=os.getenv("SALTED_PASS", False) == False, hosted_sticky=os.getenv("SALTED_PASS", False) == False,
guid=datastore.data['app_guid'], guid=datastore.data['app_guid'],
queued_uuids=[uuid for p,uuid in update_q.queue]) queued_uuids=[q_uuid.item['uuid'] for q_uuid in update_q.queue])
if session.get('share-link'): if session.get('share-link'):
@@ -596,25 +596,16 @@ def changedetection_app(config=None, datastore_o=None):
using_default_check_time = False using_default_check_time = False
break break
# Use the default if its the same as system wide # Use the default if it's the same as system-wide.
if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']: if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']:
extra_update_obj['fetch_backend'] = None extra_update_obj['fetch_backend'] = None
# Ignore text # Ignore text
form_ignore_text = form.ignore_text.data form_ignore_text = form.ignore_text.data
datastore.data['watching'][uuid]['ignore_text'] = form_ignore_text datastore.data['watching'][uuid]['ignore_text'] = form_ignore_text
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if form_ignore_text:
if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []):
if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Be sure proxy value is None # Be sure proxy value is None
if datastore.proxy_list is not None and form.data['proxy'] == '': if datastore.proxy_list is not None and form.data['proxy'] == '':
extra_update_obj['proxy'] = None extra_update_obj['proxy'] = None
@@ -632,7 +623,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.needs_write_urgent = True datastore.needs_write_urgent = True
# Queue the watch for immediate recheck, with a higher priority # Queue the watch for immediate recheck, with a higher priority
update_q.put((1, uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
# Diff page [edit] link should go back to diff page # Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff': if request.args.get("next") and request.args.get("next") == 'diff':
@@ -764,8 +755,11 @@ def changedetection_app(config=None, datastore_o=None):
@login_required @login_required
def import_page(): def import_page():
remaining_urls = [] remaining_urls = []
from changedetectionio import forms
form = forms.importForm(request.form)
if request.method == 'POST': if request.method == 'POST':
from .importer import import_url_list, import_distill_io_json from .importer import import_url_list, import_distill_io_json, import_changedetection_io_zip
# URL List import # URL List import
if request.values.get('urls') and len(request.values.get('urls').strip()): if request.values.get('urls') and len(request.values.get('urls').strip()):
@@ -773,7 +767,7 @@ def changedetection_app(config=None, datastore_o=None):
importer = import_url_list() importer = import_url_list()
importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore) importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
for uuid in importer.new_uuids: for uuid in importer.new_uuids:
update_q.put((1, uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
if len(importer.remaining_data) == 0: if len(importer.remaining_data) == 0:
return redirect(url_for('index')) return redirect(url_for('index'))
@@ -786,12 +780,22 @@ def changedetection_app(config=None, datastore_o=None):
d_importer = import_distill_io_json() d_importer = import_distill_io_json()
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore) d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
for uuid in d_importer.new_uuids: for uuid in d_importer.new_uuids:
update_q.put((1, uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
if request.files.get("backup_zip_file"):
if not form.validate():
flash("An error occurred, please see below.", "error")
else:
d_importer = import_changedetection_io_zip()
d_importer.run(data=None, flash=flash, datastore=datastore)
for uuid in d_importer.new_uuids:
# Queue without priority, we will examine their own rule to find out if it should be checked
update_q.put(queuedWatchMetaData.PrioritizedItem(item={'uuid': uuid, 'skip_when_checksum_same': True}))
# Could be some remaining, or we could be on GET # Could be some remaining, or we could be on GET
output = render_template("import.html", output = render_template("import.html",
form=form,
import_url_list_remaining="\n".join(remaining_urls), import_url_list_remaining="\n".join(remaining_urls),
original_distill_json='' original_distill_json=''
) )
@@ -1151,7 +1155,7 @@ def changedetection_app(config=None, datastore_o=None):
if not add_paused and new_uuid: if not add_paused and new_uuid:
# Straight into the queue. # Straight into the queue.
update_q.put((1, new_uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
flash("Watch added.") flash("Watch added.")
if add_paused: if add_paused:
@@ -1188,7 +1192,7 @@ def changedetection_app(config=None, datastore_o=None):
uuid = list(datastore.data['watching'].keys()).pop() uuid = list(datastore.data['watching'].keys()).pop()
new_uuid = datastore.clone(uuid) new_uuid = datastore.clone(uuid)
update_q.put((5, new_uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
flash('Cloned.') flash('Cloned.')
return redirect(url_for('index')) return redirect(url_for('index'))
@@ -1196,7 +1200,7 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/api/checknow", methods=['GET']) @app.route("/api/checknow", methods=['GET'])
@login_required @login_required
def form_watch_checknow(): def form_watch_checknow():
# Forced recheck will skip the 'skip if content is the same' rule (, 'reprocess_existing_data': True})))
tag = request.args.get('tag') tag = request.args.get('tag')
uuid = request.args.get('uuid') uuid = request.args.get('uuid')
i = 0 i = 0
@@ -1205,11 +1209,9 @@ def changedetection_app(config=None, datastore_o=None):
for t in running_update_threads: for t in running_update_threads:
running_uuids.append(t.current_uuid) running_uuids.append(t.current_uuid)
# @todo check thread is running and skip
if uuid: if uuid:
if uuid not in running_uuids: if uuid not in running_uuids:
update_q.put((1, uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
i = 1 i = 1
elif tag != None: elif tag != None:
@@ -1217,14 +1219,14 @@ def changedetection_app(config=None, datastore_o=None):
for watch_uuid, watch in datastore.data['watching'].items(): for watch_uuid, watch in datastore.data['watching'].items():
if (tag != None and tag in watch['tag']): if (tag != None and tag in watch['tag']):
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']: if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put((1, watch_uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1 i += 1
else: else:
# No tag, no uuid, add everything. # No tag, no uuid, add everything.
for watch_uuid, watch in datastore.data['watching'].items(): for watch_uuid, watch in datastore.data['watching'].items():
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']: if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put((1, watch_uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1 i += 1
flash("{} watches are queued for rechecking.".format(i)) flash("{} watches are queued for rechecking.".format(i))
return redirect(url_for('index', tag=tag)) return redirect(url_for('index', tag=tag))
@@ -1271,6 +1273,14 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['watching'][uuid.strip()]['notification_muted'] = False datastore.data['watching'][uuid.strip()]['notification_muted'] = False
flash("{} watches un-muted".format(len(uuids))) flash("{} watches un-muted".format(len(uuids)))
elif (op == 'recheck'):
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
# Recheck and require a full reprocessing
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
flash("{} watches un-muted".format(len(uuids)))
elif (op == 'notification-default'): elif (op == 'notification-default'):
from changedetectionio.notification import ( from changedetectionio.notification import (
default_notification_format_for_watch default_notification_format_for_watch
@@ -1344,7 +1354,7 @@ def changedetection_app(config=None, datastore_o=None):
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps') app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
import changedetectionio.blueprint.price_data_follower as price_data_follower import changedetectionio.blueprint.price_data_follower as price_data_follower
app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower') app.register_blueprint(price_data_follower.construct_blueprint(datastore, update_q), url_prefix='/price_data_follower')
# @todo handle ctrl break # @todo handle ctrl break
@@ -1452,7 +1462,11 @@ def ticker_thread_check_time_launch_checks():
watch_uuid_list = [] watch_uuid_list = []
while True: while True:
try: try:
watch_uuid_list = datastore.data['watching'].keys() # Get a list of watches sorted by last_checked, [1] because it gets passed a tuple
# This is so we examine the most over-due first
for k in sorted(datastore.data['watching'].items(), key=lambda item: item[1].get('last_checked',0)):
watch_uuid_list.append(k[0])
except RuntimeError as e: except RuntimeError as e:
# RuntimeError: dictionary changed size during iteration # RuntimeError: dictionary changed size during iteration
time.sleep(0.1) time.sleep(0.1)
@@ -1492,7 +1506,7 @@ def ticker_thread_check_time_launch_checks():
seconds_since_last_recheck = now - watch['last_checked'] seconds_since_last_recheck = now - watch['last_checked']
if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds: if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds:
if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]: if not uuid in running_uuids and uuid not in [q_uuid.item['uuid'] for q_uuid in update_q.queue]:
# Proxies can be set to have a limit on seconds between which they can be called # Proxies can be set to have a limit on seconds between which they can be called
watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid) watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid)
@@ -1523,8 +1537,9 @@ def ticker_thread_check_time_launch_checks():
priority, priority,
watch.jitter_seconds, watch.jitter_seconds,
now - watch['last_checked'])) now - watch['last_checked']))
# Into the queue with you # Into the queue with you
update_q.put((priority, uuid)) update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid, 'skip_when_checksum_same': True}))
# Reset for next time # Reset for next time
watch.jitter_seconds = 0 watch.jitter_seconds = 0

View File

@@ -1,3 +1,4 @@
from changedetectionio import queuedWatchMetaData
from flask_restful import abort, Resource from flask_restful import abort, Resource
from flask import request, make_response from flask import request, make_response
import validators import validators
@@ -24,7 +25,7 @@ class Watch(Resource):
abort(404, message='No watch exists with the UUID of {}'.format(uuid)) abort(404, message='No watch exists with the UUID of {}'.format(uuid))
if request.args.get('recheck'): if request.args.get('recheck'):
self.update_q.put((1, uuid)) self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return "OK", 200 return "OK", 200
# Return without history, get that via another API call # Return without history, get that via another API call
@@ -100,7 +101,7 @@ class CreateWatch(Resource):
extras = {'title': json_data['title'].strip()} if json_data.get('title') else {} extras = {'title': json_data['title'].strip()} if json_data.get('title') else {}
new_uuid = self.datastore.add_watch(url=json_data['url'].strip(), tag=tag, extras=extras) new_uuid = self.datastore.add_watch(url=json_data['url'].strip(), tag=tag, extras=extras)
self.update_q.put((1, new_uuid)) self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
return {'uuid': new_uuid}, 201 return {'uuid': new_uuid}, 201
# Return concise list of available watches and some very basic info # Return concise list of available watches and some very basic info
@@ -118,7 +119,7 @@ class CreateWatch(Resource):
if request.args.get('recheck_all'): if request.args.get('recheck_all'):
for uuid in self.datastore.data['watching'].keys(): for uuid in self.datastore.data['watching'].keys():
self.update_q.put((1, uuid)) self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return {'status': "OK"}, 200 return {'status': "OK"}, 200
return list, 200 return list, 200

View File

@@ -75,15 +75,13 @@ class steppable_browser_interface():
def action_goto_url(self, url, optional_value): def action_goto_url(self, url, optional_value):
# self.page.set_viewport_size({"width": 1280, "height": 5000}) # self.page.set_viewport_size({"width": 1280, "height": 5000})
now = time.time() now = time.time()
response = self.page.goto(url, timeout=0, wait_until='domcontentloaded') response = self.page.goto(url, timeout=0, wait_until='commit')
print("Time to goto URL", time.time() - now)
# Wait_until = commit # Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading. # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors' # This seemed to solve nearly all 'TimeoutErrors'
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) print("Time to goto URL ", time.time() - now)
self.page.wait_for_timeout(extra_wait * 1000)
def action_click_element_containing_text(self, selector=None, value=''): def action_click_element_containing_text(self, selector=None, value=''):
if not len(value.strip()): if not len(value.strip()):

View File

@@ -3,11 +3,13 @@ from distutils.util import strtobool
from flask import Blueprint, flash, redirect, url_for from flask import Blueprint, flash, redirect, url_for
from flask_login import login_required from flask_login import login_required
from changedetectionio.store import ChangeDetectionStore from changedetectionio.store import ChangeDetectionStore
from changedetectionio import queuedWatchMetaData
from queue import PriorityQueue
PRICE_DATA_TRACK_ACCEPT = 'accepted' PRICE_DATA_TRACK_ACCEPT = 'accepted'
PRICE_DATA_TRACK_REJECT = 'rejected' PRICE_DATA_TRACK_REJECT = 'rejected'
def construct_blueprint(datastore: ChangeDetectionStore): def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue):
price_data_follower_blueprint = Blueprint('price_data_follower', __name__) price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
@@ -15,6 +17,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
@price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET']) @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
def accept(uuid): def accept(uuid):
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
return redirect(url_for("form_watch_checknow", uuid=uuid)) return redirect(url_for("form_watch_checknow", uuid=uuid))

View File

@@ -23,6 +23,9 @@ class Non200ErrorCodeReceived(Exception):
self.page_text = html_tools.html_to_text(page_html) self.page_text = html_tools.html_to_text(page_html)
return return
class checksumFromPreviousCheckWasTheSame(Exception):
def __init__(self):
return
class JSActionExceptions(Exception): class JSActionExceptions(Exception):
def __init__(self, status_code, url, screenshot, message=''): def __init__(self, status_code, url, screenshot, message=''):
@@ -39,7 +42,7 @@ class BrowserStepsStepTimout(Exception):
class PageUnloadable(Exception): class PageUnloadable(Exception):
def __init__(self, status_code, url, screenshot=False, message=False): def __init__(self, status_code, url, message, screenshot=False):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
self.status_code = status_code self.status_code = status_code
self.url = url self.url = url
@@ -286,6 +289,8 @@ class base_html_playwright(Fetcher):
proxy=self.proxy, proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others # This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True, bypass_csp=True,
# Can't think why we need the service workers for our use case?
service_workers='block',
# Should never be needed # Should never be needed
accept_downloads=False accept_downloads=False
) )
@@ -294,24 +299,34 @@ class base_html_playwright(Fetcher):
if len(request_headers): if len(request_headers):
context.set_extra_http_headers(request_headers) context.set_extra_http_headers(request_headers)
try:
self.page.set_default_navigation_timeout(90000) self.page.set_default_navigation_timeout(90000)
self.page.set_default_timeout(90000) self.page.set_default_timeout(90000)
# Listen for all console events and handle errors # Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto # Goto page
try:
# Waits for the next navigation. Using Python context manager
# prevents a race condition between clicking and waiting for a navigation.
with self.page.expect_navigation():
response = self.page.goto(url, wait_until='load')
# Wait_until = commit # Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading. # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors' # This seemed to solve nearly all 'TimeoutErrors'
response = self.page.goto(url, wait_until='commit')
except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random
print ("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1)
response = self.page.goto(url, wait_until='commit')
except Exception as e:
print ("Content Fetcher > Other exception when page.goto", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
# Execute any browser steps
try:
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000) self.page.wait_for_timeout(extra_wait * 1000)
@@ -324,17 +339,15 @@ class base_html_playwright(Fetcher):
# This can be ok, we will try to grab what we could retrieve # This can be ok, we will try to grab what we could retrieve
pass pass
except Exception as e: except Exception as e:
print ("other exception when page.goto") print ("Content Fetcher > Other exception when executing custom JS code", str(e))
print (str(e))
context.close() context.close()
browser.close() browser.close()
raise PageUnloadable(url=url, status_code=None) raise PageUnloadable(url=url, status_code=None, message=str(e))
if response is None: if response is None:
context.close() context.close()
browser.close() browser.close()
print ("response object was none") print ("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page # Bug 2(?) Set the viewport size AFTER loading the page
@@ -353,7 +366,7 @@ class base_html_playwright(Fetcher):
if len(self.page.content().strip()) == 0: if len(self.page.content().strip()) == 0:
context.close() context.close()
browser.close() browser.close()
print ("Content was empty") print ("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page # Bug 2(?) Set the viewport size AFTER loading the page
@@ -498,7 +511,7 @@ class base_html_webdriver(Fetcher):
try: try:
self.driver.quit() self.driver.quit()
except Exception as e: except Exception as e:
print("Exception in chrome shutdown/quit" + str(e)) print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))
# "html_requests" is listed as the default fetcher in store.py! # "html_requests" is listed as the default fetcher in store.py!

View File

@@ -1,4 +1,5 @@
import hashlib import hashlib
import json
import logging import logging
import os import os
import re import re
@@ -6,6 +7,7 @@ import urllib3
from changedetectionio import content_fetcher, html_tools from changedetectionio import content_fetcher, html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -38,8 +40,7 @@ class perform_site_check():
return regex return regex
def run(self, uuid): def run(self, uuid, skip_when_checksum_same=True):
from copy import deepcopy
changed_detected = False changed_detected = False
screenshot = False # as bytes screenshot = False # as bytes
stripped_text_from_html = "" stripped_text_from_html = ""
@@ -122,6 +123,14 @@ class perform_site_check():
self.screenshot = fetcher.screenshot self.screenshot = fetcher.screenshot
self.xpath_data = fetcher.xpath_data self.xpath_data = fetcher.xpath_data
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
if skip_when_checksum_same:
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
# Fetching complete, now filters # Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base? # @todo move to class / maybe inside of fetcher abstract base?
@@ -159,6 +168,14 @@ class perform_site_check():
include_filters_rule.append("json:$") include_filters_rule.append("json:$")
has_filter_rule = True has_filter_rule = True
if is_json:
# Sort the JSON so we dont get false alerts when the content is just re-ordered
try:
fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True)
except Exception as e:
# Might have just been a snippet, or otherwise bad JSON, continue
pass
if has_filter_rule: if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:'] json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule: for filter in include_filters_rule:
@@ -166,6 +183,8 @@ class perform_site_check():
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
is_html = False is_html = False
if is_html or is_source: if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text

View File

@@ -3,6 +3,7 @@ import re
from wtforms import ( from wtforms import (
BooleanField, BooleanField,
FileField,
Form, Form,
IntegerField, IntegerField,
RadioField, RadioField,
@@ -425,6 +426,14 @@ class watchForm(commonSettingsForm):
result = False result = False
return result return result
# datastore.data['settings']['requests']..
class importForm(Form):
#backup_zip_file = FileField("File", validators=[validators.regexp('\.zip$', flags=re.IGNORECASE)])
backup_zip_file = FileField("File")
def validate_backup_zip_file(form, field):
if field.data:
x=1
# datastore.data['settings']['requests'].. # datastore.data['settings']['requests']..
class globalSettingsRequestForm(Form): class globalSettingsRequestForm(Form):

View File

@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from flask import request, url_for, current_app
import time import time
import validators import validators
@@ -20,6 +21,26 @@ class Importer():
datastore): datastore):
pass pass
class import_changedetection_io_zip(Importer):
def run(self,
data,
flash,
datastore,
):
# `data` should be none, we will hit up request directly
import zipfile
import io
with zipfile.ZipFile(io.BytesIO(request.files["backup_zip_file"].read()), 'r') as zf:
p =zf.namelist()
for file in zf.namelist():
x=1
class import_url_list(Importer): class import_url_list(Importer):
""" """

View File

@@ -27,7 +27,6 @@ class model(dict):
'base_url' : None, 'base_url' : None,
'extract_title_as_title': False, 'extract_title_as_title': False,
'empty_pages_are_a_change': False, 'empty_pages_are_a_change': False,
'css_dark_mode': False,
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"), 'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT, 'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum

View File

@@ -14,51 +14,52 @@ from changedetectionio.notification import (
class model(dict): class model(dict):
__newest_history_key = None __newest_history_key = None
__history_n=0 __history_n = 0
__base_config = { __base_config = {
#'history': {}, # Dict of timestamp and output stripped filename (removed) # 'history': {}, # Dict of timestamp and output stripped filename (removed)
#'newest_history_key': 0, (removed, taken from history.txt index) # 'newest_history_key': 0, (removed, taken from history.txt index)
'body': None, 'body': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new 'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0, 'check_count': 0,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine. 'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters 'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False, 'extract_title_as_title': False,
'fetch_backend': None, 'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'has_ldjson_price_data': None, 'has_ldjson_price_data': None,
'track_ldjson_price_data': None, 'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send 'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum 'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'include_filters': [], 'include_filters': [],
'last_checked': 0, 'last_checked': 0,
'last_error': False, 'last_error': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link 'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET', 'method': 'GET',
# Custom notification content # Custom notification content
'notification_body': None, 'notification_body': None,
'notification_format': default_notification_format_for_watch, 'notification_format': default_notification_format_for_watch,
'notification_muted': False, 'notification_muted': False,
'notification_title': None, 'notification_title': None,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL 'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'paused': False, 'paused': False,
'previous_md5': False, 'previous_md5': False,
'proxy': None, # Preferred proxy connection 'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'subtractive_selectors': [], 'proxy': None, # Preferred proxy connection
'tag': None, 'subtractive_selectors': [],
'text_should_not_be_present': [], # Text that should not present 'tag': None,
# Re #110, so then if this is set to None, we know to use the default value instead 'text_should_not_be_present': [], # Text that should not present
# Requires setting to None on submit if it's the same as the default # Re #110, so then if this is set to None, we know to use the default value instead
# Should be all None by default, so we use the system default in this case. # Requires setting to None on submit if it's the same as the default
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, # Should be all None by default, so we use the system default in this case.
'title': None, 'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'trigger_text': [], # List of text or regex to wait for until a change is detected 'title': None,
'url': None, 'trigger_text': [], # List of text or regex to wait for until a change is detected
'uuid': str(uuid.uuid4()), 'url': None,
'webdriver_delay': None, 'uuid': str(uuid.uuid4()),
'webdriver_js_execute_code': None, # Run before change-detection 'webdriver_delay': None,
} 'webdriver_js_execute_code': None, # Run before change-detection
}
jitter_seconds = 0 jitter_seconds = 0
def __init__(self, *arg, **kw): def __init__(self, *arg, **kw):

View File

@@ -0,0 +1,10 @@
from dataclasses import dataclass, field
from typing import Any
# So that we can queue some metadata in `item`
# https://docs.python.org/3/library/queue.html#queue.PriorityQueue
#
@dataclass(order=True)
class PrioritizedItem:
priority: int
item: Any=field(compare=False)

View File

@@ -1,3 +1,6 @@
// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
// All rights reserved.
// @file Scrape the page looking for elements of concern (%ELEMENTS%) // @file Scrape the page looking for elements of concern (%ELEMENTS%)
// http://matatk.agrip.org.uk/tests/position-and-width/ // http://matatk.agrip.org.uk/tests/position-and-width/
// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate // https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
@@ -89,8 +92,8 @@ for (var i = 0; i < elements.length; i++) {
continue continue
} }
// Forget really small ones // Skip really small ones, and where width or height ==0
if (bbox['width'] < 10 && bbox['height'] < 10) { if (bbox['width'] * bbox['height'] < 100) {
continue; continue;
} }
@@ -146,7 +149,6 @@ for (var i = 0; i < elements.length; i++) {
} }
// Inject the current one set in the include_filters, which may be a CSS rule // Inject the current one set in the include_filters, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated. // used for displaying the current one in VisualSelector, where its not one we generated.
if (include_filters.length) { if (include_filters.length) {
@@ -174,10 +176,23 @@ if (include_filters.length) {
} }
if (q) { if (q) {
bbox = q.getBoundingClientRect(); // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y) if (q.hasOwnProperty('getBoundingClientRect')) {
} else { bbox = q.getBoundingClientRect();
console.log("xpath_element_scraper: filter element "+f+" was not found"); console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
} else {
try {
// Try and see we can find its ownerElement
bbox = q.ownerElement.getBoundingClientRect();
console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
} catch (e) {
console.log("xpath_element_scraper: error looking up ownerElement")
}
}
}
if(!q) {
console.log("xpath_element_scraper: filter element " + f + " was not found");
} }
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) { if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
@@ -192,5 +207,9 @@ if (include_filters.length) {
} }
} }
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
// so that we dont select the wrapping element by mistake and be unable to select what we want
size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1)
// Window.width required for proper scaling in the frontend // Window.width required for proper scaling in the frontend
return {'size_pos': size_pos, 'browser_width': window.innerWidth}; return {'size_pos': size_pos, 'browser_width': window.innerWidth};

View File

@@ -1,4 +1,5 @@
// Horrible proof of concept code :) // Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
// All rights reserved.
// yes - this is really a hack, if you are a front-ender and want to help, please get in touch! // yes - this is really a hack, if you are a front-ender and want to help, please get in touch!
$(document).ready(function () { $(document).ready(function () {
@@ -177,9 +178,10 @@ $(document).ready(function () {
// Basically, find the most 'deepest' // Basically, find the most 'deepest'
var found = 0; var found = 0;
ctx.fillStyle = 'rgba(205,0,0,0.35)'; ctx.fillStyle = 'rgba(205,0,0,0.35)';
for (var i = selector_data['size_pos'].length; i !== 0; i--) { // Will be sorted by smallest width*height first
for (var i = 0; i <= selector_data['size_pos'].length; i++) {
// draw all of them? let them choose somehow? // draw all of them? let them choose somehow?
var sel = selector_data['size_pos'][i - 1]; var sel = selector_data['size_pos'][i];
// If we are in a bounding-box // If we are in a bounding-box
if (e.offsetY > sel.top * y_scale && e.offsetY < sel.top * y_scale + sel.height * y_scale if (e.offsetY > sel.top * y_scale && e.offsetY < sel.top * y_scale + sel.height * y_scale
&& &&
@@ -195,7 +197,7 @@ $(document).ready(function () {
// no need to keep digging // no need to keep digging
// @todo or, O to go out/up, I to go in // @todo or, O to go out/up, I to go in
// or double click to go up/out the selector? // or double click to go up/out the selector?
current_selected_i = i - 1; current_selected_i = i;
found += 1; found += 1;
break; break;
} }

View File

@@ -1,5 +1,6 @@
{% extends 'base.html' %} {% extends 'base.html' %}
{% block content %} {% block content %}
{% from '_helpers.jinja' import render_field %}
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script> <script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<div class="edit-form monospaced-textarea"> <div class="edit-form monospaced-textarea">
@@ -7,11 +8,12 @@
<ul> <ul>
<li class="tab" id=""><a href="#url-list">URL List</a></li> <li class="tab" id=""><a href="#url-list">URL List</a></li>
<li class="tab"><a href="#distill-io">Distill.io</a></li> <li class="tab"><a href="#distill-io">Distill.io</a></li>
<li class="tab"><a href="#changedetection-io">Changedetection.io</a></li>
</ul> </ul>
</div> </div>
<div class="box-wrap inner"> <div class="box-wrap inner">
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST"> <form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST" enctype="multipart/form-data">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="url-list"> <div class="tab-pane-inner" id="url-list">
<fieldset class="pure-group"> <fieldset class="pure-group">
@@ -77,6 +79,12 @@
" rows="25">{{ original_distill_json }}</textarea> " rows="25">{{ original_distill_json }}</textarea>
</fieldset> </fieldset>
</div> </div>
<div class="tab-pane-inner" id="changedetection-io">
Upload your changedetection.io backup ZIP here</br>
<fieldset class="pure-group">
{{ render_field(form.backup_zip_file) }}
</fieldset>
</div>
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button> <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
</form> </form>

View File

@@ -32,6 +32,7 @@
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unpause">UnPause</button> <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unpause">UnPause</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="mute">Mute</button> <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="mute">Mute</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unmute">UnMute</button> <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="unmute">UnMute</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="recheck">Recheck</button>
<button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="notification-default">Use default notification</button> <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="notification-default">Use default notification</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242; font-size: 70%" name="op" value="delete">Delete</button> <button class="pure-button button-secondary button-xsmall" style="background: #dd4242; font-size: 70%" name="op" value="delete">Delete</button>
</div> </div>

View File

@@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data
def test_ignore_json_order(client, live_server):
# A change in order shouldn't trigger a notification
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"hello" : 123, "world": 123}')
# Add our URL to the import page
test_url = url_for('test_endpoint', content_type="application/json", _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(2)
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"world" : 123, "hello": 123}')
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(2)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Just to be sure it still works
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('{"world" : 123, "hello": 124}')
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(2)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_ext_filter(client, live_server): def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server) check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)

View File

@@ -4,6 +4,7 @@ import queue
import time import time
from changedetectionio import content_fetcher from changedetectionio import content_fetcher
from changedetectionio import queuedWatchMetaData
from changedetectionio.fetch_site_status import FilterNotFoundInResponse from changedetectionio.fetch_site_status import FilterNotFoundInResponse
# A single update worker # A single update worker
@@ -157,11 +158,12 @@ class update_worker(threading.Thread):
while not self.app.config.exit.is_set(): while not self.app.config.exit.is_set():
try: try:
priority, uuid = self.q.get(block=False) queued_item_data = self.q.get(block=False)
except queue.Empty: except queue.Empty:
pass pass
else: else:
uuid = queued_item_data.item.get('uuid')
self.current_uuid = uuid self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()): if uuid in list(self.datastore.data['watching'].keys()):
@@ -171,11 +173,11 @@ class update_worker(threading.Thread):
update_obj= {} update_obj= {}
xpath_data = False xpath_data = False
process_changedetection_results = True process_changedetection_results = True
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url'])) print("> Processing UUID {} Priority {} URL {}".format(uuid, queued_item_data.priority, self.datastore.data['watching'][uuid]['url']))
now = time.time() now = time.time()
try: try:
changed_detected, update_obj, contents = update_handler.run(uuid) changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same'))
# Re #342 # Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc # We then convert/.decode('utf-8') for the notification etc
@@ -241,6 +243,10 @@ class update_worker(threading.Thread):
process_changedetection_results = True process_changedetection_results = True
except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo
pass
except content_fetcher.BrowserStepsStepTimout as e: except content_fetcher.BrowserStepsStepTimout as e:
if not self.datastore.data['watching'].get(uuid): if not self.datastore.data['watching'].get(uuid):