Compare commits

..

1 Commits

28 changed files with 3984 additions and 925 deletions

View File

@@ -3,16 +3,14 @@
![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master) ![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master)
## Web Site Change Detection, Monitoring and Notification - Self-Hosted or SaaS. ## Self-Hosted, Open Source, Change Monitoring of Web Pages
_Know when web pages change! Stay ontop of new information! get notifications when important website content changes_ _Know when web pages change! Stay ontop of new information!_
Live your data-life *pro-actively* instead of *re-actively*. Live your data-life *pro-actively* instead of *re-actively*.
Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start) Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start)
[[ Discord ]](https://discord.com/channels/1000806276256780309/1000806276873334816) [[ YouTube ]](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [[ LinkedIn ]](https://www.linkedin.com/company/changedetection-io/)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start) [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start)
@@ -35,7 +33,6 @@ Free, Open-source web page monitoring, notification and change detection. Don't
- New software releases, security advisories when you're not on their mailing list. - New software releases, security advisories when you're not on their mailing list.
- Festivals with changes - Festivals with changes
- Realestate listing changes - Realestate listing changes
- Know when your favourite whiskey is on sale, or other special deals are announced before anyone else
- COVID related news from government websites - COVID related news from government websites
- University/organisation news from their website - University/organisation news from their website
- Detect and monitor changes in JSON API responses - Detect and monitor changes in JSON API responses
@@ -121,7 +118,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
## Filters ## Filters
XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
(We support LXML `re:test`, `re:math` and `re:replace`.) (We support LXML re:test, re:math and re:replace.)
## Notifications ## Notifications

View File

@@ -44,7 +44,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools from changedetectionio import html_tools
from changedetectionio.api import api_v1 from changedetectionio.api import api_v1
__version__ = '0.39.16' __version__ = '0.39.15'
datastore = None datastore = None
@@ -105,10 +105,9 @@ def init_app_secret(datastore_path):
# running or something similar. # running or something similar.
@app.template_filter('format_last_checked_time') @app.template_filter('format_last_checked_time')
def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"): def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
# Worker thread tells us which UUID it is currently processing. # Worker thread tells us which UUID it is currently processing.
for t in threading.enumerate(): for t in running_update_threads:
if t.name == 'update_worker' and t.current_uuid == watch_obj['uuid']: if t.current_uuid == watch_obj['uuid']:
return '<span class="loader"></span><span> Checking now</span>' return '<span class="loader"></span><span> Checking now</span>'
if watch_obj['last_checked'] == 0: if watch_obj['last_checked'] == 0:
@@ -299,7 +298,7 @@ def changedetection_app(config=None, datastore_o=None):
# Sort by last_changed and add the uuid which is usually the key.. # Sort by last_changed and add the uuid which is usually the key..
sorted_watches = [] sorted_watches = []
# @todo needs a .itemsWithTag() or something - then we can use that in Jinaj2 and throw this away # @todo needs a .itemsWithTag() or something
for uuid, watch in datastore.data['watching'].items(): for uuid, watch in datastore.data['watching'].items():
if limit_tag != None: if limit_tag != None:
@@ -362,7 +361,7 @@ def changedetection_app(config=None, datastore_o=None):
fe.pubDate(dt) fe.pubDate(dt)
response = make_response(fg.rss_str()) response = make_response(fg.rss_str())
response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8') response.headers.set('Content-Type', 'application/rss+xml')
return response return response
@app.route("/", methods=['GET']) @app.route("/", methods=['GET'])
@@ -404,6 +403,8 @@ def changedetection_app(config=None, datastore_o=None):
watch['uuid'] = uuid watch['uuid'] = uuid
sorted_watches.append(watch) sorted_watches.append(watch)
sorted_watches.sort(key=lambda x: x['last_changed'], reverse=True)
existing_tags = datastore.get_all_tags() existing_tags = datastore.get_all_tags()
form = forms.quickWatchForm(request.form) form = forms.quickWatchForm(request.form)
@@ -432,9 +433,7 @@ def changedetection_app(config=None, datastore_o=None):
def ajax_callback_send_notification_test(): def ajax_callback_send_notification_test():
import apprise import apprise
from .apprise_asset import asset apobj = apprise.Apprise()
apobj = apprise.Apprise(asset=asset)
# validate URLS # validate URLS
if not len(request.form['notification_urls'].strip()): if not len(request.form['notification_urls'].strip()):
@@ -460,38 +459,37 @@ def changedetection_app(config=None, datastore_o=None):
return 'OK' return 'OK'
@app.route("/clear_history/<string:uuid>", methods=['GET']) @app.route("/scrub/<string:uuid>", methods=['GET'])
@login_required @login_required
def clear_watch_history(uuid): def scrub_watch(uuid):
try: try:
datastore.clear_watch_history(uuid) datastore.scrub_watch(uuid)
except KeyError: except KeyError:
flash('Watch not found', 'error') flash('Watch not found', 'error')
else: else:
flash("Cleared snapshot history for watch {}".format(uuid)) flash("Scrubbed watch {}".format(uuid))
return redirect(url_for('index')) return redirect(url_for('index'))
@app.route("/clear_history", methods=['GET', 'POST']) @app.route("/scrub", methods=['GET', 'POST'])
@login_required @login_required
def clear_all_history(): def scrub_page():
if request.method == 'POST': if request.method == 'POST':
confirmtext = request.form.get('confirmtext') confirmtext = request.form.get('confirmtext')
if confirmtext == 'clear': if confirmtext == 'scrub':
changes_removed = 0 changes_removed = 0
for uuid in datastore.data['watching'].keys(): for uuid in datastore.data['watching'].keys():
datastore.clear_watch_history(uuid) datastore.scrub_watch(uuid)
#TODO: KeyError not checked, as it is above
flash("Cleared snapshot history for all watches") flash("Cleared all snapshot history")
else: else:
flash('Incorrect confirmation text.', 'error') flash('Incorrect confirmation text.', 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
output = render_template("clear_all_history.html") output = render_template("scrub.html")
return output return output
@@ -658,8 +656,7 @@ def changedetection_app(config=None, datastore_o=None):
current_base_url=datastore.data['settings']['application']['base_url'], current_base_url=datastore.data['settings']['application']['base_url'],
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False),
visualselector_data_is_ready=visualselector_data_is_ready, visualselector_data_is_ready=visualselector_data_is_ready,
visualselector_enabled=visualselector_enabled, visualselector_enabled=visualselector_enabled
playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False)
) )
return output return output
@@ -835,7 +832,7 @@ def changedetection_app(config=None, datastore_o=None):
newest=newest_version_file_contents, newest=newest_version_file_contents,
previous=previous_version_file_contents, previous=previous_version_file_contents,
extra_stylesheets=extra_stylesheets, extra_stylesheets=extra_stylesheets,
versions=dates[:-1], # All except current/last versions=dates[1:],
uuid=uuid, uuid=uuid,
newest_version_timestamp=dates[-1], newest_version_timestamp=dates[-1],
current_previous_version=str(previous_version), current_previous_version=str(previous_version),
@@ -859,7 +856,7 @@ def changedetection_app(config=None, datastore_o=None):
uuid = list(datastore.data['watching'].keys()).pop() uuid = list(datastore.data['watching'].keys()).pop()
# Normally you would never reach this, because the 'preview' button is not available when there's no history # Normally you would never reach this, because the 'preview' button is not available when there's no history
# However they may try to clear snapshots and reload the page # However they may try to scrub and reload the page
if datastore.data['watching'][uuid].history_n == 0: if datastore.data['watching'][uuid].history_n == 0:
flash("Preview unavailable - No fetch/check completed or triggers not reached", "error") flash("Preview unavailable - No fetch/check completed or triggers not reached", "error")
return redirect(url_for('index')) return redirect(url_for('index'))
@@ -1214,7 +1211,6 @@ def changedetection_app(config=None, datastore_o=None):
# @todo handle ctrl break # @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
threading.Thread(target=ticker_thread_job_queue_processor).start()
threading.Thread(target=notification_runner).start() threading.Thread(target=notification_runner).start()
@@ -1290,63 +1286,25 @@ def notification_runner():
# Trim the log length # Trim the log length
notification_debug_log = notification_debug_log[-100:] notification_debug_log = notification_debug_log[-100:]
# Check the queue, when a job exists, start a fresh thread of update_worker
def ticker_thread_job_queue_processor():
from changedetectionio import update_worker
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
while not app.config.exit.is_set():
time.sleep(0.3)
# Check that some threads are free
running = 0
for t in threading.enumerate():
if t.name == 'update_worker':
running += 1
if running >= n_workers:
continue
try:
uuid = update_q.get(block=False)
except queue.Empty:
# Go back to waiting for exit and/or another entry from the queue
continue
print ("Starting a thread fetch")
try:
# Launch the update_worker thread that will handle picking items off a queue and sending them off
# in the event that playwright or others have a memory leak, this should clean it up better than gc.collect()
# (By letting it exit entirely)
update_worker.update_worker(update_q, notification_q, app, datastore, uuid).start()
except Exception as e:
print ("Error launching update_worker for UUID {}.".format(uuid))
print (str(e))
print ("Running now {}", running)
# Thread runner to check every minute, look for new watches to feed into the Queue. # Thread runner to check every minute, look for new watches to feed into the Queue.
def ticker_thread_check_time_launch_checks(): def ticker_thread_check_time_launch_checks():
import random import random
from changedetectionio import update_worker
recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20)) recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds) print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)
# Can go in its own function
# Always maintain the minimum number of threads, each thread will terminate when it has processed exactly 1 queued watch
# This is to be totally sure that they don't leak memory
# Spin up Workers that do the fetching # Spin up Workers that do the fetching
# Can be overriden by ENV or use the default settings # Can be overriden by ENV or use the default settings
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
for _ in range(n_workers):
new_worker = update_worker.update_worker(update_q, notification_q, app, datastore)
running_update_threads.append(new_worker)
new_worker.start()
while not app.config.exit.is_set(): while not app.config.exit.is_set():
# Update our list of watches by UUID that are currently fetching data, used in the UI # Get a list of watches by UUID that are currently fetching data
running_uuids = [] running_uuids = []
for t in running_update_threads: for t in running_update_threads:
if t.current_uuid: if t.current_uuid:

View File

@@ -1,11 +0,0 @@
import apprise
# Create our AppriseAsset and populate it with some of our new values:
# https://github.com/caronc/apprise/wiki/Development_API#the-apprise-asset-object
asset = apprise.AppriseAsset(
image_url_logo='https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/changedetectionio/static/images/avatar-256x256.png'
)
asset.app_id = "changedetection.io"
asset.app_desc = "ChangeDetection.io best and simplest website monitoring and change detection"
asset.app_url = "https://changedetection.io"

View File

@@ -35,7 +35,7 @@ def main():
create_datastore_dir = False create_datastore_dir = False
for opt, arg in opts: for opt, arg in opts:
# if opt == '--clear-all-history': # if opt == '--purge':
# Remove history, the actual files you need to delete manually. # Remove history, the actual files you need to delete manually.
# for uuid, watch in datastore.data['watching'].items(): # for uuid, watch in datastore.data['watching'].items():
# watch.update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': None}) # watch.update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': None})

View File

@@ -46,7 +46,6 @@ class Fetcher():
headers = None headers = None
fetcher_description = "No description" fetcher_description = "No description"
webdriver_js_execute_code = None
xpath_element_js = """ xpath_element_js = """
// Include the getXpath script directly, easier than fetching // Include the getXpath script directly, easier than fetching
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}}); !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
@@ -176,6 +175,7 @@ class Fetcher():
# Will be needed in the future by the VisualSelector, always get this where possible. # Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False screenshot = False
fetcher_description = "No description"
system_http_proxy = os.getenv('HTTP_PROXY') system_http_proxy = os.getenv('HTTP_PROXY')
system_https_proxy = os.getenv('HTTPS_PROXY') system_https_proxy = os.getenv('HTTPS_PROXY')
@@ -301,17 +301,11 @@ class base_html_playwright(Fetcher):
accept_downloads=False accept_downloads=False
) )
if len(request_headers):
context.set_extra_http_headers(request_headers)
page = context.new_page() page = context.new_page()
try: try:
page.set_default_navigation_timeout(90000) page.set_default_navigation_timeout(90000)
page.set_default_timeout(90000) page.set_default_timeout(90000)
# Listen for all console events and handle errors
page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto # Bug - never set viewport size BEFORE page.goto
# Waits for the next navigation. Using Python context manager # Waits for the next navigation. Using Python context manager
@@ -319,9 +313,6 @@ class base_html_playwright(Fetcher):
with page.expect_navigation(): with page.expect_navigation():
response = page.goto(url, wait_until='load') response = page.goto(url, wait_until='load')
if self.webdriver_js_execute_code is not None:
page.evaluate(self.webdriver_js_execute_code)
except playwright._impl._api_types.TimeoutError as e: except playwright._impl._api_types.TimeoutError as e:
context.close() context.close()
browser.close() browser.close()
@@ -453,12 +444,6 @@ class base_html_webdriver(Fetcher):
self.driver.set_window_size(1280, 1024) self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.screenshot = self.driver.get_screenshot_as_png() self.screenshot = self.driver.get_screenshot_as_png()
# @todo - how to check this? is it possible? # @todo - how to check this? is it possible?

View File

@@ -1,5 +1,4 @@
import hashlib import hashlib
import logging
import os import os
import re import re
import time import time
@@ -11,7 +10,6 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
class perform_site_check(): class perform_site_check():
def __init__(self, *args, datastore, **kwargs): def __init__(self, *args, datastore, **kwargs):
@@ -46,20 +44,6 @@ class perform_site_check():
return proxy_args return proxy_args
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "foobar(?i)" type configuration
def forward_slash_enclosed_regex_to_options(self, regex):
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
if res:
regex = res.group(1)
regex += '(?{})'.format(res.group(2))
else:
regex += '(?{})'.format('i')
return regex
def run(self, uuid): def run(self, uuid):
timestamp = int(time.time()) # used for storage etc too timestamp = int(time.time()) # used for storage etc too
@@ -121,9 +105,6 @@ class perform_site_check():
elif system_webdriver_delay is not None: elif system_webdriver_delay is not None:
fetcher.render_extract_delay = system_webdriver_delay fetcher.render_extract_delay = system_webdriver_delay
if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter']) fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter'])
fetcher.quit() fetcher.quit()
@@ -165,9 +146,7 @@ class perform_site_check():
is_html = False is_html = False
if is_html or is_source: if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
html_content = fetcher.content html_content = fetcher.content
# If not JSON, and if it's not text/plain.. # If not JSON, and if it's not text/plain..
@@ -230,27 +209,15 @@ class perform_site_check():
if len(extract_text) > 0: if len(extract_text) > 0:
regex_matched_output = [] regex_matched_output = []
for s_re in extract_text: for s_re in extract_text:
# incase they specified something in '/.../x' result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
regex = self.forward_slash_enclosed_regex_to_options(s_re) flags=re.MULTILINE | re.DOTALL | re.LOCALE)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html) if result:
regex_matched_output.append(result[0])
for l in result:
if type(l) is tuple:
#@todo - some formatter option default (between groups)
regex_matched_output += list(l) + [b'\n']
else:
# @todo - some formatter option default (between each ungrouped result)
regex_matched_output += [l] + [b'\n']
# Now we will only show what the regex matched
stripped_text_from_html = b''
text_content_before_ignored_filter = b''
if regex_matched_output: if regex_matched_output:
# @todo some formatter for presentation? stripped_text_from_html = b'\n'.join(regex_matched_output)
stripped_text_from_html = b''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html text_content_before_ignored_filter = stripped_text_from_html
# Re #133 - if we should strip whitespaces from triggering the change detected comparison # Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False): if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
@@ -288,6 +255,9 @@ class perform_site_check():
# Looks like something changed, but did it match all the rules? # Looks like something changed, but did it match all the rules?
if blocked: if blocked:
changed_detected = False changed_detected = False
else:
update_obj["last_changed"] = timestamp
# Extract title as title # Extract title as title
if is_html: if is_html:
@@ -295,16 +265,6 @@ class perform_site_check():
if not watch['title'] or not len(watch['title']): if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
if changed_detected:
if watch.get('check_unique_lines', False):
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
# One or more lines? unsure?
if not has_unique_lines:
logging.debug("check_unique_lines: UUID {} didnt have anything new setting change_detected=False".format(uuid))
changed_detected = False
else:
logging.debug("check_unique_lines: UUID {} had unique content".format(uuid))
# Always record the new checksum # Always record the new checksum
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5

View File

@@ -340,17 +340,12 @@ class watchForm(commonSettingsForm):
body = TextAreaField('Request body', [validators.Optional()]) body = TextAreaField('Request body', [validators.Optional()])
method = SelectField('Request method', choices=valid_method, default=default_method) method = SelectField('Request method', choices=valid_method, default=default_method)
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
check_unique_lines = BooleanField('Only trigger when new lines appear', default=False)
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()]) text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"}) save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
proxy = RadioField('Proxy') proxy = RadioField('Proxy')
filter_failure_notification_send = BooleanField(
'Send a notification when the filter can no longer be found on the page', default=False)
def validate(self, **kwargs): def validate(self, **kwargs):
if not super().validate(): if not super().validate():
@@ -389,11 +384,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()]) api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()])
password = SaltyPasswordField() password = SaltyPasswordField()
filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0,
message="Should contain zero or more attempts")])
class globalSettingsForm(Form): class globalSettingsForm(Form):
# Define these as FormFields/"sub forms", this way it matches the JSON storage # Define these as FormFields/"sub forms", this way it matches the JSON storage

View File

@@ -1,27 +1,23 @@
import json import json
import re
from typing import List from typing import List
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse from jsonpath_ng.ext import parse
import re import re
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
class JSONNotFound(ValueError): class JSONNotFound(ValueError):
def __init__(self, msg): def __init__(self, msg):
ValueError.__init__(self, msg) ValueError.__init__(self, msg)
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def css_filter(css_filter, html_content): def css_filter(css_filter, html_content):
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
html_block = "" html_block = ""
r = soup.select(css_filter, separator="") for item in soup.select(css_filter, separator=""):
if len(html_content) > 0 and len(r) == 0:
raise FilterNotFoundInResponse(css_filter)
for item in r:
html_block += str(item) html_block += str(item)
return html_block + "\n" return html_block + "\n"
@@ -46,12 +42,8 @@ def xpath_filter(xpath_filter, html_content):
tree = html.fromstring(bytes(html_content, encoding='utf-8')) tree = html.fromstring(bytes(html_content, encoding='utf-8'))
html_block = "" html_block = ""
r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}) for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
if len(html_content) > 0 and len(r) == 0: html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
raise FilterNotFoundInResponse(xpath_filter)
for item in r:
html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
return html_block return html_block
@@ -181,17 +173,10 @@ def strip_ignore_text(content, wordlist, mode="content"):
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
import multiprocessing
from inscriptis.model.config import ParserConfig
"""Converts html string to a string with just the text. If ignoring """Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also rendering anchor tag content is enable, anchor tag content are also
included in the text included in the text
@NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE
https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/
:param html_content: string with html content :param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract :param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the hyperlinks (the anchor tag content) together with text. This refers to the
@@ -212,33 +197,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
else: else:
parser_config = None parser_config = None
def parse_function(html_content, parser_config, results_queue):
from inscriptis import get_text
# get text and annotations via inscriptis # get text and annotations via inscriptis
text_content = get_text(html_content, config=parser_config) text_content = get_text(html_content, config=parser_config)
results_queue.put(text_content)
results_queue = multiprocessing.Queue()
parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
parse_process.daemon = True
parse_process.start()
text_content = results_queue.get() # blocks until results are available
parse_process.terminate()
return text_content return text_content
def workarounds_for_obfuscations(content):
"""
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
This could go into its own Pip package in the future, for faster updates
"""
# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
# https://github.com/weblyzard/inscriptis/issues/45
if not content:
return content
content = re.sub('<!--\s+-->', '', content)
return content

View File

@@ -1,28 +1,30 @@
from os import getenv import collections
import os
import uuid as uuid_builder
from changedetectionio.notification import ( from changedetectionio.notification import (
default_notification_body, default_notification_body,
default_notification_format, default_notification_format,
default_notification_title, default_notification_title,
) )
_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
class model(dict): class model(dict):
base_config = { base_config = {
'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!", 'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
'watching': {}, 'watching': {},
'settings': { 'settings': {
'headers': { 'headers': {
'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'), 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet. 'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet.
'Accept-Language': 'en-GB,en-US;q=0.9,en;' 'Accept-Language': 'en-GB,en-US;q=0.9,en;'
}, },
'requests': { 'requests': {
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds 'timeout': 15, # Default 15 seconds
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'jitter_seconds': 0, 'jitter_seconds': 0,
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections 'workers': 10, # Number of threads, lower is better for slow connections
'proxy': None # Preferred proxy connection 'proxy': None # Preferred proxy connection
}, },
'application': { 'application': {
@@ -31,8 +33,7 @@ class model(dict):
'base_url' : None, 'base_url' : None,
'extract_title_as_title': False, 'extract_title_as_title': False,
'empty_pages_are_a_change': False, 'empty_pages_are_a_change': False,
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"), 'fetch_backend': os.getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [], 'global_subtractive_selectors': [],
'ignore_whitespace': True, 'ignore_whitespace': True,

View File

@@ -1,9 +1,7 @@
import os import os
import uuid as uuid_builder import uuid as uuid_builder
from distutils.util import strtobool
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60)) minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
from changedetectionio.notification import ( from changedetectionio.notification import (
default_notification_body, default_notification_body,
@@ -42,20 +40,16 @@ class model(dict):
'trigger_text': [], # List of text or regex to wait for until a change is detected 'trigger_text': [], # List of text or regex to wait for until a change is detected
'text_should_not_be_present': [], # Text that should not present 'text_should_not_be_present': [], # Text that should not present
'fetch_backend': None, 'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_title_as_title': False, 'extract_title_as_title': False,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'proxy': None, # Preferred proxy connection 'proxy': None, # Preferred proxy connection
# Re #110, so then if this is set to None, we know to use the default value instead # Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default # Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case. # Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, 'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'webdriver_delay': None, 'webdriver_delay': None
'webdriver_js_execute_code': None, # Run before change-detection
} }
jitter_seconds = 0 jitter_seconds = 0
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
def __init__(self, *arg, **kw): def __init__(self, *arg, **kw):
import uuid import uuid
self.update(self.__base_config) self.update(self.__base_config)
@@ -164,21 +158,8 @@ class model(dict):
def threshold_seconds(self): def threshold_seconds(self):
seconds = 0 seconds = 0
for m, n in mtable.items(): for m, n in self.mtable.items():
x = self.get('time_between_check', {}).get(m, None) x = self.get('time_between_check', {}).get(m, None)
if x: if x:
seconds += x * n seconds += x * n
return seconds return seconds
# Iterate over all history texts and see if something new exists
def lines_contain_something_unique_compared_to_history(self, lines=[]):
local_lines = [l.decode('utf-8').strip().lower() for l in lines]
# Compare each lines (set) against each history text file (set) looking for something new..
for k, v in self.history.items():
alist = [line.decode('utf-8').strip().lower() for line in open(v, 'rb')]
res = set(alist) != set(local_lines)
if res:
return True
return False

View File

@@ -34,6 +34,7 @@ def process_notification(n_object, datastore):
valid_notification_formats[default_notification_format], valid_notification_formats[default_notification_format],
) )
# Insert variables into the notification content # Insert variables into the notification content
notification_parameters = create_notification_parameters(n_object, datastore) notification_parameters = create_notification_parameters(n_object, datastore)
@@ -48,9 +49,9 @@ def process_notification(n_object, datastore):
# raise it as an exception # raise it as an exception
apobjs=[] apobjs=[]
sent_objs=[] sent_objs=[]
from .apprise_asset import asset
for url in n_object['notification_urls']: for url in n_object['notification_urls']:
apobj = apprise.Apprise(debug=True, asset=asset)
apobj = apprise.Apprise(debug=True)
url = url.strip() url = url.strip()
if len(url): if len(url):
print(">> Process Notification: AppRise notifying {}".format(url)) print(">> Process Notification: AppRise notifying {}".format(url))
@@ -63,7 +64,7 @@ def process_notification(n_object, datastore):
# So if no avatar_url is specified, add one so it can be correctly calculated into the total payload # So if no avatar_url is specified, add one so it can be correctly calculated into the total payload
k = '?' if not '?' in url else '&' k = '?' if not '?' in url else '&'
if not 'avatar_url' in url and not url.startswith('mail'): if not 'avatar_url' in url:
url += k + 'avatar_url=https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/changedetectionio/static/images/avatar-256x256.png' url += k + 'avatar_url=https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/changedetectionio/static/images/avatar-256x256.png'
if url.startswith('tgram://'): if url.startswith('tgram://'):
@@ -78,21 +79,13 @@ def process_notification(n_object, datastore):
n_title = n_title[0:payload_max_size] n_title = n_title[0:payload_max_size]
n_body = n_body[0:body_limit] n_body = n_body[0:body_limit]
elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'): elif url.startswith('discord://'):
# real limit is 2000, but minus some for extra metadata # real limit is 2000, but minus some for extra metadata
payload_max_size = 1700 payload_max_size = 1700
body_limit = max(0, payload_max_size - len(n_title)) body_limit = max(0, payload_max_size - len(n_title))
n_title = n_title[0:payload_max_size] n_title = n_title[0:payload_max_size]
n_body = n_body[0:body_limit] n_body = n_body[0:body_limit]
elif url.startswith('mailto'):
# Apprise will default to HTML, so we need to override it
# So that whats' generated in n_body is in line with what is going to be sent.
# https://github.com/caronc/apprise/issues/633#issuecomment-1191449321
if not 'format=' in url and (n_format == 'text' or n_format == 'markdown'):
prefix = '?' if not '?' in url else '&'
url = "{}{}format={}".format(url, prefix, n_format)
apobj.add(url) apobj.add(url)
apobj.notify( apobj.notify(

View File

@@ -1,20 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
width="18"
height="19.92"
viewBox="0 0 18 19.92"
version="1.1"
id="svg6"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<defs
id="defs10" />
<path
d="M -3,-2 H 21 V 22 H -3 Z"
fill="none"
id="path2" />
<path
d="m 15,14.08 c -0.76,0 -1.44,0.3 -1.96,0.77 L 5.91,10.7 C 5.96,10.47 6,10.24 6,10 6,9.76 5.96,9.53 5.91,9.3 L 12.96,5.19 C 13.5,5.69 14.21,6 15,6 16.66,6 18,4.66 18,3 18,1.34 16.66,0 15,0 c -1.66,0 -3,1.34 -3,3 0,0.24 0.04,0.47 0.09,0.7 L 5.04,7.81 C 4.5,7.31 3.79,7 3,7 1.34,7 0,8.34 0,10 c 0,1.66 1.34,3 3,3 0.79,0 1.5,-0.31 2.04,-0.81 l 7.12,4.16 c -0.05,0.21 -0.08,0.43 -0.08,0.65 0,1.61 1.31,2.92 2.92,2.92 1.61,0 2.92,-1.31 2.92,-2.92 0,-1.61 -1.31,-2.92 -2.92,-2.92 z"
id="path4"
style="fill:#ffffff;fill-opacity:1" />
</svg>

Before

Width:  |  Height:  |  Size: 892 B

View File

@@ -1,30 +1,13 @@
$(document).ready(function() { $(document).ready(function() {
function toggle() { function toggle() {
if ($('input[name="fetch_backend"]:checked').val() == 'html_webdriver') { if ($('input[name="fetch_backend"]:checked').val() != 'html_requests') {
if(playwright_enabled) {
// playwright supports headers, so hide everything else
// See #664
$('#requests-override-options #request-method').hide();
$('#requests-override-options #request-body').hide();
// @todo connect this one up
$('#ignore-status-codes-option').hide();
} else {
// selenium/webdriver doesnt support anything afaik, hide it all
$('#requests-override-options').hide(); $('#requests-override-options').hide();
}
$('#webdriver-override-options').show(); $('#webdriver-override-options').show();
} else { } else {
$('#requests-override-options').show(); $('#requests-override-options').show();
$('#requests-override-options *:hidden').show();
$('#webdriver-override-options').hide(); $('#webdriver-override-options').hide();
} }
} }
$('input[name="fetch_backend"]').click(function (e) { $('input[name="fetch_backend"]').click(function (e) {
toggle(); toggle();
}); });

View File

@@ -1,3 +1 @@
node_modules node_modules
package-lock.json

File diff suppressed because it is too large Load Diff

View File

@@ -158,7 +158,8 @@ class ChangeDetectionStore:
@property @property
def threshold_seconds(self): def threshold_seconds(self):
seconds = 0 seconds = 0
for m, n in Watch.mtable.items(): mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
for m, n in mtable.items():
x = self.__data['settings']['requests']['time_between_check'].get(m) x = self.__data['settings']['requests']['time_between_check'].get(m)
if x: if x:
seconds += x * n seconds += x * n
@@ -249,7 +250,7 @@ class ChangeDetectionStore:
return self.data['watching'][uuid].get(val) return self.data['watching'][uuid].get(val)
# Remove a watchs data but keep the entry (URL etc) # Remove a watchs data but keep the entry (URL etc)
def clear_watch_history(self, uuid): def scrub_watch(self, uuid):
import pathlib import pathlib
self.__data['watching'][uuid].update( self.__data['watching'][uuid].update(
@@ -297,8 +298,7 @@ class ChangeDetectionStore:
'ignore_text', 'css_filter', 'ignore_text', 'css_filter',
'subtractive_selectors', 'trigger_text', 'subtractive_selectors', 'trigger_text',
'extract_title_as_title', 'extract_text', 'extract_title_as_title', 'extract_text',
'text_should_not_be_present', 'text_should_not_be_present']:
'webdriver_js_execute_code']:
if res.get(k): if res.get(k):
apply_extras[k] = res[k] apply_extras[k] = res[k]
@@ -518,11 +518,3 @@ class ChangeDetectionStore:
# But we should set it back to a empty dict so we don't break if this schema runs on an earlier version. # But we should set it back to a empty dict so we don't break if this schema runs on an earlier version.
# In the distant future we can remove this entirely # In the distant future we can remove this entirely
self.data['watching'][uuid]['history'] = {} self.data['watching'][uuid]['history'] = {}
# We incorrectly stored last_changed when there was not a change, and then confused the output list table
def update_3(self):
for uuid, watch in self.data['watching'].items():
# Be sure it's recalculated
p = watch.history
if watch.history_n < 2:
watch['last_changed'] = 0

View File

@@ -22,7 +22,7 @@
{% if versions|length >= 1 %} {% if versions|length >= 1 %}
<label for="diff-version">Compare newest (<span id="current-v-date"></span>) with</label> <label for="diff-version">Compare newest (<span id="current-v-date"></span>) with</label>
<select id="diff-version" name="previous_version"> <select id="diff-version" name="previous_version">
{% for version in versions|reverse %} {% for version in versions %}
<option value="{{version}}" {% if version== current_previous_version %} selected="" {% endif %}> <option value="{{version}}" {% if version== current_previous_version %} selected="" {% endif %}>
{{version}} {{version}}
</option> </option>

View File

@@ -7,7 +7,6 @@
const notification_base_url="{{url_for('ajax_callback_send_notification_test')}}"; const notification_base_url="{{url_for('ajax_callback_send_notification_test')}}";
const watch_visual_selector_data_url="{{url_for('static_content', group='visual_selector_data', filename=uuid)}}"; const watch_visual_selector_data_url="{{url_for('static_content', group='visual_selector_data', filename=uuid)}}";
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}"; const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
const playwright_enabled={% if playwright_enabled %} true {% else %} false {% endif %};
{% if emailprefix %} {% if emailprefix %}
const email_notification_prefix=JSON.parse('{{ emailprefix|tojson }}'); const email_notification_prefix=JSON.parse('{{ emailprefix|tojson }}');
@@ -25,7 +24,7 @@
<ul> <ul>
<li class="tab" id="default-tab"><a href="#general">General</a></li> <li class="tab" id="default-tab"><a href="#general">General</a></li>
<li class="tab"><a href="#request">Request</a></li> <li class="tab"><a href="#request">Request</a></li>
<li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Filter Selector</a></li> <li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Selector</a></li>
<li class="tab"><a href="#filters-and-triggers">Filters &amp; Triggers</a></li> <li class="tab"><a href="#filters-and-triggers">Filters &amp; Triggers</a></li>
<li class="tab"><a href="#notifications">Notifications</a></li> <li class="tab"><a href="#notifications">Notifications</a></li>
</ul> </ul>
@@ -62,12 +61,6 @@
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_checkbox_field(form.extract_title_as_title) }} {{ render_checkbox_field(form.extract_title_as_title) }}
</div> </div>
<div class="pure-control-group">
{{ render_checkbox_field(form.filter_failure_notification_send) }}
<span class="pure-form-message-inline">
Sends a notification when the filter can no longer be seen on the page, good for knowing when the page changed and your filter will not work anymore.
</span>
</div>
</fieldset> </fieldset>
</div> </div>
@@ -88,39 +81,33 @@
</div> </div>
{% endif %} {% endif %}
<fieldset id="webdriver-override-options"> <fieldset id="webdriver-override-options">
<div class="pure-control-group">
{{ render_field(form.webdriver_delay) }}
<div class="pure-form-message-inline"> <div class="pure-form-message-inline">
<strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong> <strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong>
<br/> <br/>
This will wait <i>n</i> seconds before extracting the text. This will wait <i>n</i> seconds before extracting the text.
{% if using_global_webdriver_wait %}
<br/><strong>Using the current global default settings</strong>
{% endif %}
</div>
</div> </div>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.webdriver_js_execute_code) }} {{ render_field(form.webdriver_delay) }}
</div>
{% if using_global_webdriver_wait %}
<div class="pure-form-message-inline"> <div class="pure-form-message-inline">
Run this code before performing change detection, handy for filling in fields and other actions <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More help and examples here</a> <strong>Using the current global default settings</strong>
</div>
</div> </div>
{% endif %}
</fieldset> </fieldset>
<fieldset class="pure-group" id="requests-override-options"> <fieldset class="pure-group" id="requests-override-options">
{% if not playwright_enabled %}
<div class="pure-form-message-inline"> <div class="pure-form-message-inline">
<strong>Request override is currently only used by the <i>Basic fast Plaintext/HTTP Client</i> method.</strong> <strong>Request override is currently only used by the <i>Basic fast Plaintext/HTTP Client</i> method.</strong>
</div> </div>
{% endif %} <div class="pure-control-group">
<div class="pure-control-group" id="request-method">
{{ render_field(form.method) }} {{ render_field(form.method) }}
</div> </div>
<div class="pure-control-group" id="request-headers"> <div class="pure-control-group">
{{ render_field(form.headers, rows=5, placeholder="Example {{ render_field(form.headers, rows=5, placeholder="Example
Cookie: foobar Cookie: foobar
User-Agent: wonderbra 1.0") }} User-Agent: wonderbra 1.0") }}
</div> </div>
<div class="pure-control-group" id="request-body"> <div class="pure-control-group">
{{ render_field(form.body, rows=5, placeholder="Example {{ render_field(form.body, rows=5, placeholder="Example
{ {
\"name\":\"John\", \"name\":\"John\",
@@ -128,7 +115,7 @@ User-Agent: wonderbra 1.0") }}
\"car\":null \"car\":null
}") }} }") }}
</div> </div>
<div id="ignore-status-codes-option"> <div>
{{ render_checkbox_field(form.ignore_status_codes) }} {{ render_checkbox_field(form.ignore_status_codes) }}
</div> </div>
</fieldset> </fieldset>
@@ -156,12 +143,6 @@ User-Agent: wonderbra 1.0") }}
</li> </li>
</ul> </ul>
</div> </div>
<fieldset>
<div class="pure-control-group">
{{ render_checkbox_field(form.check_unique_lines) }}
<span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
</div>
</fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.", {{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
class="m-d") }} class="m-d") }}
@@ -196,7 +177,7 @@ nav
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li> <li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li> <li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li> <li>Use the preview/show current tab to see ignores</li>
</ul> </ul>
@@ -239,15 +220,8 @@ Unavailable") }}
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }} {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li>Extracts text in the final output (line by line) after other filters using regular expressions; <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
<ul> <li>One line per regular-expression.</li>
<li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
<li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
<li>Keyword example &dash; example <code>Out of stock</code></li>
<li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
</ul>
</li>
<li>One line per regular-expression/ string match</li>
</ul> </ul>
</span> </span>
</div> </div>
@@ -256,7 +230,7 @@ Unavailable") }}
<div class="tab-pane-inner visual-selector-ui" id="visualselector"> <div class="tab-pane-inner visual-selector-ui" id="visualselector">
<img id="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}"> <img id="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}">
<strong>Pro-tip:</strong> This tool is only for limiting which elements will be included on a change-detection, not for interacting with browser directly.
<fieldset> <fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
{% if visualselector_enabled %} {% if visualselector_enabled %}
@@ -301,8 +275,8 @@ Unavailable") }}
<a href="{{url_for('form_delete', uuid=uuid)}}" <a href="{{url_for('form_delete', uuid=uuid)}}"
class="pure-button button-small button-error ">Delete</a> class="pure-button button-small button-error ">Delete</a>
<a href="{{url_for('clear_watch_history', uuid=uuid)}}" <a href="{{url_for('scrub_watch', uuid=uuid)}}"
class="pure-button button-small button-error ">Clear History</a> class="pure-button button-small button-error ">Scrub</a>
<a href="{{url_for('form_clone', uuid=uuid)}}" <a href="{{url_for('form_clone', uuid=uuid)}}"
class="pure-button button-small ">Create Copy</a> class="pure-button button-small ">Create Copy</a>
</div> </div>

View File

@@ -3,22 +3,22 @@
{% block content %} {% block content %}
<div class="edit-form"> <div class="edit-form">
<div class="box-wrap inner"> <div class="box-wrap inner">
<form class="pure-form pure-form-stacked" action="{{url_for('clear_all_history')}}" method="POST"> <form class="pure-form pure-form-stacked" action="{{url_for('scrub_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset> <fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
This will remove version history (snapshots) for ALL watches, but keep your list of URLs! <br/> This will remove ALL version snapshots/data, but keep your list of URLs. <br/>
You may like to use the <strong>BACKUP</strong> link first.<br/> You may like to use the <strong>BACKUP</strong> link first.<br/>
</div> </div>
<br/> <br/>
<div class="pure-control-group"> <div class="pure-control-group">
<label for="confirmtext">Confirmation text</label> <label for="confirmtext">Confirmation text</label>
<input type="text" id="confirmtext" required="" name="confirmtext" value="" size="10"/> <input type="text" id="confirmtext" required="" name="confirmtext" value="" size="10"/>
<span class="pure-form-message-inline">Type in the word <strong>clear</strong> to confirm that you understand.</span> <span class="pure-form-message-inline">Type in the word <strong>scrub</strong> to confirm that you understand!</span>
</div> </div>
<br/> <br/>
<div class="pure-control-group"> <div class="pure-control-group">
<button type="submit" class="pure-button pure-button-primary">Clear History!</button> <button type="submit" class="pure-button pure-button-primary">Scrub!</button>
</div> </div>
<br/> <br/>
<div class="pure-control-group"> <div class="pure-control-group">

View File

@@ -36,13 +36,7 @@
{{ render_field(form.requests.form.jitter_seconds, class="jitter_seconds") }} {{ render_field(form.requests.form.jitter_seconds, class="jitter_seconds") }}
<span class="pure-form-message-inline">Example - 3 seconds random jitter could trigger up to 3 seconds earlier or up to 3 seconds later</span> <span class="pure-form-message-inline">Example - 3 seconds random jitter could trigger up to 3 seconds earlier or up to 3 seconds later</span>
</div> </div>
<div class="pure-control-group">
{{ render_field(form.application.form.filter_failure_notification_threshold_attempts, class="filter_failure_notification_threshold_attempts") }}
<span class="pure-form-message-inline">After this many consecutive times that the CSS/xPath filter is missing, send a notification
<br/>
Set to <strong>0</strong> to disable
</span>
</div>
<div class="pure-control-group"> <div class="pure-control-group">
{% if not hide_remove_pass %} {% if not hide_remove_pass %}
{% if current_user.is_authenticated %} {% if current_user.is_authenticated %}
@@ -154,7 +148,7 @@ nav
<ul> <ul>
<li>Note: This is applied globally in addition to the per-watch rules.</li> <li>Note: This is applied globally in addition to the per-watch rules.</li>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li> <li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li> <li>Changing this will affect the comparison checksum which may trigger an alert</li>
<li>Use the preview/show current tab to see ignores</li> <li>Use the preview/show current tab to see ignores</li>
</ul> </ul>
@@ -179,7 +173,7 @@ nav
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_button(form.save_button) }} {{ render_button(form.save_button) }}
<a href="{{url_for('index')}}" class="pure-button button-small button-cancel">Back</a> <a href="{{url_for('index')}}" class="pure-button button-small button-cancel">Back</a>
<a href="{{url_for('clear_all_history')}}" class="pure-button button-small button-cancel">Clear Snapshot History</a> <a href="{{url_for('scrub_page')}}" class="pure-button button-small button-cancel">Delete History Snapshot Data</a>
</div> </div>
</div> </div>

View File

@@ -14,7 +14,7 @@
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }} {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
<button type="submit" class="pure-button pure-button-primary">Watch</button> <button type="submit" class="pure-button pure-button-primary">Watch</button>
</fieldset> </fieldset>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span> <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
</form> </form>
<div> <div>
<a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a> <a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>
@@ -40,7 +40,7 @@
<tbody> <tbody>
{% for watch in watches|sort(attribute='last_changed', reverse=True) %} {% for watch in watches %}
<tr id="{{ watch.uuid }}" <tr id="{{ watch.uuid }}"
class="{{ loop.cycle('pure-table-odd', 'pure-table-even') }} class="{{ loop.cycle('pure-table-odd', 'pure-table-even') }}
{% if watch.last_error is defined and watch.last_error != False %}error{% endif %} {% if watch.last_error is defined and watch.last_error != False %}error{% endif %}
@@ -68,7 +68,7 @@
{% endif %} {% endif %}
</td> </td>
<td class="last-checked">{{watch|format_last_checked_time|safe}}</td> <td class="last-checked">{{watch|format_last_checked_time|safe}}</td>
<td class="last-changed">{% if watch.history_n >=2 and watch.last_changed >0 %} <td class="last-changed">{% if watch.history_n >=2 and watch.last_changed %}
{{watch.last_changed|format_timestamp_timeago}} {{watch.last_changed|format_timestamp_timeago}}
{% else %} {% else %}
Not yet Not yet

View File

@@ -95,8 +95,6 @@ def test_api_simple(client, live_server):
assert watch_uuid in json.loads(res.data).keys() assert watch_uuid in json.loads(res.data).keys()
before_recheck_info = json.loads(res.data)[watch_uuid] before_recheck_info = json.loads(res.data)[watch_uuid]
assert before_recheck_info['last_checked'] != 0 assert before_recheck_info['last_checked'] != 0
#705 `last_changed` should be zero on the first check
assert before_recheck_info['last_changed'] == 0
assert before_recheck_info['title'] == 'My test URL' assert before_recheck_info['title'] == 'My test URL'
set_modified_response() set_modified_response()

View File

@@ -15,7 +15,7 @@ def set_original_response():
</br> </br>
So let's see what happens. </br> So let's see what happens. </br>
<div id="sametext">Some text thats the same</div> <div id="sametext">Some text thats the same</div>
<div class="changetext">Some text that will change</div> <div id="changetext">Some text that will change</div>
</body> </body>
</html> </html>
""" """
@@ -33,8 +33,7 @@ def set_modified_response():
</br> </br>
So let's see what happens. </br> So let's see what happens. </br>
<div id="sametext">Some text thats the same</div> <div id="sametext">Some text thats the same</div>
<div class="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/> 2000 online )</div> <div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
<div class="changetext">SomeCase insensitive 3456</div>
</body> </body>
</html> </html>
""" """
@@ -45,78 +44,11 @@ def set_modified_response():
return None return None
def set_multiline_response():
test_return_data = """<html>
<body>
<p>Something <br/>
across 6 billion multiple<br/>
lines
</p>
<div>aaand something lines</div>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
def test_setup(client, live_server):
live_server_setup(live_server)
def test_check_filter_multiline(client, live_server):
set_multiline_response()
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(3)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"css_filter": '',
'extract_text': '/something.+?6 billion.+?lines/si',
"url": test_url,
"tag": "",
"headers": "",
'fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Updated watch." in res.data
time.sleep(3)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'<div class="">Something' in res.data
assert b'<div class="">across 6 billion multiple' in res.data
assert b'<div class="">lines' in res.data
# but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
assert b'aaand something lines' not in res.data
def test_check_filter_and_regex_extract(client, live_server): def test_check_filter_and_regex_extract(client, live_server):
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3
css_filter = ".changetext"
live_server_setup(live_server)
css_filter = "#changetext"
set_original_response() set_original_response()
@@ -132,7 +64,6 @@ def test_check_filter_and_regex_extract(client, live_server):
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(1)
# Trigger a check # Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -144,7 +75,7 @@ def test_check_filter_and_regex_extract(client, live_server):
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"css_filter": css_filter, data={"css_filter": css_filter,
'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i', 'extract_text': '\d+ online\n\d+ guests',
"url": test_url, "url": test_url,
"tag": "", "tag": "",
"headers": "", "headers": "",
@@ -155,6 +86,15 @@ def test_check_filter_and_regex_extract(client, live_server):
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
# Check it saved
res = client.get(
url_for("edit_page", uuid="first"),
)
assert b'\d+ online' in res.data
# Trigger a check
# client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread) time.sleep(sleep_time_for_fetch_thread)
@@ -180,19 +120,8 @@ def test_check_filter_and_regex_extract(client, live_server):
# Class will be blank for now because the frontend didnt apply the diff # Class will be blank for now because the frontend didnt apply the diff
assert b'<div class="">1000 online' in res.data assert b'<div class="">1000 online' in res.data
# All regex matching should be here
assert b'<div class="">2000 online' in res.data
# Both regexs should be here # Both regexs should be here
assert b'<div class="">80 guests' in res.data assert b'<div class="">80 guests' in res.data
# Regex with flag handling should be here
assert b'<div class="">SomeCase insensitive 3456' in res.data
# Singular group from /somecase insensitive (345\d)/i
assert b'<div class="">3456' in res.data
# Regex with multiline flag handling should be here
# Should not be here # Should not be here
assert b'Some text that did change' not in res.data assert b'Some text that did change' not in res.data

View File

@@ -1,134 +0,0 @@
import os
import time
import re
from flask import url_for
from .util import set_original_response, live_server_setup
from changedetectionio.model import App
def set_response_with_filter():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
<div id="nope-doesnt-exist">Some text thats the same</div>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
def run_filter_test(client, content_filter):
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("form_watch_add"),
data={"url": test_url, "tag": ''},
follow_redirects=True
)
assert b"Watch added" in res.data
# Give the thread time to pick up the first version
time.sleep(3)
# Goto the edit page, add our ignore text
# Add our URL to the import page
url = url_for('test_notification_endpoint', _external=True)
notification_url = url.replace('http', 'json')
print(">>>> Notification URL: " + notification_url)
# Just a regular notification setting, this will be used by the special 'filter not found' notification
notification_form_data = {"notification_urls": notification_url,
"notification_title": "New ChangeDetection.io Notification - {watch_url}",
"notification_body": "BASE URL: {base_url}\n"
"Watch URL: {watch_url}\n"
"Watch UUID: {watch_uuid}\n"
"Watch title: {watch_title}\n"
"Watch tag: {watch_tag}\n"
"Preview: {preview_url}\n"
"Diff URL: {diff_url}\n"
"Snapshot: {current_snapshot}\n"
"Diff: {diff}\n"
"Diff Full: {diff_full}\n"
":-)",
"notification_format": "Text"}
notification_form_data.update({
"url": test_url,
"tag": "my tag",
"title": "my title",
"headers": "",
"css_filter": content_filter,
"fetch_backend": "html_requests"})
res = client.post(
url_for("edit_page", uuid="first"),
data=notification_form_data,
follow_redirects=True
)
assert b"Updated watch." in res.data
time.sleep(3)
# Now the notification should not exist, because we didnt reach the threshold
assert not os.path.isfile("test-datastore/notification.txt")
for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT):
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(3)
# We should see something in the frontend
assert b'Did the page change its layout' in res.data
# Now it should exist and contain our "filter not found" alert
assert os.path.isfile("test-datastore/notification.txt")
notification = False
with open("test-datastore/notification.txt", 'r') as f:
notification = f.read()
assert 'CSS/xPath filter was not present in the page' in notification
assert content_filter.replace('"', '\\"') in notification
# Remove it and prove that it doesnt trigger when not expected
os.unlink("test-datastore/notification.txt")
set_response_with_filter()
for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT):
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(3)
# It should have sent a notification, but..
assert os.path.isfile("test-datastore/notification.txt")
# but it should not contain the info about the failed filter
with open("test-datastore/notification.txt", 'r') as f:
notification = f.read()
assert not 'CSS/xPath filter was not present in the page' in notification
# cleanup for the next
client.get(
url_for("form_delete", uuid="all"),
follow_redirects=True
)
os.unlink("test-datastore/notification.txt")
def test_setup(live_server):
live_server_setup(live_server)
def test_check_css_filter_failure_notification(client, live_server):
set_original_response()
time.sleep(1)
run_filter_test(client, '#nope-doesnt-exist')
def test_check_xpath_filter_failure_notification(client, live_server):
set_original_response()
time.sleep(1)
run_filter_test(client, '//*[@id="nope-doesnt-exist"]')

View File

@@ -1,43 +0,0 @@
#!/usr/bin/python3
import time
from flask import url_for
from .util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
<span>The price is</span><span>$<!-- -->90<!-- -->.<!-- -->74</span>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_obfuscations(client, live_server):
set_original_ignore_response()
live_server_setup(live_server)
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Give the thread time to pick it up
time.sleep(3)
# Check HTML conversion detected and workd
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'$90.74' in res.data

View File

@@ -1,104 +0,0 @@
#!/usr/bin/python3
import time
from flask import url_for
from .util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
<p>Some initial text</p>
<p>Which is across multiple lines</p>
<p>So let's see what happens.</p>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
# The same but just re-ordered the text
def set_modified_swapped_lines():
# Re-ordered and with some whitespacing, should get stripped() too.
test_return_data = """<html>
<body>
<p>Some initial text</p>
<p> So let's see what happens.</p>
<p>&nbsp;Which is across multiple lines</p>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def set_modified_with_trigger_text_response():
test_return_data = """<html>
<body>
<p>Some initial text</p>
<p>So let's see what happens.</p>
<p>and a new line!</p>
<p>Which is across multiple lines</p>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_unique_lines_functionality(client, live_server):
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"check_unique_lines": "y",
"url": test_url,
"fetch_backend": "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
assert b'unviewed' not in res.data
# Make a change
set_modified_swapped_lines()
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Now set the content which contains the new text and re-ordered existing text
set_modified_with_trigger_text_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

View File

@@ -3,74 +3,38 @@ import queue
import time import time
from changedetectionio import content_fetcher from changedetectionio import content_fetcher
from changedetectionio.html_tools import FilterNotFoundInResponse
# A single update worker # A single update worker
# #
# # Requests for checking on a single site(watch) from a queue of watches
# (another process inserts watches into the queue that are time-ready for checking)
class update_worker(threading.Thread): class update_worker(threading.Thread):
current_uuid = None current_uuid = None
def __init__(self, q, notification_q, app, datastore, uuid, *args, **kwargs): def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
self.q = q self.q = q
self.app = app self.app = app
self.notification_q = notification_q self.notification_q = notification_q
self.datastore = datastore self.datastore = datastore
self.current_uuid = uuid
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.name = "update_worker"
def send_filter_failure_notification(self, uuid):
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
watch = self.datastore.data['watching'].get(uuid, False)
n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
'notification_body': "Your configured CSS/xPath filter of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
watch['css_filter'],
threshold),
'notification_format': 'text'}
if len(watch['notification_urls']):
n_object['notification_urls'] = watch['notification_urls']
elif len(self.datastore.data['settings']['application']['notification_urls']):
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
n_object.update({
'watch_url': watch['url'],
'uuid': uuid
})
self.notification_q.put(n_object)
print("Sent filter not found notification for {}".format(uuid))
# Pick one job off the list, process it threaded, exist
def run(self): def run(self):
# Go talk to the website
self.perform_site_update()
self.current_uuid = None # Done
self.q.task_done()
# Let the thread die after processing 1
# We will launch nice juicy fresh threads every time to prevent memory leaks in complex runner code (playwright etc)
print ("EXITING THREAD!")
self.app.config.exit.wait(1)
return
def perform_site_update(self):
from changedetectionio import fetch_site_status from changedetectionio import fetch_site_status
if not self.current_uuid in list(self.datastore.data['watching'].keys()): update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
return
while not self.app.config.exit.is_set():
try:
uuid = self.q.get(block=False)
except queue.Empty:
pass
else:
self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()):
changed_detected = False changed_detected = False
contents = "" contents = ""
@@ -79,56 +43,44 @@ class update_worker(threading.Thread):
xpath_data = False xpath_data = False
now = time.time() now = time.time()
update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
try: try:
changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(self.current_uuid) changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)
# Re #342 # Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc # We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)): if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes") raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e: except PermissionError as e:
self.app.logger.error("File permission error updating", self.current_uuid, str(e)) self.app.logger.error("File permission error updating", uuid, str(e))
except content_fetcher.ReplyWithContentButNoText as e: except content_fetcher.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about # Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text # Page had elements/content but no renderable text
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': "Got HTML content but no text found."}) if self.datastore.data['watching'].get(uuid, False) and self.datastore.data['watching'][uuid].get('css_filter'):
except FilterNotFoundInResponse as e: self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (CSS / xPath Filter not found in page?)"})
err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e)) else:
c = 0 self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
if self.datastore.data['watching'].get(self.current_uuid, False): pass
c = self.datastore.data['watching'][self.current_uuid].get('consecutive_filter_failures', 5)
c += 1
# Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
print("Filter for {} not found, consecutive_filter_failures: {}".format(self.current_uuid, c))
if threshold >0 and c >= threshold:
self.send_filter_failure_notification(self.current_uuid)
c = 0
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
'consecutive_filter_failures': c})
except content_fetcher.EmptyReply as e: except content_fetcher.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this? # Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
except content_fetcher.ScreenshotUnavailable as e: except content_fetcher.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'" err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
except content_fetcher.PageUnloadable as e: except content_fetcher.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly" err_text = "Page request from server didnt respond correctly"
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
except Exception as e: except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e)) self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
else: else:
try: try:
watch = self.datastore.data['watching'][self.current_uuid] watch = self.datastore.data['watching'][uuid]
fname = "" # Saved history text filename fname = "" # Saved history text filename
# For the FIRST time we check a site, or a change detected, save the snapshot. # For the FIRST time we check a site, or a change detected, save the snapshot.
@@ -137,19 +89,16 @@ class update_worker(threading.Thread):
fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time()))) fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
# Generally update anything interesting returned # Generally update anything interesting returned
update_obj['consecutive_filter_failures'] = 0 self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
self.datastore.update_watch(uuid=self.current_uuid, update_obj=update_obj)
# A change was detected # A change was detected
if changed_detected: if changed_detected:
n_object = {} n_object = {}
print (">> Change detected in UUID {} - {}".format(self.current_uuid, watch['url'])) print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
# Notifications should only trigger on the second time (first time, we gather the initial snapshot) # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
if watch.history_n >= 2: if watch.history_n >= 2:
# Atleast 2, means there really was a change print(">> Change detected in UUID {} - {}".format(uuid, watch['url']))
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_changed': round(now)})
watch_history = watch.history watch_history = watch.history
dates = list(watch_history.keys()) dates = list(watch_history.keys())
# Theoretically it's possible that this could be just 1 long, # Theoretically it's possible that this could be just 1 long,
@@ -160,9 +109,10 @@ class update_worker(threading.Thread):
) )
prev_fname = watch_history[dates[-2]] prev_fname = watch_history[dates[-2]]
# Did it have any notification alerts to hit? # Did it have any notification alerts to hit?
if len(watch['notification_urls']): if len(watch['notification_urls']):
print(">>> Notifications queued for UUID from watch {}".format(self.current_uuid)) print(">>> Notifications queued for UUID from watch {}".format(uuid))
n_object['notification_urls'] = watch['notification_urls'] n_object['notification_urls'] = watch['notification_urls']
n_object['notification_title'] = watch['notification_title'] n_object['notification_title'] = watch['notification_title']
n_object['notification_body'] = watch['notification_body'] n_object['notification_body'] = watch['notification_body']
@@ -170,7 +120,7 @@ class update_worker(threading.Thread):
# No? maybe theres a global setting, queue them all # No? maybe theres a global setting, queue them all
elif len(self.datastore.data['settings']['application']['notification_urls']): elif len(self.datastore.data['settings']['application']['notification_urls']):
print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(self.current_uuid)) print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title'] n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body'] n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
@@ -189,7 +139,7 @@ class update_worker(threading.Thread):
from changedetectionio import diff from changedetectionio import diff
n_object.update({ n_object.update({
'watch_url': watch['url'], 'watch_url': watch['url'],
'uuid': self.current_uuid, 'uuid': uuid,
'current_snapshot': contents.decode('utf-8'), 'current_snapshot': contents.decode('utf-8'),
'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), 'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) 'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
@@ -200,18 +150,24 @@ class update_worker(threading.Thread):
except Exception as e: except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart! # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
print("!!!! Exception in update_worker !!!\n", e) print("!!!! Exception in update_worker !!!\n", e)
self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e)) self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
finally: finally:
# Always record that we atleast tried # Always record that we atleast tried
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'fetch_time': round(time.time() - now, 3), self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
'last_checked': round(time.time())}) 'last_checked': round(time.time())})
# Always save the screenshot if it's available # Always save the screenshot if it's available
if screenshot: if screenshot:
self.datastore.save_screenshot(watch_uuid=self.current_uuid, screenshot=screenshot) self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
if xpath_data: if xpath_data:
self.datastore.save_xpath_data(watch_uuid=self.current_uuid, data=xpath_data) self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
self.current_uuid = None # Done
self.q.task_done()
# Give the CPU time to interrupt
time.sleep(0.1)
self.app.config.exit.wait(1)

View File

@@ -24,7 +24,7 @@ services:
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
# #
# Alternative Playwright URL, do not use "'s or 's! # Alternative Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/
# #
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
# #
@@ -78,6 +78,9 @@ services:
# - SCREEN_HEIGHT=1024 # - SCREEN_HEIGHT=1024
# - SCREEN_DEPTH=16 # - SCREEN_DEPTH=16
# - ENABLE_DEBUGGER=false # - ENABLE_DEBUGGER=false
# - SCREEN_WIDTH=1280
# - SCREEN_HEIGHT=1024
# - SCREEN_DEPTH=16
# - PREBOOT_CHROME=true # - PREBOOT_CHROME=true
# - CONNECTION_TIMEOUT=300000 # - CONNECTION_TIMEOUT=300000
# - MAX_CONCURRENT_SESSIONS=10 # - MAX_CONCURRENT_SESSIONS=10