mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-12-04 23:25:32 +00:00
merging v0.39.6
This commit is contained in:
@@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat
|
||||
```bash
|
||||
docker-compose pull && docker-compose up -d
|
||||
```
|
||||
### Filters
|
||||
XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
|
||||
|
||||
### Notifications
|
||||
|
||||
@@ -144,9 +146,9 @@ When you enable a `json:` filter, you can even automatically extract and parse e
|
||||
|
||||
See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration
|
||||
|
||||
### RaspberriPi support?
|
||||
### Raspberry Pi support?
|
||||
|
||||
RaspberriPi and linux/arm/v6 linux/arm/v7 arm64 devices are supported!
|
||||
Raspberry Pi and linux/arm/v6 linux/arm/v7 arm64 devices are supported!
|
||||
|
||||
### Windows native support?
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ import datetime
|
||||
import pytz
|
||||
from copy import deepcopy
|
||||
|
||||
__version__ = '0.39.5'
|
||||
__version__ = '0.39.6'
|
||||
|
||||
datastore = None
|
||||
|
||||
@@ -806,7 +806,8 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
compress_type=zipfile.ZIP_DEFLATED,
|
||||
compresslevel=8)
|
||||
|
||||
return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True)
|
||||
# Send_from_directory needs to be the full absolute path
|
||||
return send_from_directory(os.path.abspath(datastore_o.datastore_path), backupname, as_attachment=True)
|
||||
|
||||
@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
|
||||
def static_content(group, filename):
|
||||
|
||||
@@ -9,6 +9,12 @@ import urllib3.exceptions
|
||||
|
||||
|
||||
class EmptyReply(Exception):
|
||||
def __init__(self, status_code, url):
|
||||
# Set this so we can use it in other parts of the app
|
||||
self.status_code = status_code
|
||||
self.url = url
|
||||
return
|
||||
|
||||
pass
|
||||
|
||||
class Fetcher():
|
||||
@@ -110,6 +116,8 @@ class html_webdriver(Fetcher):
|
||||
|
||||
# @todo - how to check this? is it possible?
|
||||
self.status_code = 200
|
||||
# @todo somehow we should try to get this working for WebDriver
|
||||
# raise EmptyReply(url=url, status_code=r.status_code)
|
||||
|
||||
# @todo - dom wait loaded?
|
||||
time.sleep(5)
|
||||
@@ -151,10 +159,10 @@ class html_requests(Fetcher):
|
||||
# Return bytes here
|
||||
html = r.text
|
||||
|
||||
|
||||
# @todo test this
|
||||
# @todo maybe you really want to test zero-byte return pages?
|
||||
if not r or not html or not len(html):
|
||||
raise EmptyReply(url)
|
||||
raise EmptyReply(url=url, status_code=r.status_code)
|
||||
|
||||
self.status_code = r.status_code
|
||||
self.content = html
|
||||
|
||||
@@ -58,9 +58,7 @@ class perform_site_check():
|
||||
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
|
||||
update_obj = {
|
||||
"last_checked": timestamp
|
||||
}
|
||||
update_obj = {}
|
||||
|
||||
extra_headers = self.datastore.get_val(uuid, 'headers')
|
||||
|
||||
@@ -116,15 +114,17 @@ class perform_site_check():
|
||||
if 'json:' in css_filter_rule:
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||
is_html = False
|
||||
else:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
|
||||
if is_html:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = fetcher.content
|
||||
if has_filter_rule:
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||
if css_filter_rule[0] == '/':
|
||||
html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
|
||||
else:
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
||||
|
||||
# get_text() via inscriptis
|
||||
stripped_text_from_html = get_text(html_content)
|
||||
|
||||
@@ -181,7 +181,7 @@ class ValidateListRegex(object):
|
||||
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
|
||||
raise ValidationError(message % (line))
|
||||
|
||||
class ValidateCSSJSONInput(object):
|
||||
class ValidateCSSJSONXPATHInput(object):
|
||||
"""
|
||||
Filter validation
|
||||
@todo CSS validator ;)
|
||||
@@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object):
|
||||
self.message = message
|
||||
|
||||
def __call__(self, form, field):
|
||||
|
||||
# Nothing to see here
|
||||
if not len(field.data.strip()):
|
||||
return
|
||||
|
||||
# Does it look like XPath?
|
||||
if field.data.strip()[0] == '/':
|
||||
from lxml import html, etree
|
||||
tree = html.fromstring("<html></html>")
|
||||
|
||||
try:
|
||||
tree.xpath(field.data.strip())
|
||||
except etree.XPathEvalError as e:
|
||||
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
|
||||
raise ValidationError(message % (field.data, str(e)))
|
||||
except:
|
||||
raise ValidationError("A system-error occurred when validating your XPath expression")
|
||||
|
||||
if 'json:' in field.data:
|
||||
from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
|
||||
from jsonpath_ng.ext import parse
|
||||
@@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object):
|
||||
except (JsonPathParserError, JsonPathLexerError) as e:
|
||||
message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
|
||||
raise ValidationError(message % (input, str(e)))
|
||||
except:
|
||||
raise ValidationError("A system-error occurred when validating your JSONPath expression")
|
||||
|
||||
# Re #265 - maybe in the future fetch the page and offer a
|
||||
# warning/notice that its possible the rule doesnt yet match anything?
|
||||
@@ -232,7 +252,7 @@ class watchForm(commonSettingsForm):
|
||||
seconds_between_check = html5.IntegerField('Maximum time in seconds until recheck',
|
||||
[validators.Optional(), validators.NumberRange(min=1,max=59)])
|
||||
minutes_or_seconds = RadioField('Minutes or Seconds', choices=[('minutes','Minutes'),('seconds','Seconds')])
|
||||
css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
|
||||
css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
|
||||
title = StringField('Title')
|
||||
|
||||
ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
|
||||
|
||||
@@ -17,6 +17,20 @@ def css_filter(css_filter, html_content):
|
||||
return html_block + "\n"
|
||||
|
||||
|
||||
# Return str Utf-8 of matched rules
|
||||
def xpath_filter(xpath_filter, html_content):
|
||||
from lxml import html
|
||||
from lxml import etree
|
||||
|
||||
tree = html.fromstring(html_content)
|
||||
html_block = ""
|
||||
|
||||
for item in tree.xpath(xpath_filter.strip()):
|
||||
html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
|
||||
|
||||
return html_block
|
||||
|
||||
|
||||
# Extract/find element
|
||||
def extract_element(find='title', html_content=''):
|
||||
|
||||
|
||||
@@ -101,8 +101,10 @@ User-Agent: wonderbra 1.0") }}
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
|
||||
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
||||
<li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example <b>//*[contains(@class, 'sametext')]</b>, <a
|
||||
href="http://xpather.com/" target="new">test your XPath here</a></li>
|
||||
</ul>
|
||||
Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a
|
||||
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
||||
</span>
|
||||
</div>
|
||||
@@ -113,8 +115,11 @@ User-Agent: wonderbra 1.0") }}
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">
|
||||
Each line processed separately, any line matching will be ignored.<br/>
|
||||
Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
|
||||
<ul>
|
||||
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
|
||||
<li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
|
||||
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
|
||||
</ul>
|
||||
</span>
|
||||
|
||||
</fieldset>
|
||||
|
||||
@@ -83,8 +83,13 @@
|
||||
/some.regex\d{2}/ for case-INsensitive regex
|
||||
") }}
|
||||
<span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/>
|
||||
<span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/>
|
||||
Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>Note: This is applied globally in addition to the per-watch rules.</li>
|
||||
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
|
||||
<li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
|
||||
<li>Changing this will affect the comparison checksum which may trigger an alert</li>
|
||||
</ul>
|
||||
</span>
|
||||
</fieldset>
|
||||
</div>
|
||||
|
||||
38
changedetectionio/tests/test_errorhandling.py
Normal file
38
changedetectionio/tests/test_errorhandling.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
|
||||
from ..html_tools import *
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
|
||||
def test_error_handler(client, live_server):
|
||||
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint_403_error', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
assert b'Status Code 403' in res.data
|
||||
assert bytes("just now".encode('utf-8')) in res.data
|
||||
118
changedetectionio/tests/test_xpath_selector.py
Normal file
118
changedetectionio/tests/test_xpath_selector.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
|
||||
from ..html_tools import *
|
||||
|
||||
def test_setup(live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
def set_original_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div class="sametext">Some text thats the same</div>
|
||||
<div class="changetext">Some text that will change</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
def set_modified_response():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br>
|
||||
<div class="sametext">Some text thats the same</div>
|
||||
<div class="changetext">Some new text</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def test_check_markup_xpath_filter_restriction(client, live_server):
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
xpath_filter = "//*[contains(@class, 'sametext')]"
|
||||
|
||||
set_original_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# Goto the edit page, add our ignore text
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"Updated watch." in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# view it/reset state back to viewed
|
||||
client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)
|
||||
|
||||
# Make a change
|
||||
set_modified_response()
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
|
||||
def test_xpath_validation(client, live_server):
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("edit_page", uuid="first"),
|
||||
data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"is not a valid XPath expression" in res.data
|
||||
@@ -54,6 +54,13 @@ def live_server_setup(live_server):
|
||||
resp.headers['Content-Type'] = 'application/json'
|
||||
return resp
|
||||
|
||||
@live_server.app.route('/test-403')
|
||||
def test_endpoint_403_error():
|
||||
|
||||
from flask import make_response
|
||||
resp = make_response('', 403)
|
||||
return resp
|
||||
|
||||
# Just return the headers in the request
|
||||
@live_server.app.route('/test-headers')
|
||||
def test_headers():
|
||||
|
||||
@@ -39,9 +39,10 @@ class update_worker(threading.Thread):
|
||||
changed_detected = False
|
||||
contents = ""
|
||||
update_obj= {}
|
||||
now = time.time()
|
||||
|
||||
try:
|
||||
now = time.time()
|
||||
|
||||
changed_detected, update_obj, contents = update_handler.run(uuid)
|
||||
|
||||
# Re #342
|
||||
@@ -51,14 +52,13 @@ class update_worker(threading.Thread):
|
||||
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
|
||||
|
||||
|
||||
# Always record that we atleast tried
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)})
|
||||
|
||||
except PermissionError as e:
|
||||
self.app.logger.error("File permission error updating", uuid, str(e))
|
||||
except content_fetcher.EmptyReply as e:
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)})
|
||||
|
||||
# Some kind of custom to-str handler in the exception handler that does this?
|
||||
err_text = "EmptyReply: Status Code {}".format(e.status_code)
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
||||
'last_check_status': e.status_code})
|
||||
except Exception as e:
|
||||
self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e))
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
|
||||
@@ -66,13 +66,14 @@ class update_worker(threading.Thread):
|
||||
else:
|
||||
try:
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
fname = "" # Saved history text filename
|
||||
|
||||
# For the FIRST time we check a site, or a change detected, save the snapshot.
|
||||
if changed_detected or not watch['last_checked']:
|
||||
# A change was detected
|
||||
fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents)
|
||||
# Should always be keyed by string(timestamp)
|
||||
self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}})
|
||||
self.datastore.update_watch(uuid, {"history": {str(round(time.time())): fname}})
|
||||
|
||||
# Generally update anything interesting returned
|
||||
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
||||
@@ -136,6 +137,11 @@ class update_worker(threading.Thread):
|
||||
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
|
||||
print("!!!! Exception in update_worker !!!\n", e)
|
||||
|
||||
finally:
|
||||
# Always record that we atleast tried
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
|
||||
'last_checked': round(time.time())})
|
||||
|
||||
self.current_uuid = None # Done
|
||||
self.q.task_done()
|
||||
|
||||
|
||||
@@ -43,7 +43,8 @@ services:
|
||||
restart: unless-stopped
|
||||
|
||||
# Used for fetching pages via WebDriver+Chrome where you need Javascript support.
|
||||
# Does not work on rPi, https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver
|
||||
# Now working on arm64 (needs testing on rPi - tested on Oracle ARM instance)
|
||||
# replace image with seleniarm/standalone-chromium:4.0.0-20211213
|
||||
|
||||
# browser-chrome:
|
||||
# hostname: browser-chrome
|
||||
|
||||
@@ -26,8 +26,11 @@ paho-mqtt
|
||||
# ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
|
||||
cryptography ~= 3.4
|
||||
|
||||
# Used for CSS filtering, replace with soupsieve and lxml for xpath
|
||||
# Used for CSS filtering
|
||||
bs4
|
||||
|
||||
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
|
||||
lxml
|
||||
|
||||
# 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
|
||||
selenium ~= 4.1.0
|
||||
|
||||
Reference in New Issue
Block a user