Compare commits

...

20 Commits

Author SHA1 Message Date
dgtlmoon
9b036d7b19 Simple UI to see the difference and the two images 2022-02-12 23:48:10 +01:00
dgtlmoon
0761984bcd tweaks to image diff highlighter 2022-02-12 23:37:02 +01:00
dgtlmoon
e73721a3f0 tweaking 2022-02-12 23:03:31 +01:00
dgtlmoon
86fc9d669f Basic handler for diff rendering 2022-02-12 22:58:43 +01:00
dgtlmoon
7a66b69158 Some work around diff viewing 2022-02-12 22:48:29 +01:00
dgtlmoon
ddd7b2772d for now dont bother renaming snapshot 2022-02-12 22:48:15 +01:00
dgtlmoon
305060f79c Exceptions around saving snapshot were not being tracked 2022-02-12 22:46:50 +01:00
dgtlmoon
cfcf59d009 Switch store filename depending on type 2022-02-12 22:22:14 +01:00
dgtlmoon
af25b824a0 small tidyup 2022-02-12 22:13:53 +01:00
dgtlmoon
a29085fa18 check preview page shows what we expect 2022-02-12 22:13:33 +01:00
dgtlmoon
d7832d735d Check preview page is working 2022-02-12 22:11:36 +01:00
dgtlmoon
7d1c4d7673 Allow 'trigger text' on JSON docs 2022-02-12 21:53:02 +01:00
dgtlmoon
6e00f0e025 tidy up checksum check ara 2022-02-12 21:46:23 +01:00
dgtlmoon
4f536bb559 Fix json detect bug 2022-02-12 21:40:35 +01:00
dgtlmoon
38d8aa8d28 encode to str/bytes 2022-02-12 18:26:43 +01:00
dgtlmoon
dec47d5c43 trying to resolve json cast issue 2022-02-12 18:25:25 +01:00
dgtlmoon
cec24fe2c1 Check if 'application/json; charset=utf-8' 2022-02-12 18:22:11 +01:00
dgtlmoon
f4bc0aa2ba Not needed 2022-02-12 18:08:38 +01:00
dgtlmoon
499c4797da More works and tests 2022-02-12 18:08:18 +01:00
dgtlmoon
9bc71d187e Split out content type methods 2022-02-12 17:21:25 +01:00
12 changed files with 371 additions and 75 deletions

View File

@@ -695,6 +695,10 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/diff/<string:uuid>", methods=['GET'])
@login_required
def diff_history_page(uuid):
from changedetectionio import content_fetcher
newest_version_file_contents = ""
previous_version_file_contents = ""
# More for testing, possible to return the first/only
if uuid == 'first':
@@ -720,21 +724,28 @@ def changedetection_app(config=None, datastore_o=None):
# Save the current newest history as the most recently viewed
datastore.set_last_viewed(uuid, dates[0])
newest_file = watch['history'][dates[0]]
with open(newest_file, 'r') as f:
newest_version_file_contents = f.read()
previous_version = request.args.get('previous_version')
try:
previous_file = watch['history'][previous_version]
except KeyError:
# Not present, use a default value, the second one in the sorted list.
previous_file = watch['history'][dates[1]]
if ('content-type' in watch and content_fetcher.supported_binary_type(watch['content-type'])):
template = "diff-image.html"
else:
newest_file = watch['history'][dates[0]]
with open(newest_file, 'r') as f:
newest_version_file_contents = f.read()
with open(previous_file, 'r') as f:
previous_version_file_contents = f.read()
try:
previous_file = watch['history'][previous_version]
except KeyError:
# Not present, use a default value, the second one in the sorted list.
previous_file = watch['history'][dates[1]]
output = render_template("diff.html", watch_a=watch,
with open(previous_file, 'r') as f:
previous_version_file_contents = f.read()
template = "diff.html"
output = render_template(template,
watch_a=watch,
newest=newest_version_file_contents,
previous=previous_version_file_contents,
extra_stylesheets=extra_stylesheets,
@@ -751,6 +762,7 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/preview/<string:uuid>", methods=['GET'])
@login_required
def preview_page(uuid):
from changedetectionio import content_fetcher
# More for testing, possible to return the first/only
if uuid == 'first':
@@ -765,14 +777,25 @@ def changedetection_app(config=None, datastore_o=None):
return redirect(url_for('index'))
newest = list(watch['history'].keys())[-1]
with open(watch['history'][newest], 'r') as f:
content = f.readlines()
fname = watch['history'][newest]
if ('content-type' in watch and content_fetcher.supported_binary_type(watch['content-type'])):
template = "preview-image.html"
content = fname
else:
template = "preview.html"
try:
with open(fname, 'r') as f:
content = f.read()
except:
content = "Cant read {}".format(fname)
output = render_template("preview.html",
content=content,
extra_stylesheets=extra_stylesheets,
current_diff_url=watch['url'],
uuid=uuid)
uuid=uuid,
watch=watch)
return output
@app.route("/settings/notification-logs", methods=['GET'])
@@ -783,6 +806,50 @@ def changedetection_app(config=None, datastore_o=None):
logs=notification_debug_log if len(notification_debug_log) else ["No errors or warnings detected"])
return output
# render an image which contains the diff of two images
# We always compare the newest against whatever compare_date we are given
@app.route("/diff/show-image/<string:uuid>/<string:datestr>")
def show_single_image(uuid, datestr):
from flask import make_response
watch = datastore.data['watching'][uuid]
if datestr == 'None' or datestr is None:
datestr = list(watch['history'].keys())[0]
fname = watch['history'][datestr]
with open(fname, 'rb') as f:
resp = make_response(f.read())
# @todo assumption here about the type, re-encode? detect?
resp.headers['Content-Type'] = 'image/jpeg'
return resp
# render an image which contains the diff of two images
# We always compare the newest against whatever compare_date we are given
@app.route("/diff/image/<string:uuid>/<string:compare_date>")
def render_diff_image(uuid, compare_date):
from changedetectionio import image_diff
from flask import make_response
watch = datastore.data['watching'][uuid]
newest = list(watch['history'].keys())[-1]
# @todo this is weird
if compare_date == 'None' or compare_date is None:
compare_date = list(watch['history'].keys())[0]
new_img = watch['history'][newest]
prev_img = watch['history'][compare_date]
img = image_diff.render_diff(new_img, prev_img)
resp = make_response(img)
resp.headers['Content-Type'] = 'image/jpeg'
return resp
@app.route("/api/<string:uuid>/snapshot/current", methods=['GET'])
@login_required
def api_snapshot(uuid):

View File

@@ -5,8 +5,9 @@ from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
from selenium.common.exceptions import WebDriverException
import urllib3.exceptions
# image/jpeg etc
supported_binary_types = ['image']
class EmptyReply(Exception):
def __init__(self, status_code, url):
@@ -51,6 +52,15 @@ class Fetcher():
# def return_diff(self, stream_a, stream_b):
# return
# Assume we dont support it as binary if its not in our list
def supported_binary_type(content_type):
# Not a binary thing we support? then use text (also used for JSON/XML etc)
# @todo - future - use regex for matching
if content_type and content_type.lower().strip().split('/')[0] not in (string.lower() for string in supported_binary_types):
return False
return True
def available_fetchers():
import inspect
from changedetectionio import content_fetcher
@@ -156,15 +166,18 @@ class html_requests(Fetcher):
verify=False)
# https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8
# Return bytes here
html = r.text
if not supported_binary_type(r.headers.get('Content-Type', '')):
content = r.text
else:
content = r.content
# @todo test this
# @todo maybe you really want to test zero-byte return pages?
if not r or not html or not len(html):
if not r or not content or not len(content):
raise EmptyReply(url=url, status_code=r.status_code)
self.status_code = r.status_code
self.content = html
self.content = content
self.headers = r.headers

View File

@@ -55,10 +55,13 @@ class perform_site_check():
changed_detected = False
stripped_text_from_html = ""
fetched_md5 = ""
original_content_before_filters = False
watch = self.datastore.data['watching'][uuid]
# Unset any existing notification error
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
extra_headers = self.datastore.get_val(uuid, 'headers')
@@ -92,6 +95,7 @@ class perform_site_check():
fetcher = klass()
fetcher.run(url, timeout, request_headers, request_body, request_method)
# Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base?
@@ -101,26 +105,39 @@ class perform_site_check():
# - Do we convert to JSON?
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
is_html = not is_json
update_obj['content-type'] = fetcher.headers.get('Content-Type', '').lower().strip()
# Could be 'application/json; charset=utf-8' etc
is_json = 'application/json' in update_obj['content-type']
is_text_or_html = 'text/' in update_obj['content-type'] # text/plain , text/html etc
is_binary = not is_text_or_html and content_fetcher.supported_binary_type(update_obj['content-type'])
css_filter_rule = watch['css_filter']
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
# Auto-detect application/json, make it reformat the JSON to something nice
if is_json and not has_filter_rule:
css_filter_rule = "json:$"
has_filter_rule = True
if has_filter_rule:
if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
is_html = False
##### CONVERT THE INPUT TO TEXT, EXTRACT THE PARTS THAT NEED TO BE FILTERED
if is_html:
# Dont depend on the content-type header here, maybe it's not present
if 'json:' in css_filter_rule:
is_json = True
rule = css_filter_rule.replace('json:', '')
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content,
jsonpath_filter=rule).encode('utf-8')
is_text_or_html = False
original_content_before_filters = stripped_text_from_html
if is_text_or_html:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content
if not fetcher.headers.get('Content-Type', '') == 'text/plain':
if 'text/plain' in update_obj['content-type']:
stripped_text_from_html = html_content
# Assume it's HTML if it's not text/plain
if not 'text/plain' in update_obj['content-type']:
if has_filter_rule:
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
if css_filter_rule[0] == '/':
@@ -128,35 +145,52 @@ class perform_site_check():
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
# get_text() via inscriptis
stripped_text_from_html = get_text(html_content)
else:
# Don't run get_text or xpath/css filters on plaintext
stripped_text_from_html = html_content
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
# Extract title as title
if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
# Re #340 - return the content before the 'ignore text' was applied
original_content_before_filters = stripped_text_from_html.encode('utf-8')
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.
update_obj["last_check_status"] = fetcher.get_last_status_code()
# If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if len(text_to_ignore):
stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, text_to_ignore)
else:
stripped_text_from_html = stripped_text_from_html.encode('utf8')
######## AFTER FILTERING, STRIP OUT IGNORE TEXT
if is_text_or_html:
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if len(text_to_ignore):
stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, text_to_ignore)
else:
stripped_text_from_html = stripped_text_from_html.encode('utf8')
######## CALCULATE CHECKSUM FOR DIFF DETECTION
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
else:
if is_text_or_html:
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
else:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
if is_json:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
# Goal here in the future is to be able to abstract out different content type checks into their own class
if is_binary:
# @todo - use some actual image hash here where possible, audio hash, etc etc
m = hashlib.sha256()
m.update(fetcher.content)
fetched_md5 = m.hexdigest()
original_content_before_filters = fetcher.content
# On the first run of a site, watch['previous_md5'] will be an empty string, set it the current one.
if not len(watch['previous_md5']):
watch['previous_md5'] = fetched_md5
@@ -164,36 +198,30 @@ class perform_site_check():
blocked_by_not_found_trigger_text = False
if len(watch['trigger_text']):
blocked_by_not_found_trigger_text = True
for line in watch['trigger_text']:
# Because JSON wont serialize a re.compile object
if line[0] == '/' and line[-1] == '/':
regex = re.compile(line.strip('/'), re.IGNORECASE)
# Found it? so we don't wait for it anymore
r = re.search(regex, str(stripped_text_from_html))
if r:
# Trigger text can apply to JSON parsed documents too
if is_text_or_html or is_json:
if len(watch['trigger_text']):
blocked_by_not_found_trigger_text = True
for line in watch['trigger_text']:
# Because JSON wont serialize a re.compile object
if line[0] == '/' and line[-1] == '/':
regex = re.compile(line.strip('/'), re.IGNORECASE)
# Found it? so we don't wait for it anymore
r = re.search(regex, str(stripped_text_from_html))
if r:
blocked_by_not_found_trigger_text = False
break
elif line.lower() in str(stripped_text_from_html).lower():
# We found it don't wait for it.
blocked_by_not_found_trigger_text = False
break
elif line.lower() in str(stripped_text_from_html).lower():
# We found it don't wait for it.
blocked_by_not_found_trigger_text = False
break
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
changed_detected = True
update_obj["previous_md5"] = fetched_md5
update_obj["last_changed"] = timestamp
# Extract title as title
if is_html:
if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
return changed_detected, update_obj, text_content_before_ignored_filter
# original_content_before_filters is returned for saving the data to disk
return changed_detected, update_obj, original_content_before_filters

View File

@@ -0,0 +1,41 @@
# import the necessary packages
from skimage.metrics import structural_similarity as compare_ssim
import argparse
import imutils
import cv2
# From https://www.pyimagesearch.com/2017/06/19/image-difference-with-opencv-and-python/
def render_diff(fpath_imageA, fpath_imageB):
imageA = cv2.imread(fpath_imageA)
imageB = cv2.imread(fpath_imageB)
# convert the images to grayscale
grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
# compute the Structural Similarity Index (SSIM) between the two
# images, ensuring that the difference image is returned
(score, diff) = compare_ssim(grayA, grayB, full=True)
diff = (diff * 255).astype("uint8")
print("SSIM: {}".format(score))
# threshold the difference image, followed by finding contours to
# obtain the regions of the two input images that differ
thresh = cv2.threshold(diff, 0, 255,
cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
# loop over the contours
for c in cnts:
# compute the bounding box of the contour and then draw the
# bounding box on both input images to represent where the two
# images differ
(x, y, w, h) = cv2.boundingRect(c)
cv2.rectangle(imageA, (x, y), (x + w, y + h), (0, 0, 255), 2)
cv2.rectangle(imageB, (x, y), (x + w, y + h), (0, 0, 255), 2)
#return cv2.imencode('.jpg', imageB)[1].tobytes()
return cv2.imencode('.jpg', imageA)[1].tobytes()

View File

@@ -372,7 +372,9 @@ class ChangeDetectionStore:
if not os.path.isdir(output_path):
mkdir(output_path)
fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
suffix = "stripped.txt"
fname = "{}/{}.{}".format(output_path, uuid.uuid4(), suffix)
with open(fname, 'wb') as f:
f.write(contents)
f.close()

View File

@@ -0,0 +1,59 @@
{% extends 'base.html' %}
{% block content %}
<div id="settings">
<h1>Differences</h1>
<form class="pure-form " action="" method="GET">
<fieldset>
{% if versions|length >= 1 %}
<label for="diff-version">Compare newest (<span id="current-v-date"></span>) with</label>
<select id="diff-version" name="previous_version">
{% for version in versions %}
<option value="{{version}}" {% if version== current_previous_version %} selected="" {% endif %}>
{{version}}
</option>
{% endfor %}
</select>
<button type="submit" class="pure-button pure-button-primary">Go</button>
{% endif %}
</fieldset>
</form>
</div>
<div id="diff-ui">
<img style="max-width: 100%" src="{{ url_for('render_diff_image', uuid=uuid, compare_date=current_previous_version) }}" />
<div>
<span style="width: 50%">
<img style="max-width: 100%" src="{{ url_for('show_single_image', uuid=uuid, datestr=newest_version_timestamp) }}" />
</span>
<span style="width: 50%">
<img style="max-width: 100%" src="{{ url_for('show_single_image', uuid=uuid, datestr=current_previous_version) }}" />
</span>
</div>
</div>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff.js')}}"></script>
<script defer="">
window.onload = function() {
/* Set current version date as local time in the browser also */
var current_v = document.getElementById("current-v-date");
var dateObject = new Date({{ newest_version_timestamp }}*1000);
current_v.innerHTML=dateObject.toLocaleString();
/* Convert what is options from UTC time.time() to local browser time */
var diffList=document.getElementById("diff-version");
if (typeof(diffList) != 'undefined' && diffList != null) {
for (var option of diffList.options) {
var dateObject = new Date(option.value*1000);
option.label=dateObject.toLocaleString();
}
}
}
</script>
{% endblock %}

View File

@@ -0,0 +1,13 @@
{% extends 'base.html' %}
{% block content %}
<div id="settings">
<h1>Current</h1>
</div>
<div id="diff-ui">
image goes here
</div>
{% endblock %}

View File

@@ -11,7 +11,7 @@
<tbody>
<tr>
<td id="diff-col">
<span id="result">{% for row in content %}{{row}}{% endfor %}</span>
<span id="result">{{content}}</span>
</td>
</tr>
</tbody>

View File

@@ -100,6 +100,14 @@ def test_check_basic_change_detection_functionality(client, live_server):
# It should have picked up the <title>
assert b'head title' in res.data
# be sure the HTML converter worked
res = client.get(url_for("preview_page", uuid="first"))
assert b'<html>' not in res.data
res = client.get(url_for("preview_page", uuid="first"))
assert b'Some initial text' in res.data
#
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python3
import time
import secrets
from flask import url_for
from . util import live_server_setup
def test_binary_file_change(client, live_server):
with open("test-datastore/test.bin", "wb") as f:
f.write(secrets.token_bytes())
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_binaryfile_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
assert b'/test-binary-endpoint' in res.data
# Make a change
with open("test-datastore/test.bin", "wb") as f:
f.write(secrets.token_bytes())
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

View File

@@ -37,6 +37,16 @@ def set_modified_response():
def live_server_setup(live_server):
@live_server.app.route('/test-binary-endpoint')
def test_binaryfile_endpoint():
from flask import make_response
# Tried using a global var here but didn't seem to work, so reading from a file instead.
with open("test-datastore/test.bin", "rb") as f:
resp = make_response(f.read())
resp.headers['Content-Type'] = 'image/jpeg'
return resp
@live_server.app.route('/test-endpoint')
def test_endpoint():

View File

@@ -42,7 +42,6 @@ class update_worker(threading.Thread):
now = time.time()
try:
changed_detected, update_obj, contents = update_handler.run(uuid)
# Re #342
@@ -135,8 +134,8 @@ class update_worker(threading.Thread):
except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
print("!!!! Exception in update_worker !!!\n", e)
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
finally:
# Always record that we atleast tried
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),