mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-06 17:46:06 +00:00
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e31ae86d0 | ||
|
|
ef2dd44e7e | ||
|
|
07f41782c0 | ||
|
|
d93926a8b6 | ||
|
|
7072858814 | ||
|
|
cd5c05e72a | ||
|
|
3034d17c06 | ||
|
|
3b2c8d356a | ||
|
|
711853a149 | ||
|
|
5669ae70cc | ||
|
|
084dcde410 | ||
|
|
37b070f5a0 | ||
|
|
3952f3a207 | ||
|
|
0c3d5e55ab | ||
|
|
6a102374c6 | ||
|
|
bbd99c9aa9 | ||
|
|
26c9a6e0fc | ||
|
|
c4197a5045 | ||
|
|
f1c2ece32f | ||
|
|
704b8daa6d | ||
|
|
9ec820fa97 | ||
|
|
e7e3eb36c0 | ||
|
|
801b50cb5b | ||
|
|
eecc620386 | ||
|
|
25b565d9ba | ||
|
|
7b4ed2429d | ||
|
|
4e0fb33580 | ||
|
|
4931e757b9 | ||
|
|
3e934e8f8c | ||
|
|
118814912f | ||
|
|
4013e34899 | ||
|
|
b58cf76445 |
@@ -9,6 +9,13 @@ WORKDIR /app
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Attempt to store the triggered commit
|
||||
|
||||
ARG SOURCE_COMMIT
|
||||
ARG SOURCE_BRANCH
|
||||
RUN echo "commit: $SOURCE_COMMIT branch: $SOURCE_BRANCH" >/source.txt
|
||||
|
||||
|
||||
RUN [ ! -d "/datastore" ] && mkdir /datastore
|
||||
|
||||
CMD [ "python", "./backend.py" ]
|
||||
|
||||
24
README.md
24
README.md
@@ -16,18 +16,34 @@ Know when ...
|
||||
- Realestate listing changes
|
||||
|
||||
|
||||
Get monitoring now! super simple, one command!
|
||||
**Get monitoring now! super simple, one command!**
|
||||
|
||||
```
|
||||
$ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/datastore dgtlmoon/changedetection.io
|
||||
```bash
|
||||
docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/datastore --name changedetection.io dgtlmoon/changedetection.io
|
||||
```
|
||||
|
||||
Now visit http://127.0.0.1:5000 , You should now be able to access the UI.
|
||||
|
||||
#### Updating to latest version
|
||||
|
||||
Highly recommended :)
|
||||
|
||||
```bash
|
||||
docker pull dgtlmoon/changedetection.io
|
||||
docker kill $(docker ps -a|grep changedetection.io|awk '{print $1}')
|
||||
docker rm $(docker ps -a|grep changedetection.io|awk '{print $1}')
|
||||
docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/datastore --name changedetection.io dgtlmoon/changedetection.io
|
||||
```
|
||||
|
||||
### Screenshots
|
||||
|
||||

|
||||
Application running.
|
||||
|
||||

|
||||
|
||||
Examining differences in content.
|
||||
|
||||

|
||||
|
||||
### Future plans
|
||||
|
||||
|
||||
@@ -25,22 +25,27 @@ import datetime
|
||||
import timeago
|
||||
|
||||
import threading
|
||||
import queue
|
||||
|
||||
|
||||
from flask import Flask, render_template, request, send_file, send_from_directory, safe_join, abort, redirect, url_for
|
||||
|
||||
|
||||
# Local
|
||||
import store
|
||||
import fetch_site_status
|
||||
|
||||
running_update_threads = []
|
||||
ticker_thread = None
|
||||
|
||||
datastore = store.ChangeDetectionStore()
|
||||
messages = []
|
||||
extra_stylesheets = []
|
||||
running_update_threads = {}
|
||||
|
||||
update_q = queue.Queue()
|
||||
|
||||
|
||||
app = Flask(__name__, static_url_path='/static')
|
||||
app.config['STATIC_RESOURCES'] = "/app/static"
|
||||
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
|
||||
|
||||
# app.config['SECRET_KEY'] = 'secret!'
|
||||
|
||||
@@ -52,9 +57,9 @@ app.config['TEMPLATES_AUTO_RELOAD'] = True
|
||||
# running or something similar.
|
||||
@app.template_filter('format_last_checked_time')
|
||||
def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
|
||||
global running_update_threads
|
||||
if watch_obj['uuid'] in running_update_threads:
|
||||
if running_update_threads[watch_obj['uuid']].is_alive():
|
||||
# Worker thread tells us which UUID it is currently processing.
|
||||
for t in running_update_threads:
|
||||
if t.current_uuid == watch_obj['uuid']:
|
||||
return "Checking now.."
|
||||
|
||||
if watch_obj['last_checked'] == 0:
|
||||
@@ -112,13 +117,70 @@ def main_page():
|
||||
messages = []
|
||||
return output
|
||||
|
||||
@app.route("/scrub", methods=['GET', 'POST'])
|
||||
def scrub_page():
|
||||
from pathlib import Path
|
||||
|
||||
@app.route("/edit", methods=['GET'])
|
||||
def edit_page():
|
||||
global messages
|
||||
|
||||
uuid = request.args.get('uuid')
|
||||
output = render_template("edit.html", uuid=uuid, watch=datastore.data['watching'][uuid], messages=messages)
|
||||
if request.method == 'POST':
|
||||
confirmtext = request.form.get('confirmtext')
|
||||
|
||||
if confirmtext == 'scrub':
|
||||
|
||||
for txt_file_path in Path('/datastore').rglob('*.txt'):
|
||||
os.unlink(txt_file_path)
|
||||
|
||||
for uuid, watch in datastore.data['watching'].items():
|
||||
watch['last_checked'] = 0
|
||||
watch['last_changed'] = 0
|
||||
watch['previous_md5'] = None
|
||||
watch['history'] = {}
|
||||
|
||||
datastore.needs_write = True
|
||||
messages.append({'class': 'ok', 'message': 'Cleaned all version history.'})
|
||||
else:
|
||||
messages.append({'class': 'error', 'message': 'Wrong confirm text.'})
|
||||
|
||||
return redirect(url_for('main_page'))
|
||||
|
||||
return render_template("scrub.html")
|
||||
|
||||
|
||||
@app.route("/edit", methods=['GET', 'POST'])
|
||||
def edit_page():
|
||||
global messages
|
||||
import validators
|
||||
|
||||
if request.method == 'POST':
|
||||
uuid = request.args.get('uuid')
|
||||
|
||||
url = request.form.get('url').strip()
|
||||
tag = request.form.get('tag').strip()
|
||||
|
||||
form_headers = request.form.get('headers').strip().split("\n")
|
||||
extra_headers = {}
|
||||
if form_headers:
|
||||
for header in form_headers:
|
||||
if len(header):
|
||||
parts = header.split(':', 1)
|
||||
extra_headers.update({parts[0].strip(): parts[1].strip()})
|
||||
|
||||
validators.url(url) # @todo switch to prop/attr/observer
|
||||
datastore.data['watching'][uuid].update({'url': url,
|
||||
'tag': tag,
|
||||
'headers': extra_headers})
|
||||
datastore.needs_write = True
|
||||
|
||||
messages.append({'class': 'ok', 'message': 'Updated watch.'})
|
||||
|
||||
return redirect(url_for('main_page'))
|
||||
|
||||
else:
|
||||
|
||||
uuid = request.args.get('uuid')
|
||||
output = render_template("edit.html", uuid=uuid, watch=datastore.data['watching'][uuid], messages=messages)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@@ -166,8 +228,6 @@ def import_page():
|
||||
|
||||
messages.append({'class': 'ok', 'message': "{} Imported, {} Skipped.".format(good, len(remaining_urls))})
|
||||
|
||||
launch_checks()
|
||||
|
||||
output = render_template("import.html",
|
||||
messages=messages,
|
||||
remaining="\n".join(remaining_urls)
|
||||
@@ -179,26 +239,42 @@ def import_page():
|
||||
@app.route("/diff/<string:uuid>", methods=['GET'])
|
||||
def diff_history_page(uuid):
|
||||
global messages
|
||||
global extra_stylesheets
|
||||
extra_stylesheets.append('/static/css/diff.css')
|
||||
|
||||
extra_stylesheets=['/static/css/diff.css']
|
||||
|
||||
watch = datastore.data['watching'][uuid]
|
||||
|
||||
dates = list(watch['history'].keys())
|
||||
# Convert to int, sort and back to str again
|
||||
dates = [int(i) for i in dates]
|
||||
dates.sort(reverse=True)
|
||||
dates = [str(i) for i in dates]
|
||||
|
||||
left_file_contents = right_file_contents = ""
|
||||
l_file = watch['history'][str(dates[-1])]
|
||||
with open(l_file, 'r') as f:
|
||||
left_file_contents = f.read()
|
||||
newest_file = watch['history'][dates[0]]
|
||||
with open(newest_file, 'r') as f:
|
||||
newest_version_file_contents = f.read()
|
||||
|
||||
r_file = watch['history'][str(dates[-2])]
|
||||
with open(r_file, 'r') as f:
|
||||
right_file_contents = f.read()
|
||||
previous_version = request.args.get('previous_version')
|
||||
|
||||
try:
|
||||
previous_file = watch['history'][previous_version]
|
||||
except KeyError:
|
||||
# Not present, use a default value, the second one in the sorted list.
|
||||
previous_file = watch['history'][dates[1]]
|
||||
|
||||
with open(previous_file, 'r') as f:
|
||||
previous_version_file_contents = f.read()
|
||||
|
||||
output = render_template("diff.html", watch_a=watch,
|
||||
messages=messages,
|
||||
newest=newest_version_file_contents,
|
||||
previous=previous_version_file_contents,
|
||||
extra_stylesheets=extra_stylesheets,
|
||||
versions=dates[1:],
|
||||
newest_version_timestamp=dates[0],
|
||||
current_previous_version=str(previous_version),
|
||||
current_diff_url=watch['url'])
|
||||
|
||||
output = render_template("diff.html", watch_a=watch, messages=messages, left=left_file_contents,
|
||||
right=right_file_contents, extra_stylesheets=extra_stylesheets)
|
||||
return output
|
||||
|
||||
@app.route("/favicon.ico", methods=['GET'])
|
||||
@@ -206,6 +282,54 @@ def favicon():
|
||||
return send_from_directory("/app/static/images", filename="favicon.ico")
|
||||
|
||||
|
||||
# We're good but backups are even better!
|
||||
@app.route("/backup", methods=['GET'])
|
||||
def get_backup():
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
import zlib
|
||||
|
||||
# create a ZipFile object
|
||||
backupname = "changedetection-backup-{}.zip".format(int(time.time()))
|
||||
|
||||
# We only care about UUIDS from the current index file
|
||||
uuids = list(datastore.data['watching'].keys())
|
||||
|
||||
with zipfile.ZipFile(os.path.join("/datastore", backupname), 'w', compression=zipfile.ZIP_DEFLATED,
|
||||
compresslevel=6) as zipObj:
|
||||
|
||||
# Be sure we're written fresh
|
||||
datastore.sync_to_json()
|
||||
|
||||
# Add the index
|
||||
zipObj.write(os.path.join("/datastore", "url-watches.json"))
|
||||
# Add any snapshot data we find
|
||||
for txt_file_path in Path('/datastore').rglob('*.txt'):
|
||||
parent_p = txt_file_path.parent
|
||||
if parent_p.name in uuids:
|
||||
zipObj.write(txt_file_path)
|
||||
|
||||
return send_file(os.path.join("/datastore", backupname),
|
||||
as_attachment=True,
|
||||
mimetype="application/zip",
|
||||
attachment_filename=backupname)
|
||||
|
||||
|
||||
|
||||
# A few self sanity checks, mostly for developer/bug check
|
||||
@app.route("/self-check", methods=['GET'])
|
||||
def selfcheck():
|
||||
output = "All fine"
|
||||
# In earlier versions before a single threaded write of the JSON store, sometimes histories could get mixed.
|
||||
# Could also maybe affect people who manually fiddle with their JSON store?
|
||||
for uuid, watch in datastore.data['watching'].items():
|
||||
for timestamp, path in watch['history'].items():
|
||||
# Each history snapshot should include a full path, which contains the {uuid}
|
||||
if not uuid in path:
|
||||
output = "Something weird in {}, suspected incorrect snapshot path.".format(uuid)
|
||||
|
||||
return output
|
||||
|
||||
@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
|
||||
def static_content(group, filename):
|
||||
try:
|
||||
@@ -219,9 +343,11 @@ def api_watch_add():
|
||||
global messages
|
||||
|
||||
# @todo add_watch should throw a custom Exception for validation etc
|
||||
datastore.add_watch(url=request.form.get('url').strip(), tag=request.form.get('tag').strip())
|
||||
new_uuid = datastore.add_watch(url=request.form.get('url').strip(), tag=request.form.get('tag').strip())
|
||||
# Straight into the queue.
|
||||
update_q.put(new_uuid)
|
||||
|
||||
messages.append({'class': 'ok', 'message': 'Watch added.'})
|
||||
launch_checks()
|
||||
return redirect(url_for('main_page'))
|
||||
|
||||
|
||||
@@ -235,94 +361,91 @@ def api_delete():
|
||||
return redirect(url_for('main_page'))
|
||||
|
||||
|
||||
@app.route("/api/update", methods=['POST'])
|
||||
def api_update():
|
||||
global messages
|
||||
import validators
|
||||
|
||||
uuid = request.args.get('uuid')
|
||||
|
||||
url = request.form.get('url').strip()
|
||||
tag = request.form.get('tag').strip()
|
||||
|
||||
form_headers = request.form.get('headers').strip().split("\n")
|
||||
extra_headers = {}
|
||||
if form_headers:
|
||||
for header in form_headers:
|
||||
if len(header):
|
||||
parts = header.split(':', 1)
|
||||
extra_headers.update({parts[0].strip(): parts[1].strip()})
|
||||
|
||||
|
||||
|
||||
validators.url(url) #@todo switch to prop/attr/observer
|
||||
datastore.data['watching'][uuid].update({'url': url,
|
||||
'tag': tag,
|
||||
'headers':extra_headers})
|
||||
datastore.needs_write = True
|
||||
|
||||
messages.append({'class': 'ok', 'message': 'Updated watch.'})
|
||||
|
||||
return redirect(url_for('main_page'))
|
||||
|
||||
@app.route("/api/checknow", methods=['GET'])
|
||||
def api_watch_checknow():
|
||||
global messages
|
||||
|
||||
tag = request.args.get('tag')
|
||||
uuid = request.args.get('uuid')
|
||||
i=0
|
||||
|
||||
running_update_threads[uuid] = fetch_site_status.perform_site_check(uuid=uuid,
|
||||
datastore=datastore)
|
||||
running_update_threads[uuid].start()
|
||||
if uuid:
|
||||
update_q.put(uuid)
|
||||
i = 1
|
||||
|
||||
return redirect(url_for('main_page'))
|
||||
elif tag != None:
|
||||
for watch_uuid, watch in datastore.data['watching'].items():
|
||||
if (tag != None and tag in watch['tag']):
|
||||
i += 1
|
||||
update_q.put(watch_uuid)
|
||||
else:
|
||||
# No tag, no uuid, add everything.
|
||||
for watch_uuid, watch in datastore.data['watching'].items():
|
||||
i += 1
|
||||
update_q.put(watch_uuid)
|
||||
|
||||
messages.append({'class': 'ok', 'message': "{} watches are rechecking.".format(i)})
|
||||
return redirect(url_for('main_page', tag=tag))
|
||||
|
||||
|
||||
@app.route("/api/recheckall", methods=['GET'])
|
||||
def api_watch_recheckall():
|
||||
import fetch_site_status
|
||||
|
||||
global running_update_threads
|
||||
i = 0
|
||||
for uuid, watch in datastore.data['watching'].items():
|
||||
i = i + 1
|
||||
# Requests for checking on the site use a pool of thread Workers managed by a Queue.
|
||||
class Worker(threading.Thread):
|
||||
|
||||
running_update_threads[watch['uuid']] = fetch_site_status.perform_site_check(uuid=uuid,
|
||||
datastore=datastore)
|
||||
running_update_threads[watch['uuid']].start()
|
||||
current_uuid = None
|
||||
|
||||
return "{} triggered recheck of {} watches.".format(i, len(datastore.data['watching']))
|
||||
def __init__(self, q, *args, **kwargs):
|
||||
self.q = q
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def run(self):
|
||||
import fetch_site_status
|
||||
|
||||
# Can be used whenever, launch threads that need launching to update the stored information
|
||||
def launch_checks():
|
||||
import fetch_site_status
|
||||
global running_update_threads
|
||||
try:
|
||||
while True:
|
||||
uuid = self.q.get() # Blocking
|
||||
self.current_uuid = uuid
|
||||
|
||||
if uuid in list(datastore.data['watching'].keys()):
|
||||
update_handler = fetch_site_status.perform_site_check(uuid=uuid, datastore=datastore)
|
||||
datastore.update_watch(uuid=uuid, update_obj=update_handler.update_data)
|
||||
|
||||
minutes = datastore.data['settings']['requests']['minutes_between_check']
|
||||
for uuid, watch in datastore.data['watching'].items():
|
||||
self.current_uuid = None # Done
|
||||
self.q.task_done()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
|
||||
if watch['last_checked'] <= time.time() - (minutes * 60):
|
||||
running_update_threads[watch['uuid']] = fetch_site_status.perform_site_check(uuid=uuid,
|
||||
datastore=datastore)
|
||||
running_update_threads[watch['uuid']].start()
|
||||
|
||||
|
||||
# Thread runner to check every minute
|
||||
# Thread runner to check every minute, look for new watches to feed into the Queue.
|
||||
def ticker_thread_check_time_launch_checks():
|
||||
|
||||
# Spin up Workers.
|
||||
for _ in range(datastore.data['settings']['requests']['workers']):
|
||||
new_worker = Worker(update_q)
|
||||
running_update_threads.append(new_worker)
|
||||
new_worker.start()
|
||||
|
||||
# Every minute check for new UUIDs to follow up on
|
||||
while True:
|
||||
launch_checks()
|
||||
minutes = datastore.data['settings']['requests']['minutes_between_check']
|
||||
for uuid, watch in datastore.data['watching'].items():
|
||||
if watch['last_checked'] <= time.time() - (minutes * 60):
|
||||
update_q.put(uuid)
|
||||
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
# Thread runner, this helps with thread/write issues when there are many operations that want to update the JSON
|
||||
# by just running periodically in one thread.
|
||||
# by just running periodically in one thread, according to python, dict updates are threadsafe.
|
||||
def save_datastore():
|
||||
while True:
|
||||
if datastore.needs_write:
|
||||
datastore.sync_to_json()
|
||||
time.sleep(5)
|
||||
try:
|
||||
while True:
|
||||
if datastore.needs_write:
|
||||
datastore.sync_to_json()
|
||||
time.sleep(5)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
|
||||
def main(argv):
|
||||
ssl_mode = False
|
||||
@@ -348,6 +471,7 @@ def main(argv):
|
||||
|
||||
# @todo handle ctrl break
|
||||
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
|
||||
|
||||
save_data_thread = threading.Thread(target=save_datastore).start()
|
||||
|
||||
# @todo finalise SSL config, but this should get you in the right direction if you need it.
|
||||
|
||||
@@ -3,8 +3,8 @@ FROM python:3.8-slim
|
||||
# https://stackoverflow.com/questions/58701233/docker-logs-erroneously-appears-empty-until-container-stops
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
COPY requirements.txt /tmp/requirements.txt
|
||||
RUN pip3 install -r /tmp/requirements.txt
|
||||
# Should be mounted from docker-compose-development.yml
|
||||
RUN pip3 install -r /requirements.txt
|
||||
|
||||
|
||||
RUN [ ! -d "/datastore" ] && mkdir /datastore
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
aiohttp
|
||||
async-timeout
|
||||
chardet==2.3.0
|
||||
multidict
|
||||
python-engineio
|
||||
six==1.10.0
|
||||
yarl
|
||||
flask
|
||||
|
||||
eventlet
|
||||
requests
|
||||
validators
|
||||
|
||||
bleach==3.2.1
|
||||
html5lib==0.9999999 # via bleach
|
||||
timeago
|
||||
html2text
|
||||
|
||||
# @notes
|
||||
# - Dont install socketio, it interferes with flask_socketio
|
||||
@@ -1,16 +1,18 @@
|
||||
from threading import Thread
|
||||
import time
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import html2text
|
||||
# Not needed due to inscriptis being way better.
|
||||
#from urlextract import URLExtract
|
||||
from inscriptis import get_text
|
||||
|
||||
# Hmm Polymorphism datastore, thread, etc
|
||||
class perform_site_check(Thread):
|
||||
# Some common stuff here that can be moved to a base class
|
||||
class perform_site_check():
|
||||
|
||||
# New state that is set after a check
|
||||
# Return value dict
|
||||
update_obj = {}
|
||||
|
||||
|
||||
def __init__(self, *args, uuid=False, datastore, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.timestamp = int(time.time()) # used for storage etc too
|
||||
@@ -20,6 +22,14 @@ class perform_site_check(Thread):
|
||||
self.current_md5 = datastore.get_val(uuid, 'previous_md5')
|
||||
self.output_path = "/datastore/{}".format(self.uuid)
|
||||
|
||||
self.ensure_output_path()
|
||||
self.run()
|
||||
|
||||
# Current state of what needs to be updated
|
||||
@property
|
||||
def update_data(self):
|
||||
return self.update_obj
|
||||
|
||||
def save_firefox_screenshot(self, uuid, output):
|
||||
# @todo call selenium or whatever
|
||||
return
|
||||
@@ -32,8 +42,9 @@ class perform_site_check(Thread):
|
||||
os.mkdir(self.output_path)
|
||||
|
||||
def save_response_html_output(self, output):
|
||||
# @todo maybe record a history.json, [timestamp, md5, filename]
|
||||
with open("{}/{}.txt".format(self.output_path, self.timestamp), 'w') as f:
|
||||
|
||||
# @todo Saving the original HTML can be very large, better to set as an option, these files could be important to some.
|
||||
with open("{}/{}.html".format(self.output_path, self.timestamp), 'w') as f:
|
||||
f.write(output)
|
||||
f.close()
|
||||
|
||||
@@ -53,10 +64,11 @@ class perform_site_check(Thread):
|
||||
request_headers = self.datastore.data['settings']['headers'].copy()
|
||||
request_headers.update(extra_headers)
|
||||
|
||||
print("Checking", self.url)
|
||||
#print(request_headers)
|
||||
|
||||
self.ensure_output_path()
|
||||
# https://github.com/psf/requests/issues/4525
|
||||
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
|
||||
# do this by accident.
|
||||
if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']:
|
||||
request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')
|
||||
|
||||
try:
|
||||
timeout = self.datastore.data['settings']['requests']['timeout']
|
||||
@@ -73,26 +85,11 @@ class perform_site_check(Thread):
|
||||
stripped_text_from_html = get_text(r.text)
|
||||
|
||||
|
||||
# @todo This should be a config option.
|
||||
# Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
|
||||
|
||||
# inscriptis handles this much cleaner, probably not needed..
|
||||
# extractor = URLExtract()
|
||||
# urls = extractor.find_urls(stripped_text_from_html)
|
||||
# Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
|
||||
# if urls:
|
||||
# urls.sort(key=len, reverse=True)
|
||||
# for url in urls:
|
||||
# # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
|
||||
# if "://" in url:
|
||||
# # print ("Stripping link", url)
|
||||
# stripped_text_from_html = stripped_text_from_html.replace(url, '')
|
||||
|
||||
|
||||
|
||||
# Usually from networkIO/requests level
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
||||
self.datastore.update_watch(self.uuid, 'last_error', str(e))
|
||||
self.update_obj["last_error"] = str(e)
|
||||
|
||||
print(str(e))
|
||||
|
||||
except requests.exceptions.MissingSchema:
|
||||
@@ -100,35 +97,36 @@ class perform_site_check(Thread):
|
||||
|
||||
# Usually from html2text level
|
||||
except UnicodeDecodeError as e:
|
||||
self.datastore.update_watch(self.uuid, 'last_error', str(e))
|
||||
|
||||
self.update_obj["last_error"] = str(e)
|
||||
print(str(e))
|
||||
# figure out how to deal with this cleaner..
|
||||
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
|
||||
|
||||
else:
|
||||
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
||||
# in the future we'll implement other mechanisms.
|
||||
|
||||
# We rely on the actual text in the html output.. many sites have random script vars etc
|
||||
self.datastore.update_watch(self.uuid, 'last_error', False)
|
||||
self.datastore.update_watch(self.uuid, 'last_check_status', r.status_code)
|
||||
self.update_obj["last_check_status"] = r.status_code
|
||||
self.update_obj["last_error"] = False
|
||||
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
if self.current_md5 != fetched_md5:
|
||||
|
||||
# Dont confuse people by putting last-changed, when it actually just changed from nothing..
|
||||
# Don't confuse people by updating as last-changed, when it actually just changed from None..
|
||||
if self.datastore.get_val(self.uuid, 'previous_md5') is not None:
|
||||
self.datastore.update_watch(self.uuid, 'last_changed', self.timestamp)
|
||||
self.update_obj["last_changed"] = self.timestamp
|
||||
|
||||
self.update_obj["previous_md5"] = fetched_md5
|
||||
|
||||
self.datastore.update_watch(self.uuid, 'previous_md5', fetched_md5)
|
||||
self.save_response_html_output(r.text)
|
||||
output_filepath = self.save_response_stripped_output(stripped_text_from_html)
|
||||
|
||||
# Update history with the stripped text for future reference, this will also mean we save the first
|
||||
# attempt because 'self.current_md5 != fetched_md5' (current_md5 will be None when not run)
|
||||
# need to learn more about attr/setters/getters
|
||||
history = self.datastore.get_val(self.uuid, 'history')
|
||||
history.update(dict([(str(self.timestamp), output_filepath)]))
|
||||
self.datastore.update_watch(self.uuid, 'history', history)
|
||||
timestamp = str(self.timestamp)
|
||||
self.update_obj.update({"history": {timestamp: output_filepath}})
|
||||
|
||||
self.update_obj["last_checked"] = self.timestamp
|
||||
|
||||
self.datastore.update_watch(self.uuid, 'last_checked', int(time.time()))
|
||||
pass
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
aiohttp
|
||||
async-timeout
|
||||
chardet==2.3.0
|
||||
multidict
|
||||
python-engineio
|
||||
six==1.10.0
|
||||
yarl
|
||||
flask
|
||||
|
||||
eventlet
|
||||
requests
|
||||
validators
|
||||
|
||||
bleach==3.2.1
|
||||
html5lib==0.9999999 # via bleach
|
||||
timeago
|
||||
html2text
|
||||
|
||||
# @notes
|
||||
# - Dont install socketio, it interferes with flask_socketio
|
||||
@@ -32,12 +32,17 @@ ins {
|
||||
}
|
||||
|
||||
#settings {
|
||||
|
||||
|
||||
line-height: 2em;
|
||||
background: rgba(0,0,0,.05);
|
||||
padding: 1em;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 1em;
|
||||
color: #fff;
|
||||
font-size: 80%;
|
||||
}
|
||||
#settings label {
|
||||
margin-left: 1em;
|
||||
display: inline-block;
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.source {
|
||||
@@ -53,7 +58,7 @@ ins {
|
||||
}
|
||||
|
||||
#diff-ui {
|
||||
background: #fff;
|
||||
background: #fff;
|
||||
padding: 2em;
|
||||
margin: 1em;
|
||||
border-radius: 5px;
|
||||
|
||||
@@ -77,12 +77,21 @@ section.content {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.watch-table .title-col a[target="_blank"]::after {
|
||||
.watch-table .title-col a[target="_blank"]::after, .current-diff-url::after {
|
||||
content: url();
|
||||
margin: 0 3px 0 5px;
|
||||
}
|
||||
|
||||
/* hotovo */
|
||||
#check-all-button {
|
||||
text-align:right;
|
||||
}
|
||||
|
||||
#check-all-button a {
|
||||
border-top-left-radius: initial;
|
||||
border-top-right-radius: initial;
|
||||
border-bottom-left-radius: 5px;
|
||||
border-bottom-right-radius: 5px;
|
||||
}
|
||||
|
||||
|
||||
body:after {
|
||||
|
||||
@@ -1,34 +1,39 @@
|
||||
import json
|
||||
import uuid as uuid_builder
|
||||
import validators
|
||||
import os.path
|
||||
from os import path
|
||||
from threading import Lock, Thread
|
||||
|
||||
|
||||
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
|
||||
# Open a github issue if you know something :)
|
||||
# https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change
|
||||
class ChangeDetectionStore:
|
||||
lock = Lock()
|
||||
|
||||
def __init__(self):
|
||||
self.needs_write = False
|
||||
|
||||
self.__data = {
|
||||
'note' : "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
|
||||
'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
|
||||
'watching': {},
|
||||
'tag': "0.23",
|
||||
'settings': {
|
||||
'headers': {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet.
|
||||
'Accept-Language': 'en-GB,en-US;q=0.9,en;'
|
||||
},
|
||||
'requests': {
|
||||
'timeout': 15, # Default 15 seconds
|
||||
'minutes_between_check': 3 * 60 # Default 3 hours
|
||||
'timeout': 15, # Default 15 seconds
|
||||
'minutes_between_check': 3 * 60, # Default 3 hours
|
||||
'workers': 10 # Number of threads, lower is better for slow connections
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Base definition for all watchers
|
||||
self.generic_definition = {
|
||||
'url': None,
|
||||
@@ -38,10 +43,15 @@ class ChangeDetectionStore:
|
||||
'title': None,
|
||||
'previous_md5': None,
|
||||
'uuid': str(uuid_builder.uuid4()),
|
||||
'headers' : {}, # Extra headers to send
|
||||
'history' : {} # Dict of timestamp and output stripped filename
|
||||
'headers': {}, # Extra headers to send
|
||||
'history': {} # Dict of timestamp and output stripped filename
|
||||
}
|
||||
|
||||
if path.isfile('/source.txt'):
|
||||
with open('/source.txt') as f:
|
||||
# Should be set in Dockerfile to look for /source.txt , this will give us the git commit #
|
||||
# So when someone gives us a backup file to examine, we know exactly what code they were running.
|
||||
self.__data['build_sha'] = f.read()
|
||||
|
||||
try:
|
||||
with open('/datastore/url-watches.json') as json_file:
|
||||
@@ -59,7 +69,6 @@ class ChangeDetectionStore:
|
||||
if 'requests' in from_disk['settings']:
|
||||
self.__data['settings']['requests'].update(from_disk['settings']['requests'])
|
||||
|
||||
|
||||
# Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future.
|
||||
# @todo pretty sure theres a python we todo this with an abstracted(?) object!
|
||||
i = 0
|
||||
@@ -77,20 +86,28 @@ class ChangeDetectionStore:
|
||||
self.add_watch(url='https://www.gov.uk/coronavirus', tag='Covid')
|
||||
self.add_watch(url='https://changedetection.io', tag='Tech news')
|
||||
|
||||
|
||||
# self.entryVariable.get()
|
||||
def update_watch(self, uuid, val, var):
|
||||
def update_watch(self, uuid, update_obj):
|
||||
|
||||
self.lock.acquire()
|
||||
|
||||
# In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
|
||||
for dict_key, d in self.generic_definition.items():
|
||||
if isinstance(d, dict) and dict_key in update_obj:
|
||||
self.__data['watching'][uuid][dict_key].update(update_obj[dict_key])
|
||||
del(update_obj[dict_key])
|
||||
|
||||
# Update with the remaining values
|
||||
self.__data['watching'][uuid].update(update_obj)
|
||||
|
||||
self.__data['watching'][uuid].update({val: var})
|
||||
self.needs_write = True
|
||||
|
||||
self.lock.release()
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
return self.__data
|
||||
|
||||
def get_all_tags(self):
|
||||
tags=[]
|
||||
tags = []
|
||||
for uuid, watch in self.data['watching'].items():
|
||||
|
||||
# Support for comma separated list of tags.
|
||||
@@ -103,10 +120,11 @@ class ChangeDetectionStore:
|
||||
return tags
|
||||
|
||||
def delete(self, uuid):
|
||||
# Probably their should be dict...
|
||||
del(self.__data['watching'][uuid])
|
||||
self.needs_write = True
|
||||
|
||||
self.lock.acquire()
|
||||
del (self.__data['watching'][uuid])
|
||||
self.needs_write = True
|
||||
self.lock.release()
|
||||
|
||||
def url_exists(self, url):
|
||||
|
||||
@@ -122,26 +140,31 @@ class ChangeDetectionStore:
|
||||
return self.data['watching'][uuid].get(val)
|
||||
|
||||
def add_watch(self, url, tag):
|
||||
|
||||
# @todo deal with exception
|
||||
validators.url(url)
|
||||
self.lock.acquire()
|
||||
print("Adding", url, tag)
|
||||
# # @todo deal with exception
|
||||
# validators.url(url)
|
||||
|
||||
# @todo use a common generic version of this
|
||||
|
||||
new_uuid = str(uuid_builder.uuid4())
|
||||
_blank = self.generic_definition.copy()
|
||||
_blank.update({
|
||||
'url': url,
|
||||
'tag': tag,
|
||||
'uuid': str(uuid_builder.uuid4())
|
||||
'uuid': new_uuid
|
||||
})
|
||||
|
||||
self.data['watching'].update({_blank['uuid']: _blank})
|
||||
|
||||
self.data['watching'][new_uuid] = _blank
|
||||
self.needs_write = True
|
||||
self.lock.release()
|
||||
return new_uuid
|
||||
|
||||
def sync_to_json(self):
|
||||
print ("Saving index")
|
||||
print("Saving index")
|
||||
self.lock.acquire()
|
||||
with open('/datastore/url-watches.json', 'w') as json_file:
|
||||
json.dump(self.data, json_file, indent=4)
|
||||
self.needs_write = False
|
||||
self.lock.release()
|
||||
|
||||
# body of the constructor
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<meta name="description" content="Self hosted website change detection.">
|
||||
<title>Change Detection</title>
|
||||
<link rel="stylesheet" href="/static/css/pure-min.css">
|
||||
<link rel="stylesheet" href="/static/css/styles.css">
|
||||
<link rel="stylesheet" href="/static/css/styles.css?ver=1000">
|
||||
{% if extra_stylesheets %}
|
||||
{% for m in extra_stylesheets %}
|
||||
<link rel="stylesheet" href="{{ m }}">
|
||||
<link rel="stylesheet" href="{{ m }}?ver=1000">
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</head>
|
||||
@@ -18,9 +18,15 @@
|
||||
<div class="header">
|
||||
<div class="home-menu pure-menu pure-menu-horizontal pure-menu-fixed">
|
||||
<a class="pure-menu-heading" href="/"><strong>Change</strong>Detection.io</a>
|
||||
{% if current_diff_url %}
|
||||
<a class=current-diff-url href="{{ current_diff_url }}"><span style="max-width: 30%; overflow: hidden;">{{ current_diff_url }}</a>
|
||||
{% endif %}
|
||||
|
||||
<ul class="pure-menu-list">
|
||||
|
||||
<li class="pure-menu-item">
|
||||
<a href="/backup" class="pure-menu-link">BACKUP</a>
|
||||
</li>
|
||||
<li class="pure-menu-item">
|
||||
<a href="/import" class="pure-menu-link">IMPORT</a>
|
||||
</li>
|
||||
|
||||
@@ -2,34 +2,58 @@
|
||||
|
||||
{% block content %}
|
||||
|
||||
<div id="diff-ui">
|
||||
<div id="settings">
|
||||
<h1>Differences</h1>
|
||||
<form class="pure-form " action="" method="GET">
|
||||
<fieldset>
|
||||
|
||||
<label for="diffWords" class="pure-checkbox">
|
||||
<input type="radio" name="diff_type" id="diffWords" value="diffWords" /> Words</label>
|
||||
<label for="diffLines" class="pure-checkbox">
|
||||
<input type="radio" name="diff_type" id="diffLines" value="diffLines" checked=""/> Lines</label>
|
||||
|
||||
<label for="diffChars" class="pure-checkbox">
|
||||
<input type="radio" name="diff_type" id="diffChars" value="diffChars"/> Chars</label>
|
||||
|
||||
{% if versions|length >= 1 %}
|
||||
<label for="diff-version">Compare newest (<span id="current-v-date"></span>) with</label>
|
||||
<select id="diff-version" name="previous_version">
|
||||
{% for version in versions %}
|
||||
<option value="{{version}}" {% if version== current_previous_version %} selected="" {% endif %}>
|
||||
{{version}}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit" class="pure-button pure-button-primary">Go</button>
|
||||
{% endif %}
|
||||
</fieldset>
|
||||
</form>
|
||||
<del>Removed text</del>
|
||||
<ins>Inserted Text</ins>
|
||||
|
||||
<div id="settings">
|
||||
<h3>Diff</h3>
|
||||
<label><input type="radio" name="diff_type" value="diffChars"> Chars</label>
|
||||
<label><input type="radio" name="diff_type" value="diffWords" > Words</label>
|
||||
<label><input type="radio" name="diff_type" value="diffLines" checked=""> Lines</label>
|
||||
</div>
|
||||
|
||||
<div id="diff-ui">
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<!-- just proof of concept copied straight from github.com/kpdecker/jsdiff -->
|
||||
<td id="a" style="display: none;">{{left}}</td>
|
||||
<td id="b" style="display: none;">{{right}}</td>
|
||||
<!-- just proof of concept copied straight from github.com/kpdecker/jsdiff -->
|
||||
<td id="a" style="display: none;">{{previous}}</td>
|
||||
<td id="b" style="display: none;">{{newest}}</td>
|
||||
<td>
|
||||
<pre id="result"></pre>
|
||||
<span id="result"></span>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
Diff algorithm from the amazing <a href="https://github.com/kpdecker/jsdiff" >github.com/kpdecker/jsdiff</a>
|
||||
Diff algorithm from the amazing <a href="https://github.com/kpdecker/jsdiff">github.com/kpdecker/jsdiff</a>
|
||||
|
||||
</div>
|
||||
|
||||
<script src="/static/js/diff.js"></script>
|
||||
<script defer="">
|
||||
|
||||
|
||||
var a = document.getElementById('a');
|
||||
var b = document.getElementById('b');
|
||||
var result = document.getElementById('result');
|
||||
@@ -63,6 +87,23 @@ function changed() {
|
||||
}
|
||||
|
||||
window.onload = function() {
|
||||
|
||||
|
||||
/* Convert what is options from UTC time.time() to local browser time */
|
||||
var diffList=document.getElementById("diff-version");
|
||||
if (typeof(diffList) != 'undefined' && diffList != null) {
|
||||
for (var option of diffList.options) {
|
||||
var dateObject = new Date(option.value*1000);
|
||||
option.label=dateObject.toLocaleString();
|
||||
}
|
||||
}
|
||||
|
||||
/* Set current version date as local time in the browser also */
|
||||
var current_v = document.getElementById("current-v-date");
|
||||
var dateObject = new Date({{ newest_version_timestamp }}*1000);
|
||||
current_v.innerHTML=dateObject.toLocaleString();
|
||||
|
||||
|
||||
onDiffTypeChange(document.querySelector('#settings [name="diff_type"]:checked'));
|
||||
changed();
|
||||
};
|
||||
@@ -89,10 +130,11 @@ for (var i = 0; i < radio.length; i++) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{% endblock %}
|
||||
@@ -4,7 +4,7 @@
|
||||
<div class="edit-form">
|
||||
|
||||
|
||||
<form class="pure-form pure-form-stacked" action="/api/update?uuid={{uuid}}" method="POST">
|
||||
<form class="pure-form pure-form-stacked" action="/edit?uuid={{uuid}}" method="POST">
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
<label for="url">URL</label>
|
||||
|
||||
43
backend/templates/scrub.html
Normal file
43
backend/templates/scrub.html
Normal file
@@ -0,0 +1,43 @@
|
||||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<div class="edit-form">
|
||||
|
||||
|
||||
<form class="pure-form pure-form-stacked" action="/scrub" method="POST">
|
||||
<fieldset>
|
||||
<div class="pure-control-group">
|
||||
This will remove all version snapshots/data, but keep your list of URLs. <br/>
|
||||
You may like to use the <strong>BACKUP</strong> link first.<br/>
|
||||
|
||||
Type in the word <strong>scrub</strong> to confirm that you understand!
|
||||
<br/>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="pure-control-group">
|
||||
<br/>
|
||||
<label for="confirmtext">Confirm</label><br/>
|
||||
<input type="text" id="confirmtext" required="" name="confirmtext" value="" size="10"/>
|
||||
<br/>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="pure-control-group">
|
||||
<button type="submit" class="pure-button pure-button-primary">Scrub!</button>
|
||||
</div>
|
||||
<br/>
|
||||
<div class="pure-control-group">
|
||||
<a href="/" class="pure-button button-small button-cancel">Cancel</a>
|
||||
</div>
|
||||
|
||||
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
@@ -14,13 +14,15 @@
|
||||
</div>
|
||||
|
||||
|
||||
<br/>
|
||||
<div class="pure-control-group">
|
||||
<button type="submit" class="pure-button pure-button-primary">Save</button>
|
||||
</div>
|
||||
<br/>
|
||||
|
||||
<div class="pure-control-group">
|
||||
<a href="/" class="pure-button button-small button-cancel">Cancel</a>
|
||||
<a href="/" class="pure-button button-small button-cancel">Back</a>
|
||||
<a href="/scrub" class="pure-button button-small button-cancel">Reset all version data</a>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
</div>
|
||||
|
||||
<div id="watch-table-wrapper">
|
||||
|
||||
<table class="pure-table pure-table-striped watch-table">
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -55,7 +54,7 @@
|
||||
</td>
|
||||
<td>{{watch|format_last_checked_time}}</td>
|
||||
<td>{{watch.last_changed|format_timestamp_timeago}}</td>
|
||||
<td><a href="/api/checknow?uuid={{ watch.uuid}}" class="pure-button button-small pure-button-primary">Recheck</a>
|
||||
<td><a href="/api/checknow?uuid={{ watch.uuid}}{% if request.args.get('tag') %}&tag={{request.args.get('tag')}}{% endif %}" class="pure-button button-small pure-button-primary">Recheck</a>
|
||||
<a href="/edit?uuid={{ watch.uuid}}" class="pure-button button-small pure-button-primary">Edit</a>
|
||||
{% if watch.history|length >= 2 %}
|
||||
<a href="/diff/{{ watch.uuid}}" class="pure-button button-small pure-button-primary">Diff</a>
|
||||
@@ -67,6 +66,10 @@
|
||||
|
||||
</tbody>
|
||||
</table>
|
||||
<div id="check-all-button">
|
||||
|
||||
<a href="/api/checknow{% if active_tag%}?tag={{active_tag}}{%endif%}" class="pure-button button-tag " >Recheck all {% if active_tag%}in "{{active_tag}}"{%endif%}</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
@@ -10,6 +10,7 @@ services:
|
||||
container_name: changedetection.io-dev
|
||||
volumes:
|
||||
- ./backend:/app
|
||||
- ./requirements.txt:/requirements.txt # Normally COPY'ed in the Dockerfile
|
||||
- ./datastore:/datastore
|
||||
|
||||
ports:
|
||||
|
||||
BIN
screenshot-diff.png
Normal file
BIN
screenshot-diff.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 115 KiB |
BIN
screenshot.png
BIN
screenshot.png
Binary file not shown.
|
Before Width: | Height: | Size: 297 KiB After Width: | Height: | Size: 217 KiB |
Reference in New Issue
Block a user