Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
2e59f2a115 WIP 2022-05-13 12:05:52 +02:00
dgtlmoon
8735b73746 Proof of concept 2022-05-13 11:00:06 +02:00
14 changed files with 71 additions and 212 deletions

View File

@@ -605,12 +605,12 @@ def changedetection_app(config=None, datastore_o=None):
if request.method == 'POST' and not form.validate():
flash("An error occurred, please see below.", "error")
output = render_template("edit.html",
uuid=uuid,
watch=datastore.data['watching'][uuid],
form=form,
has_empty_checktime=using_default_check_time,
using_global_webdriver_wait=default['webdriver_delay'] is None,
current_base_url=datastore.data['settings']['application']['base_url'],
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False)
)
@@ -932,6 +932,9 @@ def changedetection_app(config=None, datastore_o=None):
# Add the flask app secret
zipObj.write(os.path.join(datastore_o.datastore_path, "secret.txt"), arcname="secret.txt")
# Add the sqlite3 db
zipObj.write(os.path.join(datastore_o.datastore_path, "watch.db"), arcname="watch.db")
# Add any snapshot data we find, use the full path to access the file, but make the file 'relative' in the Zip.
for txt_file_path in Path(datastore_o.datastore_path).rglob('*.txt'):
parent_p = txt_file_path.parent

View File

@@ -3,22 +3,17 @@ import chardet
import os
import requests
import time
import urllib3.exceptions
import sys
class EmptyReply(Exception):
def __init__(self, status_code, url):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
return
pass
class ReplyWithContentButNoText(Exception):
def __init__(self, status_code, url):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
return
pass
@@ -33,9 +28,6 @@ class Fetcher():
system_http_proxy = os.getenv('HTTP_PROXY')
system_https_proxy = os.getenv('HTTPS_PROXY')
# Time ONTOP of the system defined env minimum time
render_extract_delay=0
@abstractmethod
def get_error(self):
return self.error
@@ -105,7 +97,7 @@ class base_html_playwright(Fetcher):
self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
self.command_executor = os.getenv(
"PLAYWRIGHT_DRIVER_URL",
'ws://playwright-chrome:3000'
'ws://playwright-chrome:3000/playwright'
).strip('"')
# If any proxy settings are enabled, then we should setup the proxy object
@@ -155,7 +147,7 @@ class base_html_playwright(Fetcher):
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
page.wait_for_timeout(extra_wait * 1000)
except playwright._impl._api_types.TimeoutError as e:
raise EmptyReply(url=url, status_code=None)
@@ -163,9 +155,6 @@ class base_html_playwright(Fetcher):
if response is None:
raise EmptyReply(url=url, status_code=None)
if len(page.content().strip()) == 0:
raise EmptyReply(url=url, status_code=None)
self.status_code = response.status
self.content = page.content()
self.headers = response.all_headers()
@@ -251,7 +240,7 @@ class base_html_webdriver(Fetcher):
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()

View File

@@ -97,13 +97,7 @@ class perform_site_check():
proxy_args = self.set_proxy_from_list(watch)
fetcher = klass(proxy_override=proxy_args)
# Configurable per-watch or global extra delay before extracting text (for webDriver types)
system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
if watch['webdriver_delay'] is not None:
fetcher.render_extract_delay = watch['webdriver_delay']
elif system_webdriver_delay is not None:
fetcher.render_extract_delay = system_webdriver_delay
# Proxy List support
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code)
# Fetching complete, now filters
@@ -184,11 +178,6 @@ class perform_site_check():
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=200)
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.

View File

@@ -318,7 +318,6 @@ class commonSettingsForm(Form):
notification_format = SelectField('Notification format', choices=valid_notification_formats.keys(), default=default_notification_format)
fetch_backend = RadioField(u'Fetch method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False)
webdriver_delay = IntegerField('Wait seconds before extracting text', validators=[validators.Optional(), validators.NumberRange(min=1, message="Should contain one or more seconds")] )
class watchForm(commonSettingsForm):
@@ -371,7 +370,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
ignore_whitespace = BooleanField('Ignore whitespace')
real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False)
render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
password = SaltyPasswordField()

View File

@@ -30,7 +30,6 @@ class model(dict):
'password': False,
'base_url' : None,
'extract_title_as_title': False,
'empty_pages_are_a_change': False,
'fetch_backend': os.getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [],
@@ -42,8 +41,7 @@ class model(dict):
'notification_body': default_notification_body,
'notification_format': default_notification_format,
'real_browser_save_screenshot': True,
'schema_version' : 0,
'webdriver_delay': None # Extra delay in seconds before extracting text
'schema_version' : 0
}
}
}

View File

@@ -27,7 +27,8 @@ class model(dict):
'headers': {}, # Extra headers to send
'body': None,
'method': 'GET',
'history': {}, # Dict of timestamp and output stripped filename
# now stored in a sqlite3 db to reduce memory usage
#'history': {}, # Dict of timestamp and output stripped filename
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
# Custom notification content
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
@@ -43,8 +44,7 @@ class model(dict):
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'webdriver_delay': None
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}
}
def __init__(self, *arg, **kw):

View File

@@ -1,16 +0,0 @@
$(document).ready(function() {
function toggle() {
if ($('input[name="application-fetch_backend"]:checked').val() != 'html_requests') {
$('#requests-override-options').hide();
$('#webdriver-override-options').show();
} else {
$('#requests-override-options').show();
$('#webdriver-override-options').hide();
}
}
$('input[name="application-fetch_backend"]').click(function (e) {
toggle();
});
toggle();
});

View File

@@ -2,10 +2,8 @@ $(document).ready(function() {
function toggle() {
if ($('input[name="fetch_backend"]:checked').val() != 'html_requests') {
$('#requests-override-options').hide();
$('#webdriver-override-options').show();
} else {
$('#requests-override-options').show();
$('#webdriver-override-options').hide();
}
}
$('input[name="fetch_backend"]').click(function (e) {

View File

@@ -12,9 +12,11 @@ from os import mkdir, path, unlink
from threading import Lock
import re
import requests
import sqlite3
from . model import App, Watch
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
# Open a github issue if you know something :)
# https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change
@@ -32,6 +34,11 @@ class ChangeDetectionStore:
self.needs_write = False
self.datastore_path = datastore_path
self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
self.datastore_path = datastore_path
#@todo - check for better options
self.__history_db_connection = sqlite3.connect("{}/watch.db".format(self.datastore_path))
self.proxy_list = None
self.stop_thread = False
@@ -70,6 +77,9 @@ class ChangeDetectionStore:
if 'application' in from_disk['settings']:
self.__data['settings']['application'].update(from_disk['settings']['application'])
# Bump the update version by running updates
self.run_updates()
# Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future.
# @todo pretty sure theres a python we todo this with an abstracted(?) object!
for uuid, watch in self.__data['watching'].items():
@@ -79,6 +89,7 @@ class ChangeDetectionStore:
self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid)
print("Watching:", uuid, self.__data['watching'][uuid]['url'])
# First time ran, doesnt exist.
except (FileNotFoundError, json.decoder.JSONDecodeError):
if include_default_watches:
@@ -111,7 +122,6 @@ class ChangeDetectionStore:
secret = secrets.token_hex(16)
self.__data['settings']['application']['rss_access_token'] = secret
# Proxy list support - available as a selection in settings when text file is imported
# CSV list
# "name, address", or just "name"
@@ -119,8 +129,6 @@ class ChangeDetectionStore:
if path.isfile(proxy_list_file):
self.import_proxy_list(proxy_list_file)
# Bump the update version by running updates
self.run_updates()
self.needs_write = True
@@ -129,19 +137,20 @@ class ChangeDetectionStore:
# Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0.
def get_newest_history_key(self, uuid):
if len(self.__data['watching'][uuid]['history']) == 1:
cur = self.__history_db_connection.cursor()
c = cur.execute("SELECT COUNT(*) FROM watch_history WHERE watch_uuid = :uuid", {"uuid": uuid}).fetchone()
if c and c[0] <= 1:
return 0
dates = list(self.__data['watching'][uuid]['history'].keys())
# Convert to int, sort and back to str again
# @todo replace datastore getter that does this automatically
dates = [int(i) for i in dates]
dates.sort(reverse=True)
if len(dates):
# always keyed as str
return str(dates[0])
max = cur.execute("SELECT MAX(timestamp) FROM watch_history WHERE watch_uuid = :uuid", {"uuid": uuid}).fetchone()
return max[0]
return 0
def __refresh_history_max_timestamp(self):
# select watch_uuid, max(timestamp) from watch_history group by watch_uuid;
# could be way faster
x=1
def set_last_viewed(self, uuid, timestamp):
self.data['watching'][uuid].update({'last_viewed': int(timestamp)})
@@ -186,13 +195,13 @@ class ChangeDetectionStore:
def data(self):
has_unviewed = False
for uuid, v in self.__data['watching'].items():
self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid)
if int(v['newest_history_key']) <= int(v['last_viewed']):
self.__data['watching'][uuid]['viewed'] = True
# self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid)
# if int(v['newest_history_key']) <= int(v['last_viewed']):
# self.__data['watching'][uuid]['viewed'] = True
else:
self.__data['watching'][uuid]['viewed'] = False
has_unviewed = True
# else:
# self.__data['watching'][uuid]['viewed'] = False
# has_unviewed = True
# #106 - Be sure this is None on empty string, False, None, etc
# Default var for fetch_backend
@@ -495,3 +504,31 @@ class ChangeDetectionStore:
# Only upgrade individual watch time if it was set
if watch.get('minutes_between_check', False):
self.data['watching'][uuid]['time_between_check']['minutes'] = watch['minutes_between_check']
def update_3(self):
"""Migrate storage of history data to SQLite
- No need to store the history list in memory and re-write it everytime
- I've seen memory usage grow exponentially due to having large lists of watches with long histories
- Data about 'last changed' still stored in the main JSON struct which is fine
- We don't really need this data until we query against it (like for listing other available snapshots in the diff page etc)
"""
if self.__history_db_connection:
# Create the table
self.__history_db_connection.execute("CREATE TABLE IF NOT EXISTS watch_history(id INTEGER PRIMARY KEY, watch_uuid VARCHAR(36), timestamp INT, path TEXT, snapshot_type VARCHAR(10))")
self.__history_db_connection.execute("CREATE INDEX IF NOT EXISTS `uuid` ON `watch_history` (`watch_uuid`)")
self.__history_db_connection.execute("CREATE INDEX IF NOT EXISTS `uuid_timestamp` ON `watch_history` (`watch_uuid`, `timestamp`)")
# Insert each watch history list as executemany() for faster migration
for uuid, watch in self.data['watching'].items():
history = []
if watch.get('history', False):
for d, p in watch['history'].items():
d = int(d) # Used to be keyed as str, we'll fix this now too
history.append((uuid, d, p, 'text'))
if len(history):
self.__history_db_connection.executemany("INSERT INTO watch_history (watch_uuid, timestamp, path, snapshot_type) VALUES (?,?,?,?)", history)
self.__history_db_connection.commit()
del(self.data['watching'][uuid]['history'])

View File

@@ -73,21 +73,6 @@
</span>
</div>
{% endif %}
<fieldset class="pure-group" id="webdriver-override-options">
<div class="pure-form-message-inline">
<strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong>
<br/>
This will wait <i>n</i> seconds before extracting the text.
</div>
<div class="pure-control-group">
{{ render_field(form.webdriver_delay) }}
</div>
{% if using_global_webdriver_wait %}
<div class="pure-form-message-inline">
<strong>Using the current global default settings</strong>
</div>
{% endif %}
</fieldset>
<fieldset class="pure-group" id="requests-override-options">
<div class="pure-form-message-inline">
<strong>Request override is currently only used by the <i>Basic fast Plaintext/HTTP Client</i> method.</strong>

View File

@@ -12,7 +12,6 @@
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='global-settings.js')}}" defer></script>
<div class="edit-form">
<div class="tabs collapsable">
<ul>
@@ -61,11 +60,6 @@
{{ render_checkbox_field(form.application.form.real_browser_save_screenshot) }}
<span class="pure-form-message-inline">When using a Chrome browser, a screenshot from the last check will be available on the Diff page</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
@@ -93,16 +87,6 @@
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
</span>
</div>
<fieldset class="pure-group" id="webdriver-override-options">
<div class="pure-form-message-inline">
<strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong>
<br/>
This will wait <i>n</i> seconds before extracting the text.
</div>
<div class="pure-control-group">
{{ render_field(form.application.form.webdriver_delay) }}
</div>
</fieldset>
</div>

View File

@@ -13,7 +13,7 @@
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
<button type="submit" class="pure-button pure-button-primary">Watch</button>
</fieldset>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /> Tip: You can also add 'shared' watches. <a href="#">More info</a></a></span>
</form>
<div>
<a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>

View File

@@ -1,102 +0,0 @@
#!/usr/bin/python3
import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup
sleep_time_for_fetch_thread = 3
def set_nonrenderable_response():
test_return_data = """<html>
<head><title>modified head title</title></head>
<!-- like when some angular app was broken and doesnt render or whatever -->
<body>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
def test_check_basic_change_detection_functionality(client, live_server):
set_original_response()
live_server_setup(live_server)
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
#####################
client.post(
url_for("settings_page"),
data={"application-empty_pages_are_a_change": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"},
follow_redirects=True
)
# this should not trigger a change, because no good text could be converted from the HTML
set_nonrenderable_response()
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# ok now do the opposite
client.post(
url_for("settings_page"),
data={"application-empty_pages_are_a_change": "y",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"},
follow_redirects=True
)
set_modified_response()
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
#
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -52,10 +52,6 @@ class update_worker(threading.Thread):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
except content_fetcher.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
pass
except content_fetcher.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply: Status Code {}".format(e.status_code)