Compare commits

..

9 Commits

20 changed files with 591 additions and 265 deletions

View File

@@ -11,7 +11,7 @@ Live your data-life *pro-actively* instead of *re-actively*.
Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start) Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start)
[[ Discord ]](https://discord.com/channels/1000806276256780309/1000806276873334816) [[ YouTube ]](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [[ LinkedIn ]](https://www.linkedin.com/company/changedetection-io/) [![Discord](https://img.shields.io/badge/DISCORD-%237289DA.svg?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/vUNt4EtWMF) [ ![YouTube](https://img.shields.io/badge/YouTube-%23FF0000.svg?style=for-the-badge&logo=YouTube&logoColor=white)](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/company/changedetection-io/)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start) [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start)

View File

@@ -1 +1,2 @@
test-datastore test-datastore
package-lock.json

View File

@@ -44,7 +44,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools from changedetectionio import html_tools
from changedetectionio.api import api_v1 from changedetectionio.api import api_v1
__version__ = '0.39.16' __version__ = '0.39.17'
datastore = None datastore = None
@@ -105,10 +105,9 @@ def init_app_secret(datastore_path):
# running or something similar. # running or something similar.
@app.template_filter('format_last_checked_time') @app.template_filter('format_last_checked_time')
def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"): def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
# Worker thread tells us which UUID it is currently processing. # Worker thread tells us which UUID it is currently processing.
for t in threading.enumerate(): for t in running_update_threads:
if t.name == 'update_worker' and t.current_uuid == watch_obj['uuid']: if t.current_uuid == watch_obj['uuid']:
return '<span class="loader"></span><span> Checking now</span>' return '<span class="loader"></span><span> Checking now</span>'
if watch_obj['last_checked'] == 0: if watch_obj['last_checked'] == 0:
@@ -581,6 +580,9 @@ def changedetection_app(config=None, datastore_o=None):
if request.method == 'POST' and form.validate(): if request.method == 'POST' and form.validate():
extra_update_obj = {} extra_update_obj = {}
if request.args.get('unpause_on_save'):
extra_update_obj['paused'] = False
# Re #110, if they submit the same as the default value, set it to None, so we continue to follow the default # Re #110, if they submit the same as the default value, set it to None, so we continue to follow the default
# Assume we use the default value, unless something relevant is different, then use the form value # Assume we use the default value, unless something relevant is different, then use the form value
# values could be None, 0 etc. # values could be None, 0 etc.
@@ -620,7 +622,10 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['watching'][uuid].update(form.data) datastore.data['watching'][uuid].update(form.data)
datastore.data['watching'][uuid].update(extra_update_obj) datastore.data['watching'][uuid].update(extra_update_obj)
flash("Updated watch.") if request.args.get('unpause_on_save'):
flash("Updated watch - unpaused!.")
else:
flash("Updated watch.")
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
# But in the case something is added we should save straight away # But in the case something is added we should save straight away
@@ -1064,9 +1069,9 @@ def changedetection_app(config=None, datastore_o=None):
except FileNotFoundError: except FileNotFoundError:
abort(404) abort(404)
@app.route("/api/add", methods=['POST']) @app.route("/form/add/quickwatch", methods=['POST'])
@login_required @login_required
def form_watch_add(): def form_quick_watch_add():
from changedetectionio import forms from changedetectionio import forms
form = forms.quickWatchForm(request.form) form = forms.quickWatchForm(request.form)
@@ -1079,13 +1084,19 @@ def changedetection_app(config=None, datastore_o=None):
flash('The URL {} already exists'.format(url), "error") flash('The URL {} already exists'.format(url), "error")
return redirect(url_for('index')) return redirect(url_for('index'))
# @todo add_watch should throw a custom Exception for validation etc add_paused = request.form.get('edit_and_watch_submit_button') != None
new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip()) new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip(), extras={'paused': add_paused})
if new_uuid:
if not add_paused and new_uuid:
# Straight into the queue. # Straight into the queue.
update_q.put(new_uuid) update_q.put(new_uuid)
flash("Watch added.") flash("Watch added.")
if add_paused:
flash('Watch added in Paused state, saving will unpause.')
return redirect(url_for('edit_page', uuid=new_uuid, unpause_on_save=1))
return redirect(url_for('index')) return redirect(url_for('index'))
@@ -1214,7 +1225,6 @@ def changedetection_app(config=None, datastore_o=None):
# @todo handle ctrl break # @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
threading.Thread(target=ticker_thread_job_queue_processor).start()
threading.Thread(target=notification_runner).start() threading.Thread(target=notification_runner).start()
@@ -1290,63 +1300,25 @@ def notification_runner():
# Trim the log length # Trim the log length
notification_debug_log = notification_debug_log[-100:] notification_debug_log = notification_debug_log[-100:]
# Check the queue, when a job exists, start a fresh thread of update_worker
def ticker_thread_job_queue_processor():
from changedetectionio import update_worker
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
while not app.config.exit.is_set():
time.sleep(0.3)
# Check that some threads are free
running = 0
for t in threading.enumerate():
if t.name == 'update_worker':
running += 1
if running >= n_workers:
continue
try:
uuid = update_q.get(block=False)
except queue.Empty:
# Go back to waiting for exit and/or another entry from the queue
continue
print ("Starting a thread fetch")
try:
# Launch the update_worker thread that will handle picking items off a queue and sending them off
# in the event that playwright or others have a memory leak, this should clean it up better than gc.collect()
# (By letting it exit entirely)
update_worker.update_worker(update_q, notification_q, app, datastore, uuid).start()
except Exception as e:
print ("Error launching update_worker for UUID {}.".format(uuid))
print (str(e))
print ("Running now {}", running)
# Thread runner to check every minute, look for new watches to feed into the Queue. # Thread runner to check every minute, look for new watches to feed into the Queue.
def ticker_thread_check_time_launch_checks(): def ticker_thread_check_time_launch_checks():
import random import random
from changedetectionio import update_worker
recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20)) recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds) print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)
# Can go in its own function
# Always maintain the minimum number of threads, each thread will terminate when it has processed exactly 1 queued watch
# This is to be totally sure that they don't leak memory
# Spin up Workers that do the fetching # Spin up Workers that do the fetching
# Can be overriden by ENV or use the default settings # Can be overriden by ENV or use the default settings
n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
for _ in range(n_workers):
new_worker = update_worker.update_worker(update_q, notification_q, app, datastore)
running_update_threads.append(new_worker)
new_worker.start()
while not app.config.exit.is_set(): while not app.config.exit.is_set():
# Update our list of watches by UUID that are currently fetching data, used in the UI # Get a list of watches by UUID that are currently fetching data
running_uuids = [] running_uuids = []
for t in running_update_threads: for t in running_update_threads:
if t.current_uuid: if t.current_uuid:

View File

@@ -63,12 +63,12 @@ class Fetcher():
break; break;
} }
if('' !==r.id) { if('' !==r.id) {
chained_css.unshift("#"+r.id); chained_css.unshift("#"+CSS.escape(r.id));
final_selector= chained_css.join('>'); final_selector= chained_css.join(' > ');
// Be sure theres only one, some sites have multiples of the same ID tag :-( // Be sure theres only one, some sites have multiples of the same ID tag :-(
if (window.document.querySelectorAll(final_selector).length ==1 ) { if (window.document.querySelectorAll(final_selector).length ==1 ) {
return final_selector; return final_selector;
} }
return null; return null;
} else { } else {
chained_css.unshift(r.tagName.toLowerCase()); chained_css.unshift(r.tagName.toLowerCase());
@@ -547,6 +547,43 @@ class html_requests(Fetcher):
self.headers = r.headers self.headers = r.headers
# "html_requests" is listed as the default fetcher in store.py!
class html_fetcher_with_weird_memory_leak(Fetcher):
fetcher_description = "HTTP Fetcher with unexplainable memory leak"
def __init__(self, proxy_override=None):
self.proxy_override = proxy_override
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_css_filter=None):
self.status_code = 200
# Does nothing to help
# with open('memory-leak.html', 'r', encoding="utf-8") as f:
# with open('memory-leak.html', 'r') as f:
# Works but is binary (no good for me)
with open('memory-leak.html', 'r') as f:
wtf = f.read()
# just to prove gc.collect doesnt help, i dont even use 'wtf'
del wtf
wtf="not much"
import gc
gc.collect()
self.content = "<html>foobar</html>"
self.headers = {}
self.xpath_data = '{}'
# Decide which is the 'real' HTML webdriver, this is more a system wide config # Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific. # rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)

View File

@@ -308,6 +308,9 @@ class ValidateCSSJSONXPATHInput(object):
class quickWatchForm(Form): class quickWatchForm(Form):
url = fields.URLField('URL', validators=[validateURL()]) url = fields.URLField('URL', validators=[validateURL()])
tag = StringField('Group tag', [validators.Optional()]) tag = StringField('Group tag', [validators.Optional()])
watch_submit_button = SubmitField('Watch', render_kw={"class": "pure-button pure-button-primary"})
edit_and_watch_submit_button = SubmitField('Edit > Watch', render_kw={"class": "pure-button pure-button-primary"})
# Common to a single watch and the global settings # Common to a single watch and the global settings
class commonSettingsForm(Form): class commonSettingsForm(Form):

View File

@@ -4,6 +4,8 @@ from typing import List
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse from jsonpath_ng.ext import parse
import re import re
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
class FilterNotFoundInResponse(ValueError): class FilterNotFoundInResponse(ValueError):
def __init__(self, msg): def __init__(self, msg):
@@ -50,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
if len(html_content) > 0 and len(r) == 0: if len(html_content) > 0 and len(r) == 0:
raise FilterNotFoundInResponse(xpath_filter) raise FilterNotFoundInResponse(xpath_filter)
for item in r: #@note: //title/text() wont work where <title>CDATA..
html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
for element in r:
if type(element) == etree._ElementStringResult:
html_block += str(element) + "<br/>"
elif type(element) == etree._ElementUnicodeResult:
html_block += str(element) + "<br/>"
else:
html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
return html_block return html_block
@@ -181,16 +190,9 @@ def strip_ignore_text(content, wordlist, mode="content"):
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
import multiprocessing
from inscriptis.model.config import ParserConfig
"""Converts html string to a string with just the text. If ignoring """Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also rendering anchor tag content is enable, anchor tag content are also
included in the text included in the text
@NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE
https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/
:param html_content: string with html content :param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract :param render_anchor_tag_content: boolean flag indicating whether to extract
@@ -212,19 +214,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
else: else:
parser_config = None parser_config = None
# get text and annotations via inscriptis
def parse_function(html_content, parser_config, results_queue): text_content = get_text(html_content, config=parser_config)
from inscriptis import get_text
# get text and annotations via inscriptis
text_content = get_text(html_content, config=parser_config)
results_queue.put(text_content)
results_queue = multiprocessing.Queue()
parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
parse_process.daemon = True
parse_process.start()
text_content = results_queue.get() # blocks until results are available
parse_process.terminate()
return text_content return text_content

View File

@@ -31,7 +31,7 @@ class model(dict):
'base_url' : None, 'base_url' : None,
'extract_title_as_title': False, 'extract_title_as_title': False,
'empty_pages_are_a_change': False, 'empty_pages_are_a_change': False,
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"), 'fetch_backend': 'html_fetcher_with_weird_memory_leak',
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT, 'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [], 'global_subtractive_selectors': [],

View File

@@ -172,13 +172,14 @@ class model(dict):
# Iterate over all history texts and see if something new exists # Iterate over all history texts and see if something new exists
def lines_contain_something_unique_compared_to_history(self, lines=[]): def lines_contain_something_unique_compared_to_history(self, lines=[]):
local_lines = [l.decode('utf-8').strip().lower() for l in lines] local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
# Compare each lines (set) against each history text file (set) looking for something new.. # Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({})
for k, v in self.history.items(): for k, v in self.history.items():
alist = [line.decode('utf-8').strip().lower() for line in open(v, 'rb')] alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')])
res = set(alist) != set(local_lines) existing_history = existing_history.union(alist)
if res:
return True
return False # Check that everything in local_lines(new stuff) already exists in existing_history - it should
# if not, something new happened
return not local_lines.issubset(existing_history)

View File

@@ -78,7 +78,7 @@ def process_notification(n_object, datastore):
n_title = n_title[0:payload_max_size] n_title = n_title[0:payload_max_size]
n_body = n_body[0:body_limit] n_body = n_body[0:body_limit]
elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'): elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks') or url.startswith('https://discord.com/api'):
# real limit is 2000, but minus some for extra metadata # real limit is 2000, but minus some for extra metadata
payload_max_size = 1700 payload_max_size = 1700
body_limit = max(0, payload_max_size - len(n_title)) body_limit = max(0, payload_max_size - len(n_title))

View File

@@ -1,9 +1,7 @@
/* /*
* -- BASE STYLES -- * -- BASE STYLES --
* Most of these are inherited from Base, but I want to change a few. * Most of these are inherited from Base, but I want to change a few.
* nvm use v14.18.1 * nvm use v14.18.1 && npm install && npm run build
* npm install
* npm run build
* or npm run watch * or npm run watch
*/ */
body { body {
@@ -203,13 +201,18 @@ body:after, body:before {
border-radius: 10px; border-radius: 10px;
margin-bottom: 1em; } margin-bottom: 1em; }
#new-watch-form input { #new-watch-form input {
width: auto !important; display: inline-block;
display: inline-block; } margin-bottom: 5px; }
#new-watch-form .label { #new-watch-form .label {
display: none; } display: none; }
#new-watch-form legend { #new-watch-form legend {
color: #fff; color: #fff;
font-weight: bold; } font-weight: bold; }
#new-watch-form #watch-add-wrapper-zone > div {
display: inline-block; }
@media only screen and (max-width: 760px) {
#new-watch-form #watch-add-wrapper-zone #url {
width: 100%; } }
#diff-col { #diff-col {
padding-left: 40px; } padding-left: 40px; }

View File

@@ -1,9 +1,7 @@
/* /*
* -- BASE STYLES -- * -- BASE STYLES --
* Most of these are inherited from Base, but I want to change a few. * Most of these are inherited from Base, but I want to change a few.
* nvm use v14.18.1 * nvm use v14.18.1 && npm install && npm run build
* npm install
* npm run build
* or npm run watch * or npm run watch
*/ */
body { body {
@@ -269,8 +267,8 @@ body:after, body:before {
border-radius: 10px; border-radius: 10px;
margin-bottom: 1em; margin-bottom: 1em;
input { input {
width: auto !important;
display: inline-block; display: inline-block;
margin-bottom: 5px;
} }
.label { .label {
display: none; display: none;
@@ -279,6 +277,17 @@ body:after, body:before {
color: #fff; color: #fff;
font-weight: bold; font-weight: bold;
} }
#watch-add-wrapper-zone {
> div {
display: inline-block;
}
@media only screen and (max-width: 760px) {
#url {
width: 100%;
}
}
}
} }

View File

@@ -82,9 +82,8 @@ class ChangeDetectionStore:
if include_default_watches: if include_default_watches:
print("Creating JSON store at", self.datastore_path) print("Creating JSON store at", self.datastore_path)
self.add_watch(url='http://www.quotationspage.com/random.php', tag='test') for i in range(50):
self.add_watch(url='https://news.ycombinator.com/', tag='Tech news') self.add_watch(url='https://changedetection.io/CHANGELOG.txt?x='+str(i), tag='test')
self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io')
self.__data['version_tag'] = version_tag self.__data['version_tag'] = version_tag

View File

@@ -33,7 +33,7 @@
<div class="box-wrap inner"> <div class="box-wrap inner">
<form class="pure-form pure-form-stacked" <form class="pure-form pure-form-stacked"
action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next') ) }}" method="POST"> action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next'), unpause_on_save = request.args.get('unpause_on_save')) }}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="general"> <div class="tab-pane-inner" id="general">
@@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
</div> </div>
</fieldset> </fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.", {% set field = render_field(form.css_filter,
class="m-d") }} placeholder=".class-name or #some-id, or other CSS selector rule.",
class="m-d")
%}
{{ field }}
{% if '/text()' in field %}
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
{% endif %}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li> href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
<ul>
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
href="http://xpather.com/" target="new">test your XPath here</a></li> href="http://xpather.com/" target="new">test your XPath here</a></li>
<li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
</ul>
</li>
</ul> </ul>
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/> href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>

View File

@@ -1,18 +1,25 @@
{% extends 'base.html' %} {% extends 'base.html' %}
{% block content %} {% block content %}
{% from '_helpers.jinja' import render_simple_field %} {% from '_helpers.jinja' import render_simple_field, render_field %}
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script> <script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script> <script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
<div class="box"> <div class="box">
<form class="pure-form" action="{{ url_for('form_watch_add') }}" method="POST" id="new-watch-form"> <form class="pure-form" action="{{ url_for('form_quick_watch_add') }}" method="POST" id="new-watch-form">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset> <fieldset>
<legend>Add a new change detection watch</legend> <legend>Add a new change detection watch</legend>
{{ render_simple_field(form.url, placeholder="https://...", required=true) }} <div id="watch-add-wrapper-zone">
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }} <div>
<button type="submit" class="pure-button pure-button-primary">Watch</button> {{ render_simple_field(form.url, placeholder="https://...", required=true) }}
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
</div>
<div>
{{ render_simple_field(form.watch_submit_button, title="Watch this URL!" ) }}
{{ render_simple_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
</div>
</div>
</fieldset> </fieldset>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span> <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
</form> </form>

View File

@@ -30,7 +30,7 @@ def run_filter_test(client, content_filter):
# Add our URL to the import page # Add our URL to the import page
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
res = client.post( res = client.post(
url_for("form_watch_add"), url_for("form_quick_watch_add"),
data={"url": test_url, "tag": ''}, data={"url": test_url, "tag": ''},
follow_redirects=True follow_redirects=True
) )

View File

@@ -36,7 +36,7 @@ def test_check_notification(client, live_server):
# Add our URL to the import page # Add our URL to the import page
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
res = client.post( res = client.post(
url_for("form_watch_add"), url_for("form_quick_watch_add"),
data={"url": test_url, "tag": ''}, data={"url": test_url, "tag": ''},
follow_redirects=True follow_redirects=True
) )
@@ -172,7 +172,7 @@ def test_notification_validation(client, live_server):
# Add our URL to the import page # Add our URL to the import page
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
res = client.post( res = client.post(
url_for("form_watch_add"), url_for("form_quick_watch_add"),
data={"url": test_url, "tag": 'nice one'}, data={"url": test_url, "tag": 'nice one'},
follow_redirects=True follow_redirects=True
) )

View File

@@ -16,7 +16,7 @@ def test_check_notification_error_handling(client, live_server):
# use a different URL so that it doesnt interfere with the actual check until we are ready # use a different URL so that it doesnt interfere with the actual check until we are ready
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
res = client.post( res = client.post(
url_for("form_watch_add"), url_for("form_quick_watch_add"),
data={"url": "https://changedetection.io/CHANGELOG.txt", "tag": ''}, data={"url": "https://changedetection.io/CHANGELOG.txt", "tag": ''},
follow_redirects=True follow_redirects=True
) )

View File

@@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
follow_redirects=True follow_redirects=True
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(1)
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
@@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
assert b'Deleted' in res.data assert b'Deleted' in res.data
# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
def test_check_xpath_text_function_utf8(client, live_server):
filter='//item/title/text()'
d='''<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
<channel>
<title>rpilocator.com</title>
<link>https://rpilocator.com</link>
<description>Find Raspberry Pi Computers in Stock</description>
<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
<image>
<url>https://rpilocator.com/favicon.png</url>
<title>rpilocator.com</title>
<link>https://rpilocator.com/</link>
<width>32</width>
<height>32</height>
</image>
<item>
<title>Stock Alert (UK): RPi CM4</title>
<foo>something else unrelated</foo>
</item>
<item>
<title>Stock Alert (UK): Big monitor</title>
<foo>something else unrelated</foo>
</item>
</channel>
</rss>'''
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(d)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(1)
res = client.post(
url_for("edit_page", uuid="first"),
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
time.sleep(3)
res = client.get(url_for("index"))
assert b'Unicode strings with encoding declaration are not supported.' not in res.data
# The service should echo back the request headers
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_markup_xpath_filter_restriction(client, live_server): def test_check_markup_xpath_filter_restriction(client, live_server):
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3

View File

@@ -7,20 +7,19 @@ from changedetectionio.html_tools import FilterNotFoundInResponse
# A single update worker # A single update worker
# #
# # Requests for checking on a single site(watch) from a queue of watches
# (another process inserts watches into the queue that are time-ready for checking)
class update_worker(threading.Thread): class update_worker(threading.Thread):
current_uuid = None current_uuid = None
def __init__(self, q, notification_q, app, datastore, uuid, *args, **kwargs): def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
self.q = q self.q = q
self.app = app self.app = app
self.notification_q = notification_q self.notification_q = notification_q
self.datastore = datastore self.datastore = datastore
self.current_uuid = uuid
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.name = "update_worker"
def send_filter_failure_notification(self, uuid): def send_filter_failure_notification(self, uuid):
@@ -48,170 +47,169 @@ class update_worker(threading.Thread):
self.notification_q.put(n_object) self.notification_q.put(n_object)
print("Sent filter not found notification for {}".format(uuid)) print("Sent filter not found notification for {}".format(uuid))
# Pick one job off the list, process it threaded, exist
def run(self): def run(self):
# Go talk to the website
self.perform_site_update()
self.current_uuid = None # Done
self.q.task_done()
# Let the thread die after processing 1
# We will launch nice juicy fresh threads every time to prevent memory leaks in complex runner code (playwright etc)
print ("EXITING THREAD!")
self.app.config.exit.wait(1)
return
def perform_site_update(self):
from changedetectionio import fetch_site_status from changedetectionio import fetch_site_status
if not self.current_uuid in list(self.datastore.data['watching'].keys()):
return
changed_detected = False
contents = ""
screenshot = False
update_obj= {}
xpath_data = False
now = time.time()
update_handler = fetch_site_status.perform_site_check(datastore=self.datastore) update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
try:
changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(self.current_uuid)
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", self.current_uuid, str(e))
except content_fetcher.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': "Got HTML content but no text found."})
except FilterNotFoundInResponse as e:
err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
c = 0
if self.datastore.data['watching'].get(self.current_uuid, False):
c = self.datastore.data['watching'][self.current_uuid].get('consecutive_filter_failures', 5)
c += 1
# Send notification if we reached the threshold? while not self.app.config.exit.is_set():
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
print("Filter for {} not found, consecutive_filter_failures: {}".format(self.current_uuid, c))
if threshold >0 and c >= threshold:
self.send_filter_failure_notification(self.current_uuid)
c = 0
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
'consecutive_filter_failures': c})
except content_fetcher.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
except content_fetcher.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
except content_fetcher.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
else:
try: try:
watch = self.datastore.data['watching'][self.current_uuid] uuid = self.q.get(block=False)
fname = "" # Saved history text filename except queue.Empty:
pass
# For the FIRST time we check a site, or a change detected, save the snapshot. else:
if changed_detected or not watch['last_checked']: self.current_uuid = uuid
# A change was detected
fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
# Generally update anything interesting returned if uuid in list(self.datastore.data['watching'].keys()):
update_obj['consecutive_filter_failures'] = 0
self.datastore.update_watch(uuid=self.current_uuid, update_obj=update_obj)
# A change was detected changed_detected = False
if changed_detected: contents = ""
n_object = {} screenshot = False
print (">> Change detected in UUID {} - {}".format(self.current_uuid, watch['url'])) update_obj= {}
xpath_data = False
now = time.time()
# Notifications should only trigger on the second time (first time, we gather the initial snapshot) try:
if watch.history_n >= 2: changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)
# Atleast 2, means there really was a change # Re #342
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_changed': round(now)}) # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
except content_fetcher.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
except FilterNotFoundInResponse as e:
err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
c = 0
if self.datastore.data['watching'].get(uuid, False):
c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5)
c += 1
watch_history = watch.history # Send notification if we reached the threshold?
dates = list(watch_history.keys()) threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
# Theoretically it's possible that this could be just 1 long, print("Filter for {} not found, consecutive_filter_failures: {}".format(uuid, c))
# - In the case that the timestamp key was not unique if threshold >0 and c >= threshold:
if len(dates) == 1: self.send_filter_failure_notification(uuid)
raise ValueError( c = 0
"History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
)
prev_fname = watch_history[dates[-2]]
# Did it have any notification alerts to hit? self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
if len(watch['notification_urls']): 'consecutive_filter_failures': c})
print(">>> Notifications queued for UUID from watch {}".format(self.current_uuid)) except content_fetcher.EmptyReply as e:
n_object['notification_urls'] = watch['notification_urls'] # Some kind of custom to-str handler in the exception handler that does this?
n_object['notification_title'] = watch['notification_title'] err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
n_object['notification_body'] = watch['notification_body'] self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
n_object['notification_format'] = watch['notification_format'] 'last_check_status': e.status_code})
except content_fetcher.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
except content_fetcher.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# No? maybe theres a global setting, queue them all else:
elif len(self.datastore.data['settings']['application']['notification_urls']): try:
print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(self.current_uuid)) watch = self.datastore.data['watching'][uuid]
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] fname = "" # Saved history text filename
n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
else:
print(">>> NO notifications queued, watch and global notification URLs were empty.")
# Only prepare to notify if the rules above matched # For the FIRST time we check a site, or a change detected, save the snapshot.
if 'notification_urls' in n_object: if changed_detected or not watch['last_checked']:
# HTML needs linebreak, but MarkDown and Text can use a linefeed # A change was detected
if n_object['notification_format'] == 'HTML': fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
line_feed_sep = "</br>"
else:
line_feed_sep = "\n"
from changedetectionio import diff # Generally update anything interesting returned
n_object.update({ update_obj['consecutive_filter_failures'] = 0
'watch_url': watch['url'], self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
'uuid': self.current_uuid,
'current_snapshot': contents.decode('utf-8'),
'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
})
self.notification_q.put(n_object) # A change was detected
if changed_detected:
n_object = {}
print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
except Exception as e: # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart! if watch.history_n >= 2:
print("!!!! Exception in update_worker !!!\n", e) # Atleast 2, means there really was a change
self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_changed': round(now)})
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
finally: watch_history = watch.history
# Always record that we atleast tried dates = list(watch_history.keys())
self.datastore.update_watch(uuid=self.current_uuid, update_obj={'fetch_time': round(time.time() - now, 3), # Theoretically it's possible that this could be just 1 long,
'last_checked': round(time.time())}) # - In the case that the timestamp key was not unique
if len(dates) == 1:
raise ValueError(
"History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
)
prev_fname = watch_history[dates[-2]]
# Always save the screenshot if it's available # Did it have any notification alerts to hit?
if screenshot: if len(watch['notification_urls']):
self.datastore.save_screenshot(watch_uuid=self.current_uuid, screenshot=screenshot) print(">>> Notifications queued for UUID from watch {}".format(uuid))
if xpath_data: n_object['notification_urls'] = watch['notification_urls']
self.datastore.save_xpath_data(watch_uuid=self.current_uuid, data=xpath_data) n_object['notification_title'] = watch['notification_title']
n_object['notification_body'] = watch['notification_body']
n_object['notification_format'] = watch['notification_format']
# No? maybe theres a global setting, queue them all
elif len(self.datastore.data['settings']['application']['notification_urls']):
print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
else:
print(">>> NO notifications queued, watch and global notification URLs were empty.")
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
# HTML needs linebreak, but MarkDown and Text can use a linefeed
if n_object['notification_format'] == 'HTML':
line_feed_sep = "</br>"
else:
line_feed_sep = "\n"
from changedetectionio import diff
n_object.update({
'watch_url': watch['url'],
'uuid': uuid,
'current_snapshot': contents.decode('utf-8'),
'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
})
self.notification_q.put(n_object)
except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
print("!!!! Exception in update_worker !!!\n", e)
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
finally:
# Always record that we atleast tried
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
'last_checked': round(time.time())})
# Always save the screenshot if it's available
if screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
if xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
self.current_uuid = None # Done
self.q.task_done()
# Give the CPU time to interrupt
time.sleep(0.1)
self.app.config.exit.wait(1)

231
memory-leak.html Normal file

File diff suppressed because one or more lines are too long