example memory usage that isnt cleared

0.39.17
Add https://discord.com/api notification hook to the automatic truncation due to Discords 2000 char limit
2025-12-30 11:50:26 +00:00 · 2022-07-28 20:55:01 +02:00 · 2022-07-28 13:07:51 +02:00 · 2022-07-28 12:34:55 +02:00 · 2022-07-28 12:13:26 +02:00 · 2022-07-28 11:50:31 +02:00
20 changed files with 591 additions and 265 deletions
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Live your data-life *pro-actively* instead of *re-actively*.
 Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start)
-[[ Discord ]](https://discord.com/channels/1000806276256780309/1000806276873334816) [[ YouTube ]](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [[ LinkedIn ]](https://www.linkedin.com/company/changedetection-io/)
+[![Discord](https://img.shields.io/badge/DISCORD-%237289DA.svg?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/vUNt4EtWMF) [ ![YouTube](https://img.shields.io/badge/YouTube-%23FF0000.svg?style=for-the-badge&logo=YouTube&logoColor=white)](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/company/changedetection-io/)
 [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring"  title="Self-hosted web page change monitoring"  />](https://lemonade.changedetection.io/start)
--- a/changedetectionio/.gitignore
+++ b/changedetectionio/.gitignore
@@ -1 +1,2 @@
 test-datastore
 package-lock.json
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -44,7 +44,7 @@ from flask_wtf import CSRFProtect
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1
-__version__ = '0.39.16'
+__version__ = '0.39.17'
 datastore = None
@@ -105,10 +105,9 @@ def init_app_secret(datastore_path):
 # running or something similar.
@app.template_filter('format_last_checked_time')
 def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
    # Worker thread tells us which UUID it is currently processing.
-    for t in threading.enumerate():
+    for t in running_update_threads:
-        if t.name == 'update_worker' and t.current_uuid == watch_obj['uuid']:
+        if t.current_uuid == watch_obj['uuid']:
            return '<span class="loader"></span><span> Checking now</span>'
    if watch_obj['last_checked'] == 0:
@@ -581,6 +580,9 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'POST' and form.validate():
            extra_update_obj = {}
            if request.args.get('unpause_on_save'):
                extra_update_obj['paused'] = False
            # Re #110, if they submit the same as the default value, set it to None, so we continue to follow the default
            # Assume we use the default value, unless something relevant is different, then use the form value
            # values could be None, 0 etc.
@@ -620,7 +622,10 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['watching'][uuid].update(form.data)
            datastore.data['watching'][uuid].update(extra_update_obj)
-            flash("Updated watch.")
+            if request.args.get('unpause_on_save'):
                flash("Updated watch - unpaused!.")
            else:
                flash("Updated watch.")
            # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
            # But in the case something is added we should save straight away
@@ -1064,9 +1069,9 @@ def changedetection_app(config=None, datastore_o=None):
        except FileNotFoundError:
            abort(404)
-    @app.route("/api/add", methods=['POST'])
+    @app.route("/form/add/quickwatch", methods=['POST'])
    @login_required
-    def form_watch_add():
+    def form_quick_watch_add():
        from changedetectionio import forms
        form = forms.quickWatchForm(request.form)
@@ -1079,13 +1084,19 @@ def changedetection_app(config=None, datastore_o=None):
            flash('The URL {} already exists'.format(url), "error")
            return redirect(url_for('index'))
-        # @todo add_watch should throw a custom Exception for validation etc
+        add_paused = request.form.get('edit_and_watch_submit_button') != None
-        new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip())
+        new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip(), extras={'paused': add_paused})
-        if new_uuid:
+
        if not add_paused and new_uuid:
            # Straight into the queue.
            update_q.put(new_uuid)
            flash("Watch added.")
        if add_paused:
            flash('Watch added in Paused state, saving will unpause.')
            return redirect(url_for('edit_page', uuid=new_uuid, unpause_on_save=1))
        return redirect(url_for('index'))
@@ -1214,7 +1225,6 @@ def changedetection_app(config=None, datastore_o=None):
    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
    threading.Thread(target=ticker_thread_job_queue_processor).start()
    threading.Thread(target=notification_runner).start()
@@ -1290,63 +1300,25 @@ def notification_runner():
            # Trim the log length
            notification_debug_log = notification_debug_log[-100:]
 # Check the queue, when a job exists, start a fresh thread of update_worker
 def ticker_thread_job_queue_processor():
    from changedetectionio import update_worker
    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
    while not app.config.exit.is_set():
        time.sleep(0.3)
        # Check that some threads are free
        running = 0
        for t in threading.enumerate():
            if t.name == 'update_worker':
                running += 1
        if running >= n_workers:
            continue
        try:
            uuid = update_q.get(block=False)
        except queue.Empty:
            # Go back to waiting for exit and/or another entry from the queue
            continue
        print ("Starting a thread fetch")
        try:
            # Launch the update_worker thread that will handle picking items off a queue and sending them off
            # in the event that playwright or others have a memory leak, this should clean it up better than gc.collect()
            # (By letting it exit entirely)
            update_worker.update_worker(update_q, notification_q, app, datastore, uuid).start()
        except Exception as e:
            print ("Error launching update_worker for UUID {}.".format(uuid))
            print (str(e))
        print ("Running now {}", running)
 # Thread runner to check every minute, look for new watches to feed into the Queue.
 def ticker_thread_check_time_launch_checks():
    import random
-
+    from changedetectionio import update_worker
    recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
    print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)
    # Can go in its own function
    # Always maintain the minimum number of threads, each thread will terminate when it has processed exactly 1 queued watch
    # This is to be totally sure that they don't leak memory
    # Spin up Workers that do the fetching
    # Can be overriden by ENV or use the default settings
-
+    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
    for _ in range(n_workers):
        new_worker = update_worker.update_worker(update_q, notification_q, app, datastore)
        running_update_threads.append(new_worker)
        new_worker.start()
    while not app.config.exit.is_set():
-        # Update our list of watches by UUID that are currently fetching data, used in the UI
+        # Get a list of watches by UUID that are currently fetching data
        running_uuids = []
        for t in running_update_threads:
            if t.current_uuid:
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -63,12 +63,12 @@ class Fetcher():
                      break;
                    }
                    if('' !==r.id) {
-                      chained_css.unshift("#"+r.id);
+                      chained_css.unshift("#"+CSS.escape(r.id));
-                      final_selector= chained_css.join('>');
+                      final_selector= chained_css.join(' > ');
                      // Be sure theres only one, some sites have multiples of the same ID tag :-(
                      if (window.document.querySelectorAll(final_selector).length ==1 ) {
                        return final_selector;
-                      }
+                        }
                      return null;
                    } else {
                      chained_css.unshift(r.tagName.toLowerCase());
@@ -547,6 +547,43 @@ class html_requests(Fetcher):
        self.headers = r.headers
 # "html_requests" is listed as the default fetcher in store.py!
 class html_fetcher_with_weird_memory_leak(Fetcher):
    fetcher_description = "HTTP Fetcher with unexplainable memory leak"
    def __init__(self, proxy_override=None):
        self.proxy_override = proxy_override
    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_css_filter=None):
        self.status_code = 200
        # Does nothing to help
        # with open('memory-leak.html', 'r', encoding="utf-8") as f:
        # with open('memory-leak.html', 'r') as f:
        # Works but is binary (no good for me)
        with open('memory-leak.html', 'r') as f:
            wtf = f.read()
        # just to prove gc.collect doesnt help, i dont even use 'wtf'
        del wtf
        wtf="not much"
        import gc
        gc.collect()
        self.content = "<html>foobar</html>"
        self.headers = {}
        self.xpath_data = '{}'
 # Decide which is the 'real' HTML webdriver, this is more a system wide config
 # rather than site-specific.
 use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -308,6 +308,9 @@ class ValidateCSSJSONXPATHInput(object):
 class quickWatchForm(Form):
    url = fields.URLField('URL', validators=[validateURL()])
    tag = StringField('Group tag', [validators.Optional()])
    watch_submit_button = SubmitField('Watch', render_kw={"class": "pure-button pure-button-primary"})
    edit_and_watch_submit_button = SubmitField('Edit > Watch', render_kw={"class": "pure-button pure-button-primary"})
 # Common to a single watch and the global settings
 class commonSettingsForm(Form):
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -4,6 +4,8 @@ from typing import List
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
 import re
 from inscriptis import get_text
 from inscriptis.model.config import ParserConfig
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
@@ -50,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
    if len(html_content) > 0 and len(r) == 0:
        raise FilterNotFoundInResponse(xpath_filter)
-    for item in r:
+    #@note: //title/text() wont work where <title>CDATA..
-        html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
+
    for element in r:
        if type(element) == etree._ElementStringResult:
            html_block += str(element) + "<br/>"
        elif type(element) == etree._ElementUnicodeResult:
            html_block += str(element) + "<br/>"
        else:
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
    return html_block
@@ -181,16 +190,9 @@ def strip_ignore_text(content, wordlist, mode="content"):
 def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    import multiprocessing
    from inscriptis.model.config import ParserConfig
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
    @NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE 
           https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/ 
    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
@@ -212,19 +214,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    else:
        parser_config = None
-
+    # get text and annotations via inscriptis
-    def parse_function(html_content, parser_config, results_queue):
+    text_content = get_text(html_content, config=parser_config)
        from inscriptis import get_text
        # get text and annotations via inscriptis
        text_content = get_text(html_content, config=parser_config)
        results_queue.put(text_content)
    results_queue = multiprocessing.Queue()
    parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
    parse_process.daemon = True
    parse_process.start()
    text_content = results_queue.get()  # blocks until results are available
    parse_process.terminate()
    return text_content
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -31,7 +31,7 @@ class model(dict):
                    'base_url' : None,
                    'extract_title_as_title': False,
                    'empty_pages_are_a_change': False,
-                    'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
+                    'fetch_backend': 'html_fetcher_with_weird_memory_leak',
                    'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
                    'global_subtractive_selectors': [],
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -172,13 +172,14 @@ class model(dict):
    # Iterate over all history texts and see if something new exists
    def lines_contain_something_unique_compared_to_history(self, lines=[]):
-        local_lines = [l.decode('utf-8').strip().lower() for l in lines]
+        local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
        # Compare each lines (set) against each history text file (set) looking for something new..
        existing_history = set({})
        for k, v in self.history.items():
-            alist = [line.decode('utf-8').strip().lower() for line in open(v, 'rb')]
+            alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')])
-            res = set(alist) != set(local_lines)
+            existing_history = existing_history.union(alist)
            if res:
                return True
-        return False
+        # Check that everything in local_lines(new stuff) already exists in existing_history - it should
        # if not, something new happened
        return not local_lines.issubset(existing_history)
--- a/changedetectionio/notification.py
+++ b/changedetectionio/notification.py
@@ -78,7 +78,7 @@ def process_notification(n_object, datastore):
                    n_title = n_title[0:payload_max_size]
                    n_body = n_body[0:body_limit]
-                elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'):
+                elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks') or url.startswith('https://discord.com/api'):
                    # real limit is 2000, but minus some for extra metadata
                    payload_max_size = 1700
                    body_limit = max(0, payload_max_size - len(n_title))
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -1,9 +1,7 @@
 /*
 * -- BASE STYLES --
 * Most of these are inherited from Base, but I want to change a few.
- * nvm use v14.18.1
+ * nvm use v14.18.1 && npm install && npm run build
 * npm install
 * npm run build
 * or npm run watch
 */
 body {
@@ -203,13 +201,18 @@ body:after, body:before {
  border-radius: 10px;
  margin-bottom: 1em; }
  #new-watch-form input {
-    width: auto !important;
+    display: inline-block;
-    display: inline-block; }
+    margin-bottom: 5px; }
  #new-watch-form .label {
    display: none; }
  #new-watch-form legend {
    color: #fff;
    font-weight: bold; }
  #new-watch-form #watch-add-wrapper-zone > div {
    display: inline-block; }
  @media only screen and (max-width: 760px) {
    #new-watch-form #watch-add-wrapper-zone #url {
      width: 100%; } }
 #diff-col {
  padding-left: 40px; }
--- a/changedetectionio/static/styles/styles.scss
+++ b/changedetectionio/static/styles/styles.scss
@@ -1,9 +1,7 @@
 /*
 * -- BASE STYLES --
 * Most of these are inherited from Base, but I want to change a few.
- * nvm use v14.18.1
+ * nvm use v14.18.1 && npm install && npm run build
 * npm install
 * npm run build
 * or npm run watch
 */
 body {
@@ -269,8 +267,8 @@ body:after, body:before {
  border-radius: 10px;
  margin-bottom: 1em;
  input {
    width: auto !important;
    display: inline-block;
    margin-bottom: 5px;
  }
  .label {
    display: none;
@@ -279,6 +277,17 @@ body:after, body:before {
    color: #fff;
    font-weight: bold;
  }
  #watch-add-wrapper-zone {
    > div {
      display: inline-block;
    }
    @media only screen and (max-width: 760px) {
      #url {
        width: 100%;
      }
    }
  }
 }
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -82,9 +82,8 @@ class ChangeDetectionStore:
            if include_default_watches:
                print("Creating JSON store at", self.datastore_path)
-                self.add_watch(url='http://www.quotationspage.com/random.php', tag='test')
+                for i in range(50):
-                self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
+                    self.add_watch(url='https://changedetection.io/CHANGELOG.txt?x='+str(i), tag='test')
                self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io')
        self.__data['version_tag'] = version_tag
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -33,7 +33,7 @@
    <div class="box-wrap inner">
        <form class="pure-form pure-form-stacked"
-              action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next') ) }}" method="POST">
+              action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next'), unpause_on_save = request.args.get('unpause_on_save')) }}" method="POST">
             <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
            <div class="tab-pane-inner" id="general">
@@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
                        </div>
                    </fieldset>
                    <div class="pure-control-group">
-                        {{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
+                        {% set field = render_field(form.css_filter,
-                        class="m-d") }}
+                            placeholder=".class-name or #some-id, or other CSS selector rule.",
                            class="m-d")
                        %}
                        {{ field }}
                        {% if '/text()' in  field %}
                          <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
                        {% endif %}
                        <span class="pure-form-message-inline">
                    <ul>
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
                            <ul>
                                <li>Example:  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                                <li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
                            </ul>
                            </li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -1,18 +1,25 @@
 {% extends 'base.html' %}
 {% block content %}
-{% from '_helpers.jinja' import render_simple_field %}
+{% from '_helpers.jinja' import render_simple_field, render_field %}
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
 <div class="box">
-    <form class="pure-form" action="{{ url_for('form_watch_add') }}" method="POST" id="new-watch-form">
+    <form class="pure-form" action="{{ url_for('form_quick_watch_add') }}" method="POST" id="new-watch-form">
        <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
        <fieldset>
            <legend>Add a new change detection watch</legend>
-                {{ render_simple_field(form.url, placeholder="https://...", required=true) }}
+            <div id="watch-add-wrapper-zone">
-                {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
+                <div>
-            <button type="submit" class="pure-button pure-button-primary">Watch</button>
+                    {{ render_simple_field(form.url, placeholder="https://...", required=true) }}
                    {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
                </div>
                <div>
                    {{ render_simple_field(form.watch_submit_button, title="Watch this URL!" ) }}
                    {{ render_simple_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
                </div>
            </div>
        </fieldset>
        <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
    </form>
--- a/changedetectionio/tests/test_filter_failure_notification.py
+++ b/changedetectionio/tests/test_filter_failure_notification.py
@@ -30,7 +30,7 @@ def run_filter_test(client, content_filter):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": ''},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_notification.py
+++ b/changedetectionio/tests/test_notification.py
@@ -36,7 +36,7 @@ def test_check_notification(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": ''},
        follow_redirects=True
    )
@@ -172,7 +172,7 @@ def test_notification_validation(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": 'nice one'},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_notification_errors.py
+++ b/changedetectionio/tests/test_notification_errors.py
@@ -16,7 +16,7 @@ def test_check_notification_error_handling(client, live_server):
    # use a different URL so that it doesnt interfere with the actual check until we are ready
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": "https://changedetection.io/CHANGELOG.txt", "tag": ''},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
@@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
    assert b'Deleted' in res.data
 # Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
 def test_check_xpath_text_function_utf8(client, live_server):
    filter='//item/title/text()'
    d='''<?xml version="1.0" encoding="UTF-8"?>
 <rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
 	<channel>
 		<title>rpilocator.com</title>
 		<link>https://rpilocator.com</link>
 		<description>Find Raspberry Pi Computers in Stock</description>
 		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
 		<image>
 			<url>https://rpilocator.com/favicon.png</url>
 			<title>rpilocator.com</title>
 			<link>https://rpilocator.com/</link>
 			<width>32</width>
 			<height>32</height>
 		</image>
 		<item>
 			<title>Stock Alert (UK): RPi CM4</title>
 			<foo>something else unrelated</foo>
 		</item>
 		<item>
 			<title>Stock Alert (UK): Big monitor</title>
 			<foo>something else unrelated</foo>
 		</item>		
 	</channel>
 </rss>'''
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(d)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    res = client.get(url_for("index"))
    assert b'Unicode strings with encoding declaration are not supported.' not in res.data
    # The service should echo back the request headers
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
    assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 def test_check_markup_xpath_filter_restriction(client, live_server):
    sleep_time_for_fetch_thread = 3
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -7,20 +7,19 @@ from changedetectionio.html_tools import FilterNotFoundInResponse
 # A single update worker
 #
-#
+# Requests for checking on a single site(watch) from a queue of watches
 # (another process inserts watches into the queue that are time-ready for checking)
 class update_worker(threading.Thread):
    current_uuid = None
-    def __init__(self, q, notification_q, app, datastore, uuid, *args, **kwargs):
+    def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
        self.q = q
        self.app = app
        self.notification_q = notification_q
        self.datastore = datastore
        self.current_uuid = uuid
        super().__init__(*args, **kwargs)
        self.name = "update_worker"
    def send_filter_failure_notification(self, uuid):
@@ -48,170 +47,169 @@ class update_worker(threading.Thread):
            self.notification_q.put(n_object)
            print("Sent filter not found notification for {}".format(uuid))
    # Pick one job off the list, process it threaded, exist
    def run(self):
        # Go talk to the website
        self.perform_site_update()
        self.current_uuid = None  # Done
        self.q.task_done()
        # Let the thread die after processing 1
        # We will launch nice juicy fresh threads every time to prevent memory leaks in complex runner code (playwright etc)
        print ("EXITING THREAD!")
        self.app.config.exit.wait(1)
        return
    def perform_site_update(self):
        from changedetectionio import fetch_site_status
        if not self.current_uuid in list(self.datastore.data['watching'].keys()):
            return
        changed_detected = False
        contents = ""
        screenshot = False
        update_obj= {}
        xpath_data = False
        now = time.time()
        update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
        try:
            changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(self.current_uuid)
            # Re #342
            # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
            # We then convert/.decode('utf-8') for the notification etc
            if not isinstance(contents, (bytes, bytearray)):
                raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
        except PermissionError as e:
            self.app.logger.error("File permission error updating", self.current_uuid, str(e))
        except content_fetcher.ReplyWithContentButNoText as e:
            # Totally fine, it's by choice - just continue on, nothing more to care about
            # Page had elements/content but no renderable text
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': "Got HTML content but no text found."})
        except FilterNotFoundInResponse as e:
            err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
            c = 0
            if self.datastore.data['watching'].get(self.current_uuid, False):
                c = self.datastore.data['watching'][self.current_uuid].get('consecutive_filter_failures', 5)
            c += 1
-            # Send notification if we reached the threshold?
+        while not self.app.config.exit.is_set():
            threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
            print("Filter for {} not found, consecutive_filter_failures: {}".format(self.current_uuid, c))
            if threshold >0 and c >= threshold:
                self.send_filter_failure_notification(self.current_uuid)
                c = 0
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'consecutive_filter_failures': c})
        except content_fetcher.EmptyReply as e:
            # Some kind of custom to-str handler in the exception handler that does this?
            err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except content_fetcher.ScreenshotUnavailable as e:
            err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except content_fetcher.PageUnloadable as e:
            err_text = "Page request from server didnt respond correctly"
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except Exception as e:
            self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
        else:
            try:
-                watch = self.datastore.data['watching'][self.current_uuid]
+                uuid = self.q.get(block=False)
-                fname = "" # Saved history text filename
+            except queue.Empty:
                pass
-                # For the FIRST time we check a site, or a change detected, save the snapshot.
+            else:
-                if changed_detected or not watch['last_checked']:
+                self.current_uuid = uuid
                    # A change was detected
                    fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
-                # Generally update anything interesting returned
+                if uuid in list(self.datastore.data['watching'].keys()):
                update_obj['consecutive_filter_failures'] = 0
                self.datastore.update_watch(uuid=self.current_uuid, update_obj=update_obj)
-                # A change was detected
+                    changed_detected = False
-                if changed_detected:
+                    contents = ""
-                    n_object = {}
+                    screenshot = False
-                    print (">> Change detected in UUID {} - {}".format(self.current_uuid, watch['url']))
+                    update_obj= {}
                    xpath_data = False
                    now = time.time()
-                    # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
+                    try:
-                    if watch.history_n >= 2:
+                        changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)
-                        # Atleast 2, means there really was a change
+                        # Re #342
-                        self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_changed': round(now)})
+                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
                        # We then convert/.decode('utf-8') for the notification etc
                        if not isinstance(contents, (bytes, bytearray)):
                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
                    except PermissionError as e:
                        self.app.logger.error("File permission error updating", uuid, str(e))
                    except content_fetcher.ReplyWithContentButNoText as e:
                        # Totally fine, it's by choice - just continue on, nothing more to care about
                        # Page had elements/content but no renderable text
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
                    except FilterNotFoundInResponse as e:
                        err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
                        c = 0
                        if self.datastore.data['watching'].get(uuid, False):
                            c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5)
                        c += 1
-                        watch_history = watch.history
+                        # Send notification if we reached the threshold?
-                        dates = list(watch_history.keys())
+                        threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
-                        # Theoretically it's possible that this could be just 1 long,
+                        print("Filter for {} not found, consecutive_filter_failures: {}".format(uuid, c))
-                        # - In the case that the timestamp key was not unique
+                        if threshold >0 and c >= threshold:
-                        if len(dates) == 1:
+                            self.send_filter_failure_notification(uuid)
-                            raise ValueError(
+                            c = 0
                                "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
                            )
                        prev_fname = watch_history[dates[-2]]
-                        # Did it have any notification alerts to hit?
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
-                        if len(watch['notification_urls']):
+                                                                           'consecutive_filter_failures': c})
-                            print(">>> Notifications queued for UUID from watch {}".format(self.current_uuid))
+                    except content_fetcher.EmptyReply as e:
-                            n_object['notification_urls'] = watch['notification_urls']
+                        # Some kind of custom to-str handler in the exception handler that does this?
-                            n_object['notification_title'] = watch['notification_title']
+                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
-                            n_object['notification_body'] = watch['notification_body']
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
-                            n_object['notification_format'] = watch['notification_format']
+                                                                           'last_check_status': e.status_code})
                    except content_fetcher.ScreenshotUnavailable as e:
                        err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                    except content_fetcher.PageUnloadable as e:
                        err_text = "Page request from server didnt respond correctly"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                    except Exception as e:
                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
-                        # No? maybe theres a global setting, queue them all
+                    else:
-                        elif len(self.datastore.data['settings']['application']['notification_urls']):
+                        try:
-                            print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(self.current_uuid))
+                            watch = self.datastore.data['watching'][uuid]
-                            n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
+                            fname = "" # Saved history text filename
                            n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
                            n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
                            n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
                        else:
                            print(">>> NO notifications queued, watch and global notification URLs were empty.")
-                        # Only prepare to notify if the rules above matched
+                            # For the FIRST time we check a site, or a change detected, save the snapshot.
-                        if 'notification_urls' in n_object:
+                            if changed_detected or not watch['last_checked']:
-                            # HTML needs linebreak, but MarkDown and Text can use a linefeed
+                                # A change was detected
-                            if n_object['notification_format'] == 'HTML':
+                                fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
                                line_feed_sep = "</br>"
                            else:
                                line_feed_sep = "\n"
-                            from changedetectionio import diff
+                            # Generally update anything interesting returned
-                            n_object.update({
+                            update_obj['consecutive_filter_failures'] = 0
-                                'watch_url': watch['url'],
+                            self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
                                'uuid': self.current_uuid,
                                'current_snapshot': contents.decode('utf-8'),
                                'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
                                'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
                            })
-                            self.notification_q.put(n_object)
+                            # A change was detected
                            if changed_detected:
                                n_object = {}
                                print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
-            except Exception as e:
+                                # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
-                # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
+                                if watch.history_n >= 2:
-                print("!!!! Exception in update_worker !!!\n", e)
+                                    # Atleast 2, means there really was a change
-                self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
+                                    self.datastore.update_watch(uuid=uuid, update_obj={'last_changed': round(now)})
                self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
-        finally:
+                                    watch_history = watch.history
-            # Always record that we atleast tried
+                                    dates = list(watch_history.keys())
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'fetch_time': round(time.time() - now, 3),
+                                    # Theoretically it's possible that this could be just 1 long,
-                                                               'last_checked': round(time.time())})
+                                    # - In the case that the timestamp key was not unique
                                    if len(dates) == 1:
                                        raise ValueError(
                                            "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
                                        )
                                    prev_fname = watch_history[dates[-2]]
-            # Always save the screenshot if it's available
+                                    # Did it have any notification alerts to hit?
-            if screenshot:
+                                    if len(watch['notification_urls']):
-                self.datastore.save_screenshot(watch_uuid=self.current_uuid, screenshot=screenshot)
+                                        print(">>> Notifications queued for UUID from watch {}".format(uuid))
-            if xpath_data:
+                                        n_object['notification_urls'] = watch['notification_urls']
-                self.datastore.save_xpath_data(watch_uuid=self.current_uuid, data=xpath_data)
+                                        n_object['notification_title'] = watch['notification_title']
                                        n_object['notification_body'] = watch['notification_body']
                                        n_object['notification_format'] = watch['notification_format']
                                    # No? maybe theres a global setting, queue them all
                                    elif len(self.datastore.data['settings']['application']['notification_urls']):
                                        print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
                                        n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
                                        n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
                                        n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
                                        n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
                                    else:
                                        print(">>> NO notifications queued, watch and global notification URLs were empty.")
                                    # Only prepare to notify if the rules above matched
                                    if 'notification_urls' in n_object:
                                        # HTML needs linebreak, but MarkDown and Text can use a linefeed
                                        if n_object['notification_format'] == 'HTML':
                                            line_feed_sep = "</br>"
                                        else:
                                            line_feed_sep = "\n"
                                        from changedetectionio import diff
                                        n_object.update({
                                            'watch_url': watch['url'],
                                            'uuid': uuid,
                                            'current_snapshot': contents.decode('utf-8'),
                                            'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
                                            'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
                                        })
                                        self.notification_q.put(n_object)
                        except Exception as e:
                            # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
                            print("!!!! Exception in update_worker !!!\n", e)
                            self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
                    finally:
                        # Always record that we atleast tried
                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
                                                                           'last_checked': round(time.time())})
                        # Always save the screenshot if it's available
                        if screenshot:
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
                        if xpath_data:
                            self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
                self.current_uuid = None  # Done
                self.q.task_done()
                # Give the CPU time to interrupt
                time.sleep(0.1)
            self.app.config.exit.wait(1)
--- a/memory-leak.html
+++ b/memory-leak.html
Author	SHA1	Message	Date
dgtlmoon	44b2159140	example memory usage that isnt cleared	2022-07-28 20:55:01 +02:00
dgtlmoon	3c9d2ded38	0.39.17	2022-07-28 13:07:51 +02:00
dgtlmoon	9f4364a130	Add https://discord.com/api notification hook to the automatic truncation due to Discords 2000 char limit	2022-07-28 12:34:55 +02:00
dgtlmoon	5bd9eaf99d	UI Feature - Add watch in "paused" state, saving then unpauses (#779 )	2022-07-28 12:13:26 +02:00
dgtlmoon	b1c51c0a65	Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778 )	2022-07-28 11:50:31 +02:00
dgtlmoon	232bd92389	Bug fix - Filter "Only trigger when new lines appear" should check all history, not only the first item (#777 )	2022-07-28 10:16:19 +02:00
dgtlmoon	e6173357a9	Visual Selector direct element finder fix	2022-07-28 09:19:10 +02:00
dgtlmoon	f2b8888aff	Update README.md	2022-07-27 14:25:24 +02:00
dgtlmoon	9c46f175f9	Update README.md links	2022-07-27 14:23:18 +02:00
`@@ -1 +1,2 @@`
	`test-datastore`	`test-datastore`
		`package-lock.json`