example memory usage that isnt cleared

0.39.17
Add https://discord.com/api notification hook to the automatic truncation due to Discords 2000 char limit
2025-12-26 09:50:31 +00:00 · 2022-07-28 20:55:01 +02:00 · 2022-07-28 13:07:51 +02:00 · 2022-07-28 12:34:55 +02:00 · 2022-07-28 12:13:26 +02:00 · 2022-07-28 11:50:31 +02:00
20 changed files with 591 additions and 265 deletions
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Live your data-life *pro-actively* instead of *re-actively*.

 Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start)

-[[ Discord ]](https://discord.com/channels/1000806276256780309/1000806276873334816) [[ YouTube ]](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [[ LinkedIn ]](https://www.linkedin.com/company/changedetection-io/)
+[![Discord](https://img.shields.io/badge/DISCORD-%237289DA.svg?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/vUNt4EtWMF) [ ![YouTube](https://img.shields.io/badge/YouTube-%23FF0000.svg?style=for-the-badge&logo=YouTube&logoColor=white)](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/company/changedetection-io/)


 [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring"  title="Self-hosted web page change monitoring"  />](https://lemonade.changedetection.io/start)
--- a/changedetectionio/.gitignore
+++ b/changedetectionio/.gitignore
@@ -1 +1,2 @@
 test-datastore
+package-lock.json
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -44,7 +44,7 @@ from flask_wtf import CSRFProtect
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1

-__version__ = '0.39.16'
+__version__ = '0.39.17'

 datastore = None

@@ -105,10 +105,9 @@ def init_app_secret(datastore_path):
 # running or something similar.
@app.template_filter('format_last_checked_time')
 def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
-
    # Worker thread tells us which UUID it is currently processing.
-    for t in threading.enumerate():
-        if t.name == 'update_worker' and t.current_uuid == watch_obj['uuid']:
+    for t in running_update_threads:
+        if t.current_uuid == watch_obj['uuid']:
            return '<span class="loader"></span><span> Checking now</span>'

    if watch_obj['last_checked'] == 0:
@@ -581,6 +580,9 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'POST' and form.validate():
            extra_update_obj = {}

+            if request.args.get('unpause_on_save'):
+                extra_update_obj['paused'] = False
+
            # Re #110, if they submit the same as the default value, set it to None, so we continue to follow the default
            # Assume we use the default value, unless something relevant is different, then use the form value
            # values could be None, 0 etc.
@@ -620,7 +622,10 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['watching'][uuid].update(form.data)
            datastore.data['watching'][uuid].update(extra_update_obj)

-            flash("Updated watch.")
+            if request.args.get('unpause_on_save'):
+                flash("Updated watch - unpaused!.")
+            else:
+                flash("Updated watch.")

            # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
            # But in the case something is added we should save straight away
@@ -1064,9 +1069,9 @@ def changedetection_app(config=None, datastore_o=None):
        except FileNotFoundError:
            abort(404)

-    @app.route("/api/add", methods=['POST'])
+    @app.route("/form/add/quickwatch", methods=['POST'])
    @login_required
-    def form_watch_add():
+    def form_quick_watch_add():
        from changedetectionio import forms
        form = forms.quickWatchForm(request.form)

@@ -1079,13 +1084,19 @@ def changedetection_app(config=None, datastore_o=None):
            flash('The URL {} already exists'.format(url), "error")
            return redirect(url_for('index'))

-        # @todo add_watch should throw a custom Exception for validation etc
-        new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip())
-        if new_uuid:
+        add_paused = request.form.get('edit_and_watch_submit_button') != None
+        new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip(), extras={'paused': add_paused})
+
+
+        if not add_paused and new_uuid:
            # Straight into the queue.
            update_q.put(new_uuid)
            flash("Watch added.")

+        if add_paused:
+            flash('Watch added in Paused state, saving will unpause.')
+            return redirect(url_for('edit_page', uuid=new_uuid, unpause_on_save=1))
+
        return redirect(url_for('index'))


@@ -1214,7 +1225,6 @@ def changedetection_app(config=None, datastore_o=None):

    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
-    threading.Thread(target=ticker_thread_job_queue_processor).start()

    threading.Thread(target=notification_runner).start()

@@ -1290,63 +1300,25 @@ def notification_runner():
            # Trim the log length
            notification_debug_log = notification_debug_log[-100:]

-# Check the queue, when a job exists, start a fresh thread of update_worker
-def ticker_thread_job_queue_processor():
-
-    from changedetectionio import update_worker
-    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
-
-    while not app.config.exit.is_set():
-        time.sleep(0.3)
-
-        # Check that some threads are free
-        running = 0
-        for t in threading.enumerate():
-            if t.name == 'update_worker':
-                running += 1
-
-        if running >= n_workers:
-            continue
-
-        try:
-            uuid = update_q.get(block=False)
-        except queue.Empty:
-            # Go back to waiting for exit and/or another entry from the queue
-            continue
-        print ("Starting a thread fetch")
-
-
-        try:
-            # Launch the update_worker thread that will handle picking items off a queue and sending them off
-            # in the event that playwright or others have a memory leak, this should clean it up better than gc.collect()
-            # (By letting it exit entirely)
-            update_worker.update_worker(update_q, notification_q, app, datastore, uuid).start()
-        except Exception as e:
-            print ("Error launching update_worker for UUID {}.".format(uuid))
-            print (str(e))
-
-        print ("Running now {}", running)
-
-
 # Thread runner to check every minute, look for new watches to feed into the Queue.
 def ticker_thread_check_time_launch_checks():
    import random
-
+    from changedetectionio import update_worker

    recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
    print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)

-    # Can go in its own function
-
-    # Always maintain the minimum number of threads, each thread will terminate when it has processed exactly 1 queued watch
-    # This is to be totally sure that they don't leak memory
    # Spin up Workers that do the fetching
    # Can be overriden by ENV or use the default settings
-
+    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
+    for _ in range(n_workers):
+        new_worker = update_worker.update_worker(update_q, notification_q, app, datastore)
+        running_update_threads.append(new_worker)
+        new_worker.start()

    while not app.config.exit.is_set():

-        # Update our list of watches by UUID that are currently fetching data, used in the UI
+        # Get a list of watches by UUID that are currently fetching data
        running_uuids = []
        for t in running_update_threads:
            if t.current_uuid:
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -63,12 +63,12 @@ class Fetcher():
                      break;
                    }
                    if('' !==r.id) {
-                      chained_css.unshift("#"+r.id);
-                      final_selector= chained_css.join('>');
+                      chained_css.unshift("#"+CSS.escape(r.id));
+                      final_selector= chained_css.join(' > ');
                      // Be sure theres only one, some sites have multiples of the same ID tag :-(
                      if (window.document.querySelectorAll(final_selector).length ==1 ) {
                        return final_selector;
-                      }
+                        }
                      return null;
                    } else {
                      chained_css.unshift(r.tagName.toLowerCase());
@@ -547,6 +547,43 @@ class html_requests(Fetcher):
        self.headers = r.headers


+# "html_requests" is listed as the default fetcher in store.py!
+class html_fetcher_with_weird_memory_leak(Fetcher):
+    fetcher_description = "HTTP Fetcher with unexplainable memory leak"
+
+    def __init__(self, proxy_override=None):
+        self.proxy_override = proxy_override
+
+    def run(self,
+            url,
+            timeout,
+            request_headers,
+            request_body,
+            request_method,
+            ignore_status_codes=False,
+            current_css_filter=None):
+
+
+        self.status_code = 200
+
+        # Does nothing to help
+        # with open('memory-leak.html', 'r', encoding="utf-8") as f:
+        # with open('memory-leak.html', 'r') as f:
+
+        # Works but is binary (no good for me)
+        with open('memory-leak.html', 'r') as f:
+            wtf = f.read()
+
+        # just to prove gc.collect doesnt help, i dont even use 'wtf'
+        del wtf
+        wtf="not much"
+        import gc
+        gc.collect()
+
+        self.content = "<html>foobar</html>"
+        self.headers = {}
+        self.xpath_data = '{}'
+
 # Decide which is the 'real' HTML webdriver, this is more a system wide config
 # rather than site-specific.
 use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -308,6 +308,9 @@ class ValidateCSSJSONXPATHInput(object):
 class quickWatchForm(Form):
    url = fields.URLField('URL', validators=[validateURL()])
    tag = StringField('Group tag', [validators.Optional()])
+    watch_submit_button = SubmitField('Watch', render_kw={"class": "pure-button pure-button-primary"})
+    edit_and_watch_submit_button = SubmitField('Edit > Watch', render_kw={"class": "pure-button pure-button-primary"})
+

 # Common to a single watch and the global settings
 class commonSettingsForm(Form):
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -4,6 +4,8 @@ from typing import List
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
 import re
+from inscriptis import get_text
+from inscriptis.model.config import ParserConfig

 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
@@ -50,8 +52,15 @@ def xpath_filter(xpath_filter, html_content):
    if len(html_content) > 0 and len(r) == 0:
        raise FilterNotFoundInResponse(xpath_filter)

-    for item in r:
-        html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
+    #@note: //title/text() wont work where <title>CDATA..
+
+    for element in r:
+        if type(element) == etree._ElementStringResult:
+            html_block += str(element) + "<br/>"
+        elif type(element) == etree._ElementUnicodeResult:
+            html_block += str(element) + "<br/>"
+        else:
+            html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"

    return html_block

@@ -181,16 +190,9 @@ def strip_ignore_text(content, wordlist, mode="content"):


 def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
-    import multiprocessing
-
-    from inscriptis.model.config import ParserConfig
-
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
-    
-    @NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE 
-           https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/ 

    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
@@ -212,19 +214,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    else:
        parser_config = None

-
-    def parse_function(html_content, parser_config, results_queue):
-        from inscriptis import get_text
-        # get text and annotations via inscriptis
-        text_content = get_text(html_content, config=parser_config)
-        results_queue.put(text_content)
-
-    results_queue = multiprocessing.Queue()
-    parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
-    parse_process.daemon = True
-    parse_process.start()
-    text_content = results_queue.get()  # blocks until results are available
-    parse_process.terminate()
+    # get text and annotations via inscriptis
+    text_content = get_text(html_content, config=parser_config)

    return text_content

--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -31,7 +31,7 @@ class model(dict):
                    'base_url' : None,
                    'extract_title_as_title': False,
                    'empty_pages_are_a_change': False,
-                    'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
+                    'fetch_backend': 'html_fetcher_with_weird_memory_leak',
                    'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
                    'global_subtractive_selectors': [],
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -172,13 +172,14 @@ class model(dict):

    # Iterate over all history texts and see if something new exists
    def lines_contain_something_unique_compared_to_history(self, lines=[]):
-        local_lines = [l.decode('utf-8').strip().lower() for l in lines]
+        local_lines = set([l.decode('utf-8').strip().lower() for l in lines])

        # Compare each lines (set) against each history text file (set) looking for something new..
+        existing_history = set({})
        for k, v in self.history.items():
-            alist = [line.decode('utf-8').strip().lower() for line in open(v, 'rb')]
-            res = set(alist) != set(local_lines)
-            if res:
-                return True
+            alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')])
+            existing_history = existing_history.union(alist)

-        return False
+        # Check that everything in local_lines(new stuff) already exists in existing_history - it should
+        # if not, something new happened
+        return not local_lines.issubset(existing_history)
--- a/changedetectionio/notification.py
+++ b/changedetectionio/notification.py
@@ -78,7 +78,7 @@ def process_notification(n_object, datastore):
                    n_title = n_title[0:payload_max_size]
                    n_body = n_body[0:body_limit]

-                elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'):
+                elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks') or url.startswith('https://discord.com/api'):
                    # real limit is 2000, but minus some for extra metadata
                    payload_max_size = 1700
                    body_limit = max(0, payload_max_size - len(n_title))
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -1,9 +1,7 @@
 /*
 * -- BASE STYLES --
 * Most of these are inherited from Base, but I want to change a few.
- * nvm use v14.18.1
- * npm install
- * npm run build
+ * nvm use v14.18.1 && npm install && npm run build
 * or npm run watch
 */
 body {
@@ -203,13 +201,18 @@ body:after, body:before {
  border-radius: 10px;
  margin-bottom: 1em; }
  #new-watch-form input {
-    width: auto !important;
-    display: inline-block; }
+    display: inline-block;
+    margin-bottom: 5px; }
  #new-watch-form .label {
    display: none; }
  #new-watch-form legend {
    color: #fff;
    font-weight: bold; }
+  #new-watch-form #watch-add-wrapper-zone > div {
+    display: inline-block; }
+  @media only screen and (max-width: 760px) {
+    #new-watch-form #watch-add-wrapper-zone #url {
+      width: 100%; } }

 #diff-col {
  padding-left: 40px; }
--- a/changedetectionio/static/styles/styles.scss
+++ b/changedetectionio/static/styles/styles.scss
@@ -1,9 +1,7 @@
 /*
 * -- BASE STYLES --
 * Most of these are inherited from Base, but I want to change a few.
- * nvm use v14.18.1
- * npm install
- * npm run build
+ * nvm use v14.18.1 && npm install && npm run build
 * or npm run watch
 */
 body {
@@ -269,8 +267,8 @@ body:after, body:before {
  border-radius: 10px;
  margin-bottom: 1em;
  input {
-    width: auto !important;
    display: inline-block;
+    margin-bottom: 5px;
  }
  .label {
    display: none;
@@ -279,6 +277,17 @@ body:after, body:before {
    color: #fff;
    font-weight: bold;
  }
+
+  #watch-add-wrapper-zone {
+    > div {
+      display: inline-block;
+    }
+    @media only screen and (max-width: 760px) {
+      #url {
+        width: 100%;
+      }
+    }
+  }
 }


--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -82,9 +82,8 @@ class ChangeDetectionStore:
            if include_default_watches:
                print("Creating JSON store at", self.datastore_path)

-                self.add_watch(url='http://www.quotationspage.com/random.php', tag='test')
-                self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
-                self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io')
+                for i in range(50):
+                    self.add_watch(url='https://changedetection.io/CHANGELOG.txt?x='+str(i), tag='test')

        self.__data['version_tag'] = version_tag

--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -33,7 +33,7 @@

    <div class="box-wrap inner">
        <form class="pure-form pure-form-stacked"
-              action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next') ) }}" method="POST">
+              action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next'), unpause_on_save = request.args.get('unpause_on_save')) }}" method="POST">
             <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>

            <div class="tab-pane-inner" id="general">
@@ -163,15 +163,26 @@ User-Agent: wonderbra 1.0") }}
                        </div>
                    </fieldset>
                    <div class="pure-control-group">
-                        {{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
-                        class="m-d") }}
+                        {% set field = render_field(form.css_filter,
+                            placeholder=".class-name or #some-id, or other CSS selector rule.",
+                            class="m-d")
+                        %}
+                        {{ field }}
+                        {% if '/text()' in  field %}
+                          <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
+                        {% endif %}
                        <span class="pure-form-message-inline">
                    <ul>
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
+                            <ul>
+                                <li>Example:  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
+                                <li>Example: Get all titles from an RSS feed <code>//title/text()</code></li>
+                            </ul>
+                            </li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -1,18 +1,25 @@
 {% extends 'base.html' %}
 {% block content %}
-{% from '_helpers.jinja' import render_simple_field %}
+{% from '_helpers.jinja' import render_simple_field, render_field %}
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>

 <div class="box">

-    <form class="pure-form" action="{{ url_for('form_watch_add') }}" method="POST" id="new-watch-form">
+    <form class="pure-form" action="{{ url_for('form_quick_watch_add') }}" method="POST" id="new-watch-form">
        <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
        <fieldset>
            <legend>Add a new change detection watch</legend>
-                {{ render_simple_field(form.url, placeholder="https://...", required=true) }}
-                {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
-            <button type="submit" class="pure-button pure-button-primary">Watch</button>
+            <div id="watch-add-wrapper-zone">
+                <div>
+                    {{ render_simple_field(form.url, placeholder="https://...", required=true) }}
+                    {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
+                </div>
+                <div>
+                    {{ render_simple_field(form.watch_submit_button, title="Watch this URL!" ) }}
+                    {{ render_simple_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
+                </div>
+            </div>
        </fieldset>
        <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
    </form>
--- a/changedetectionio/tests/test_filter_failure_notification.py
+++ b/changedetectionio/tests/test_filter_failure_notification.py
@@ -30,7 +30,7 @@ def run_filter_test(client, content_filter):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": ''},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_notification.py
+++ b/changedetectionio/tests/test_notification.py
@@ -36,7 +36,7 @@ def test_check_notification(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": ''},
        follow_redirects=True
    )
@@ -172,7 +172,7 @@ def test_notification_validation(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": test_url, "tag": 'nice one'},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_notification_errors.py
+++ b/changedetectionio/tests/test_notification_errors.py
@@ -16,7 +16,7 @@ def test_check_notification_error_handling(client, live_server):
    # use a different URL so that it doesnt interfere with the actual check until we are ready
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
-        url_for("form_watch_add"),
+        url_for("form_quick_watch_add"),
        data={"url": "https://changedetection.io/CHANGELOG.txt", "tag": ''},
        follow_redirects=True
    )
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -86,6 +86,7 @@ def test_check_xpath_filter_utf8(client, live_server):
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
+    time.sleep(1)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
@@ -99,6 +100,68 @@ def test_check_xpath_filter_utf8(client, live_server):
    assert b'Deleted' in res.data


+# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
+def test_check_xpath_text_function_utf8(client, live_server):
+    filter='//item/title/text()'
+
+    d='''<?xml version="1.0" encoding="UTF-8"?>
+<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
+	<channel>
+		<title>rpilocator.com</title>
+		<link>https://rpilocator.com</link>
+		<description>Find Raspberry Pi Computers in Stock</description>
+		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
+		<image>
+			<url>https://rpilocator.com/favicon.png</url>
+			<title>rpilocator.com</title>
+			<link>https://rpilocator.com/</link>
+			<width>32</width>
+			<height>32</height>
+		</image>
+		<item>
+			<title>Stock Alert (UK): RPi CM4</title>
+			<foo>something else unrelated</foo>
+		</item>
+		<item>
+			<title>Stock Alert (UK): Big monitor</title>
+			<foo>something else unrelated</foo>
+		</item>		
+	</channel>
+</rss>'''
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(d)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    time.sleep(1)
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+    time.sleep(3)
+    res = client.get(url_for("index"))
+    assert b'Unicode strings with encoding declaration are not supported.' not in res.data
+
+    # The service should echo back the request headers
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b'<div class="">Stock Alert (UK): RPi CM4' in res.data
+    assert b'<div class="">Stock Alert (UK): Big monitor' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data

 def test_check_markup_xpath_filter_restriction(client, live_server):
    sleep_time_for_fetch_thread = 3
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -7,20 +7,19 @@ from changedetectionio.html_tools import FilterNotFoundInResponse

 # A single update worker
 #
-#
+# Requests for checking on a single site(watch) from a queue of watches
+# (another process inserts watches into the queue that are time-ready for checking)
+

 class update_worker(threading.Thread):
    current_uuid = None

-    def __init__(self, q, notification_q, app, datastore, uuid, *args, **kwargs):
+    def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
        self.q = q
-
        self.app = app
        self.notification_q = notification_q
        self.datastore = datastore
-        self.current_uuid = uuid
        super().__init__(*args, **kwargs)
-        self.name = "update_worker"

    def send_filter_failure_notification(self, uuid):

@@ -48,170 +47,169 @@ class update_worker(threading.Thread):
            self.notification_q.put(n_object)
            print("Sent filter not found notification for {}".format(uuid))

-    # Pick one job off the list, process it threaded, exist
    def run(self):
-        # Go talk to the website
-        self.perform_site_update()
-
-        self.current_uuid = None  # Done
-        self.q.task_done()
-
-        # Let the thread die after processing 1
-        # We will launch nice juicy fresh threads every time to prevent memory leaks in complex runner code (playwright etc)
-        print ("EXITING THREAD!")
-        self.app.config.exit.wait(1)
-        return
-
-
-
-    def perform_site_update(self):
-
        from changedetectionio import fetch_site_status

-        if not self.current_uuid in list(self.datastore.data['watching'].keys()):
-            return
-
-
-        changed_detected = False
-        contents = ""
-        screenshot = False
-        update_obj= {}
-        xpath_data = False
-        now = time.time()
-
        update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
-        try:
-            changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(self.current_uuid)
-            # Re #342
-            # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
-            # We then convert/.decode('utf-8') for the notification etc
-            if not isinstance(contents, (bytes, bytearray)):
-                raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
-        except PermissionError as e:
-            self.app.logger.error("File permission error updating", self.current_uuid, str(e))
-        except content_fetcher.ReplyWithContentButNoText as e:
-            # Totally fine, it's by choice - just continue on, nothing more to care about
-            # Page had elements/content but no renderable text
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': "Got HTML content but no text found."})
-        except FilterNotFoundInResponse as e:
-            err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
-            c = 0
-            if self.datastore.data['watching'].get(self.current_uuid, False):
-                c = self.datastore.data['watching'][self.current_uuid].get('consecutive_filter_failures', 5)
-            c += 1

-            # Send notification if we reached the threshold?
-            threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
-            print("Filter for {} not found, consecutive_filter_failures: {}".format(self.current_uuid, c))
-            if threshold >0 and c >= threshold:
-                self.send_filter_failure_notification(self.current_uuid)
-                c = 0
+        while not self.app.config.exit.is_set():

-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
-                                                               'consecutive_filter_failures': c})
-        except content_fetcher.EmptyReply as e:
-            # Some kind of custom to-str handler in the exception handler that does this?
-            err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
-                                                               'last_check_status': e.status_code})
-        except content_fetcher.ScreenshotUnavailable as e:
-            err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
-                                                               'last_check_status': e.status_code})
-        except content_fetcher.PageUnloadable as e:
-            err_text = "Page request from server didnt respond correctly"
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
-                                                               'last_check_status': e.status_code})
-        except Exception as e:
-            self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
-
-        else:
            try:
-                watch = self.datastore.data['watching'][self.current_uuid]
-                fname = "" # Saved history text filename
+                uuid = self.q.get(block=False)
+            except queue.Empty:
+                pass

-                # For the FIRST time we check a site, or a change detected, save the snapshot.
-                if changed_detected or not watch['last_checked']:
-                    # A change was detected
-                    fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
+            else:
+                self.current_uuid = uuid

-                # Generally update anything interesting returned
-                update_obj['consecutive_filter_failures'] = 0
-                self.datastore.update_watch(uuid=self.current_uuid, update_obj=update_obj)
+                if uuid in list(self.datastore.data['watching'].keys()):

-                # A change was detected
-                if changed_detected:
-                    n_object = {}
-                    print (">> Change detected in UUID {} - {}".format(self.current_uuid, watch['url']))
+                    changed_detected = False
+                    contents = ""
+                    screenshot = False
+                    update_obj= {}
+                    xpath_data = False
+                    now = time.time()

-                    # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
-                    if watch.history_n >= 2:
-                        # Atleast 2, means there really was a change
-                        self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_changed': round(now)})
+                    try:
+                        changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)
+                        # Re #342
+                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
+                        # We then convert/.decode('utf-8') for the notification etc
+                        if not isinstance(contents, (bytes, bytearray)):
+                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
+                    except PermissionError as e:
+                        self.app.logger.error("File permission error updating", uuid, str(e))
+                    except content_fetcher.ReplyWithContentButNoText as e:
+                        # Totally fine, it's by choice - just continue on, nothing more to care about
+                        # Page had elements/content but no renderable text
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
+                    except FilterNotFoundInResponse as e:
+                        err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
+                        c = 0
+                        if self.datastore.data['watching'].get(uuid, False):
+                            c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5)
+                        c += 1

-                        watch_history = watch.history
-                        dates = list(watch_history.keys())
-                        # Theoretically it's possible that this could be just 1 long,
-                        # - In the case that the timestamp key was not unique
-                        if len(dates) == 1:
-                            raise ValueError(
-                                "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
-                            )
-                        prev_fname = watch_history[dates[-2]]
+                        # Send notification if we reached the threshold?
+                        threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
+                        print("Filter for {} not found, consecutive_filter_failures: {}".format(uuid, c))
+                        if threshold >0 and c >= threshold:
+                            self.send_filter_failure_notification(uuid)
+                            c = 0

-                        # Did it have any notification alerts to hit?
-                        if len(watch['notification_urls']):
-                            print(">>> Notifications queued for UUID from watch {}".format(self.current_uuid))
-                            n_object['notification_urls'] = watch['notification_urls']
-                            n_object['notification_title'] = watch['notification_title']
-                            n_object['notification_body'] = watch['notification_body']
-                            n_object['notification_format'] = watch['notification_format']
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'consecutive_filter_failures': c})
+                    except content_fetcher.EmptyReply as e:
+                        # Some kind of custom to-str handler in the exception handler that does this?
+                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'last_check_status': e.status_code})
+                    except content_fetcher.ScreenshotUnavailable as e:
+                        err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'last_check_status': e.status_code})
+                    except content_fetcher.PageUnloadable as e:
+                        err_text = "Page request from server didnt respond correctly"
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'last_check_status': e.status_code})
+                    except Exception as e:
+                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})

-                        # No? maybe theres a global setting, queue them all
-                        elif len(self.datastore.data['settings']['application']['notification_urls']):
-                            print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(self.current_uuid))
-                            n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
-                            n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
-                            n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
-                            n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
-                        else:
-                            print(">>> NO notifications queued, watch and global notification URLs were empty.")
+                    else:
+                        try:
+                            watch = self.datastore.data['watching'][uuid]
+                            fname = "" # Saved history text filename

-                        # Only prepare to notify if the rules above matched
-                        if 'notification_urls' in n_object:
-                            # HTML needs linebreak, but MarkDown and Text can use a linefeed
-                            if n_object['notification_format'] == 'HTML':
-                                line_feed_sep = "</br>"
-                            else:
-                                line_feed_sep = "\n"
+                            # For the FIRST time we check a site, or a change detected, save the snapshot.
+                            if changed_detected or not watch['last_checked']:
+                                # A change was detected
+                                fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))

-                            from changedetectionio import diff
-                            n_object.update({
-                                'watch_url': watch['url'],
-                                'uuid': self.current_uuid,
-                                'current_snapshot': contents.decode('utf-8'),
-                                'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
-                                'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
-                            })
+                            # Generally update anything interesting returned
+                            update_obj['consecutive_filter_failures'] = 0
+                            self.datastore.update_watch(uuid=uuid, update_obj=update_obj)

-                            self.notification_q.put(n_object)
+                            # A change was detected
+                            if changed_detected:
+                                n_object = {}
+                                print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))

-            except Exception as e:
-                # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
-                print("!!!! Exception in update_worker !!!\n", e)
-                self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
-                self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
+                                # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
+                                if watch.history_n >= 2:
+                                    # Atleast 2, means there really was a change
+                                    self.datastore.update_watch(uuid=uuid, update_obj={'last_changed': round(now)})

-        finally:
-            # Always record that we atleast tried
-            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'fetch_time': round(time.time() - now, 3),
-                                                               'last_checked': round(time.time())})
+                                    watch_history = watch.history
+                                    dates = list(watch_history.keys())
+                                    # Theoretically it's possible that this could be just 1 long,
+                                    # - In the case that the timestamp key was not unique
+                                    if len(dates) == 1:
+                                        raise ValueError(
+                                            "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
+                                        )
+                                    prev_fname = watch_history[dates[-2]]

-            # Always save the screenshot if it's available
-            if screenshot:
-                self.datastore.save_screenshot(watch_uuid=self.current_uuid, screenshot=screenshot)
-            if xpath_data:
-                self.datastore.save_xpath_data(watch_uuid=self.current_uuid, data=xpath_data)
+                                    # Did it have any notification alerts to hit?
+                                    if len(watch['notification_urls']):
+                                        print(">>> Notifications queued for UUID from watch {}".format(uuid))
+                                        n_object['notification_urls'] = watch['notification_urls']
+                                        n_object['notification_title'] = watch['notification_title']
+                                        n_object['notification_body'] = watch['notification_body']
+                                        n_object['notification_format'] = watch['notification_format']
+
+                                    # No? maybe theres a global setting, queue them all
+                                    elif len(self.datastore.data['settings']['application']['notification_urls']):
+                                        print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
+                                        n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
+                                        n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
+                                        n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
+                                        n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
+                                    else:
+                                        print(">>> NO notifications queued, watch and global notification URLs were empty.")
+
+                                    # Only prepare to notify if the rules above matched
+                                    if 'notification_urls' in n_object:
+                                        # HTML needs linebreak, but MarkDown and Text can use a linefeed
+                                        if n_object['notification_format'] == 'HTML':
+                                            line_feed_sep = "</br>"
+                                        else:
+                                            line_feed_sep = "\n"
+
+                                        from changedetectionio import diff
+                                        n_object.update({
+                                            'watch_url': watch['url'],
+                                            'uuid': uuid,
+                                            'current_snapshot': contents.decode('utf-8'),
+                                            'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
+                                            'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
+                                        })
+
+                                        self.notification_q.put(n_object)
+
+                        except Exception as e:
+                            # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
+                            print("!!!! Exception in update_worker !!!\n", e)
+                            self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
+                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
+
+                    finally:
+                        # Always record that we atleast tried
+                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
+                                                                           'last_checked': round(time.time())})
+
+                        # Always save the screenshot if it's available
+                        if screenshot:
+                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
+                        if xpath_data:
+                            self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)


+                self.current_uuid = None  # Done
+                self.q.task_done()
+
+                # Give the CPU time to interrupt
+                time.sleep(0.1)
+
+            self.app.config.exit.wait(1)
--- a/memory-leak.html
+++ b/memory-leak.html
Author	SHA1	Message	Date
dgtlmoon	44b2159140	example memory usage that isnt cleared	2022-07-28 20:55:01 +02:00
dgtlmoon	3c9d2ded38	0.39.17	2022-07-28 13:07:51 +02:00
dgtlmoon	9f4364a130	Add https://discord.com/api notification hook to the automatic truncation due to Discords 2000 char limit	2022-07-28 12:34:55 +02:00
dgtlmoon	5bd9eaf99d	UI Feature - Add watch in "paused" state, saving then unpauses (#779 )	2022-07-28 12:13:26 +02:00
dgtlmoon	b1c51c0a65	Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778 )	2022-07-28 11:50:31 +02:00
dgtlmoon	232bd92389	Bug fix - Filter "Only trigger when new lines appear" should check all history, not only the first item (#777 )	2022-07-28 10:16:19 +02:00
dgtlmoon	e6173357a9	Visual Selector direct element finder fix	2022-07-28 09:19:10 +02:00
dgtlmoon	f2b8888aff	Update README.md	2022-07-27 14:25:24 +02:00
dgtlmoon	9c46f175f9	Update README.md links	2022-07-27 14:23:18 +02:00