LXML memory leak workaround

fix status
WIP
2025-11-17 06:56:10 +00:00 · 2022-07-27 15:35:05 +02:00 · 2022-07-27 13:11:14 +02:00 · 2022-07-27 13:05:23 +02:00 · 2022-07-27 00:01:51 +02:00 · 2022-07-26 17:34:34 +02:00
21 changed files with 695 additions and 3937 deletions
--- a/README.md
+++ b/README.md
@@ -3,14 +3,16 @@
 ![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master)
-## Self-Hosted, Open Source, Change Monitoring of Web Pages
+## Web Site Change Detection, Monitoring and Notification - Self-Hosted or SaaS.
-_Know when web pages change! Stay ontop of new information!_ 
+_Know when web pages change! Stay ontop of new information! get notifications when important website content changes_ 
 Live your data-life *pro-actively* instead of *re-actively*.
 Free, Open-source web page monitoring, notification and change detection. Don't have time? [**Try our $6.99/month subscription - unlimited checks and watches!**](https://lemonade.changedetection.io/start)
 [[ Discord ]](https://discord.com/channels/1000806276256780309/1000806276873334816) [[ YouTube ]](https://www.youtube.com/channel/UCbS09q1TRf0o4N2t-WA3emQ) [[ LinkedIn ]](https://www.linkedin.com/company/changedetection-io/)
 [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring"  title="Self-hosted web page change monitoring"  />](https://lemonade.changedetection.io/start)
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -44,7 +44,7 @@ from flask_wtf import CSRFProtect
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1
-__version__ = '0.39.15'
+__version__ = '0.39.16'
 datastore = None
@@ -105,9 +105,10 @@ def init_app_secret(datastore_path):
 # running or something similar.
@app.template_filter('format_last_checked_time')
 def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
    # Worker thread tells us which UUID it is currently processing.
-    for t in running_update_threads:
+    for t in threading.enumerate():
-        if t.current_uuid == watch_obj['uuid']:
+        if t.name == 'update_worker' and t.current_uuid == watch_obj['uuid']:
            return '<span class="loader"></span><span> Checking now</span>'
    if watch_obj['last_checked'] == 0:
@@ -361,7 +362,7 @@ def changedetection_app(config=None, datastore_o=None):
                fe.pubDate(dt)
        response = make_response(fg.rss_str())
-        response.headers.set('Content-Type', 'application/rss+xml')
+        response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8')
        return response
    @app.route("/", methods=['GET'])
@@ -1213,6 +1214,7 @@ def changedetection_app(config=None, datastore_o=None):
    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
    threading.Thread(target=ticker_thread_job_queue_processor).start()
    threading.Thread(target=notification_runner).start()
@@ -1288,25 +1290,63 @@ def notification_runner():
            # Trim the log length
            notification_debug_log = notification_debug_log[-100:]
 # Check the queue, when a job exists, start a fresh thread of update_worker
 def ticker_thread_job_queue_processor():
    from changedetectionio import update_worker
    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
    while not app.config.exit.is_set():
        time.sleep(0.3)
        # Check that some threads are free
        running = 0
        for t in threading.enumerate():
            if t.name == 'update_worker':
                running += 1
        if running >= n_workers:
            continue
        try:
            uuid = update_q.get(block=False)
        except queue.Empty:
            # Go back to waiting for exit and/or another entry from the queue
            continue
        print ("Starting a thread fetch")
        try:
            # Launch the update_worker thread that will handle picking items off a queue and sending them off
            # in the event that playwright or others have a memory leak, this should clean it up better than gc.collect()
            # (By letting it exit entirely)
            update_worker.update_worker(update_q, notification_q, app, datastore, uuid).start()
        except Exception as e:
            print ("Error launching update_worker for UUID {}.".format(uuid))
            print (str(e))
        print ("Running now {}", running)
 # Thread runner to check every minute, look for new watches to feed into the Queue.
 def ticker_thread_check_time_launch_checks():
    import random
-    from changedetectionio import update_worker
+
    recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
    print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)
    # Can go in its own function
    # Always maintain the minimum number of threads, each thread will terminate when it has processed exactly 1 queued watch
    # This is to be totally sure that they don't leak memory
    # Spin up Workers that do the fetching
    # Can be overriden by ENV or use the default settings
-    n_workers = int(os.getenv("FETCH_WORKERS", datastore.data['settings']['requests']['workers']))
+
    for _ in range(n_workers):
        new_worker = update_worker.update_worker(update_q, notification_q, app, datastore)
        running_update_threads.append(new_worker)
        new_worker.start()
    while not app.config.exit.is_set():
-        # Get a list of watches by UUID that are currently fetching data
+        # Update our list of watches by UUID that are currently fetching data, used in the UI
        running_uuids = []
        for t in running_update_threads:
            if t.current_uuid:
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -46,6 +46,7 @@ class Fetcher():
    headers = None
    fetcher_description = "No description"
    webdriver_js_execute_code = None
    xpath_element_js = """               
                // Include the getXpath script directly, easier than fetching
                !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
@@ -175,7 +176,6 @@ class Fetcher():
    # Will be needed in the future by the VisualSelector, always get this where possible.
    screenshot = False
    fetcher_description = "No description"
    system_http_proxy = os.getenv('HTTP_PROXY')
    system_https_proxy = os.getenv('HTTPS_PROXY')
@@ -309,13 +309,19 @@ class base_html_playwright(Fetcher):
                page.set_default_navigation_timeout(90000)
                page.set_default_timeout(90000)
-               # Bug - never set viewport size BEFORE page.goto
+                # Listen for all console events and handle errors
                page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
                # Bug - never set viewport size BEFORE page.goto
                # Waits for the next navigation. Using Python context manager
                # prevents a race condition between clicking and waiting for a navigation.
                with page.expect_navigation():
                    response = page.goto(url, wait_until='load')
                if self.webdriver_js_execute_code is not None:
                    page.evaluate(self.webdriver_js_execute_code)
            except playwright._impl._api_types.TimeoutError as e:
                context.close()
                browser.close()
@@ -447,6 +453,12 @@ class base_html_webdriver(Fetcher):
        self.driver.set_window_size(1280, 1024)
        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
        if self.webdriver_js_execute_code is not None:
            self.driver.execute_script(self.webdriver_js_execute_code)
            # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
            self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
        self.screenshot = self.driver.get_screenshot_as_png()
        # @todo - how to check this? is it possible?
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
 class perform_site_check():
    def __init__(self, *args, datastore, **kwargs):
@@ -45,6 +46,20 @@ class perform_site_check():
        return proxy_args
    # Doesn't look like python supports forward slash auto enclosure in re.findall
    # So convert it to inline flag "foobar(?i)" type configuration
    def forward_slash_enclosed_regex_to_options(self, regex):
        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
        if res:
            regex = res.group(1)
            regex += '(?{})'.format(res.group(2))
        else:
            regex += '(?{})'.format('i')
        return regex
    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too
@@ -106,6 +121,9 @@ class perform_site_check():
        elif system_webdriver_delay is not None:
            fetcher.render_extract_delay = system_webdriver_delay
        if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
            fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter'])
        fetcher.quit()
@@ -147,7 +165,9 @@ class perform_site_check():
                is_html = False
        if is_html or is_source:
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
            html_content = fetcher.content
            # If not JSON,  and if it's not text/plain..
@@ -210,15 +230,27 @@ class perform_site_check():
        if len(extract_text) > 0:
            regex_matched_output = []
            for s_re in extract_text:
-                result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
+                # incase they specified something in '/.../x'
-                                    flags=re.MULTILINE | re.DOTALL | re.LOCALE)
+                regex = self.forward_slash_enclosed_regex_to_options(s_re)
-                if result:
+                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
                    regex_matched_output.append(result[0])
                for l in result:
                    if type(l) is tuple:
                        #@todo - some formatter option default (between groups)
                        regex_matched_output += list(l) + [b'\n']
                    else:
                        # @todo - some formatter option default (between each ungrouped result)
                        regex_matched_output += [l] + [b'\n']
            # Now we will only show what the regex matched
            stripped_text_from_html = b''
            text_content_before_ignored_filter = b''
            if regex_matched_output:
-                stripped_text_from_html = b'\n'.join(regex_matched_output)
+                # @todo some formatter for presentation?
                stripped_text_from_html = b''.join(regex_matched_output)
                text_content_before_ignored_filter = stripped_text_from_html
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -344,9 +344,13 @@ class watchForm(commonSettingsForm):
    trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
    text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
    webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
    save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
    proxy = RadioField('Proxy')
    filter_failure_notification_send = BooleanField(
        'Send a notification when the filter can no longer be found on the page', default=False)
    def validate(self, **kwargs):
        if not super().validate():
@@ -385,6 +389,11 @@ class globalSettingsApplicationForm(commonSettingsForm):
    api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()])
    password = SaltyPasswordField()
    filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',
                                                                  render_kw={"style": "width: 5em;"},
                                                                  validators=[validators.NumberRange(min=0,
                                                                                                     message="Should contain zero or more attempts")])
 class globalSettingsForm(Form):
    # Define these as FormFields/"sub forms", this way it matches the JSON storage
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -1,23 +1,27 @@
 import json
 import re
 from typing import List
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
 import re
 from inscriptis import get_text
 from inscriptis.model.config import ParserConfig
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def css_filter(css_filter, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
-    for item in soup.select(css_filter, separator=""):
+    r = soup.select(css_filter, separator="")
    if len(html_content) > 0 and len(r) == 0:
        raise FilterNotFoundInResponse(css_filter)
    for item in r:
        html_block += str(item)
    return html_block + "\n"
@@ -42,8 +46,12 @@ def xpath_filter(xpath_filter, html_content):
    tree = html.fromstring(bytes(html_content, encoding='utf-8'))
    html_block = ""
-    for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
+    r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
-        html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
+    if len(html_content) > 0 and len(r) == 0:
        raise FilterNotFoundInResponse(xpath_filter)
    for item in r:
        html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "<br/>"
    return html_block
@@ -173,9 +181,16 @@ def strip_ignore_text(content, wordlist, mode="content"):
 def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    import multiprocessing
    from inscriptis.model.config import ParserConfig
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
    @NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE 
           https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/ 
    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
@@ -197,8 +212,33 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    else:
        parser_config = None
-    # get text and annotations via inscriptis
+
-    text_content = get_text(html_content, config=parser_config)
+    def parse_function(html_content, parser_config, results_queue):
        from inscriptis import get_text
        # get text and annotations via inscriptis
        text_content = get_text(html_content, config=parser_config)
        results_queue.put(text_content)
    results_queue = multiprocessing.Queue()
    parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
    parse_process.daemon = True
    parse_process.start()
    text_content = results_queue.get()  # blocks until results are available
    parse_process.terminate()
    return text_content
 def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
    This could go into its own Pip package in the future, for faster updates
    """
    # HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
    # https://github.com/weblyzard/inscriptis/issues/45
    if not content:
        return content
    content = re.sub('<!--\s+-->', '', content)
    return content
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -1,30 +1,28 @@
-import collections
+from os import getenv
 import os
 import uuid as uuid_builder
 from changedetectionio.notification import (
    default_notification_body,
    default_notification_format,
    default_notification_title,
 )
 _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
 class model(dict):
    base_config = {
            'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
            'watching': {},
            'settings': {
                'headers': {
-                    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
+                    'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'),
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'Accept-Encoding': 'gzip, deflate',  # No support for brolti in python requests yet.
                    'Accept-Language': 'en-GB,en-US;q=0.9,en;'
                },
                'requests': {
-                    'timeout': 15,  # Default 15 seconds
+                    'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")),  # Default 45 seconds
                    'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
                    'jitter_seconds': 0,
-                    'workers': 10,  # Number of threads, lower is better for slow connections
+                    'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")),  # Number of threads, lower is better for slow connections
                    'proxy': None # Preferred proxy connection
                },
                'application': {
@@ -33,7 +31,8 @@ class model(dict):
                    'base_url' : None,
                    'extract_title_as_title': False,
                    'empty_pages_are_a_change': False,
-                    'fetch_backend': os.getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
+                    'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
                    'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
                    'global_subtractive_selectors': [],
                    'ignore_whitespace': True,
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -1,7 +1,9 @@
 import os
 import uuid as uuid_builder
 from distutils.util import strtobool
 minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60))
 mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
 from changedetectionio.notification import (
    default_notification_body,
@@ -40,6 +42,8 @@ class model(dict):
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'text_should_not_be_present': [], # Text that should not present
            'fetch_backend': None,
            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
            'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
            'extract_title_as_title': False,
            'check_unique_lines': False, # On change-detected, compare against all history if its something new
            'proxy': None, # Preferred proxy connection
@@ -47,10 +51,11 @@ class model(dict):
            # Requires setting to None on submit if it's the same as the default
            # Should be all None by default, so we use the system default in this case.
            'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
-            'webdriver_delay': None
+            'webdriver_delay': None,
            'webdriver_js_execute_code': None, # Run before change-detection
        }
    jitter_seconds = 0
-    mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
+
    def __init__(self, *arg, **kw):
        import uuid
        self.update(self.__base_config)
@@ -159,7 +164,7 @@ class model(dict):
    def threshold_seconds(self):
        seconds = 0
-        for m, n in self.mtable.items():
+        for m, n in mtable.items():
            x = self.get('time_between_check', {}).get(m, None)
            if x:
                seconds += x * n
--- a/changedetectionio/notification.py
+++ b/changedetectionio/notification.py
@@ -34,7 +34,6 @@ def process_notification(n_object, datastore):
        valid_notification_formats[default_notification_format],
    )
    # Insert variables into the notification content
    notification_parameters = create_notification_parameters(n_object, datastore)
@@ -64,7 +63,7 @@ def process_notification(n_object, datastore):
                # So if no avatar_url is specified, add one so it can be correctly calculated into the total payload
                k = '?' if not '?' in url else '&'
-                if not 'avatar_url' in url:
+                if not 'avatar_url' in url and not url.startswith('mail'):
                    url += k + 'avatar_url=https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/changedetectionio/static/images/avatar-256x256.png'
                if url.startswith('tgram://'):
@@ -79,13 +78,21 @@ def process_notification(n_object, datastore):
                    n_title = n_title[0:payload_max_size]
                    n_body = n_body[0:body_limit]
-                elif url.startswith('discord://'):
+                elif url.startswith('discord://') or url.startswith('https://discordapp.com/api/webhooks'):
                    # real limit is 2000, but minus some for extra metadata
                    payload_max_size = 1700
                    body_limit = max(0, payload_max_size - len(n_title))
                    n_title = n_title[0:payload_max_size]
                    n_body = n_body[0:body_limit]
                elif url.startswith('mailto'):
                    # Apprise will default to HTML, so we need to override it
                    # So that whats' generated in n_body is in line with what is going to be sent.
                    # https://github.com/caronc/apprise/issues/633#issuecomment-1191449321
                    if not 'format=' in url and (n_format == 'text' or n_format == 'markdown'):
                        prefix = '?' if not '?' in url else '&'
                        url = "{}{}format={}".format(url, prefix, n_format)
                apobj.add(url)
                apobj.notify(
--- a/changedetectionio/static/images/spread-white.svg
+++ b/changedetectionio/static/images/spread-white.svg
@@ -0,0 +1,20 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg
   width="18"
   height="19.92"
   viewBox="0 0 18 19.92"
   version="1.1"
   id="svg6"
   xmlns="http://www.w3.org/2000/svg"
   xmlns:svg="http://www.w3.org/2000/svg">
  <defs
     id="defs10" />
  <path
     d="M -3,-2 H 21 V 22 H -3 Z"
     fill="none"
     id="path2" />
  <path
     d="m 15,14.08 c -0.76,0 -1.44,0.3 -1.96,0.77 L 5.91,10.7 C 5.96,10.47 6,10.24 6,10 6,9.76 5.96,9.53 5.91,9.3 L 12.96,5.19 C 13.5,5.69 14.21,6 15,6 16.66,6 18,4.66 18,3 18,1.34 16.66,0 15,0 c -1.66,0 -3,1.34 -3,3 0,0.24 0.04,0.47 0.09,0.7 L 5.04,7.81 C 4.5,7.31 3.79,7 3,7 1.34,7 0,8.34 0,10 c 0,1.66 1.34,3 3,3 0.79,0 1.5,-0.31 2.04,-0.81 l 7.12,4.16 c -0.05,0.21 -0.08,0.43 -0.08,0.65 0,1.61 1.31,2.92 2.92,2.92 1.61,0 2.92,-1.31 2.92,-2.92 0,-1.61 -1.31,-2.92 -2.92,-2.92 z"
     id="path4"
     style="fill:#ffffff;fill-opacity:1" />
 </svg>
--- a/changedetectionio/static/styles/.gitignore
+++ b/changedetectionio/static/styles/.gitignore
@@ -1 +1,3 @@
 node_modules
 package-lock.json
--- a/changedetectionio/static/styles/package-lock.json
+++ b/changedetectionio/static/styles/package-lock.json
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -158,8 +158,7 @@ class ChangeDetectionStore:
    @property
    def threshold_seconds(self):
        seconds = 0
-        mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
+        for m, n in Watch.mtable.items():
        for m, n in mtable.items():
            x = self.__data['settings']['requests']['time_between_check'].get(m)
            if x:
                seconds += x * n
@@ -298,7 +297,8 @@ class ChangeDetectionStore:
                          'ignore_text', 'css_filter',
                          'subtractive_selectors', 'trigger_text',
                          'extract_title_as_title', 'extract_text',
-                          'text_should_not_be_present']:
+                          'text_should_not_be_present',
                          'webdriver_js_execute_code']:
                    if res.get(k):
                        apply_extras[k] = res[k]
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -25,7 +25,7 @@
        <ul>
            <li class="tab" id="default-tab"><a href="#general">General</a></li>
            <li class="tab"><a href="#request">Request</a></li>
-            <li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Selector</a></li>
+            <li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Filter Selector</a></li>
            <li class="tab"><a href="#filters-and-triggers">Filters &amp; Triggers</a></li>
            <li class="tab"><a href="#notifications">Notifications</a></li>
        </ul>
@@ -62,6 +62,12 @@
                    <div class="pure-control-group">
                        {{ render_checkbox_field(form.extract_title_as_title) }}
                    </div>
                    <div class="pure-control-group">
                        {{ render_checkbox_field(form.filter_failure_notification_send) }}
                        <span class="pure-form-message-inline">
                         Sends a notification when the filter can no longer be seen on the page, good for knowing when the page changed and your filter will not work anymore.
                        </span>
                    </div>
                </fieldset>
            </div>
@@ -88,14 +94,17 @@
                            <strong>If you're having trouble waiting for the page to be fully rendered (text missing etc), try increasing the 'wait' time here.</strong>
                            <br/>
                            This will wait <i>n</i> seconds before extracting the text.
                            {% if using_global_webdriver_wait %}
                            <br/><strong>Using the current global default settings</strong>
                            {% endif %}
                        </div>
                    </div>
-                    {% if using_global_webdriver_wait %}
+                    <div class="pure-control-group">
-                    <div class="pure-form-message-inline">
+                        {{ render_field(form.webdriver_js_execute_code) }}
-                        <strong>Using the current global default settings</strong>
+                        <div class="pure-form-message-inline">
                            Run this code before performing change detection, handy for filling in fields and other actions <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More help and examples here</a>
                        </div>
                    </div>
                    {% endif %}
                </fieldset>
                <fieldset class="pure-group" id="requests-override-options">
                    {% if not playwright_enabled %}
@@ -187,7 +196,7 @@ nav
                    <span class="pure-form-message-inline">
                        <ul>
                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
-                            <li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
+                            <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
                            <li>Use the preview/show current tab to see ignores</li>
                        </ul>
@@ -230,8 +239,15 @@ Unavailable") }}
                        {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
                        <span class="pure-form-message-inline">
                    <ul>
-                        <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
+                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
-                        <li>One line per regular-expression.</li>
+                            <ul>
                                <li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
                                <li>Keyword example &dash; example <code>Out of stock</code></li>
                                <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
                            </ul>
                        </li>
                        <li>One line per regular-expression/ string match</li>
                    </ul>
                        </span>
                    </div>
@@ -240,7 +256,7 @@ Unavailable") }}
            <div class="tab-pane-inner visual-selector-ui" id="visualselector">
                <img id="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}">
-
+                <strong>Pro-tip:</strong> This tool is only for limiting which elements will be included on a change-detection, not for interacting with browser directly.
                <fieldset>
                    <div class="pure-control-group">
                        {% if visualselector_enabled %}
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -36,7 +36,13 @@
                        {{ render_field(form.requests.form.jitter_seconds, class="jitter_seconds") }}
                        <span class="pure-form-message-inline">Example - 3 seconds random jitter could trigger up to 3 seconds earlier or up to 3 seconds later</span>
                    </div>
-
+                    <div class="pure-control-group">
                        {{ render_field(form.application.form.filter_failure_notification_threshold_attempts, class="filter_failure_notification_threshold_attempts") }}
                        <span class="pure-form-message-inline">After this many consecutive times that the CSS/xPath filter is missing, send a notification
                            <br/>
                        Set to <strong>0</strong> to disable
                        </span>
                    </div>
                    <div class="pure-control-group">
                        {% if not hide_remove_pass %}
                            {% if current_user.is_authenticated %}
@@ -148,7 +154,7 @@ nav
                        <ul>
                            <li>Note: This is applied globally in addition to the per-watch rules.</li>
                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
-                            <li>Regular Expression support, wrap the line in forward slash <code>/regex/</code></li>
+                            <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
                            <li>Use the preview/show current tab to see ignores</li>
                        </ul>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -14,7 +14,7 @@
                {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
            <button type="submit" class="pure-button pure-button-primary">Watch</button>
        </fieldset>
-        <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
+        <span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
    </form>
    <div>
        <a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -15,7 +15,7 @@ def set_original_response():
     </br>
     So let's see what happens.  </br>
     <div id="sametext">Some text thats the same</div>
-     <div id="changetext">Some text that will change</div>
+     <div class="changetext">Some text that will change</div>     
     </body>
     </html>
    """
@@ -33,7 +33,8 @@ def set_modified_response():
     </br>
     So let's see what happens.  </br>
     <div id="sametext">Some text thats the same</div>
-     <div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
+     <div class="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/>  2000 online )</div>
     <div class="changetext">SomeCase insensitive 3456</div>
     </body>
     </html>
    """
@@ -44,11 +45,78 @@ def set_modified_response():
    return None
-def test_check_filter_and_regex_extract(client, live_server):
+def set_multiline_response():
-    sleep_time_for_fetch_thread = 3
+    test_return_data = """<html>
       <body>
     <p>Something <br/>
        across 6 billion multiple<br/>
        lines
     </p>
     <div>aaand something lines</div>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
    return None
 def test_setup(client, live_server):
    live_server_setup(live_server)
-    css_filter = "#changetext"
+
 def test_check_filter_multiline(client, live_server):
    set_multiline_response()
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(3)
    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": '',
              'extract_text': '/something.+?6 billion.+?lines/si',
              "url": test_url,
              "tag": "",
              "headers": "",
              'fetch_backend': "html_requests"
              },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b'<div class="">Something' in res.data
    assert b'<div class="">across 6 billion multiple' in res.data
    assert b'<div class="">lines' in res.data
    # but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
    assert b'aaand something lines' not in res.data
 def test_check_filter_and_regex_extract(client, live_server):
    sleep_time_for_fetch_thread = 3
    css_filter = ".changetext"
    set_original_response()
@@ -64,6 +132,7 @@ def test_check_filter_and_regex_extract(client, live_server):
    )
    assert b"1 Imported" in res.data
    time.sleep(1)
    # Trigger a check
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -75,7 +144,7 @@ def test_check_filter_and_regex_extract(client, live_server):
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": css_filter,
-              'extract_text': '\d+ online\n\d+ guests',
+              'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
              "url": test_url,
              "tag": "",
              "headers": "",
@@ -86,15 +155,6 @@ def test_check_filter_and_regex_extract(client, live_server):
    assert b"Updated watch." in res.data
    # Check it saved
    res = client.get(
        url_for("edit_page", uuid="first"),
    )
    assert b'\d+ online' in res.data
    # Trigger a check
 #    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
@@ -119,9 +179,20 @@ def test_check_filter_and_regex_extract(client, live_server):
    # Class will be blank for now because the frontend didnt apply the diff
    assert b'<div class="">1000 online' in res.data
-    
+
    # All regex matching should be here
    assert b'<div class="">2000 online' in res.data
    # Both regexs should be here
    assert b'<div class="">80 guests' in res.data
    # Regex with flag handling should be here
    assert b'<div class="">SomeCase insensitive 3456' in res.data
    # Singular group from /somecase insensitive (345\d)/i
    assert b'<div class="">3456' in res.data
    # Regex with multiline flag handling should be here
    # Should not be here
-    assert b'Some text that did change' not in res.data
+    assert b'Some text that did change' not in res.data
--- a/changedetectionio/tests/test_filter_failure_notification.py
+++ b/changedetectionio/tests/test_filter_failure_notification.py
@@ -0,0 +1,134 @@
 import os
 import time
 import re
 from flask import url_for
 from .util import set_original_response, live_server_setup
 from changedetectionio.model import App
 def set_response_with_filter():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     <div id="nope-doesnt-exist">Some text thats the same</div>     
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
    return None
 def run_filter_test(client, content_filter):
    # Give the endpoint time to spin up
    time.sleep(1)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("form_watch_add"),
        data={"url": test_url, "tag": ''},
        follow_redirects=True
    )
    assert b"Watch added" in res.data
    # Give the thread time to pick up the first version
    time.sleep(3)
    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    url = url_for('test_notification_endpoint', _external=True)
    notification_url = url.replace('http', 'json')
    print(">>>> Notification URL: " + notification_url)
    # Just a regular notification setting, this will be used by the special 'filter not found' notification
    notification_form_data = {"notification_urls": notification_url,
                              "notification_title": "New ChangeDetection.io Notification - {watch_url}",
                              "notification_body": "BASE URL: {base_url}\n"
                                                   "Watch URL: {watch_url}\n"
                                                   "Watch UUID: {watch_uuid}\n"
                                                   "Watch title: {watch_title}\n"
                                                   "Watch tag: {watch_tag}\n"
                                                   "Preview: {preview_url}\n"
                                                   "Diff URL: {diff_url}\n"
                                                   "Snapshot: {current_snapshot}\n"
                                                   "Diff: {diff}\n"
                                                   "Diff Full: {diff_full}\n"
                                                   ":-)",
                              "notification_format": "Text"}
    notification_form_data.update({
        "url": test_url,
        "tag": "my tag",
        "title": "my title",
        "headers": "",
        "css_filter": content_filter,
        "fetch_backend": "html_requests"})
    res = client.post(
        url_for("edit_page", uuid="first"),
        data=notification_form_data,
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    # Now the notification should not exist, because we didnt reach the threshold
    assert not os.path.isfile("test-datastore/notification.txt")
    for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT):
        res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
        time.sleep(3)
    # We should see something in the frontend
    assert b'Did the page change its layout' in res.data
    # Now it should exist and contain our "filter not found" alert
    assert os.path.isfile("test-datastore/notification.txt")
    notification = False
    with open("test-datastore/notification.txt", 'r') as f:
        notification = f.read()
    assert 'CSS/xPath filter was not present in the page' in notification
    assert content_filter.replace('"', '\\"') in notification
    # Remove it and prove that it doesnt trigger when not expected
    os.unlink("test-datastore/notification.txt")
    set_response_with_filter()
    for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT):
        client.get(url_for("form_watch_checknow"), follow_redirects=True)
        time.sleep(3)
    # It should have sent a notification, but..
    assert os.path.isfile("test-datastore/notification.txt")
    # but it should not contain the info about the failed filter
    with open("test-datastore/notification.txt", 'r') as f:
        notification = f.read()
    assert not 'CSS/xPath filter was not present in the page' in notification
    # cleanup for the next
    client.get(
        url_for("form_delete", uuid="all"),
        follow_redirects=True
    )
    os.unlink("test-datastore/notification.txt")
 def test_setup(live_server):
    live_server_setup(live_server)
 def test_check_css_filter_failure_notification(client, live_server):
    set_original_response()
    time.sleep(1)
    run_filter_test(client, '#nope-doesnt-exist')
 def test_check_xpath_filter_failure_notification(client, live_server):
    set_original_response()
    time.sleep(1)
    run_filter_test(client, '//*[@id="nope-doesnt-exist"]')
--- a/changedetectionio/tests/test_obfuscations.py
+++ b/changedetectionio/tests/test_obfuscations.py
@@ -0,0 +1,43 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from .util import live_server_setup
 def set_original_ignore_response():
    test_return_data = """<html>
       <body>
     <span>The price is</span><span>$<!-- -->90<!-- -->.<!-- -->74</span>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def test_obfuscations(client, live_server):
    set_original_ignore_response()
    live_server_setup(live_server)
    time.sleep(1)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    # Give the thread time to pick it up
    time.sleep(3)
    # Check HTML conversion detected and workd
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
    assert b'$90.74' in res.data
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -3,173 +3,215 @@ import queue
 import time
 from changedetectionio import content_fetcher
 from changedetectionio.html_tools import FilterNotFoundInResponse
 # A single update worker
 #
-# Requests for checking on a single site(watch) from a queue of watches
+#
 # (another process inserts watches into the queue that are time-ready for checking)
 class update_worker(threading.Thread):
    current_uuid = None
-    def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
+    def __init__(self, q, notification_q, app, datastore, uuid, *args, **kwargs):
        self.q = q
        self.app = app
        self.notification_q = notification_q
        self.datastore = datastore
        self.current_uuid = uuid
        super().__init__(*args, **kwargs)
        self.name = "update_worker"
    def send_filter_failure_notification(self, uuid):
        threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
        watch = self.datastore.data['watching'].get(uuid, False)
        n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
                    'notification_body': "Your configured CSS/xPath filter of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
                        watch['css_filter'],
                        threshold),
                    'notification_format': 'text'}
        if len(watch['notification_urls']):
            n_object['notification_urls'] = watch['notification_urls']
        elif len(self.datastore.data['settings']['application']['notification_urls']):
            n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
        # Only prepare to notify if the rules above matched
        if 'notification_urls' in n_object:
            n_object.update({
                'watch_url': watch['url'],
                'uuid': uuid
            })
            self.notification_q.put(n_object)
            print("Sent filter not found notification for {}".format(uuid))
    # Pick one job off the list, process it threaded, exist
    def run(self):
        # Go talk to the website
        self.perform_site_update()
        self.current_uuid = None  # Done
        self.q.task_done()
        # Let the thread die after processing 1
        # We will launch nice juicy fresh threads every time to prevent memory leaks in complex runner code (playwright etc)
        print ("EXITING THREAD!")
        self.app.config.exit.wait(1)
        return
    def perform_site_update(self):
        from changedetectionio import fetch_site_status
        if not self.current_uuid in list(self.datastore.data['watching'].keys()):
            return
        changed_detected = False
        contents = ""
        screenshot = False
        update_obj= {}
        xpath_data = False
        now = time.time()
        update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
        try:
            changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(self.current_uuid)
            # Re #342
            # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
            # We then convert/.decode('utf-8') for the notification etc
            if not isinstance(contents, (bytes, bytearray)):
                raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
        except PermissionError as e:
            self.app.logger.error("File permission error updating", self.current_uuid, str(e))
        except content_fetcher.ReplyWithContentButNoText as e:
            # Totally fine, it's by choice - just continue on, nothing more to care about
            # Page had elements/content but no renderable text
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': "Got HTML content but no text found."})
        except FilterNotFoundInResponse as e:
            err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e))
            c = 0
            if self.datastore.data['watching'].get(self.current_uuid, False):
                c = self.datastore.data['watching'][self.current_uuid].get('consecutive_filter_failures', 5)
            c += 1
-        while not self.app.config.exit.is_set():
+            # Send notification if we reached the threshold?
            threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
            print("Filter for {} not found, consecutive_filter_failures: {}".format(self.current_uuid, c))
            if threshold >0 and c >= threshold:
                self.send_filter_failure_notification(self.current_uuid)
                c = 0
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'consecutive_filter_failures': c})
        except content_fetcher.EmptyReply as e:
            # Some kind of custom to-str handler in the exception handler that does this?
            err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except content_fetcher.ScreenshotUnavailable as e:
            err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except content_fetcher.PageUnloadable as e:
            err_text = "Page request from server didnt respond correctly"
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': err_text,
                                                               'last_check_status': e.status_code})
        except Exception as e:
            self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
        else:
            try:
-                uuid = self.q.get(block=False)
+                watch = self.datastore.data['watching'][self.current_uuid]
-            except queue.Empty:
+                fname = "" # Saved history text filename
                pass
-            else:
+                # For the FIRST time we check a site, or a change detected, save the snapshot.
-                self.current_uuid = uuid
+                if changed_detected or not watch['last_checked']:
                    # A change was detected
                    fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
-                if uuid in list(self.datastore.data['watching'].keys()):
+                # Generally update anything interesting returned
                update_obj['consecutive_filter_failures'] = 0
                self.datastore.update_watch(uuid=self.current_uuid, update_obj=update_obj)
-                    changed_detected = False
+                # A change was detected
-                    contents = ""
+                if changed_detected:
-                    screenshot = False
+                    n_object = {}
-                    update_obj= {}
+                    print (">> Change detected in UUID {} - {}".format(self.current_uuid, watch['url']))
                    xpath_data = False
                    now = time.time()
-                    try:
+                    # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
-                        changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)
+                    if watch.history_n >= 2:
-                        # Re #342
+                        # Atleast 2, means there really was a change
-                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
+                        self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_changed': round(now)})
-                        # We then convert/.decode('utf-8') for the notification etc
+
-                        if not isinstance(contents, (bytes, bytearray)):
+                        watch_history = watch.history
-                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
+                        dates = list(watch_history.keys())
-                    except PermissionError as e:
+                        # Theoretically it's possible that this could be just 1 long,
-                        self.app.logger.error("File permission error updating", uuid, str(e))
+                        # - In the case that the timestamp key was not unique
-                    except content_fetcher.ReplyWithContentButNoText as e:
+                        if len(dates) == 1:
-                        # Totally fine, it's by choice - just continue on, nothing more to care about
+                            raise ValueError(
-                        # Page had elements/content but no renderable text
+                                "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
-                        if self.datastore.data['watching'].get(uuid, False) and self.datastore.data['watching'][uuid].get('css_filter'):
+                            )
-                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (CSS / xPath Filter not found in page?)"})
+                        prev_fname = watch_history[dates[-2]]
                        # Did it have any notification alerts to hit?
                        if len(watch['notification_urls']):
                            print(">>> Notifications queued for UUID from watch {}".format(self.current_uuid))
                            n_object['notification_urls'] = watch['notification_urls']
                            n_object['notification_title'] = watch['notification_title']
                            n_object['notification_body'] = watch['notification_body']
                            n_object['notification_format'] = watch['notification_format']
                        # No? maybe theres a global setting, queue them all
                        elif len(self.datastore.data['settings']['application']['notification_urls']):
                            print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(self.current_uuid))
                            n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
                            n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
                            n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
                            n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
                        else:
-                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."})
+                            print(">>> NO notifications queued, watch and global notification URLs were empty.")
                        pass
                    except content_fetcher.EmptyReply as e:
                        # Some kind of custom to-str handler in the exception handler that does this?
                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                    except content_fetcher.ScreenshotUnavailable as e:
                        err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                    except content_fetcher.PageUnloadable as e:
                        err_text = "Page request from server didnt respond correctly"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
-                    except Exception as e:
+                        # Only prepare to notify if the rules above matched
-                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
+                        if 'notification_urls' in n_object:
-                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
+                            # HTML needs linebreak, but MarkDown and Text can use a linefeed
                            if n_object['notification_format'] == 'HTML':
                                line_feed_sep = "</br>"
                            else:
                                line_feed_sep = "\n"
-                    else:
+                            from changedetectionio import diff
-                        try:
+                            n_object.update({
-                            watch = self.datastore.data['watching'][uuid]
+                                'watch_url': watch['url'],
-                            fname = "" # Saved history text filename
+                                'uuid': self.current_uuid,
                                'current_snapshot': contents.decode('utf-8'),
                                'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
                                'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
                            })
-                            # For the FIRST time we check a site, or a change detected, save the snapshot.
+                            self.notification_q.put(n_object)
                            if changed_detected or not watch['last_checked']:
                                # A change was detected
                                fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
-                            # Generally update anything interesting returned
+            except Exception as e:
-                            self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
+                # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
                print("!!!! Exception in update_worker !!!\n", e)
                self.app.logger.error("Exception reached processing watch UUID: %s - %s", self.current_uuid, str(e))
                self.datastore.update_watch(uuid=self.current_uuid, update_obj={'last_error': str(e)})
-                            # A change was detected
+        finally:
-                            if changed_detected:
+            # Always record that we atleast tried
-                                n_object = {}
+            self.datastore.update_watch(uuid=self.current_uuid, update_obj={'fetch_time': round(time.time() - now, 3),
-                                print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
+                                                               'last_checked': round(time.time())})
-                                # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
+            # Always save the screenshot if it's available
-                                if watch.history_n >= 2:
+            if screenshot:
-                                    # Atleast 2, means there really was a change
+                self.datastore.save_screenshot(watch_uuid=self.current_uuid, screenshot=screenshot)
-                                    self.datastore.update_watch(uuid=uuid, update_obj={'last_changed': round(now)})
+            if xpath_data:
-
+                self.datastore.save_xpath_data(watch_uuid=self.current_uuid, data=xpath_data)
                                    watch_history = watch.history
                                    dates = list(watch_history.keys())
                                    # Theoretically it's possible that this could be just 1 long,
                                    # - In the case that the timestamp key was not unique
                                    if len(dates) == 1:
                                        raise ValueError(
                                            "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
                                        )
                                    prev_fname = watch_history[dates[-2]]
                                    # Did it have any notification alerts to hit?
                                    if len(watch['notification_urls']):
                                        print(">>> Notifications queued for UUID from watch {}".format(uuid))
                                        n_object['notification_urls'] = watch['notification_urls']
                                        n_object['notification_title'] = watch['notification_title']
                                        n_object['notification_body'] = watch['notification_body']
                                        n_object['notification_format'] = watch['notification_format']
                                    # No? maybe theres a global setting, queue them all
                                    elif len(self.datastore.data['settings']['application']['notification_urls']):
                                        print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid))
                                        n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
                                        n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title']
                                        n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body']
                                        n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format']
                                    else:
                                        print(">>> NO notifications queued, watch and global notification URLs were empty.")
                                    # Only prepare to notify if the rules above matched
                                    if 'notification_urls' in n_object:
                                        # HTML needs linebreak, but MarkDown and Text can use a linefeed
                                        if n_object['notification_format'] == 'HTML':
                                            line_feed_sep = "</br>"
                                        else:
                                            line_feed_sep = "\n"
                                        from changedetectionio import diff
                                        n_object.update({
                                            'watch_url': watch['url'],
                                            'uuid': uuid,
                                            'current_snapshot': contents.decode('utf-8'),
                                            'diff': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep),
                                            'diff_full': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep)
                                        })
                                        self.notification_q.put(n_object)
                        except Exception as e:
                            # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
                            print("!!!! Exception in update_worker !!!\n", e)
                            self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
                    finally:
                        # Always record that we atleast tried
                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
                                                                           'last_checked': round(time.time())})
                        # Always save the screenshot if it's available
                        if screenshot:
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
                        if xpath_data:
                            self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
                self.current_uuid = None  # Done
                self.q.task_done()
                # Give the CPU time to interrupt
                time.sleep(0.1)
            self.app.config.exit.wait(1)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -24,7 +24,7 @@ services:
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
  #       Alternative Playwright URL, do not use "'s or 's!
-  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/
+  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@@ -78,9 +78,6 @@ services:
 #            - SCREEN_HEIGHT=1024
 #            - SCREEN_DEPTH=16
 #            - ENABLE_DEBUGGER=false
 #            - SCREEN_WIDTH=1280
 #            - SCREEN_HEIGHT=1024
 #            - SCREEN_DEPTH=16
 #            - PREBOOT_CHROME=true
 #            - CONNECTION_TIMEOUT=300000
 #            - MAX_CONCURRENT_SESSIONS=10
Author	SHA1	Message	Date
dgtlmoon	9d8558fbc9	LXML memory leak workaround	2022-07-27 15:35:05 +02:00
dgtlmoon	a7a8ba58ed	fix status	2022-07-27 13:11:14 +02:00
dgtlmoon	7823140442	WIP	2022-07-27 13:05:23 +02:00
dgtlmoon	1f27865fdf	Filter failure notification send default enable now controlled by setting Env var	2022-07-27 00:01:51 +02:00
dgtlmoon	faa42d75e0	Refactor of extract text filter - Regex, support Regex (groups) and all python regex flags via /something/aiLmsux (#773 )	2022-07-26 17:34:34 +02:00
dgtlmoon	3b6e6d85bb	Update README.md - adding LinkedIn link	2022-07-25 00:28:41 +02:00
dgtlmoon	30d6a272ce	Update README.md - Adding Discord and YouTube links	2022-07-24 23:06:42 +02:00
dgtlmoon	291700554e	Bug fix for alerting when xPath based filters are no longer present (#772 )	2022-07-23 19:39:52 +02:00
dgtlmoon	a82fad7059	Send notification when CSS/xPath filter is missing after more than 6 (configurable) attempts (#771 )	2022-07-23 17:19:00 +02:00
dgtlmoon	c2fe5ae0d1	mailto plaintext handling fix for 'plaintext' apprise integration	2022-07-23 16:55:31 +02:00
dgtlmoon	5beefdb7cc	Minor code cleanups	2022-07-23 13:18:44 +02:00
dgtlmoon	872bbba71c	Notifications - email - Correctly send plaintext notification email with plaintext header (#767 )	2022-07-21 15:22:20 +02:00
Jonathon Sisson	d578de1a35	Form text tweak - Regex clarification (#766 )	2022-07-21 10:05:59 +02:00
dgtlmoon	cdc104be10	Update README.md	2022-07-20 14:37:45 +02:00
dgtlmoon	dd0eeca056	Handle simple obfuscations - HomeDepot.com style price obfuscation (#764 )	2022-07-20 14:02:22 +02:00
dgtlmoon	a95468be08	Fixing docker-compose.yml PLAYWRIGHT_DRIVER_URL example URL	2022-07-15 20:45:29 +02:00
Brandon Wees	ace44d0e00	Notifications fix - Discord - added discord webhook base url to truncation rules (#753 ) Co-authored-by: bwees <branonwees@gmail.com>	2022-07-14 17:41:12 +02:00
dgtlmoon	ebb8b88621	Update Playwright URI Env example with stealth setting and CORS workaround (more reliable fetching)	2022-07-12 22:36:22 +02:00
dgtlmoon	12fc2200de	remove extra file	2022-07-12 22:32:20 +02:00
dgtlmoon	52d3d375ba	removing package-lock.json - not required to be in git	2022-07-10 20:29:11 +02:00
dgtlmoon	08117089e6	Share-icon cleanups	2022-07-10 20:24:49 +02:00
dgtlmoon	2ba3a6d53f	Test improvement: Extract text should return all matches	2022-07-10 20:05:48 +02:00
dgtlmoon	2f636553a9	Bug fix: RSS Feed should also announce utf-8 charset	2022-07-10 18:50:21 +02:00
Freddie Leeman	0bde48b282	Regex extract filter: Return all regex results instead of first match (#730 )	2022-07-10 15:09:10 +02:00
dgtlmoon	fae1164c0b	Ability to specify JS before running change-detection (#744 )	2022-07-10 13:56:01 +02:00
dgtlmoon	169c293143	Playwright - log console errors to output	2022-07-10 13:55:29 +02:00
dgtlmoon	46cb5cff66	UI Improvement - Clarifying "Visual Filter" tool as "Visual Selector Filter"	2022-07-10 12:51:12 +02:00
Simo Elalj	05584ea886	Use environment variables to override new watch settings defaults (user-agent, timeout, workers) (#742 )	2022-07-08 20:50:04 +02:00
marvin8	176a591357	Update docker-compose.yml - Remove duplicate environment variables from playwright-chrome sample config in docker-compose.yml (#738 )	2022-07-06 09:03:10 +02:00
dgtlmoon	15569f9592	0.39.16	2022-07-05 16:14:57 +02:00
dgtlmoon	5f9e475fe0	Fix notification apprise application name to changedetection.io #731	2022-06-30 23:11:03 +02:00
`@@ -1 +1,3 @@`
	`node_modules`	`node_modules`
		`package-lock.json`