0.39.10

Add filter to remove elements by CSS rule from HTML before change detection is run (#445 )
Minor updates to filters form text
2025-11-17 06:56:10 +00:00 · 2022-03-12 17:28:30 +01:00 · 2022-03-12 13:29:30 +01:00 · 2022-03-12 11:20:43 +01:00 · 2022-03-12 08:24:51 +01:00 · 2022-03-11 18:50:02 +01:00
32 changed files with 1121 additions and 244 deletions
--- a/README.md
+++ b/README.md
@@ -15,13 +15,19 @@ Open source web page monitoring, notification and change detection.
 <img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring"  title="Self-hosted web page change monitoring"  />


-**Get your own instance now on Lemonade!**
+**Get your own private instance now! Let us host it for you!**

 [![Deploy to Lemonade](https://lemonade.changedetection.io/static/images/lemonade.svg)](https://lemonade.changedetection.io/start)

+
+[_Let us host your own private instance - We accept PayPal and Bitcoin, Support the further development of changedetection.io!_](https://lemonade.changedetection.io/start)
+
+
+
 - Automatic Updates, Automatic Backups, No Heroku "paused application", don't miss a change!
 - Javascript browser included
- Pay with Bitcoin
+- Unlimited checks and watches!
+

 #### Example use cases

@@ -99,6 +105,8 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
 ## Filters
 XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.

+(We support LXML re:test, re:math and re:replace.)
+
 ## Notifications

 ChangeDetection.io supports a massive amount of notifications (including email, office365, custom APIs, etc) when a web-page has a change detected thanks to the <a href="https://github.com/caronc/apprise">apprise</a> library.
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -36,7 +36,9 @@ from flask import (
 )
 from flask_login import login_required

-__version__ = '0.39.8'
+from changedetectionio import html_tools
+
+__version__ = '0.39.10'

 datastore = None

@@ -126,7 +128,7 @@ def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"):
    # return timeago.format(timestamp, time.time())
    # return datetime.datetime.utcfromtimestamp(timestamp).strftime(format)

-
+# When nobody is logged in Flask-Login's current_user is set to an AnonymousUser object.
 class User(flask_login.UserMixin):
    id=None

@@ -135,7 +137,6 @@ class User(flask_login.UserMixin):
    def get_user(self, email="defaultuser@changedetection.io"):
        return self
    def is_authenticated(self):
-
        return True
    def is_active(self):
        return True
@@ -214,6 +215,10 @@ def changedetection_app(config=None, datastore_o=None):
            return redirect(url_for('index'))

        if request.method == 'GET':
+            if flask_login.current_user.is_authenticated:
+                flash("Already logged in")
+                return redirect(url_for("index"))
+
            output = render_template("login.html")
            return output

@@ -249,6 +254,11 @@ def changedetection_app(config=None, datastore_o=None):
        # (No password in settings or env var)
        app.config['LOGIN_DISABLED'] = datastore.data['settings']['application']['password'] == False and os.getenv("SALTED_PASS", False) == False

+        # Set the auth cookie path if we're running as X-settings/X-Forwarded-Prefix
+        if os.getenv('USE_X_SETTINGS') and 'X-Forwarded-Prefix' in request.headers:
+            app.config['REMEMBER_COOKIE_PATH'] = request.headers['X-Forwarded-Prefix']
+            app.config['SESSION_COOKIE_PATH'] = request.headers['X-Forwarded-Prefix']
+
        # For the RSS path, allow access via a token
        if request.path == '/rss' and request.args.get('token'):
            app_rss_token = datastore.data['settings']['application']['rss_access_token']
@@ -367,7 +377,10 @@ def changedetection_app(config=None, datastore_o=None):
                                 tags=existing_tags,
                                 active_tag=limit_tag,
                                 app_rss_token=datastore.data['settings']['application']['rss_access_token'],
-                                 has_unviewed=datastore.data['has_unviewed'])
+                                 has_unviewed=datastore.data['has_unviewed'],
+                                 # Don't link to hosting when we're on the hosting environment
+                                 hosted_sticky=os.getenv("SALTED_PASS", False) == False,
+                                 guid=datastore.data['app_guid'])

        return output

@@ -441,7 +454,7 @@ def changedetection_app(config=None, datastore_o=None):
                raw_content = file.read()

                handler = fetch_site_status.perform_site_check(datastore=datastore)
-                stripped_content = handler.strip_ignore_text(raw_content,
+                stripped_content = html_tools.strip_ignore_text(raw_content,
                                                             datastore.data['watching'][uuid]['ignore_text'])

                if datastore.data['settings']['application'].get('ignore_whitespace', False):
@@ -514,6 +527,7 @@ def changedetection_app(config=None, datastore_o=None):


            datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip()
+            datastore.data['watching'][uuid]['subtractive_selectors'] = form.subtractive_selectors.data

            # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
            if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
@@ -546,10 +560,14 @@ def changedetection_app(config=None, datastore_o=None):
                    flash('No notification URLs set, cannot send test.', 'error')

            # Diff page [edit] link should go back to diff page
-            if request.args.get("next") and request.args.get("next") == 'diff':
+            if request.args.get("next") and request.args.get("next") == 'diff' and not form.save_and_preview_button.data:
                return redirect(url_for('diff_history_page', uuid=uuid))
            else:
-                return redirect(url_for('index'))
+                if form.save_and_preview_button.data:
+                    flash('You may need to reload this page to see the new content.')
+                    return redirect(url_for('preview_page', uuid=uuid))
+                else:
+                    return redirect(url_for('index'))

        else:
            if request.method == 'POST' and not form.validate():
@@ -582,6 +600,7 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'GET':
            form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
            form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
+            form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
            form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
            form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
@@ -610,6 +629,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['settings']['application']['notification_format'] = form.notification_format.data
            datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
            datastore.data['settings']['application']['base_url'] = form.base_url.data
+            datastore.data['settings']['application']['global_subtractive_selectors'] = form.global_subtractive_selectors.data
            datastore.data['settings']['application']['global_ignore_text'] =  form.global_ignore_text.data
            datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data

@@ -721,8 +741,12 @@ def changedetection_app(config=None, datastore_o=None):
        # Save the current newest history as the most recently viewed
        datastore.set_last_viewed(uuid, dates[0])
        newest_file = watch['history'][dates[0]]
-        with open(newest_file, 'r') as f:
-            newest_version_file_contents = f.read()
+
+        try:
+            with open(newest_file, 'r') as f:
+                newest_version_file_contents = f.read()
+        except Exception as e:
+            newest_version_file_contents = "Unable to read {}.\n".format(newest_file)

        previous_version = request.args.get('previous_version')
        try:
@@ -731,8 +755,11 @@ def changedetection_app(config=None, datastore_o=None):
            # Not present, use a default value, the second one in the sorted list.
            previous_file = watch['history'][dates[1]]

-        with open(previous_file, 'r') as f:
-            previous_version_file_contents = f.read()
+        try:
+            with open(previous_file, 'r') as f:
+                previous_version_file_contents = f.read()
+        except Exception as e:
+            previous_version_file_contents = "Unable to read {}.\n".format(previous_file)

        output = render_template("diff.html", watch_a=watch,
                                 newest=newest_version_file_contents,
@@ -744,13 +771,16 @@ def changedetection_app(config=None, datastore_o=None):
                                 current_previous_version=str(previous_version),
                                 current_diff_url=watch['url'],
                                 extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
-                                 left_sticky= True )
+                                 left_sticky=True)

        return output

    @app.route("/preview/<string:uuid>", methods=['GET'])
    @login_required
    def preview_page(uuid):
+        content = []
+        ignored_line_numbers = []
+        trigger_line_numbers = []

        # More for testing, possible to return the first/only
        if uuid == 'first':
@@ -764,14 +794,51 @@ def changedetection_app(config=None, datastore_o=None):
            flash("No history found for the specified link, bad link?", "error")
            return redirect(url_for('index'))

-        newest = list(watch['history'].keys())[-1]
-        with open(watch['history'][newest], 'r') as f:
-            content = f.readlines()
+        if len(watch['history']):
+            timestamps = sorted(watch['history'].keys(), key=lambda x: int(x))
+            filename = watch['history'][timestamps[-1]]
+            try:
+                with open(filename, 'r') as f:
+                    tmp = f.readlines()
+
+                    # Get what needs to be highlighted
+                    ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text']
+
+                    # .readlines will keep the \n, but we will parse it here again, in the future tidy this up
+                    ignored_line_numbers = html_tools.strip_ignore_text(content="".join(tmp),
+                                                                        wordlist=ignore_rules,
+                                                                        mode='line numbers'
+                                                                        )
+
+                    trigger_line_numbers = html_tools.strip_ignore_text(content="".join(tmp),
+                                                                        wordlist=watch['trigger_text'],
+                                                                        mode='line numbers'
+                                                                        )
+                    # Prepare the classes and lines used in the template
+                    i=0
+                    for l in tmp:
+                        classes=[]
+                        i+=1
+                        if i in ignored_line_numbers:
+                            classes.append('ignored')
+                        if i in trigger_line_numbers:
+                            classes.append('triggered')
+                        content.append({'line': l, 'classes': ' '.join(classes)})
+
+
+            except Exception as e:
+                content.append({'line': "File doesnt exist or unable to read file {}".format(filename), 'classes': ''})
+        else:
+            content.append({'line': "No history found", 'classes': ''})
+

        output = render_template("preview.html",
                                 content=content,
                                 extra_stylesheets=extra_stylesheets,
+                                 ignored_line_numbers=ignored_line_numbers,
+                                 triggered_line_numbers=trigger_line_numbers,
                                 current_diff_url=watch['url'],
+                                 watch=watch,
                                 uuid=uuid)
        return output

@@ -783,6 +850,7 @@ def changedetection_app(config=None, datastore_o=None):
                                 logs=notification_debug_log if len(notification_debug_log) else ["No errors or warnings detected"])

        return output
+
    @app.route("/api/<string:uuid>/snapshot/current", methods=['GET'])
    @login_required
    def api_snapshot(uuid):
@@ -1066,22 +1134,42 @@ def ticker_thread_check_time_launch_checks():
                running_uuids.append(t.current_uuid)

        # Re #232 - Deepcopy the data incase it changes while we're iterating through it all
-        copied_datastore = deepcopy(datastore)
+        while True:
+            try:
+                copied_datastore = deepcopy(datastore)
+            except RuntimeError as e:
+                # RuntimeError: dictionary changed size during iteration
+                time.sleep(0.1)
+            else:
+                break
+
+        # Re #438 - Don't place more watches in the queue to be checked if the queue is already large
+        while update_q.qsize() >= 2000:
+            time.sleep(1)

        # Check for watches outside of the time threshold to put in the thread queue.
+        now = time.time()
+        max_system_wide = int(copied_datastore.data['settings']['requests']['minutes_between_check']) * 60
+
        for uuid, watch in copied_datastore.data['watching'].items():
+
+            # No need todo further processing if it's paused
+            if watch['paused']:
+                continue
+
            # If they supplied an individual entry minutes to threshold.
-            if 'minutes_between_check' in watch and watch['minutes_between_check'] is not None:
+            watch_minutes_between_check = watch.get('minutes_between_check', None)
+            if watch_minutes_between_check is not None:
                # Cast to int just incase
-                max_time = int(watch['minutes_between_check']) * 60
+                max_time = int(watch_minutes_between_check) * 60
            else:
                # Default system wide.
-                max_time = int(copied_datastore.data['settings']['requests']['minutes_between_check']) * 60
+                max_time = max_system_wide

-            threshold = time.time() - max_time
+            threshold = now - max_time

-            # Yeah, put it in the queue, it's more than time.
-            if not watch['paused'] and watch['last_checked'] <= threshold:
+            # Yeah, put it in the queue, it's more than time
+            if watch['last_checked'] <= threshold:
                if not uuid in running_uuids and uuid not in update_q.queue:
                    update_q.put(uuid)

--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -1,10 +1,11 @@
-import time
-from changedetectionio import content_fetcher
 import hashlib
-from inscriptis import get_text
-import urllib3
-from . import html_tools
 import re
+import time
+
+import urllib3
+from inscriptis import get_text
+
+from changedetectionio import content_fetcher, html_tools

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

@@ -16,40 +17,6 @@ class perform_site_check():
        super().__init__(*args, **kwargs)
        self.datastore = datastore

-    def strip_ignore_text(self, content, list_ignore_text):
-        import re
-        ignore = []
-        ignore_regex = []
-        for k in list_ignore_text:
-
-            # Is it a regex?
-            if k[0] == '/':
-                ignore_regex.append(k.strip(" /"))
-            else:
-                ignore.append(k)
-
-        output = []
-        for line in content.splitlines():
-
-            # Always ignore blank lines in this mode. (when this function gets called)
-            if len(line.strip()):
-                regex_matches = False
-
-                # if any of these match, skip
-                for regex in ignore_regex:
-                    try:
-                        if re.search(regex, line, re.IGNORECASE):
-                            regex_matches = True
-                    except Exception as e:
-                        continue
-
-                if not regex_matches and not any(skip_text in line for skip_text in ignore):
-                    output.append(line.encode('utf8'))
-
-        return "\n".encode('utf8').join(output)
-
-
-
    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too

@@ -102,11 +69,18 @@ class perform_site_check():
            # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
            # return content().textfilter().jsonextract().checksumcompare() ?

-            is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
+            is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
            is_html = not is_json
            css_filter_rule = watch['css_filter']
+            subtractive_selectors = watch.get(
+                "subtractive_selectors", []
+            ) + self.datastore.data["settings"]["application"].get(
+                "global_subtractive_selectors", []
+            )

            has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
+            has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
+            
            if is_json and not has_filter_rule:
                css_filter_rule = "json:$"
                has_filter_rule = True
@@ -119,8 +93,13 @@ class perform_site_check():
            if is_html:
                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                html_content = fetcher.content
-                if not fetcher.headers.get('Content-Type', '') == 'text/plain':

+                # If not JSON,  and if it's not text/plain..
+                if 'text/plain' in fetcher.headers.get('Content-Type', '').lower():
+                    # Don't run get_text or xpath/css filters on plaintext
+                    stripped_text_from_html = html_content
+                else:
+                    # Then we assume HTML
                    if has_filter_rule:
                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                        if css_filter_rule[0] == '/':
@@ -128,12 +107,10 @@ class perform_site_check():
                        else:
                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
-
+                    if has_subtractive_selectors:
+                        html_content = html_tools.element_removal(subtractive_selectors, html_content)
                    # get_text() via inscriptis
                    stripped_text_from_html = get_text(html_content)
-                else:
-                    # Don't run get_text or xpath/css filters on plaintext
-                    stripped_text_from_html = html_content

            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
@@ -147,7 +124,7 @@ class perform_site_check():
            # @todo we could abstract out the get_text() to handle this cleaner
            text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
            if len(text_to_ignore):
-                stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, text_to_ignore)
+                stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
            else:
                stripped_text_from_html = stripped_text_from_html.encode('utf8')

@@ -165,22 +142,14 @@ class perform_site_check():
            blocked_by_not_found_trigger_text = False

            if len(watch['trigger_text']):
+                # Yeah, lets block first until something matches
                blocked_by_not_found_trigger_text = True
-                for line in watch['trigger_text']:
-                    # Because JSON wont serialize a re.compile object
-                    if line[0] == '/' and line[-1] == '/':
-                        regex = re.compile(line.strip('/'), re.IGNORECASE)
-                        # Found it? so we don't wait for it anymore
-                        r = re.search(regex, str(stripped_text_from_html))
-                        if r:
-                            blocked_by_not_found_trigger_text = False
-                            break
-
-                    elif line.lower() in str(stripped_text_from_html).lower():
-                        # We found it don't wait for it.
-                        blocked_by_not_found_trigger_text = False
-                        break
-
+                # Filter and trigger works the same, so reuse it
+                result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
+                                                      wordlist=watch['trigger_text'],
+                                                      mode="line numbers")
+                if result:
+                    blocked_by_not_found_trigger_text = False


            if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -1,12 +1,30 @@
-from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \
-    Field
-from wtforms import widgets
-from wtforms.validators import ValidationError
-from wtforms.fields import html5
-from changedetectionio import content_fetcher
 import re

-from changedetectionio.notification import default_notification_format, valid_notification_formats, default_notification_body, default_notification_title
+from wtforms import (
+    BooleanField,
+    Field,
+    Form,
+    IntegerField,
+    PasswordField,
+    RadioField,
+    SelectField,
+    StringField,
+    SubmitField,
+    TextAreaField,
+    fields,
+    validators,
+    widgets,
+)
+from wtforms.fields import html5
+from wtforms.validators import ValidationError
+
+from changedetectionio import content_fetcher
+from changedetectionio.notification import (
+    default_notification_body,
+    default_notification_format,
+    default_notification_title,
+    valid_notification_formats,
+)

 valid_method = {
    'GET',
@@ -44,8 +62,8 @@ class SaltyPasswordField(StringField):
    encrypted_password = ""

    def build_password(self, password):
-        import hashlib
        import base64
+        import hashlib
        import secrets

        # Make a new salt on every new password and store it with the password
@@ -103,9 +121,10 @@ class ValidateContentFetcherIsReady(object):
        self.message = message

    def __call__(self, form, field):
-        from changedetectionio import content_fetcher
        import urllib3.exceptions

+        from changedetectionio import content_fetcher
+
        # Better would be a radiohandler that keeps a reference to each class
        if field.data is not None:
            klass = getattr(content_fetcher, field.data)
@@ -212,52 +231,69 @@ class ValidateListRegex(object):
                except re.error:
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))
-
+              
 class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
    @todo CSS validator ;)
    """

-    def __init__(self, message=None):
+    def __init__(self, message=None, allow_xpath=True, allow_json=True):
        self.message = message
+        self.allow_xpath = allow_xpath
+        self.allow_json = allow_json

    def __call__(self, form, field):

+        if isinstance(field.data, str):
+            data = [field.data]
+        else:
+            data = field.data
+
+        for line in data:
        # Nothing to see here
-        if not len(field.data.strip()):
-            return
+            if not len(line.strip()):
+                return

-        # Does it look like XPath?
-        if field.data.strip()[0] == '/':
-            from lxml import html, etree
-            tree = html.fromstring("<html></html>")
+            # Does it look like XPath?
+            if line.strip()[0] == '/':
+                if not self.allow_xpath:
+                    raise ValidationError("XPath not permitted in this field!")
+                from lxml import etree, html
+                tree = html.fromstring("<html></html>")

-            try:
-                tree.xpath(field.data.strip())
-            except etree.XPathEvalError as e:
-                message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
-                raise ValidationError(message % (field.data, str(e)))
-            except:
-                raise ValidationError("A system-error occurred when validating your XPath expression")
+                try:
+                    tree.xpath(line.strip())
+                except etree.XPathEvalError as e:
+                    message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+                    raise ValidationError(message % (line, str(e)))
+                except:
+                    raise ValidationError("A system-error occurred when validating your XPath expression")

-        if 'json:' in field.data:
-            from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
-            from jsonpath_ng.ext import parse
+            if 'json:' in line:
+                if not self.allow_json:
+                    raise ValidationError("JSONPath not permitted in this field!")

-            input = field.data.replace('json:', '')
+                from jsonpath_ng.exceptions import (
+                    JsonPathLexerError,
+                    JsonPathParserError,
+                )
+                from jsonpath_ng.ext import parse

-            try:
-                parse(input)
-            except (JsonPathParserError, JsonPathLexerError) as e:
-                message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
-                raise ValidationError(message % (input, str(e)))
-            except:
-                raise ValidationError("A system-error occurred when validating your JSONPath expression")
+                input = line.replace('json:', '')

-            # Re #265 - maybe in the future fetch the page and offer a
-            # warning/notice that its possible the rule doesnt yet match anything?
+                try:
+                    parse(input)
+                except (JsonPathParserError, JsonPathLexerError) as e:
+                    message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
+                    raise ValidationError(message % (input, str(e)))
+                except:
+                    raise ValidationError("A system-error occurred when validating your JSONPath expression")

+                # Re #265 - maybe in the future fetch the page and offer a
+                # warning/notice that its possible the rule doesnt yet match anything?
+
+            
 class quickWatchForm(Form):
    # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
    # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
@@ -282,6 +318,7 @@ class watchForm(commonSettingsForm):
    minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
                                               [validators.Optional(), validators.NumberRange(min=1)])
    css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
+    subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    title = StringField('Title')

    ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
@@ -290,6 +327,9 @@ class watchForm(commonSettingsForm):
    method = SelectField('Request Method', choices=valid_method, default=default_method)
    trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])

+    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
+    save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
+
    def validate(self, **kwargs):
        if not super().validate():
            return False
@@ -310,5 +350,6 @@ class globalSettingsForm(commonSettingsForm):
                                               [validators.NumberRange(min=1)])
    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
    base_url = StringField('Base URL', validators=[validators.Optional()])
+    global_subtractive_selectors = StringListField('Ignore elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
-    ignore_whitespace = BooleanField('Ignore whitespace')
+    ignore_whitespace = BooleanField('Ignore whitespace')
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -1,4 +1,7 @@
 import json
+import re
+from typing import List
+
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse

@@ -16,16 +19,27 @@ def css_filter(css_filter, html_content):

    return html_block + "\n"

+def subtractive_css_selector(css_selector, html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    for item in soup.select(css_selector):
+        item.decompose()
+    return str(soup)
+
+    
+def element_removal(selectors: List[str], html_content):
+    """Joins individual filters into one css filter."""
+    selector = ",".join(selectors)
+    return subtractive_css_selector(selector, html_content)
+    

 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content):
-    from lxml import html
-    from lxml import etree
+    from lxml import etree, html

    tree = html.fromstring(html_content)
    html_block = ""

-    for item in tree.xpath(xpath_filter.strip()):
+    for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
        html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"

    return html_block
@@ -105,3 +119,50 @@ def extract_json_as_string(content, jsonpath_filter):
        return ''

    return stripped_text_from_html
+
+# Mode     - "content" return the content without the matches (default)
+#          - "line numbers" return a list of line numbers that match (int list)
+#
+# wordlist - list of regex's (str) or words (str)
+def strip_ignore_text(content, wordlist, mode="content"):
+    ignore = []
+    ignore_regex = []
+
+    # @todo check this runs case insensitive
+    for k in wordlist:
+
+        # Is it a regex?
+        if k[0] == '/':
+            ignore_regex.append(k.strip(" /"))
+        else:
+            ignore.append(k)
+
+    i = 0
+    output = []
+    ignored_line_numbers = []
+    for line in content.splitlines():
+        i += 1
+        # Always ignore blank lines in this mode. (when this function gets called)
+        if len(line.strip()):
+            regex_matches = False
+
+            # if any of these match, skip
+            for regex in ignore_regex:
+                try:
+                    if re.search(regex, line, re.IGNORECASE):
+                        regex_matches = True
+                except Exception as e:
+                    continue
+
+            if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
+                output.append(line.encode('utf8'))
+            else:
+                ignored_line_numbers.append(i)
+
+
+
+    # Used for finding out what to highlight
+    if mode == "line numbers":
+        return ignored_line_numbers
+
+    return "\n".encode('utf8').join(output)
--- a/changedetectionio/static/styles/diff.css
+++ b/changedetectionio/static/styles/diff.css
@@ -54,3 +54,19 @@ ins {
  body {
    height: 99%;
    /* Hide scroll bar in Firefox */ } }
+
+td#diff-col div {
+  text-align: justify;
+  white-space: pre-wrap; }
+
+.ignored {
+  background-color: #ccc;
+  /*  border: #0d91fa 1px solid; */
+  opacity: 0.7; }
+
+.triggered {
+  background-color: #1b98f8; }
+
+/* ignored and triggered? make it obvious error */
+.ignored.triggered {
+  background-color: #ff0000; }
--- a/changedetectionio/static/styles/diff.scss
+++ b/changedetectionio/static/styles/diff.scss
@@ -66,3 +66,23 @@ ins {
 		height: 99%; /* Hide scroll bar in Firefox */
 	}
 }
+
+td#diff-col div {
+    text-align: justify;
+    white-space: pre-wrap;
+}
+
+.ignored {
+    background-color: #ccc;
+   /*  border: #0d91fa 1px solid; */
+    opacity: 0.7;
+}
+
+.triggered {
+    background-color: #1b98f8;
+}
+
+/* ignored and triggered? make it obvious error */
+.ignored.triggered {
+  background-color: #ff0000;
+}
--- a/changedetectionio/static/styles/package.json
+++ b/changedetectionio/static/styles/package.json
@@ -4,8 +4,7 @@
  "description": "",
  "main": "index.js",
  "scripts": {
-    "build": "node-sass styles.scss diff.scss -o .",
-    "watch": "node-sass --watch styles.scss diff.scss -o ."
+    "build": "node-sass styles.scss -o .;node-sass diff.scss -o ."
  },
  "author": "",
  "license": "ISC",
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
--- a/changedetectionio/static/styles/styles.scss
+++ b/changedetectionio/static/styles/styles.scss
@@ -45,6 +45,7 @@ section.content {
 /* table related */
 .watch-table {
  width: 100%;
+  font-size: 80%;

  tr.unviewed {
    font-weight: bold;
@@ -55,7 +56,6 @@ section.content {
  }

  td {
-    font-size: 80%;
    white-space: nowrap;
  }

@@ -107,12 +107,12 @@ section.content {

 body:after {
  content: "";
-  background: linear-gradient(130deg, #ff7a18, #af002d 41.07%, #319197 76.05%)
+  background: linear-gradient(130deg, #5ad8f7, #2f50af 41.07%, #9150bf 84.05%);
 }

 body:after, body:before {
  display: block;
-  height: 600px;
+  height: 650px;
  position: absolute;
  top: 0;
  left: 0;
@@ -125,11 +125,8 @@ body::after {
 }

 body::before {
+  // background-image set in base.html so it works with reverse proxies etc
  content: "";
-  background-image: url(/static/images/gradient-border.png);
-}
-
-body:before {
  background-size: cover
 }

@@ -265,6 +262,7 @@ body:after, body:before {
  }
  legend {
    color: #fff;
+    font-weight: bold;
  }
 }

@@ -317,11 +315,9 @@ footer {
    */
 }

-
-
 .sticky-tab {
  position: absolute;
-  top: 80px;
+  top: 60px;
  font-size: 8px;
  background: #fff;
  padding: 10px;
@@ -331,6 +327,11 @@ footer {
  &#right-sticky {
    right: 0px;
  }
+  &#hosted-sticky {
+    right: 0px;
+    top: 100px;
+    font-weight: bold;
+  }
 }

 #new-version-text a {
@@ -542,6 +543,16 @@ $form-edge-padding: 20px;
    display: block;
  }
 }
+
+.login-form {
+  .inner {
+    background: #fff;;
+    padding: $form-edge-padding;
+    border-radius: 5px;
+  }
+}
+
+
 .edit-form {
  min-width: 70%;
  .tab-pane-inner {
@@ -565,5 +576,14 @@ $form-edge-padding: 20px;
    display: block;
    background: #fff;
  }
+
+  .pure-form-message-inline {
+    padding-left: 0;
+  }
 }

+ul {
+    padding-left: 1em;
+    padding-top: 0px;
+    margin-top: 4px;
+}
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -1,15 +1,19 @@
-from os import unlink, path, mkdir
 import json
-import uuid as uuid_builder
-from threading import Lock
-from copy import deepcopy
-
 import logging
-import time
-import threading
 import os
+import threading
+import time
+import uuid as uuid_builder
+from copy import deepcopy
+from os import mkdir, path, unlink
+from threading import Lock
+
+from changedetectionio.notification import (
+    default_notification_body,
+    default_notification_format,
+    default_notification_title,
+)

-from changedetectionio.notification import default_notification_format, default_notification_body, default_notification_title

 # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
 # Open a github issue if you know something :)
@@ -46,6 +50,7 @@ class ChangeDetectionStore:
                    'extract_title_as_title': False,
                    'fetch_backend': 'html_requests',
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
+                    'global_subtractive_selectors': [],
                    'ignore_whitespace': False,
                    'notification_urls': [], # Apprise URL list
                    # Custom notification content
@@ -82,6 +87,7 @@ class ChangeDetectionStore:
            'notification_body': default_notification_body,
            'notification_format': default_notification_format,
            'css_filter': "",
+            'subtractive_selectors': [],
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'fetch_backend': None,
            'extract_title_as_title': False
@@ -144,8 +150,8 @@ class ChangeDetectionStore:
            unlink(password_reset_lockfile)

        if not 'app_guid' in self.__data:
-            import sys
            import os
+            import sys
            if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
                self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
            else:
@@ -184,10 +190,6 @@ class ChangeDetectionStore:

    def update_watch(self, uuid, update_obj):

-        # Skip if 'paused' state
-        if self.__data['watching'][uuid]['paused']:
-            return
-
        with self.lock:

            # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
@@ -398,13 +400,10 @@ class ChangeDetectionStore:
                # system was out of memory, out of RAM etc
                with open(self.json_store_path+".tmp", 'w') as json_file:
                    json.dump(data, json_file, indent=4)
-
+                os.rename(self.json_store_path+".tmp", self.json_store_path)
            except Exception as e:
                logging.error("Error writing JSON!! (Main JSON file save was skipped) : %s", str(e))

-            else:
-                os.rename(self.json_store_path+".tmp", self.json_store_path)
-
            self.needs_write = False

    # Thread runner, this helps with thread/write issues when there are many operations that want to update the JSON
@@ -437,6 +436,7 @@ class ChangeDetectionStore:
                index.append(self.data['watching'][uuid]['history'][str(id)])

        import pathlib
+
        # Only in the sub-directories
        for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
            if not str(item) in index:
--- a/changedetectionio/templates/_common_fields.jinja
+++ b/changedetectionio/templates/_common_fields.jinja
@@ -34,9 +34,8 @@
                            </div>
                            <div class="pure-controls">
                            <span class="pure-form-message-inline">
-                                These tokens can be used in the notification body and title to
-                                customise the notification text.
-                            </span>
+                                These tokens can be used in the notification body and title to customise the notification text.
+
                                <table class="pure-table" id="token-table">
                                    <thead>
                                    <tr>
@@ -88,7 +87,7 @@
                                    </tr>
                                    </tbody>
                                </table>
-                                <span class="pure-form-message-inline">
+                                <br/>
                                URLs generated by changedetection.io (such as <code>{diff_url}</code>) require the <code>BASE_URL</code> environment variable set.<br/>
                                Your <code>BASE_URL</code> var is currently "{{current_base_url}}"
                            </span>
--- a/changedetectionio/templates/_helpers.jinja
+++ b/changedetectionio/templates/_helpers.jinja
@@ -25,3 +25,6 @@
 {% endmacro %}


+{% macro render_button(field) %}
+  {{ field(**kwargs)|safe }}
+{% endmacro %}
--- a/changedetectionio/templates/base.html
+++ b/changedetectionio/templates/base.html
@@ -12,7 +12,13 @@
        <link rel="stylesheet" href="{{ m }}?ver=1000">
        {% endfor %}
    {% endif %}
+    <style>
+    body::before {
+        background-image: url({{url_for('static_content', group='images', filename='gradient-border.png')}});
+    }
+    </style>
 </head>
+
 <body>

 <div class="header">
@@ -35,13 +41,13 @@
        {% if current_user.is_authenticated or not has_password %}
            {% if not current_diff_url %}
            <li class="pure-menu-item">
-                <a href="{{ url_for('get_backup')}}" class="pure-menu-link">BACKUP</a>
+                <a href="{{ url_for('settings_page')}}" class="pure-menu-link">SETTINGS</a>
            </li>
            <li class="pure-menu-item">
                <a href="{{ url_for('import_page')}}" class="pure-menu-link">IMPORT</a>
            </li>
            <li class="pure-menu-item">
-                <a href="{{ url_for('settings_page')}}" class="pure-menu-link">SETTINGS</a>
+                <a href="{{ url_for('get_backup')}}" class="pure-menu-link">BACKUP</a>
            </li>
            {% else %}
            <li class="pure-menu-item">
@@ -68,7 +74,7 @@
        </ul>
    </div>
 </div>
-
+{% if hosted_sticky %}<div class="sticky-tab" id="hosted-sticky"><a href="https://lemonade.changedetection.io/start?ref={{guid}}">Let us host your instance!</a></div>{% endif %}
 {% if left_sticky %}<div class="sticky-tab" id="left-sticky"><a href="{{url_for('preview_page', uuid=uuid)}}">Show current snapshot</a></div> {% endif %}
 {% if right_sticky %}<div class="sticky-tab" id="right-sticky">{{ right_sticky }}</div> {% endif %}
 <section class="content">
--- a/changedetectionio/templates/diff.html
+++ b/changedetectionio/templates/diff.html
@@ -36,6 +36,7 @@
    <a onclick="next_diff();">Jump</a>
 </div>
 <div id="diff-ui">
+    <div class="tip">Pro-tip: Use <strong>show current snapshot</strong> tab to visualise what will be ignored.</div>
    <table>
        <tbody>
        <tr>
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -1,6 +1,7 @@
 {% extends 'base.html' %}
 {% block content %}
 {% from '_helpers.jinja' import render_field %}
+{% from '_helpers.jinja' import render_button %}
 {% from '_common_fields.jinja' import render_common_settings_form %}
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>

@@ -57,24 +58,30 @@
                        </span>
                    </div>

+                <hr/>
                <fieldset class="pure-group">
-                                    <div class="pure-control-group">
-                    {{ render_field(form.method) }}
-                </div>
-                    <strong>Note: <i>Request Headers and Body settings are ONLY used by Basic fast Plaintext/HTTP Client fetch method.</i></strong>
-                    {{ render_field(form.headers, rows=5, placeholder="Example
+
+                    <span class="pure-form-message-inline">
+                        <strong>Request override is currently only used by the <i>Basic fast Plaintext/HTTP Client</i> method.</strong>
+                    </span>
+                    <div class="pure-control-group">
+                        {{ render_field(form.method) }}
+                    </div>
+                    <div class="pure-control-group">
+{{ render_field(form.headers, rows=5, placeholder="Example
 Cookie: foobar
 User-Agent: wonderbra 1.0") }}
-                </fieldset>
-                <div class="pure-control-group">
-                    {{ render_field(form.body, rows=5, placeholder="Example
+                    </div>
+                    <div class="pure-control-group">
+                                        {{ render_field(form.body, rows=5, placeholder="Example
 {
   \"name\":\"John\",
   \"age\":30,
   \"car\":null
 }") }}
-                </div>
-
+                    </div>
+                </fieldset>
+                <br/>
            </div>

            <div class="tab-pane-inner" id="notifications">
@@ -88,6 +95,18 @@ User-Agent: wonderbra 1.0") }}

            <div class="tab-pane-inner" id="filters-and-triggers">
                <fieldset>
+                        <div class="pure-control-group">
+                            <strong>Pro-tips:</strong><br/>
+                            <ul>
+                                <li>
+                                    Use the preview page to see your filters and triggers highlighted.
+                                </li>
+                                <li>
+                                    Some sites use JavaScript to create the content, for this you should <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">use the Chrome/WebDriver Fetcher</a>
+                                </li>
+                            </ul>
+                    </div>
+
                    <div class="pure-control-group">
                        {{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.",
                        class="m-d") }}
@@ -96,14 +115,25 @@ User-Agent: wonderbra 1.0") }}
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example  <b>//*[contains(@class, 'sametext')]</b>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <b>//*[contains(@class, 'sametext')]</b>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
                </span>
                    </div>
-
+                    <fieldset class="pure-group">
+                      {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
+footer
+nav
+.stockticker") }}
+                      <span class="pure-form-message-inline">
+                        <ul>
+                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
+                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                        </ul>
+                      </span>
+                    </fieldset>
                </fieldset>
                <fieldset class="pure-group">
                    {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
@@ -114,6 +144,7 @@ User-Agent: wonderbra 1.0") }}
                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
                            <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
+                            <li>Use the preview/show current tab to see ignores</li>
                        </ul>
                </span>

@@ -138,7 +169,8 @@ User-Agent: wonderbra 1.0") }}
            <div id="actions">
                <div class="pure-control-group">

-                    <button type="submit" class="pure-button pure-button-primary">Save</button>
+                      {{ render_button(form.save_button) }} {{ render_button(form.save_and_preview_button) }}
+
                    <a href="{{url_for('api_delete', uuid=uuid)}}"
                       class="pure-button button-small button-error ">Delete</a>
                    <a href="{{url_for('api_clone', uuid=uuid)}}"
--- a/changedetectionio/templates/login.html
+++ b/changedetectionio/templates/login.html
@@ -1,8 +1,7 @@
 {% extends 'base.html' %}

 {% block content %}
-<div class="edit-form">
-
+<div class="login-form">
 <div class="inner">
    <form class="pure-form pure-form-stacked" action="{{url_for('login')}}" method="POST">
        <fieldset>
--- a/changedetectionio/templates/preview.html
+++ b/changedetectionio/templates/preview.html
@@ -3,19 +3,21 @@
 {% block content %}

 <div id="settings">
-    <h1>Current</h1>
+    <h1>Current - {{watch.last_checked|format_timestamp_timeago}}</h1>
 </div>

 <div id="diff-ui">
+    <span class="ignored">Grey lines are ignored</span> <span class="triggered">Blue lines are triggers</span>
    <table>
        <tbody>
        <tr>
            <td id="diff-col">
-                <span id="result">{% for row in content %}{{row}}{% endfor %}</span>
+                    {% for row in content %}
+                    <div class="{{row.classes}}">{{row.line}}</div>
+                    {% endfor %}
            </td>
        </tr>
        </tbody>
    </table>
 </div>
-
 {% endblock %}
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -83,7 +83,18 @@
                    </span>
                    </fieldset>

-
+                    <fieldset class="pure-group">
+                      {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
+footer
+nav
+.stockticker") }}
+                      <span class="pure-form-message-inline">
+                        <ul>
+                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
+                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                        </ul>
+                      </span>
+                    </fieldset>
                    <fieldset class="pure-group">
                    {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
 /some.regex\d{2}/ for case-INsensitive regex
@@ -95,6 +106,7 @@
                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
                            <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
+                            <li>Use the preview/show current tab to see ignores</li>
                        </ul>
                     </span>
                    </fieldset>
--- a/changedetectionio/tests/test_api.py
+++ b/changedetectionio/tests/test_api.py
@@ -14,7 +14,6 @@ def set_response_data(test_return_data):


 def test_snapshot_api_detects_change(client, live_server):
-
    test_return_data = "Some initial text"

    test_return_data_modified = "Some NEW nice initial text"
@@ -27,7 +26,7 @@ def test_snapshot_api_detects_change(client, live_server):
    time.sleep(1)

    # Add our URL to the import page
-    test_url = url_for('test_endpoint', _external=True)
+    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -7,6 +7,13 @@ from . util import set_original_response, set_modified_response, live_server_set

 sleep_time_for_fetch_thread = 3

+# Basic test to check inscriptus is not adding return line chars, basically works etc
+def test_inscriptus():
+    from inscriptis import get_text
+    html_content="<html><body>test!<br/>ok man</body></html>"
+    stripped_text_from_html = get_text(html_content)
+    assert stripped_text_from_html == 'test!\nok man'
+

 def test_check_basic_change_detection_functionality(client, live_server):
    set_original_response()
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python3
+
+import time
+
+from flask import url_for
+
+from ..html_tools import *
+from .util import live_server_setup
+
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+
+def set_original_response():
+    test_return_data = """<html>
+    <header>
+    <h2>Header</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A</a></li>
+      <li><a href="#">B</a></li>
+      <li><a href="#">C</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+    <div id="changetext">Some text that will change</div>
+     </body>
+    <footer>
+    <p>Footer</p>
+    </footer>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def set_modified_response():
+    test_return_data = """<html>
+    <header>
+    <h2>Header changed</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A changed</a></li>
+      <li><a href="#">B</a></li>
+      <li><a href="#">C</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+    <div id="changetext">Some text that changes</div>
+     </body>
+    <footer>
+    <p>Footer changed</p>
+    </footer>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def test_element_removal_output():
+    from changedetectionio import fetch_site_status
+    from inscriptis import get_text
+
+    # Check text with sub-parts renders correctly
+    content = """<html>
+    <header>
+    <h2>Header</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>across multiple lines</p>
+     <div id="changetext">Some text that changes</div>
+     </body>
+    <footer>
+    <p>Footer</p>
+    </footer>
+     </html>
+    """
+    html_blob = element_removal(
+        ["header", "footer", "nav", "#changetext"], html_content=content
+    )
+    text = get_text(html_blob)
+    assert (
+        text
+        == """Some initial text
+
+across multiple lines
+"""
+    )
+
+
+def test_element_removal_full(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for("test_endpoint", _external=True)
+    res = client.post(
+        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Goto the edit page, add the filter data
+    # Not sure why \r needs to be added - absent of the #changetext this is not necessary
+    subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={
+            "subtractive_selectors": subtractive_selectors_data,
+            "url": test_url,
+            "tag": "",
+            "headers": "",
+            "fetch_backend": "html_requests",
+        },
+        follow_redirects=True,
+    )
+    assert b"Updated watch." in res.data
+
+    # Check it saved
+    res = client.get(
+        url_for("edit_page", uuid="first"),
+    )
+    assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # No change yet - first check
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data
+
+    #  Make a change to header/footer/nav
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # There should not be an unviewed change, as changes should be removed
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@@ -3,6 +3,7 @@
 import time
 from flask import url_for
 from . util import live_server_setup
+from changedetectionio import html_tools

 def test_setup(live_server):
    live_server_setup(live_server)
@@ -23,7 +24,7 @@ def test_strip_regex_text_func():
    ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]

    fetcher = fetch_site_status.perform_site_check(datastore=False)
-    stripped_content = fetcher.strip_ignore_text(test_content, ignore_lines)
+    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

    assert b"but 1 lines" in stripped_content
    assert b"igNORe-cAse text" not in stripped_content
--- a/changedetectionio/tests/test_ignore_text.py
+++ b/changedetectionio/tests/test_ignore_text.py
@@ -3,6 +3,7 @@
 import time
 from flask import url_for
 from . util import live_server_setup
+from changedetectionio import html_tools

 def test_setup(live_server):
    live_server_setup(live_server)
@@ -23,7 +24,7 @@ def test_strip_text_func():
    ignore_lines = ["sometimes"]

    fetcher = fetch_site_status.perform_site_check(datastore=False)
-    stripped_content = fetcher.strip_ignore_text(test_content, ignore_lines)
+    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

    assert b"sometimes" not in stripped_content
    assert b"Some content" in stripped_content
@@ -52,6 +53,8 @@ def set_modified_original_ignore_response():
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
+     <p>new ignore stuff</p>
+     <p>blah</p>
     </body>
     </html>

@@ -67,7 +70,7 @@ def set_modified_ignore_response():
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
-     <P>ZZZZZ</P>
+     <P>ZZZZz</P>
     </br>
     So let's see what happens.  </br>
     </body>
@@ -82,7 +85,8 @@ def set_modified_ignore_response():
 def test_check_ignore_text_functionality(client, live_server):
    sleep_time_for_fetch_thread = 3

-    ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
+    # Use a mix of case in ZzZ to prove it works case-insensitive.
+    ignore_text = "XXXXX\r\nYYYYY\r\nzZzZZ\r\nnew ignore stuff"
    set_original_ignore_response()

    # Give the endpoint time to spin up
@@ -142,13 +146,25 @@ def test_check_ignore_text_functionality(client, live_server):
    assert b'unviewed' not in res.data
    assert b'/test-endpoint' in res.data

+
+
+
+
    # Just to be sure.. set a regular modified change..
    set_modified_original_ignore_response()
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
+
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data

+    # Check the preview/highlighter, we should be able to see what we ignored, but it should be highlighted
+    # We only introduce the "modified" content that includes what we ignore so we can prove the newest version also displays
+    # at /preview
+    res = client.get(url_for("preview_page", uuid="first"))
+    # We should be able to see what we ignored
+    assert b'<div class="ignored">new ignore stuff' in res.data
+
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data

--- a/changedetectionio/tests/test_jsonpath_selector.py
+++ b/changedetectionio/tests/test_jsonpath_selector.py
@@ -162,7 +162,7 @@ def test_check_json_without_filter(client, live_server):
    time.sleep(1)

    # Add our URL to the import page
-    test_url = url_for('test_endpoint_json', _external=True)
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)
    client.post(
        url_for("import_page"),
        data={"urls": test_url},
@@ -193,7 +193,7 @@ def test_check_json_filter(client, live_server):
    time.sleep(1)

    # Add our URL to the import page
-    test_url = url_for('test_endpoint', _external=True)
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
@@ -258,7 +258,7 @@ def test_check_json_filter_bool_val(client, live_server):
    # Give the endpoint time to spin up
    time.sleep(1)

-    test_url = url_for('test_endpoint', _external=True)
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)

    res = client.post(
        url_for("import_page"),
@@ -313,7 +313,7 @@ def test_check_json_ext_filter(client, live_server):
    time.sleep(1)

    # Add our URL to the import page
-    test_url = url_for('test_endpoint', _external=True)
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
--- a/changedetectionio/tests/test_request.py
+++ b/changedetectionio/tests/test_request.py
@@ -77,14 +77,6 @@ def test_body_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_body', _external=True)

-    # Add the test URL twice, we will check
-    res = client.post(
-        url_for("import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-    assert b"1 Imported" in res.data
-
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
@@ -94,19 +86,6 @@ def test_body_in_request(client, live_server):

    body_value = 'Test Body Value'

-    # Attempt to add a body with a GET method
-    res = client.post(
-        url_for("edit_page", uuid="first"),
-        data={
-              "url": test_url,
-              "tag": "",
-              "method": "GET",
-              "fetch_backend": "html_requests",
-              "body": "invalid"},
-        follow_redirects=True
-    )
-    assert b"Body must be empty when Request Method is set to GET" in res.data
-
    # Add a properly formatted body with a proper method
    res = client.post(
        url_for("edit_page", uuid="first"),
@@ -120,8 +99,7 @@ def test_body_in_request(client, live_server):
    )
    assert b"Updated watch." in res.data

-    # Give the thread time to pick up the first version
-    time.sleep(5)
+    time.sleep(3)

    # The service should echo back the body
    res = client.get(
@@ -129,9 +107,20 @@ def test_body_in_request(client, live_server):
        follow_redirects=True
    )

-    # Check if body returned contains the specified data
+    # If this gets stuck something is wrong, something should always be there
+    assert b"No history found" not in res.data
+    # We should see what we sent in the reply
    assert str.encode(body_value) in res.data

+    ####### data sanity checks
+    # Add the test URL twice, we will check
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
    watches_with_body = 0
    with open('test-datastore/url-watches.json') as f:
        app_struct = json.load(f)
@@ -142,6 +131,20 @@ def test_body_in_request(client, live_server):
    # Should be only one with body set
    assert watches_with_body==1

+    # Attempt to add a body with a GET method
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={
+              "url": test_url,
+              "tag": "",
+              "method": "GET",
+              "fetch_backend": "html_requests",
+              "body": "invalid"},
+        follow_redirects=True
+    )
+    assert b"Body must be empty when Request Method is set to GET" in res.data
+
+
 def test_method_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_method', _external=True)
--- a/changedetectionio/tests/test_trigger.py
+++ b/changedetectionio/tests/test_trigger.py
@@ -129,3 +129,8 @@ def test_trigger_functionality(client, live_server):
    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
+
+    # Check the preview/highlighter, we should be able to see what we triggered on, but it should be highlighted
+    res = client.get(url_for("preview_page", uuid="first"))
+    # We should be able to see what we ignored
+    assert b'<div class="triggered">foobar' in res.data
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -96,6 +96,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data

+
 def test_xpath_validation(client, live_server):

    # Give the endpoint time to spin up
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python3

+from flask import make_response, request

 def set_original_response():
    test_return_data = """<html>
@@ -40,24 +41,16 @@ def live_server_setup(live_server):

    @live_server.app.route('/test-endpoint')
    def test_endpoint():
+        ctype = request.args.get('content_type')
+
        # Tried using a global var here but didn't seem to work, so reading from a file instead.
-        with open("test-datastore/endpoint-content.txt", "r") as f:
-            return f.read()
-
-    @live_server.app.route('/test-endpoint-json')
-    def test_endpoint_json():
-
-        from flask import make_response
-
        with open("test-datastore/endpoint-content.txt", "r") as f:
            resp = make_response(f.read())
-            resp.headers['Content-Type'] = 'application/json'
+            resp.headers['Content-Type'] = ctype if ctype else 'text/html'
            return resp

    @live_server.app.route('/test-403')
    def test_endpoint_403_error():
-
-        from flask import make_response
        resp = make_response('', 403)
        return resp

@@ -65,7 +58,6 @@ def live_server_setup(live_server):
    @live_server.app.route('/test-headers')
    def test_headers():

-        from flask import request
        output= []

        for header in request.headers:
@@ -76,24 +68,16 @@ def live_server_setup(live_server):
    # Just return the body in the request
    @live_server.app.route('/test-body', methods=['POST', 'GET'])
    def test_body():
-
-        from flask import request
-
        return request.data

    # Just return the verb in the request
    @live_server.app.route('/test-method', methods=['POST', 'GET', 'PATCH'])
    def test_method():
-
-        from flask import request
-
        return request.method

    # Where we POST to as a notification
    @live_server.app.route('/test_notification_endpoint', methods=['POST', 'GET'])
    def test_notification_endpoint():
-        from flask import request
-
        with open("test-datastore/notification.txt", "wb") as f:
            # Debug method, dump all POST to file also, used to prove #65
            data = request.stream.read()
@@ -107,8 +91,6 @@ def live_server_setup(live_server):
    # Just return the verb in the request
    @live_server.app.route('/test-basicauth', methods=['GET'])
    def test_basicauth_method():
-
-        from flask import request
        auth = request.authorization
        ret = " ".join([auth.username, auth.password, auth.type])
        return ret
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -136,6 +136,8 @@ class update_worker(threading.Thread):
                        except Exception as e:
                            # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
                            print("!!!! Exception in update_worker !!!\n", e)
+                            self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
+                            self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})

                    finally:
                        # Always record that we atleast tried
@@ -145,4 +147,7 @@ class update_worker(threading.Thread):
                self.current_uuid = None  # Done
                self.q.task_done()

+                # Give the CPU time to interrupt
+                time.sleep(0.1)
+
            self.app.config.exit.wait(1)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,9 +1,9 @@
 version: '2'
 services:
-    changedetection.io:
+    changedetection:
      image: ghcr.io/dgtlmoon/changedetection.io
      container_name: changedetection.io
-      hostname: changedetection.io
+      hostname: changedetection
      volumes:
        - changedetection-data:/datastore

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ flask~= 2.0
 eventlet>=0.31.0
 validators
 timeago ~=1.0
-inscriptis ~= 1.2
+inscriptis ~= 2.2
 feedgen ~= 0.9
 flask-login ~= 0.5
 pytz
@@ -17,7 +17,7 @@ wtforms ~= 2.3.3
 jsonpath-ng ~= 1.5.3

 # Notification library
-apprise ~= 0.9.6
+apprise ~= 0.9.7

 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
 paho-mqtt
@@ -34,5 +34,4 @@ lxml

 # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
 selenium ~= 4.1.0
-pytest ~=6.2
-pytest-flask ~=1.2
+
Author	SHA1	Message	Date
dgtlmoon	c25294ca57	0.39.10	2022-03-12 17:28:30 +01:00
Tim Loderhose	d4359c2e67	Add filter to remove elements by CSS rule from HTML before change detection is run (#445 )	2022-03-12 13:29:30 +01:00
dgtlmoon	44fc804991	Minor updates to filters form text	2022-03-12 11:20:43 +01:00
dgtlmoon	b72c9eaf62	Re #448 - Dont use changedetection.io as the container name and hostname, fix problems fetching from the real changedetection.io webserver :)	2022-03-12 08:24:51 +01:00
dgtlmoon	7ce9e4dfc2	Testing - Refactor HTTP Request Type test (#453 )	2022-03-11 18:50:02 +01:00
dgtlmoon	3cc6586695	Make table header font size the same as content	2022-03-07 13:03:59 +01:00
dgtlmoon	09204cb43f	Adjust background colours	2022-03-06 19:03:59 +01:00
dgtlmoon	a709122874	Handle the case where the visitor is already logged-in and tries to login again (#447 )	2022-03-06 18:19:05 +01:00
dgtlmoon	efbeaf9535	Make the Request Override settings easier to understand	2022-03-06 17:23:21 +01:00
dgtlmoon	1a19fba07d	Minor tweak to notification token table	2022-03-06 17:10:30 +01:00
dgtlmoon	eb9020c175	Style tweak to watch form	2022-03-06 17:05:23 +01:00
dgtlmoon	13bb44e4f8	Login form style fixes	2022-03-06 17:03:15 +01:00
dgtlmoon	47f294c23b	Upgrade apprise notification engine to 0.9.7 (important telegram fixes)	2022-03-05 13:14:14 +01:00
dgtlmoon	a4cce16188	Remove pytest from production release pip requirements	2022-03-05 13:12:15 +01:00
dgtlmoon	69aec23d1d	Style fix for background image relative to X-Forwarded-Prefix when running via reverse proxy subdirectory	2022-03-05 13:08:57 +01:00
dgtlmoon	f85ccffe0a	Merge branch 'master' of github.com:dgtlmoon/changedetection.io	2022-03-04 13:13:54 +01:00
dgtlmoon	0005131472	Re-arranging primary links so the important ones are easier to find on mobile	2022-03-04 13:06:39 +01:00
dgtlmoon	3be1f4ea44	Set authentication cookie path relative to X-Forwarded-Prefix when running via reverse proxy subdirectory (#446 )	2022-03-04 11:23:32 +01:00
dgtlmoon	46c72a7fb3	Upgrade inscriptis HTML converter to version 2.2~ (#434 )	2022-03-01 17:58:54 +01:00
dgtlmoon	96664ffb10	Better text/plain detection and refactor tests (#443 )	2022-03-01 17:50:15 +01:00
dgtlmoon	615fa2c5b2	Tweak support tabs and text (#440 )	2022-02-28 22:39:32 +01:00
dgtlmoon	fd45fcce2f	Include link to changedetection.io hosted option (#439 )	2022-02-28 15:47:59 +01:00
dgtlmoon	75ca7ec504	Improved CPU usage around the loop responsible for what sites needs to be checked	2022-02-28 15:08:51 +01:00
dgtlmoon	8b1e9f6591	Update README.md with hosting options	2022-02-26 18:42:54 +01:00
dgtlmoon	883aa968fd	0.39.9	2022-02-24 17:02:50 +01:00
dgtlmoon	3240ed2339	Minor reliability upgrade for large datasets - retry deepcopy (#436 )	2022-02-24 16:58:51 +01:00
dgtlmoon	a89ffffc76	"Recheck" button should work when entry is in paused state	2022-02-24 16:49:48 +01:00
dgtlmoon	fda93c3798	Better file exception handling on saving index JSON	2022-02-24 16:36:24 +01:00
dgtlmoon	a51c555964	Fix small issue in highlight trigger/ignore preview page with setting the background colours, add test	2022-02-23 12:30:36 +01:00
dgtlmoon	b401998030	Ensure string matching on the ignore filter is always case-INsensitive	2022-02-23 12:01:11 +01:00
dgtlmoon	014fda9058	Ability to visualise trigger and filter rules against the current snapshot on the preview page	2022-02-23 10:49:25 +01:00
dgtlmoon	dd384619e0	Update README.md	2022-02-19 13:41:54 +01:00
Michael	85715120e2	XPath RegularExpression support	2022-02-19 13:40:57 +01:00
dgtlmoon	a0e4f9b88a	better checking of JSON type	2022-02-17 18:16:47 +01:00
dgtlmoon	04bef6091e	Make system level errors from the HTTP fetchers easier to find (#421 )	2022-02-13 23:43:45 +01:00