Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.

Text filters - Adding filters "Trim whitespace" and "Remove duplicate lines"
Browser Steps - UI - Use a better flexbox layout
2025-11-30 05:13:21 +00:00 · 2024-09-19 22:13:28 +02:00 · 2024-09-18 15:45:44 +02:00 · 2024-09-18 11:26:10 +02:00 · 2024-09-17 22:43:04 +02:00 · 2024-09-17 19:06:17 +02:00
14 changed files with 199 additions and 36 deletions
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -25,6 +25,7 @@ browser_step_ui_config = {'Choose one': '0 0',
                          'Click element if exists': '1 0',
                          'Click element': '1 0',
                          'Click element containing text': '0 1',
+                          'Click element containing text if exists': '0 1',
                          'Enter text in field': '1 1',
                          'Execute JS': '0 1',
 #                          'Extract text and use as filter': '1 0',
@@ -96,12 +97,24 @@ class steppable_browser_interface():
        return self.action_goto_url(value=self.start_url)

    def action_click_element_containing_text(self, selector=None, value=''):
+        logger.debug("Clicking element containing text")
        if not len(value.strip()):
            return
        elem = self.page.get_by_text(value)
        if elem.count():
            elem.first.click(delay=randint(200, 500), timeout=3000)

+    def action_click_element_containing_text_if_exists(self, selector=None, value=''):
+        logger.debug("Clicking element containing text if exists")
+        if not len(value.strip()):
+            return
+        elem = self.page.get_by_text(value)
+        logger.debug(f"Clicking element containing text - {elem.count()} elements found")
+        if elem.count():
+            elem.first.click(delay=randint(200, 500), timeout=3000)
+        else:
+            return
+
    def action_enter_text_in_field(self, selector, value):
        if not len(selector.strip()):
            return
--- a/changedetectionio/blueprint/tags/templates/edit-tag.html
+++ b/changedetectionio/blueprint/tags/templates/edit-tag.html
@@ -89,11 +89,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
                    {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
 footer
 nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
                    <span class="pure-form-message-inline">
                        <ul>
-                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
-                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                          <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
+                          <li> Don't paste HTML here, use only CSS and XPath selectors </li>
+                          <li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
                        </ul>
                      </span>
                </fieldset>
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -469,7 +469,7 @@ class processor_text_json_diff_form(commonSettingsForm):

    include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='')

-    subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
+    subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])

    extract_text = StringListField('Extract text', [ValidateListRegex()])

@@ -480,8 +480,10 @@ class processor_text_json_diff_form(commonSettingsForm):
    body = TextAreaField('Request body', [validators.Optional()])
    method = SelectField('Request method', choices=valid_method, default=default_method)
    ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
-    check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
+    check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
+    remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
    sort_text_alphabetically =  BooleanField('Sort text alphabetically', default=False)
+    trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)

    filter_text_added = BooleanField('Added lines', default=True)
    filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
@@ -576,7 +578,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
    empty_pages_are_a_change =  BooleanField('Treat empty pages as a change?', default=False)
    fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
-    global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
+    global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
    ignore_whitespace = BooleanField('Ignore whitespace')
    password = SaltyPasswordField()
    pager_size = IntegerField('Pager size',
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -1,4 +1,5 @@
 from typing import List
+from lxml import etree
 import json
 import re

@@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content):
        item.decompose()
    return str(soup)

+def subtractive_xpath_selector(xpath_selector, html_content): 
+    html_tree = etree.HTML(html_content)
+    elements_to_remove = html_tree.xpath(xpath_selector)
+
+    for element in elements_to_remove:
+        element.getparent().remove(element)
+
+    modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
+    return modified_html

 def element_removal(selectors: List[str], html_content):
-    """Joins individual filters into one css filter."""
-    selector = ",".join(selectors)
-    return subtractive_css_selector(selector, html_content)
+    """Removes elements that match a list of CSS or xPath selectors."""
+    modified_html = html_content
+    for selector in selectors:
+        if selector.startswith(('xpath:', 'xpath1:', '//')):
+            xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
+            modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
+        else:
+            modified_html = subtractive_css_selector(selector, modified_html)
+    return modified_html

 def elementpath_tostring(obj):
    """
--- a/changedetectionio/model/init.py
+++ b/changedetectionio/model/init.py
@@ -60,6 +60,8 @@ class watch_base(dict):
            'time_between_check_use_default': True,
            'title': None,
            'track_ldjson_price_data': None,
+            'trim_text_whitespace': False,
+            'remove_duplicate_lines': False,
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'url': '',
            'uuid': str(uuid.uuid4()),
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -158,6 +158,20 @@ class perform_site_check(difference_detection_processor):
        update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
        update_obj["last_check_status"] = self.fetcher.get_last_status_code()

+        # Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.
+        # Otherwise it will assume "in stock" because nothing suggesting the opposite was found
+        from ...html_tools import html_to_text
+        text = html_to_text(self.fetcher.content)
+        logger.debug(f"Length of text after conversion: {len(text)}")
+        if not len(text):
+            from ...content_fetchers.exceptions import ReplyWithContentButNoText
+            raise ReplyWithContentButNoText(url=watch.link,
+                                            status_code=self.fetcher.get_last_status_code(),
+                                            screenshot=self.fetcher.screenshot,
+                                            html_content=self.fetcher.content,
+                                            xpath_data=self.fetcher.xpath_data
+                                            )
+
        # Which restock settings to compare against?
        restock_settings = watch.get('restock_settings', {})

--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -218,11 +218,19 @@ class perform_site_check(difference_detection_processor):
                            is_rss=is_rss)) #1874 activate the <title workaround hack
                        stripped_text_from_html = future.result()

-        if watch.get('sort_text_alphabetically') and stripped_text_from_html:
+
+        if watch.get('trim_text_whitespace'):
+            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
+
+        if watch.get('remove_duplicate_lines'):
+            stripped_text_from_html = '\n'.join(dict.fromkeys(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
+
+        if watch.get('sort_text_alphabetically'):
            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
-            stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
-            stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
+            stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
+            stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
+

        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
@@ -304,7 +312,7 @@ class perform_site_check(difference_detection_processor):
                        for match in res:
                            regex_matched_output += [match] + [b'\n']

-            # Now we will only show what the regex matched
+            ##########################################################
            stripped_text_from_html = b''
            text_content_before_ignored_filter = b''
            if regex_matched_output:
@@ -312,6 +320,8 @@ class perform_site_check(difference_detection_processor):
                stripped_text_from_html = b''.join(regex_matched_output)
                text_content_before_ignored_filter = stripped_text_from_html

+
+
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
--- a/changedetectionio/static/styles/scss/parts/_browser-steps.scss
+++ b/changedetectionio/static/styles/scss/parts/_browser-steps.scss
@@ -40,15 +40,29 @@
  }
 }

-#browser-steps-fieldlist {
-  height: 100%;
-  overflow-y: scroll;
-}

 #browser-steps .flex-wrapper {
  display: flex;
  flex-flow: row;
  height: 70vh;
+  font-size: 80%;
+  #browser-steps-ui {
+    flex-grow: 1;      /* Allow it to grow and fill the available space */
+    flex-shrink: 1;    /* Allow it to shrink if needed */
+    flex-basis: 0;     /* Start with 0 base width so it stretches as much as possible */
+    background-color: #eee;
+    border-radius: 5px;
+
+  }
+
+  #browser-steps-fieldlist {
+    flex-grow: 0;      /* Don't allow it to grow */
+    flex-shrink: 0;    /* Don't allow it to shrink */
+    flex-basis: auto;  /* Base width is determined by the content */
+    max-width: 400px;  /* Set a max width to prevent overflow */
+    padding-left: 1rem;
+    overflow-y: scroll;
+  }
 }

 /*  this is duplicate :( */
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -46,14 +46,31 @@
    #browser_steps li > label {
      display: none; }

-#browser-steps-fieldlist {
-  height: 100%;
-  overflow-y: scroll; }
-
 #browser-steps .flex-wrapper {
  display: flex;
  flex-flow: row;
-  height: 70vh; }
+  height: 70vh;
+  font-size: 80%; }
+  #browser-steps .flex-wrapper #browser-steps-ui {
+    flex-grow: 1;
+    /* Allow it to grow and fill the available space */
+    flex-shrink: 1;
+    /* Allow it to shrink if needed */
+    flex-basis: 0;
+    /* Start with 0 base width so it stretches as much as possible */
+    background-color: #eee;
+    border-radius: 5px; }
+  #browser-steps .flex-wrapper #browser-steps-fieldlist {
+    flex-grow: 0;
+    /* Don't allow it to grow */
+    flex-shrink: 0;
+    /* Don't allow it to shrink */
+    flex-basis: auto;
+    /* Base width is determined by the content */
+    max-width: 400px;
+    /* Set a max width to prevent overflow */
+    padding-left: 1rem;
+    overflow-y: scroll; }

 /*  this is duplicate :( */
 #browsersteps-selector-wrapper {
@@ -1194,11 +1211,9 @@ ul {
  color: #fff;
  opacity: 0.7; }

-
 .restock-label svg {
  vertical-align: middle; }

-
 #chrome-extension-link {
  padding: 9px;
  border: 1px solid var(--color-grey-800);
--- a/changedetectionio/templates/_common_fields.html
+++ b/changedetectionio/templates/_common_fields.html
@@ -15,7 +15,7 @@
                                <strong>Tip:</strong> Use <a target=_new href="https://github.com/caronc/apprise">AppRise Notification URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.<br>
 </p>
                                <div data-target="#advanced-help-notifications" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
-                              <ul style="display: none" id="advanced-help-notifications">
+                                <ul style="display: none" id="advanced-help-notifications">
                                <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li>
                                <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li>
                                <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li>
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -200,7 +200,7 @@ User-Agent: wonderbra 1.0") }}
                        <div id="loading-status-text" style="display: none;">Please wait, first browser step can take a little time to load..<div class="spinner"></div></div>
                        <div class="flex-wrapper" >

-                            <div id="browser-steps-ui" class="noselect"  style="width: 100%; background-color: #eee; border-radius: 5px;">
+                            <div id="browser-steps-ui" class="noselect">

                                <div class="noselect"  id="browsersteps-selector-wrapper" style="width: 100%">
                                    <span class="loader" >
@@ -215,7 +215,7 @@ User-Agent: wonderbra 1.0") }}
                                    <canvas  class="noselect" id="browsersteps-selector-canvas" style="max-width: 100%; width: 100%;"></canvas>
                                </div>
                            </div>
-                            <div id="browser-steps-fieldlist" style="padding-left: 1em;  width: 350px; font-size: 80%;" >
+                            <div id="browser-steps-fieldlist" >
                                <span id="browser-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
                                {{ render_field(form.browser_steps) }}
                            </div>
@@ -310,12 +310,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
                    {{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
 footer
 nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
                    <span class="pure-form-message-inline">
                        <ul>
-                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
-                          <li> Don't paste HTML here, use only CSS selectors </li>
-                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                          <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
+                          <li> Don't paste HTML here, use only CSS and XPath selectors </li>
+                          <li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
                        </ul>
                      </span>
                </fieldset>
@@ -330,11 +331,22 @@ nav
                    <span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
                    <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
                </fieldset>
-
+                <fieldset class="pure-control-group">
+                    {{ render_checkbox_field(form.check_unique_lines) }}
+                    <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
+                </fieldset>
+                <fieldset class="pure-control-group">
+                    {{ render_checkbox_field(form.remove_duplicate_lines) }}
+                    <span class="pure-form-message-inline">Remove duplicate lines of text</span>
+                </fieldset>
                <fieldset class="pure-control-group">
                    {{ render_checkbox_field(form.sort_text_alphabetically) }}
                    <span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span>
                </fieldset>
+                <fieldset class="pure-control-group">
+                    {{ render_checkbox_field(form.trim_text_whitespace) }}
+                    <span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
+                </fieldset>
                <fieldset class="pure-control-group">
                    {{ render_checkbox_field(form.check_unique_lines) }}
                    <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -155,11 +155,13 @@
                      {{ render_field(form.application.form.global_subtractive_selectors, rows=5, placeholder="header
 footer
 nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
                      <span class="pure-form-message-inline">
                        <ul>
-                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
-                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                          <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
+                          <li> Don't paste HTML here, use only CSS and XPath selectors </li>
+                          <li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
                        </ul>
                      </span>
                    </fieldset>
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@@ -87,6 +87,9 @@ def test_element_removal_output():
     Some initial text<br>
     <p>across multiple lines</p>
     <div id="changetext">Some text that changes</div>
+     <div>Some text should be matched by xPath // selector</div>
+     <div>Some text should be matched by xPath selector</div>
+     <div>Some text should be matched by xPath1 selector</div>
     </body>
    <footer>
    <p>Footer</p>
@@ -94,7 +97,16 @@ def test_element_removal_output():
     </html>
    """
    html_blob = element_removal(
-        ["header", "footer", "nav", "#changetext"], html_content=content
+      [
+        "header",
+        "footer",
+        "nav",
+        "#changetext",
+        "//*[contains(text(), 'xPath // selector')]",
+        "xpath://*[contains(text(), 'xPath selector')]",
+        "xpath1://*[contains(text(), 'xPath1 selector')]"
+      ],
+      html_content=content
    )
    text = get_text(html_blob)
    assert (
--- a/changedetectionio/tests/test_unique_lines.py
+++ b/changedetectionio/tests/test_unique_lines.py
@@ -11,6 +11,8 @@ def set_original_ignore_response():
     <p>Some initial text</p>
     <p>Which is across multiple lines</p>
     <p>So let's see what happens.</p>
+     <p>&nbsp;  So let's see what happens.   <br> </p>
+     <p>A - sortable line</p> 
     </body>
     </html>
    """
@@ -164,5 +166,52 @@ def test_sort_lines_functionality(client, live_server, measure_memory_usage):
    assert res.data.find(b'A uppercase') < res.data.find(b'Z last')
    assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines')
    
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+
+def test_extra_filters(client, live_server, measure_memory_usage):
+    #live_server_setup(live_server)
+
+    set_original_ignore_response()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    wait_for_all_checks(client)
+
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"remove_duplicate_lines": "y",
+              "trim_text_whitespace": "y",
+              "sort_text_alphabetically": "",  # leave this OFF for testing
+              "url": test_url,
+              "fetch_backend": "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+    # Give the thread time to pick it up
+    wait_for_all_checks(client)
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("preview_page", uuid="first")
+    )
+
+    assert res.data.count(b"see what happens.") == 1
+
+    # still should remain unsorted ('A - sortable line') stays at the end
+    assert res.data.find(b'A - sortable line') > res.data.find(b'Which is across multiple lines')
+
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
Author	SHA1	Message	Date
dgtlmoon	8bad0b06ff	Only try to process restock information (like scraping for keywords) if the page was actually rendered correctly.	2024-09-19 22:13:28 +02:00
dgtlmoon	e830fb2320	Text filters - Adding filters "Trim whitespace" and "Remove duplicate lines"	2024-09-18 15:45:44 +02:00
dgtlmoon	c6589ee1b4	Browser Steps - UI - Use a better flexbox layout	2024-09-18 11:26:10 +02:00
Michael McMillan	dc936a2e8a	Filters - Add support for also removing HTML elements using XPath selectors (#2632 )	2024-09-17 22:43:04 +02:00
dgtlmoon	8c1527c1ad	Update AppRise notification library to 1.9.0 (#2624 )	2024-09-17 19:06:17 +02:00
Dawid Wróbel	a5ff1cd1d7	browser_steps: add "click element containing text if exists" (#2629 )	2024-09-17 18:30:54 +02:00