undo change

Also make user-agent detect string case insensitive
Dont set user agent default if none is set
2025-11-14 05:26:09 +00:00 · 2023-09-23 17:21:35 +02:00 · 2023-09-23 16:36:14 +02:00 · 2023-09-23 14:34:56 +02:00
20 changed files with 115 additions and 227 deletions
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -30,11 +30,11 @@ jobs:

    steps:
    - name: Checkout repository
-      uses: actions/checkout@v4
+      uses: actions/checkout@v2

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v1
      with:
        languages: ${{ matrix.language }}
        # If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,7 +45,7 @@ jobs:
    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v1

    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 https://git.io/JvXDl
@@ -59,4 +59,4 @@ jobs:
    #   make release

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v1
--- a/.github/workflows/containers.yml
+++ b/.github/workflows/containers.yml
@@ -39,9 +39,9 @@ jobs:
    # Or if we are in a tagged release scenario.
    if: ${{ github.event.workflow_run.conclusion == 'success' }} || ${{ github.event.release.tag_name }} != ''
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2
      - name: Set up Python 3.9
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v2
        with:
          python-version: 3.9

@@ -58,27 +58,27 @@ jobs:
          echo ${{ github.ref }} > changedetectionio/tag.txt

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v1
        with:
          image: tonistiigi/binfmt:latest
          platforms: all

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v1
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Docker Hub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKER_HUB_USERNAME }}
          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}

      - name: Set up Docker Buildx
        id: buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v1
        with:
          install: true
          version: latest
@@ -88,7 +88,7 @@ jobs:
      - name: Build and push :dev
        id: docker_build
        if: ${{ github.ref }} == "refs/heads/master"
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v2
        with:
          context: ./
          file: ./Dockerfile
@@ -105,7 +105,7 @@ jobs:
      - name: Build and push :tag
        id: docker_build_tag_release
        if: github.event_name == 'release' && startsWith(github.event.release.tag_name, '0.')
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v2
        with:
          context: ./
          file: ./Dockerfile
@@ -125,7 +125,7 @@ jobs:
        run: echo step SHA ${{ steps.vars.outputs.sha_short }} tag ${{steps.vars.outputs.tag}} branch ${{steps.vars.outputs.branch}} digest ${{ steps.docker_build.outputs.digest }}

      - name: Cache Docker layers
-        uses: actions/cache@v3
+        uses: actions/cache@v2
        with:
          path: /tmp/.buildx-cache
          key: ${{ runner.os }}-buildx-${{ github.sha }}
--- a/.github/workflows/test-container-build.yml
+++ b/.github/workflows/test-container-build.yml
@@ -24,22 +24,22 @@ jobs:
  test-container-build:
    runs-on: ubuntu-latest
    steps:
-        - uses: actions/checkout@v4
+        - uses: actions/checkout@v2
        - name: Set up Python 3.9
-          uses: actions/setup-python@v4
+          uses: actions/setup-python@v2
          with:
            python-version: 3.9

        # Just test that the build works, some libraries won't compile on ARM/rPi etc
        - name: Set up QEMU
-          uses: docker/setup-qemu-action@v3
+          uses: docker/setup-qemu-action@v1
          with:
            image: tonistiigi/binfmt:latest
            platforms: all

        - name: Set up Docker Buildx
          id: buildx
-          uses: docker/setup-buildx-action@v3
+          uses: docker/setup-buildx-action@v1
          with:
            install: true
            version: latest
@@ -49,7 +49,7 @@ jobs:
        # Check we can still build under alpine/musl
        - name: Test that the docker containers can build (musl via alpine check)
          id: docker_build_musl
-          uses: docker/build-push-action@v5
+          uses: docker/build-push-action@v2
          with:
            context: ./
            file: ./.github/test/Dockerfile-alpine
@@ -57,7 +57,7 @@ jobs:

        - name: Test that the docker containers can build
          id: docker_build
-          uses: docker/build-push-action@v5
+          uses: docker/build-push-action@v2
          # https://github.com/docker/build-push-action#customizing
          with:
            context: ./
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -7,11 +7,11 @@ jobs:
  test-application:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      # Mainly just for link/flake8
      - name: Set up Python 3.10
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v2
        with:
          python-version: '3.10'

--- a/.github/workflows/test-pip-build.yml
+++ b/.github/workflows/test-pip-build.yml
@@ -11,10 +11,10 @@ jobs:
  test-pip-build-basics:
    runs-on: ubuntu-latest
    steps:
-        - uses: actions/checkout@v4
+        - uses: actions/checkout@v2

        - name: Set up Python 3.9
-          uses: actions/setup-python@v4
+          uses: actions/setup-python@v2
          with:
            python-version: 3.9

--- a/changedetectionio/blueprint/check_proxies/init.py
+++ b/changedetectionio/blueprint/check_proxies/init.py
@@ -57,11 +57,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
                status.update({'status': 'ERROR OTHER', 'length': len(contents), 'text': f"Got empty reply with code {e.status_code} - Access denied"})
            else:
                status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': f"Empty reply with code {e.status_code}, needs chrome?"})
-        except content_fetcher.ReplyWithContentButNoText as e:
-            txt = f"Got reply but with no content - Status code {e.status_code} - It's possible that the filters were found, but contained no usable text (or contained only an image)."
-            status.update({'status': 'ERROR', 'text': txt})
+
        except Exception as e:
-            status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': 'Error: '+type(e).__name__+str(e)})
+            status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': 'Error: '+str(e)})
        else:
            status.update({'status': 'OK', 'length': len(contents), 'text': ''})

--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -77,13 +77,11 @@ class ScreenshotUnavailable(Exception):


 class ReplyWithContentButNoText(Exception):
-    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
+    def __init__(self, status_code, url, screenshot=None):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
-        self.has_filters = has_filters
-        self.html_content = html_content
        return


--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -229,19 +229,16 @@ class ValidateJinja2Template(object):
    def __call__(self, form, field):
        from changedetectionio import notification

-        from jinja2 import Environment, BaseLoader, TemplateSyntaxError, UndefinedError
+        from jinja2 import Environment, BaseLoader, TemplateSyntaxError
        from jinja2.meta import find_undeclared_variables


        try:
            jinja2_env = Environment(loader=BaseLoader)
            jinja2_env.globals.update(notification.valid_tokens)
-
            rendered = jinja2_env.from_string(field.data).render()
        except TemplateSyntaxError as e:
            raise ValidationError(f"This is not a valid Jinja2 template: {e}") from e
-        except UndefinedError as e:
-            raise ValidationError(f"A variable or function is not defined: {e}") from e

        ast = jinja2_env.parse(field.data)
        undefined = ", ".join(find_undeclared_variables(ast))
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -10,7 +10,6 @@ import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"

-PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # all of those may or may not appear on different websites
 LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@@ -18,23 +17,7 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
-
-
-# Doesn't look like python supports forward slash auto enclosure in re.findall
-# So convert it to inline flag "(?i)foobar" type configuration
-def perl_style_slash_enclosed_regex_to_options(regex):
-
-    res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
-
-    if res:
-        flags = res.group(2) if res.group(2) else 'i'
-        regex = f"(?{flags}){res.group(1)}"
-    else:
-        # Fall back to just ignorecase as an option
-        regex = f"(?i){regex}"
-
-    return regex
-
+        
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    soup = BeautifulSoup(html_content, "html.parser")
@@ -212,14 +195,23 @@ def strip_ignore_text(content, wordlist, mode="content"):
    output = []
    ignore_text = []
    ignore_regex = []
+
    ignored_line_numbers = []

    for k in wordlist:
        # Is it a regex?
-        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
-        if res:
-            ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
+        x = re.search('^\/(.*)\/(.*)', k.strip())
+        if x:
+            # Starts with / but doesn't look like a regex
+            p = x.group(1)
+            try:
+                # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
+                ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
+            except Exception as e:
+                # Badly formed regex, treat as text
+                ignore_text.append(k.strip())
        else:
+            # Had a / but doesn't work as regex
            ignore_text.append(k.strip())

    for line in content.splitlines():
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -11,19 +11,17 @@ from changedetectionio import content_fetcher, html_tools
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from copy import deepcopy
 from . import difference_detection_processor
-from ..html_tools import PERL_STYLE_REGEX

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-name = 'Webpage Text/HTML, JSON and PDF changes'
-description = 'Detects all text changes where possible'

+name =  'Webpage Text/HTML, JSON and PDF changes'
+description = 'Detects all text changes where possible'

 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)

-
 class PDFToHTMLToolNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
@@ -39,6 +37,19 @@ class perform_site_check(difference_detection_processor):
        super().__init__(*args, **kwargs)
        self.datastore = datastore

+    # Doesn't look like python supports forward slash auto enclosure in re.findall
+    # So convert it to inline flag "foobar(?i)" type configuration
+    def forward_slash_enclosed_regex_to_options(self, regex):
+        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
+
+        if res:
+            regex = res.group(1)
+            regex += '(?{})'.format(res.group(2))
+        else:
+            regex += '(?{})'.format('i')
+
+        return regex
+
    def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
        changed_detected = False
        screenshot = False  # as bytes
@@ -124,8 +135,7 @@ class perform_site_check(difference_detection_processor):
        # requests for PDF's, images etc should be passwd the is_binary flag
        is_binary = watch.is_pdf

-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
-                    is_binary=is_binary)
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
        fetcher.quit()

        self.screenshot = fetcher.screenshot
@@ -141,6 +151,7 @@ class perform_site_check(difference_detection_processor):
            if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
                raise content_fetcher.checksumFromPreviousCheckWasTheSame()

+
        # Fetching complete, now filters
        # @todo move to class / maybe inside of fetcher abstract base?

@@ -220,6 +231,8 @@ class perform_site_check(difference_detection_processor):
                    stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                    is_html = False

+
+
        if is_html or is_source:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -270,6 +283,7 @@ class perform_site_check(difference_detection_processor):
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

+
        # @todo whitespace coming from missing rtrim()?
        # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
        # Rewrite's the processing text based on only what diff result they want to see
@@ -279,13 +293,13 @@ class perform_site_check(difference_detection_processor):
            # needs to not include (added) etc or it may get used twice
            # Replace the processed text with the preferred result
            rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
-                                             newest_version_file_contents=stripped_text_from_html,
-                                             include_equal=False,  # not the same lines
-                                             include_added=watch.get('filter_text_added', True),
-                                             include_removed=watch.get('filter_text_removed', True),
-                                             include_replaced=watch.get('filter_text_replaced', True),
-                                             line_feed_sep="\n",
-                                             include_change_type_prefix=False)
+                                                       newest_version_file_contents=stripped_text_from_html,
+                                                       include_equal=False,  # not the same lines
+                                                       include_added=watch.get('filter_text_added', True),
+                                                       include_removed=watch.get('filter_text_removed', True),
+                                                       include_replaced=watch.get('filter_text_replaced', True),
+                                                       line_feed_sep="\n",
+                                                       include_change_type_prefix=False)

            watch.save_last_fetched_before_filters(text_content_before_ignored_filter)

@@ -300,12 +314,7 @@ class perform_site_check(difference_detection_processor):
        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
-            raise content_fetcher.ReplyWithContentButNoText(url=url,
-                                                            status_code=fetcher.get_last_status_code(),
-                                                            screenshot=screenshot,
-                                                            has_filters=has_filter_rule,
-                                                            html_content=html_content
-                                                            )
+            raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)

        # We rely on the actual text in the html output.. many sites have random script vars etc,
        # in the future we'll implement other mechanisms.
@@ -326,25 +335,16 @@ class perform_site_check(difference_detection_processor):
            regex_matched_output = []
            for s_re in extract_text:
                # incase they specified something in '/.../x'
-                if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
-                    regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
-                    result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+                regex = self.forward_slash_enclosed_regex_to_options(s_re)
+                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)

-                    for l in result:
-                        if type(l) is tuple:
-                            # @todo - some formatter option default (between groups)
-                            regex_matched_output += list(l) + [b'\n']
-                        else:
-                            # @todo - some formatter option default (between each ungrouped result)
-                            regex_matched_output += [l] + [b'\n']
-                else:
-                    # Doesnt look like regex, just hunt for plaintext and return that which matches
-                    # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
-                    r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
-                    res = r.findall(stripped_text_from_html)
-                    if res:
-                        for match in res:
-                            regex_matched_output += [match] + [b'\n']
+                for l in result:
+                    if type(l) is tuple:
+                        # @todo - some formatter option default (between groups)
+                        regex_matched_output += list(l) + [b'\n']
+                    else:
+                        # @todo - some formatter option default (between each ungrouped result)
+                        regex_matched_output += [l] + [b'\n']

            # Now we will only show what the regex matched
            stripped_text_from_html = b''
--- a/changedetectionio/res/stock-not-in-stock.js
+++ b/changedetectionio/res/stock-not-in-stock.js
@@ -5,19 +5,14 @@ function isItemInStock() {
    'agotado',
    'artikel zurzeit vergriffen',
    'as soon as stock is available',
-    'ausverkauft', // sold out
    'available for back order',
-    'back-order or out of stock',
    'backordered',
-    'benachrichtigt mich', // notify me
    'brak na stanie',
    'brak w magazynie',
    'coming soon',
    'currently have any tickets for this',
    'currently unavailable',
-    'dostępne wkrótce',
    'en rupture de stock',
-    'ist derzeit nicht auf lager',
    'item is no longer available',
    'message if back in stock',
    'nachricht bei',
@@ -42,7 +37,6 @@ function isItemInStock() {
    'unavailable tickets',
    'we do not currently have an estimate of when this product will be back in stock.',
    'zur zeit nicht an lager',
-    '已售完',
  ];


--- a/changedetectionio/static/js/browser-steps.js
+++ b/changedetectionio/static/js/browser-steps.js
@@ -208,7 +208,7 @@ $(document).ready(function () {
            console.log(x);
            if (x && first_available.length) {
                // @todo will it let you click shit that has a layer ontop? probably not.
-                if (x['tagtype'] === 'text' || x['tagtype'] === 'number' || x['tagtype'] === 'email' || x['tagName'] === 'textarea' || x['tagtype'] === 'password' || x['tagtype'] === 'search') {
+                if (x['tagtype'] === 'text' || x['tagtype'] === 'email' || x['tagName'] === 'textarea' || x['tagtype'] === 'password' || x['tagtype'] === 'search') {
                    $('select', first_available).val('Enter text in field').change();
                    $('input[type=text]', first_available).first().val(x['xpath']);
                    $('input[placeholder="Value"]', first_available).addClass('ok').click().focus();
--- a/changedetectionio/static/styles/scss/parts/_browser-steps.scss
+++ b/changedetectionio/static/styles/scss/parts/_browser-steps.scss
@@ -44,7 +44,7 @@
 #browser-steps .flex-wrapper {
  display: flex;
  flex-flow: row;
-  height: 70vh;
+  height: 600px; /*@todo make this dynamic */
 }

 /*  this is duplicate :( */
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -50,7 +50,8 @@
 #browser-steps .flex-wrapper {
  display: flex;
  flex-flow: row;
-  height: 70vh; }
+  height: 600px;
+  /*@todo make this dynamic */ }

 /*  this is duplicate :( */
 #browsersteps-selector-wrapper {
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -378,16 +378,15 @@ Unavailable") }}
                        {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
                        <span class="pure-form-message-inline">
                    <ul>
-                        <li>Extracts text in the final output (line by line) after other filters using regular expressions or string match;
+                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
                            <ul>
                                <li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
-                                <li>Don't forget to consider the white-space at the start of a line <code>/.+?reports.+?2022/i</code></li>
                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
                                <li>Keyword example &dash; example <code>Out of stock</code></li>
                                <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
                            </ul>
                        </li>
-                        <li>One line per regular-expression/string match</li>
+                        <li>One line per regular-expression/ string match</li>
                    </ul>
                        </span>
                    </div>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -119,9 +119,6 @@
                            <a href="{{ url_for('settings_page', uuid=watch.uuid) }}#proxies">Try adding external proxies/locations</a>
                        
                        {% endif %}
-                        {% if 'empty result or contain only an image' in watch.last_error %}
-                            <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images">more help here</a>.
-                        {% endif %}
                    </div>
                    {% endif %}
                    {% if watch.last_notification_error is defined and watch.last_notification_error != False %}
--- a/changedetectionio/tests/test_css_selector.py
+++ b/changedetectionio/tests/test_css_selector.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from . util import live_server_setup

 from ..html_tools import *

@@ -176,77 +176,3 @@ def test_check_multiple_filters(client, live_server):
    assert b"Blob A" in res.data # CSS was ok
    assert b"Blob B" in res.data # xPath was ok
    assert b"Blob C" not in res.data # Should not be included
-
-# The filter exists, but did not contain anything useful
-# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector
-# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text
-def test_filter_is_empty_help_suggestion(client, live_server):
-    #live_server_setup(live_server)
-
-    include_filters = "#blob-a"
-
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write("""<html><body>
-         <div id="blob-a">
-           <img src="something.jpg">
-         </div>
-         </body>
-         </html>
-        """)
-
-
-    # Add our URL to the import page
-    test_url = url_for('test_endpoint', _external=True)
-    res = client.post(
-        url_for("import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-    assert b"1 Imported" in res.data
-    wait_for_all_checks(client)
-
-    # Goto the edit page, add our ignore text
-    # Add our URL to the import page
-    res = client.post(
-        url_for("edit_page", uuid="first"),
-        data={"include_filters": include_filters,
-              "url": test_url,
-              "tags": "",
-              "headers": "",
-              'fetch_backend': "html_requests"},
-        follow_redirects=True
-    )
-    assert b"Updated watch." in res.data
-
-    wait_for_all_checks(client)
-
-
-    res = client.get(
-        url_for("index"),
-        follow_redirects=True
-    )
-
-    assert b'empty result or contain only an image' in res.data
-
-
-    ### Just an empty selector, no image
-
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write("""<html><body>
-         <div id="blob-a">
-           <!-- doo doo -->
-         </div>
-         </body>
-         </html>
-        """)
-
-    res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    wait_for_all_checks(client)
-
-    res = client.get(
-        url_for("index"),
-        follow_redirects=True
-    )
-
-    assert b'empty result or contain only an image' not in res.data
-    assert b'but contained no usable text' in res.data
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from .util import live_server_setup

 from ..html_tools import *

@@ -55,8 +55,6 @@ def set_multiline_response():
     </p>
     
     <div>aaand something lines</div>
-     <br>
-     <div>and this should be</div>
     </body>
     </html>
    """
@@ -68,10 +66,11 @@ def set_multiline_response():


 def test_setup(client, live_server):
+
    live_server_setup(live_server)

 def test_check_filter_multiline(client, live_server):
-    #live_server_setup(live_server)
+
    set_multiline_response()

    # Add our URL to the import page
@@ -83,15 +82,14 @@ def test_check_filter_multiline(client, live_server):
    )
    assert b"1 Imported" in res.data

-    wait_for_all_checks(client)
+    time.sleep(3)

    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"include_filters": '',
-              # Test a regex and a plaintext
-              'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
+              'extract_text': '/something.+?6 billion.+?lines/si',
              "url": test_url,
              "tags": "",
              "headers": "",
@@ -101,19 +99,13 @@ def test_check_filter_multiline(client, live_server):
    )

    assert b"Updated watch." in res.data
-    wait_for_all_checks(client)
-
-    res = client.get(url_for("index"))
-
-    # Issue 1828
-    assert b'not at the start of the expression' not in res.data
+    time.sleep(3)

    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
-    # Plaintext that doesnt look like a regex should match also
-    assert b'and this should be' in res.data
+

    assert b'<div class="">Something' in res.data
    assert b'<div class="">across 6 billion multiple' in res.data
@@ -123,11 +115,14 @@ def test_check_filter_multiline(client, live_server):
    assert b'aaand something lines' not in res.data

 def test_check_filter_and_regex_extract(client, live_server):
-    
+    sleep_time_for_fetch_thread = 3
    include_filters = ".changetext"

    set_original_response()

+    # Give the endpoint time to spin up
+    time.sleep(1)
+
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
@@ -137,15 +132,19 @@ def test_check_filter_and_regex_extract(client, live_server):
    )
    assert b"1 Imported" in res.data

+    time.sleep(1)
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
    # Give the thread time to pick it up
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"include_filters": include_filters,
-              'extract_text': '/\d+ online/\r\n/\d+ guests/\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i\r\n/issue1828.+?2022/i',
+              'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
              "url": test_url,
              "tags": "",
              "headers": "",
@@ -156,13 +155,8 @@ def test_check_filter_and_regex_extract(client, live_server):

    assert b"Updated watch." in res.data

-
    # Give the thread time to pick it up
-    wait_for_all_checks(client)
-
-    res = client.get(url_for("index"))
-    #issue 1828
-    assert b'not at the start of the expression' not in res.data
+    time.sleep(sleep_time_for_fetch_thread)

    #  Make a change
    set_modified_response()
@@ -170,7 +164,7 @@ def test_check_filter_and_regex_extract(client, live_server):
    # Trigger a check
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # It should have 'unviewed' still
    # Because it should be looking at only that 'sametext' id
--- a/changedetectionio/tests/test_trigger_regex.py
+++ b/changedetectionio/tests/test_trigger_regex.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from . util import live_server_setup


 def set_original_ignore_response():
@@ -26,8 +26,13 @@ def test_trigger_regex_functionality(client, live_server):

    live_server_setup(live_server)

+    sleep_time_for_fetch_thread = 3
+
    set_original_ignore_response()

+    # Give the endpoint time to spin up
+    time.sleep(1)
+
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
@@ -38,7 +43,7 @@ def test_trigger_regex_functionality(client, live_server):
    assert b"1 Imported" in res.data

    # Give the thread time to pick it up
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # It should report nothing found (just a new one shouldnt have anything)
    res = client.get(url_for("index"))
@@ -52,7 +57,7 @@ def test_trigger_regex_functionality(client, live_server):
              "fetch_backend": "html_requests"},
        follow_redirects=True
    )
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)
    # so that we set the state to 'unviewed' after all the edits
    client.get(url_for("diff_history_page", uuid="first"))

@@ -60,7 +65,7 @@ def test_trigger_regex_functionality(client, live_server):
        f.write("some new noise")

    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # It should report nothing found (nothing should match the regex)
    res = client.get(url_for("index"))
@@ -70,7 +75,7 @@ def test_trigger_regex_functionality(client, live_server):
        f.write("regex test123<br>\nsomething 123")

    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data

--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -3,7 +3,7 @@ import threading
 import queue
 import time

-from changedetectionio import content_fetcher, html_tools
+from changedetectionio import content_fetcher
 from .processors.text_json_diff import FilterNotFoundInResponse
 from .processors.restock_diff import UnableToExtractRestockData

@@ -251,20 +251,7 @@ class update_worker(threading.Thread):
                        # Totally fine, it's by choice - just continue on, nothing more to care about
                        # Page had elements/content but no renderable text
                        # Backend (not filters) gave zero output
-                        extra_help = ""
-                        if e.has_filters:
-                            # Maybe it contains an image? offer a more helpful link
-                            has_img = html_tools.include_filters(include_filters='img',
-                                                                 html_content=e.html_content)
-                            if has_img:
-                                extra_help = ", it's possible that the filters you have give an empty result or contain only an image."
-                            else:
-                                extra_help = ", it's possible that the filters were found, but contained no usable text."
-
-                        self.datastore.update_watch(uuid=uuid, update_obj={
-                            'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
-                        })
-
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
                        if e.screenshot:
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
                        process_changedetection_results = False
Author	SHA1	Message	Date
dgtlmoon	9111710c83	undo change	2023-09-23 17:21:35 +02:00
dgtlmoon	ce1d4b039c	Also make user-agent detect string case insensitive	2023-09-23 16:36:14 +02:00
dgtlmoon	e1f19a3265	Dont set user agent default if none is set	2023-09-23 14:34:56 +02:00