Merge branch 'master' into browsersteps-requests

UI - Show error/warning when trying to compare the same version
2025-10-31 14:47:21 +00:00 · 2024-02-01 10:37:21 +01:00 · 2024-02-01 10:36:43 +01:00 · 2024-02-01 10:33:03 +01:00 · 2024-02-01 00:09:27 +01:00 · 2024-01-31 23:37:23 +01:00
15 changed files with 106 additions and 39 deletions
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -40,12 +40,12 @@ jobs:
        path: dist/
    - name: Test that the basic pip built package runs without error
      run: |
-        set -e
+        set -ex
        pip3 install dist/changedetection.io*.whl
        changedetection.io -d /tmp -p 10000 &
        sleep 3
-        curl http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
-        curl http://127.0.0.1:10000/ >/dev/null
+        curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
+        curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null
        killall changedetection.io


--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -27,7 +27,7 @@ jobs:
        run: |
          
          docker network create changedet-network
-
+          
          # Selenium+browserless
          docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome:4
          docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.60-chrome-stable
@@ -47,7 +47,7 @@ jobs:
          # Debug SMTP server/echo message back server
          docker run --network changedet-network -d -p 11025:11025 -p 11080:11080  --hostname mailserver test-changedetectionio  bash -c 'python changedetectionio/tests/smtp/smtp-test-server.py' 

-      - name: Test built container with pytest
+      - name: Test built container with Pytest (generally as requests/plaintext fetching)
        run: |
          # Unit tests
          echo "run test with unittest"
@@ -61,20 +61,32 @@ jobs:
          # append the docker option. e.g. '-e LOGGER_LEVEL=DEBUG'
          docker run --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio && ./run_basic_tests.sh'

-      - name: Test built container selenium+browserless/playwright
+      - name: Specific tests in built container for Selenium
        run: |
          
          # Selenium fetch
          docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py'
-          
+
+      - name: Specific tests in built container for Playwright
+        run: |         
          # Playwright/Browserless fetch
          docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
-          
+
+      - name: Specific tests in built container for headers and requests checks with Playwright
+        run: |                  
          # Settings headers playwright tests - Call back in from Browserless, check headers
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
+
+      - name: Specific tests in built container for headers and requests checks with Selenium
+        run: |                  
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
+
+      - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher
+        run: |                  
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'          
-          
+
+      - name: Test built container restock detection via Playwright
+        run: |                            
          # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'

@@ -106,10 +118,10 @@ jobs:
          docker run --name test-changedetectionio -p 5556:5000  -d test-changedetectionio
          sleep 3
          # Should return 0 (no error) when grep finds it
-          curl -s http://localhost:5556 |grep -q checkbox-uuid
+          curl --retry-connrefused --retry 6  -s http://localhost:5556 |grep -q checkbox-uuid
          
          # and IPv6
-          curl -s -g -6 "http://[::1]:5556"|grep -q checkbox-uuid
+          curl --retry-connrefused --retry 6  -s -g -6 "http://[::1]:5556"|grep -q checkbox-uuid

          # Check whether TRACE log is enabled.
          # Also, check whether TRACE is came from STDERR
--- a/changedetectionio/blueprint/price_data_follower/init.py
+++ b/changedetectionio/blueprint/price_data_follower/init.py
@@ -18,8 +18,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue
    def accept(uuid):
        datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
        update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
-        return redirect(url_for("form_watch_checknow", uuid=uuid))
-
+        return redirect(url_for("index"))

    @login_required
    @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -51,6 +51,7 @@ class BrowserStepsStepException(Exception):
        return


+# @todo - make base Exception class that announces via logger()
 class PageUnloadable(Exception):
    def __init__(self, status_code, url, message, screenshot=False):
        # Set this so we can use it in other parts of the app
@@ -60,6 +61,10 @@ class PageUnloadable(Exception):
        self.message = message
        return

+class BrowserStepsInUnsupportedFetcher(Exception):
+    def __init__(self, url):
+        self.url = url
+        return

 class EmptyReply(Exception):
    def __init__(self, status_code, url, screenshot=None):
@@ -389,10 +394,24 @@ class base_html_playwright(Fetcher):
            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
        else:
            # 200 Here means that the communication to browserless worked only, not the page state
-            if response.status_code == 200:
+            try:
+                x = response.json()
+            except Exception as e:
+                raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
+
+            try:
+                self.status_code = response.status_code
+            except Exception as e:
+                raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
+
+            self.headers = x.get('headers')
+
+            if self.status_code != 200 and not ignore_status_codes:
+                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
+
+            if self.status_code == 200:
                import base64

-                x = response.json()
                if not x.get('screenshot'):
                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
                    # https://github.com/puppeteer/puppeteer/issues/1834
@@ -403,16 +422,10 @@ class base_html_playwright(Fetcher):
                if not x.get('content', '').strip():
                    raise EmptyReply(url=url, status_code=None)

-                if x.get('status_code', 200) != 200 and not ignore_status_codes:
-                    raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
-
                self.content = x.get('content')
-                self.headers = x.get('headers')
                self.instock_data = x.get('instock_data')
                self.screenshot = base64.b64decode(x.get('screenshot'))
-                self.status_code = x.get('status_code')
                self.xpath_data = x.get('xpath_data')
-
            else:
                # Some other error from browserless
                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
@@ -703,6 +716,9 @@ class html_requests(Fetcher):
            current_include_filters=None,
            is_binary=False):

+        if self.browser_steps_get_valid_steps():
+            raise BrowserStepsInUnsupportedFetcher(url=url)
+
        # Make requests use a more modern looking user-agent
        if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
            request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
@@ -742,6 +758,8 @@ class html_requests(Fetcher):
                if encoding:
                    r.encoding = encoding

+        self.headers = r.headers
+
        if not r.content or not len(r.content):
            raise EmptyReply(url=url, status_code=r.status_code)

@@ -758,7 +776,7 @@ class html_requests(Fetcher):
        else:
            self.content = r.text

-        self.headers = r.headers
+
        self.raw_content = r.content


--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -13,7 +13,6 @@ from threading import Event
 import datetime
 import flask_login
 from loguru import logger
-import sys
 import os
 import pytz
 import queue
@@ -317,6 +316,9 @@ def changedetection_app(config=None, datastore_o=None):

    @app.route("/rss", methods=['GET'])
    def rss():
+        from jinja2 import Environment, BaseLoader
+        jinja2_env = Environment(loader=BaseLoader)
+        now = time.time()
        # Always requires token set
        app_rss_token = datastore.data['settings']['application'].get('rss_access_token')
        rss_url_token = request.args.get('token')
@@ -380,8 +382,12 @@ def changedetection_app(config=None, datastore_o=None):
                                             include_equal=False,
                                             line_feed_sep="<br>")

-                fe.content(content="<html><body><h4>{}</h4>{}</body></html>".format(watch_title, html_diff),
-                           type='CDATA')
+                # @todo Make this configurable and also consider html-colored markup
+                # @todo User could decide if <link> goes to the diff page, or to the watch link
+                rss_template = "<html><body>\n<h4><a href=\"{{watch_url}}\">{{watch_title}}</a></h4>\n<p>{{html_diff}}</p>\n</body></html>\n"
+                content = jinja2_env.from_string(rss_template).render(watch_title=watch_title, html_diff=html_diff, watch_url=watch.link)
+
+                fe.content(content=content, type='CDATA')

                fe.guid(guid, permalink=False)
                dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key))
@@ -390,6 +396,7 @@ def changedetection_app(config=None, datastore_o=None):

        response = make_response(fg.rss_str())
        response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8')
+        logger.trace(f"RSS generated in {time.time() - now:.3f}s")
        return response

    @app.route("/", methods=['GET'])
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -56,6 +56,7 @@ base_config = {
    'previous_md5': False,
    'previous_md5_before_filters': False,  # Used for skipping changedetection entirely
    'proxy': None,  # Preferred proxy connection
+    'remote_server_reply': None, # From 'server' reply header
    'subtractive_selectors': [],
    'tag': '', # Old system of text name for a tag, to be removed
    'tags': [], # list of UUIDs to App.Tags
--- a/changedetectionio/static/js/diff-overview.js
+++ b/changedetectionio/static/js/diff-overview.js
@@ -90,5 +90,10 @@ $(document).ready(function () {
        }
    }

-
+    $('#diff-form').on('submit', function (e) {
+        if ($('select[name=from_version]').val() === $('select[name=to_version]').val()) {
+            e.preventDefault();
+            alert('Error - You are trying to compare the same version.');
+        }
+    });
 });
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -255,6 +255,7 @@ class ChangeDetectionStore:
                'last_viewed': 0,
                'previous_md5': False,
                'previous_md5_before_filters': False,
+                'remote_server_reply': None,
                'track_ldjson_price_data': None,
            })

--- a/changedetectionio/templates/diff.html
+++ b/changedetectionio/templates/diff.html
@@ -13,7 +13,7 @@
 <script src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>

 <div id="settings">
-    <form class="pure-form " action="" method="GET">
+    <form class="pure-form " action="" method="GET" id="diff-form">
        <fieldset>
            {% if versions|length >= 1 %}
                <strong>Compare</strong>
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -401,6 +401,7 @@ Unavailable") }}
                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
                                <li>Keyword example &dash; example <code>Out of stock</code></li>
                                <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
+                                <li>Example - match lines containing a keyword <code>/.*icecream.*/</code></li>
                            </ul>
                        </li>
                        <li>One line per regular-expression/string match</li>
--- a/changedetectionio/tests/test_api.py
+++ b/changedetectionio/tests/test_api.py
@@ -163,6 +163,7 @@ def test_api_simple(client, live_server):
    # Loading the most recent snapshot should force viewed to become true
    client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)

+    time.sleep(3)
    # Fetch the whole watch again, viewed should be true
    res = client.get(
        url_for("watch", uuid=watch_uuid),
--- a/changedetectionio/tests/test_request.py
+++ b/changedetectionio/tests/test_request.py
@@ -10,7 +10,7 @@ def test_setup(live_server):
 # Hard to just add more live server URLs when one test is already running (I think)
 # So we add our test here (was in a different file)
 def test_headers_in_request(client, live_server):
-    #live_server_setup(live_server)
+    #ve_server_setup(live_server)
    # Add our URL to the import page
    test_url = url_for('test_headers', _external=True)
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
@@ -70,16 +70,17 @@ def test_headers_in_request(client, live_server):

    wait_for_all_checks(client)

-    # Re #137 -  Examine the JSON index file, it should have only one set of headers entered
+    # Re #137 -  It should have only one set of headers entered
    watches_with_headers = 0
-    with open('test-datastore/url-watches.json') as f:
-        app_struct = json.load(f)
-        for uuid in app_struct['watching']:
-            if (len(app_struct['watching'][uuid]['headers'])):
+    for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
+            if (len(watch['headers'])):
                watches_with_headers += 1
+    assert watches_with_headers == 1
+
+    # 'server' http header was automatically recorded
+    for k, watch in client.application.config.get('DATASTORE').data.get('watching').items():
+        assert 'custom' in watch.get('remote_server_reply') # added in util.py

-    # Should be only one with headers set
-    assert watches_with_headers==1
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data

--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -175,12 +175,16 @@ def live_server_setup(live_server):
    @live_server.app.route('/test-headers')
    def test_headers():

-        output= []
+        output = []

        for header in request.headers:
-             output.append("{}:{}".format(str(header[0]),str(header[1])   ))
+            output.append("{}:{}".format(str(header[0]), str(header[1])))

-        return "\n".join(output)
+        content = "\n".join(output)
+
+        resp = make_response(content, 200)
+        resp.headers['server'] = 'custom'
+        return resp

    # Just return the body in the request
    @live_server.app.route('/test-body', methods=['POST', 'GET'])
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -430,6 +430,12 @@ class update_worker(threading.Thread):
                                                                           'last_check_status': e.status_code,
                                                                           'has_ldjson_price_data': None})
                        process_changedetection_results = False
+                    except content_fetcher.BrowserStepsInUnsupportedFetcher as e:
+                        err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
+                        process_changedetection_results = False
+                        logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}")
+
                    except UnableToExtractRestockData as e:
                        # Usually when fetcher.instock_data returns empty
                        logger.error(f"Exception (UnableToExtractRestockData) reached processing watch UUID: {uuid}")
@@ -491,6 +497,16 @@ class update_worker(threading.Thread):
                    if self.datastore.data['watching'].get(uuid):
                        # Always record that we atleast tried
                        count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
+
+                        # Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds
+                        try:
+                            server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
+                            self.datastore.update_watch(uuid=uuid,
+                                                        update_obj={'remote_server_reply': server_header}
+                                                        )
+                        except Exception as e:
+                            pass
+
                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
                                                                           'last_checked': round(time.time()),
                                                                           'check_count': count
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -94,7 +94,8 @@ services:
 #

     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
-     # Note: works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) and other issues
+     # Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
+     #       Does not report status codes (200, 404, 403) and other issues
     # More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
 #    browser-chrome:
 #        hostname: browser-chrome
Author	SHA1	Message	Date
dgtlmoon	5d1ecaed94	Merge branch 'master' into browsersteps-requests	2024-02-01 10:37:21 +01:00
dgtlmoon	4b49759113	UI - Show error/warning when trying to compare the same version	2024-02-01 10:36:43 +01:00
dgtlmoon	1945a59a72	UI - Show error/warning when trying to compare the same version	2024-02-01 10:33:03 +01:00
dgtlmoon	e9a9790cb0	Fetching - Make an obvious error when using BrowserSteps with the simple text fetcher (#2145 )	2024-02-01 00:09:27 +01:00
dgtlmoon	520650e2e6	Make an obvious error when using BrowserSteps with the simple text fetcher	2024-01-31 23:37:23 +01:00
dgtlmoon	593660e2f6	Fix for switching to price-data-follower mode (when page has JSON price data), only needs to be queued once. Re #1565	2024-01-31 22:39:24 +01:00
dgtlmoon	7d96b4ba83	Fetching - Always record `server` software reply headers (will be used in the future) (#2143 )	2024-01-31 16:15:43 +01:00
dgtlmoon	fca40e4d5b	Testing - General test workflow improvements (#2144 )	2024-01-31 15:10:44 +01:00
dgtlmoon	66e2dfcead	RSS - Include link to the watched URL in the feed (#2139 #2131 and #327 )	2024-01-29 16:26:14 +01:00
dgtlmoon	bce7eb68fb	Notifications - skip empty notification URLs from being processed (#2138 )	2024-01-29 14:20:39 +01:00
dgtlmoon	93c0385119	UI - Filters & Triggers - Adding example for keyword matching in a line	2024-01-29 14:18:14 +01:00
dgtlmoon	e17f3be739	RSS - Adding performance stats	2024-01-29 13:05:11 +01:00