Merge branch 'master' into total-bytes-counter

0.46.04
Fixing restock monitor tests and tweaking docker default config example,
2026-06-14 04:42:06 +00:00 · 2024-09-05 11:27:08 +02:00 · 2024-09-04 13:55:18 +02:00 · 2024-09-02 15:11:31 +02:00 · 2024-09-02 13:21:38 +02:00 · 2024-09-01 13:07:06 +02:00
22 changed files with 185 additions and 105 deletions
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.46.01'
+__version__ = '0.46.04'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
@@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
            playwright_browser=browsersteps_start_session['browser'],
            proxy=proxy,
-            start_url=datastore.data['watching'][watch_uuid].get('url')
+            start_url=datastore.data['watching'][watch_uuid].get('url'),
+            headers=datastore.data['watching'][watch_uuid].get('headers')
        )

        # For test
@@ -51,6 +51,7 @@ class Fetcher():
    instock_data = None
    instock_data_js = ""
    status_code = None
+    total_bytes = None
    webdriver_js_execute_code = None
    xpath_data = None
    xpath_element_js = ""
@@ -65,8 +66,8 @@ class Fetcher():

    def __init__(self):
        import importlib.resources
-        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
-        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()
+        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
+        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')

    @abstractmethod
    def get_error(self):
@@ -81,7 +82,8 @@ class Fetcher():
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):
        # Should set self.error, self.status_code and self.content
        pass

@@ -83,7 +83,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):

        from playwright.sync_api import sync_playwright
        import playwright._impl._errors
@@ -130,7 +131,7 @@ class fetcher(Fetcher):
            if response is None:
                context.close()
                browser.close()
-                logger.debug("Content Fetcher > Response object was none")
+                logger.debug("Content Fetcher > Response object from the browser communication was none")
                raise EmptyReply(url=url, status_code=None)

            try:
@@ -166,10 +167,10 @@ class fetcher(Fetcher):

                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

-            if len(self.page.content().strip()) == 0:
+            if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
+                logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
                context.close()
                browser.close()
-                logger.debug("Content Fetcher > Content was empty")
                raise EmptyReply(url=url, status_code=response.status)

            # Run Browser Steps here
@@ -75,7 +75,8 @@ class fetcher(Fetcher):
                         request_method,
                         ignore_status_codes,
                         current_include_filters,
-                         is_binary
+                         is_binary,
+                         empty_pages_are_a_change
                         ):

        from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -153,7 +154,7 @@ class fetcher(Fetcher):
        if response is None:
            await self.page.close()
            await browser.close()
-            logger.warning("Content Fetcher > Response object was none")
+            logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
            raise EmptyReply(url=url, status_code=None)

        self.headers = response.headers
@@ -186,10 +187,11 @@ class fetcher(Fetcher):

            raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
        content = await self.page.content
-        if len(content.strip()) == 0:
+
+        if not empty_pages_are_a_change and len(content.strip()) == 0:
+            logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
            await self.page.close()
            await browser.close()
-            logger.error("Content Fetcher > Content was empty")
            raise EmptyReply(url=url, status_code=response.status)

        # Run Browser Steps here
@@ -247,7 +249,7 @@ class fetcher(Fetcher):
        await self.fetch_page(**kwargs)

    def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
-            current_include_filters=None, is_binary=False):
+            current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):

        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -262,7 +264,8 @@ class fetcher(Fetcher):
                request_method=request_method,
                ignore_status_codes=ignore_status_codes,
                current_include_filters=current_include_filters,
-                is_binary=is_binary
+                is_binary=is_binary,
+                empty_pages_are_a_change=empty_pages_are_a_change
            ), timeout=max_time))
        except asyncio.TimeoutError:
            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
@@ -1,9 +1,8 @@
+from loguru import logger
+import chardet
 import hashlib
 import os
-
-import chardet
 import requests
-
 from changedetectionio import strtobool
 from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 from changedetectionio.content_fetchers.base import Fetcher
@@ -13,6 +12,27 @@ from changedetectionio.content_fetchers.base import Fetcher
 class fetcher(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

+    def get_total_bytes_received(self, response):
+        # Calculate the size of the response content
+        content_size = len(response.content)
+        # Calculate the size of the response headers
+        headers_size = sum(len(k) + len(v) for k, v in response.headers.items()) + len(response.headers) * 4  # adding 4 for ': ' and '\r\n'
+
+        # Total bytes received
+        total_received = content_size + headers_size
+        return total_received
+
+    def get_total_bytes_transferred(self, request):
+        # Calculate the size of the request headers
+        headers_size = sum(len(k) + len(v) for k, v in request.headers.items()) + len(request.headers) * 4  # adding 4 for ': ' and '\r\n'
+
+        # Calculate the size of the request body, if any
+        body_size = len(request.body or '')
+
+        # Total bytes transferred (request + response)
+        total_transferred = headers_size + body_size
+        return total_transferred
+
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        self.proxy_override = proxy_override
@@ -26,7 +46,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):

        if self.browser_steps_get_valid_steps():
            raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -53,13 +74,17 @@ class fetcher(Fetcher):
            session.mount('file://', FileAdapter())

        r = session.request(method=request_method,
-                            data=request_body,
+                            data=request_body.encode('utf-8') if type(request_body) is str else request_body,
                            url=url,
                            headers=request_headers,
                            timeout=timeout,
                            proxies=proxies,
                            verify=False)

+        total_received = self.get_total_bytes_received(response=r)
+        request_prepared = r.request
+        self.total_bytes = self.get_total_bytes_transferred(request_prepared) + total_received
+
        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
@@ -74,7 +99,10 @@ class fetcher(Fetcher):
        self.headers = r.headers

        if not r.content or not len(r.content):
-            raise EmptyReply(url=url, status_code=r.status_code)
+            if not empty_pages_are_a_change:
+                raise EmptyReply(url=url, status_code=r.status_code)
+            else:
+                logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")

        # @todo test this
        # @todo maybe you really want to test zero-byte return pages?
@@ -75,6 +75,7 @@ function isItemInStock() {
        'vergriffen',
        'vorbestellen',
        'vorbestellung ist bald möglich',
+        'we don\'t currently have any',
        'we couldn\'t find any products that match',
        'we do not currently have an estimate of when this product will be back in stock.',
        'we don\'t know when or if this item will be back in stock.',
@@ -173,7 +174,8 @@ function isItemInStock() {
        const element = elementsToScan[i];
        // outside the 'fold' or some weird text in the heading area
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
-        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
+        // Note: theres also an automated test that places the 'out of stock' text fairly low down
+        if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
            continue
        }
        elementText = "";
@@ -187,7 +189,7 @@ function isItemInStock() {
            // and these mean its out of stock
            for (const outOfStockText of outOfStockTexts) {
                if (elementText.includes(outOfStockText)) {
-                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
+                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
                    return outOfStockText; // item is out of stock
                }
            }
@@ -164,6 +164,15 @@ visibleElementsArray.forEach(function (element) {
        }
    }

+    let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
+
+    let text = element.textContent.trim().slice(0, 30).trim();
+    while (/\n{2,}|\t{2,}/.test(text)) {
+        text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
+    }
+
+    // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
+    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;

    size_pos.push({
        xpath: xpath_result,
@@ -171,9 +180,16 @@ visibleElementsArray.forEach(function (element) {
        height: Math.round(bbox['height']),
        left: Math.floor(bbox['left']),
        top: Math.floor(bbox['top']) + scroll_y,
+        // tagName used by Browser Steps
        tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
+        // tagtype used by Browser Steps
        tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
-        isClickable: window.getComputedStyle(element).cursor == "pointer"
+        isClickable: window.getComputedStyle(element).cursor === "pointer",
+        // Used by the keras trainer
+        fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
+        fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
+        hasDigitCurrency: hasDigitCurrency,
+        label: label,
    });

 });
@@ -56,7 +56,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):

        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options as ChromeOptions
@@ -1377,17 +1377,19 @@ def changedetection_app(config=None, datastore_o=None):
        import brotli

        watch = datastore.data['watching'].get(uuid)
-        if watch and os.path.isdir(watch.watch_data_dir):
-            latest_filename = list(watch.history.keys())[0]
+        if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
+            latest_filename = list(watch.history.keys())[-1]
            html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
-            if html_fname.endswith('.br'):
-                # Read and decompress the Brotli file
-                with open(html_fname, 'rb') as f:
+            with open(html_fname, 'rb') as f:
+                if html_fname.endswith('.br'):
+                    # Read and decompress the Brotli file
                    decompressed_data = brotli.decompress(f.read())
+                else:
+                    decompressed_data = f.read()

-                buffer = BytesIO(decompressed_data)
+            buffer = BytesIO(decompressed_data)

-                return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
+            return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')


        # Return a 500 error
@@ -107,7 +107,7 @@ def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs):

    r(results.get('url'),
      auth=auth,
-      data=body,
+      data=body.encode('utf-8') if type(body) is str else body,
      headers=headers,
      params=params
      )
@@ -26,6 +26,8 @@ class difference_detection_processor():

    def call_browser(self):
        from requests.structures import CaseInsensitiveDict
+        from changedetectionio.content_fetchers.exceptions import EmptyReply
+
        # Protect against file:// access
        if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
            if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -133,8 +135,18 @@ class difference_detection_processor():
        is_binary = self.watch.is_pdf

        # And here we go! call the right browser with browser-specific settings
-        self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
-                    is_binary=is_binary)
+        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
+
+        self.fetcher.run(url=url,
+                         timeout=timeout,
+                         request_headers=request_headers,
+                         request_body=request_body,
+                         request_method=request_method,
+                         ignore_status_codes=ignore_status_codes,
+                         current_include_filters=self.watch.get('include_filters'),
+                         is_binary=is_binary,
+                         empty_pages_are_a_change=empty_pages_are_a_change
+                         )

        #@todo .quit here could go on close object, so we can run JS if change-detected
        self.fetcher.quit()
@@ -1,11 +1,12 @@

-from changedetectionio.model.Watch import model as BaseWatch
-import re
 from babel.numbers import parse_decimal
+from changedetectionio.model.Watch import model as BaseWatch
+from typing import Union
+import re

 class Restock(dict):

-    def parse_currency(self, raw_value: str) -> float:
+    def parse_currency(self, raw_value: str) -> Union[float, None]:
        # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
        standardized_value = raw_value

@@ -21,8 +22,11 @@ class Restock(dict):
        # Remove any non-numeric characters except for the decimal point
        standardized_value = re.sub(r'[^\d.-]', '', standardized_value)

-        # Convert to float
-        return float(parse_decimal(standardized_value, locale='en'))
+        if standardized_value:
+            # Convert to float
+            return float(parse_decimal(standardized_value, locale='en'))
+
+        return None

    def __init__(self, *args, **kwargs):
        # Define default values
@@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock:
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")

-    value = {}
    now = time.time()
+
    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
-
    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
+    try:
+        data = extruct.extract(html_content, syntaxes=syntaxes)
+    except Exception as e:
+        logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
+        return Restock()

-    data = extruct.extract(html_content, syntaxes=syntaxes)
    logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")

    # First phase, dead simple scanning of anything that looks useful
@@ -76,7 +76,7 @@
                    </div>
                    <div class="pure-control-group">
                        {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
-                        <span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
+                        <span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
                    </div>
                {% if form.requests.proxy %}
                    <div class="pure-control-group inline-radio">
@@ -112,7 +112,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
    res = client.post(
        url_for("settings_page"),
        data={"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
-              "application-notification_body": 'triggered text was -{{triggered_text}}-',
+              "application-notification_body": 'triggered text was -{{triggered_text}}- 网站监测 内容更新了',
              # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
              "application-notification_urls": test_notification_url,
              "application-minutes_between_check": 180,
@@ -167,9 +167,10 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
    # Takes a moment for apprise to fire
    time.sleep(3)
    assert os.path.isfile("test-datastore/notification.txt"), "Notification fired because I can see the output file"
-    with open("test-datastore/notification.txt", 'r') as f:
-        response= f.read()
-        assert '-Oh yes please-' in response
+    with open("test-datastore/notification.txt", 'rb') as f:
+        response = f.read()
+        assert b'-Oh yes please-' in response
+        assert '网站监测 内容更新了'.encode('utf-8') in response


    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
@@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

    wait_for_all_checks(client)

+    uuid = extract_UUID_from_client(client)
+
+    # Check the 'get latest snapshot works'
+    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
+    assert b'which has this one new line' in res.data
+
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
@@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert expected_url.encode('utf-8') in res.data

    # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
-    res = client.get(url_for("diff_history_page", uuid="first"))
+    res = client.get(url_for("diff_history_page", uuid=uuid))
    assert b'selected=""' in res.data, "Confirm diff history page loaded"

    # Check the [preview] pulls the right one
@@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b'unviewed' not in res.data

    # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
-    uuid = extract_UUID_from_client(client)
    client.get(url_for("clear_watch_history", uuid=uuid))
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'preview/' in res.data

-
-    # Check the 'get latest snapshot works'
-    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
-    assert b'<head><title>head title</title></head>' in res.data
-
    #
    # Cleanup everything
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
@@ -1,12 +1,7 @@
 #!/usr/bin/env python3

-import time
 from flask import url_for
-from urllib.request import urlopen
-from .util import set_original_response, set_modified_response, live_server_setup
-
-sleep_time_for_fetch_thread = 3
-
+from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks

 def set_nonrenderable_response():
    test_return_data = """<html>
@@ -22,6 +17,13 @@ def set_nonrenderable_response():

    return None

+def set_zero_byte_response():
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("")
+
+    return None
+
 def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
    set_original_response()
    live_server_setup(live_server)
@@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

    assert b"1 Imported" in res.data

-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)

-    # Do this a few times.. ensures we dont accidently set the status
-    for n in range(3):
-        client.get(url_for("form_watch_checknow"), follow_redirects=True)
-
-        # Give the thread time to pick it up
-        time.sleep(sleep_time_for_fetch_thread)
-
-        # It should report nothing found (no new 'unviewed' class)
-        res = client.get(url_for("index"))
-        assert b'unviewed' not in res.data
+    # It should report nothing found (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data


    #####################
@@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    client.get(url_for("form_watch_checknow"), follow_redirects=True)

    # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)

    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
@@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    client.get(url_for("form_watch_checknow"), follow_redirects=True)

    # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)

    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
+    client.get(url_for("mark_all_viewed"), follow_redirects=True)

-
-
+    # A totally zero byte (#2528) response should also not trigger an error
+    set_zero_byte_response()
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
+    assert b'fetch-error' not in res.data

    #
    # Cleanup everything
@@ -291,11 +291,11 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
        data={
              "application-fetch_backend": "html_requests",
              "application-minutes_between_check": 180,
-              "application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444 }',
+              "application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了" }',
              "application-notification_format": default_notification_format,
              "application-notification_urls": test_notification_url,
              # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
-              "application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
+              "application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }} ",
              },
        follow_redirects=True
    )
@@ -324,6 +324,7 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
        j = json.loads(x)
        assert j['url'].startswith('http://localhost')
        assert j['secret'] == 444
+        assert j['somebug'] == '网站监测 内容更新了'

    # URL check, this will always be converted to lowercase
    assert os.path.isfile("test-datastore/notification-url.txt")
@@ -354,9 +355,10 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
 #2510
 def test_global_send_test_notification(client, live_server, measure_memory_usage):

-
    #live_server_setup(live_server)
    set_original_response()
+    if os.path.isfile("test-datastore/notification.txt"):
+        os.unlink("test-datastore/notification.txt")

    # otherwise other settings would have already existed from previous tests in this file
    res = client.post(
@@ -364,7 +366,8 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
        data={
            "application-fetch_backend": "html_requests",
            "application-minutes_between_check": 180,
-            "application-notification_body": 'change detection is cool',
+            #1995 UTF-8 content should be encoded
+            "application-notification_body": 'change detection is cool 网站监测 内容更新了',
            "application-notification_format": default_notification_format,
            "application-notification_urls": "",
            "application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
@@ -399,8 +402,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage

    with open("test-datastore/notification.txt", 'r') as f:
        x = f.read()
-        assert 'change detection is coo' in x
-
+        assert 'change detection is cool 网站监测 内容更新了' in x

    os.unlink("test-datastore/notification.txt")

@@ -420,7 +422,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
    with open("test-datastore/notification.txt", 'r') as f:
        x = f.read()
        # Should come from notification.py default handler when there is no notification body to pull from
-        assert 'change detection is coo' in x
+        assert 'change detection is cool 网站监测 内容更新了' in x

    client.get(
        url_for("form_delete", uuid="all"),
@@ -1,6 +1,5 @@
 from .processors.exceptions import ProcessorException
-from . import content_fetchers
-
+import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
 from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
 from changedetectionio import html_tools

@@ -301,7 +300,7 @@ class update_worker(threading.Thread):
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
                        process_changedetection_results = False

-                    except content_fetchers.exceptions.ReplyWithContentButNoText as e:
+                    except content_fetchers_exceptions.ReplyWithContentButNoText as e:
                        # Totally fine, it's by choice - just continue on, nothing more to care about
                        # Page had elements/content but no renderable text
                        # Backend (not filters) gave zero output
@@ -327,7 +326,7 @@ class update_worker(threading.Thread):
                            
                        process_changedetection_results = False

-                    except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
+                    except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
                        if e.status_code == 403:
                            err_text = "Error - 403 (Access denied) received"
                        elif e.status_code == 404:
@@ -380,23 +379,23 @@ class update_worker(threading.Thread):

                        process_changedetection_results = False

-                    except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
+                    except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
                        # Yes fine, so nothing todo, don't continue to process.
                        process_changedetection_results = False
                        changed_detected = False
-                    except content_fetchers.exceptions.BrowserConnectError as e:
+                    except content_fetchers_exceptions.BrowserConnectError as e:
                        self.datastore.update_watch(uuid=uuid,
                                                    update_obj={'last_error': e.msg
                                                                }
                                                    )
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
+                    except content_fetchers_exceptions.BrowserFetchTimedOut as e:
                        self.datastore.update_watch(uuid=uuid,
                                                    update_obj={'last_error': e.msg
                                                                }
                                                    )
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserStepsStepException as e:
+                    except content_fetchers_exceptions.BrowserStepsStepException as e:

                        if not self.datastore.data['watching'].get(uuid):
                            continue
@@ -438,25 +437,25 @@ class update_worker(threading.Thread):

                        process_changedetection_results = False

-                    except content_fetchers.exceptions.EmptyReply as e:
+                    except content_fetchers_exceptions.EmptyReply as e:
                        # Some kind of custom to-str handler in the exception handler that does this?
                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.ScreenshotUnavailable as e:
+                    except content_fetchers_exceptions.ScreenshotUnavailable as e:
                        err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.JSActionExceptions as e:
+                    except content_fetchers_exceptions.JSActionExceptions as e:
                        err_text = "Error running JS Actions - Page request - "+e.message
                        if e.screenshot:
                            watch.save_screenshot(screenshot=e.screenshot, as_error=True)
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.PageUnloadable as e:
+                    except content_fetchers_exceptions.PageUnloadable as e:
                        err_text = "Page request from server didnt respond correctly"
                        if e.message:
                            err_text = "{} - {}".format(err_text, e.message)
@@ -468,7 +467,7 @@ class update_worker(threading.Thread):
                                                                           'last_check_status': e.status_code,
                                                                           'has_ldjson_price_data': None})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
+                    except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
                        err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
                        process_changedetection_results = False
@@ -18,7 +18,7 @@ services:
  #
  #        Log levels are in descending order. (TRACE is the most detailed one)
  #        Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
-  #      - LOGGER_LEVEL=DEBUG
+  #      - LOGGER_LEVEL=TRACE
  #
  #       Alternative WebDriver/selenium URL, do not use "'s or 's!
  #      - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,8 +29,9 @@ services:
  #
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
-  #       Alternative Playwright URL, do not use "'s or 's!
-  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
+  #       Alternative target "Chrome" Playwright URL, do not use "'s or 's!
+  #       "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
+  #      - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@@ -73,10 +74,10 @@ services:
 #              condition: service_started


-     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
+     # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
-#    playwright-chrome:
-#        hostname: playwright-chrome
+#    sockpuppetbrowser:
+#        hostname: sockpuppetbrowser
 #        image: dgtlmoon/sockpuppetbrowser:latest
 #        cap_add:
 #            - SYS_ADMIN
@@ -35,7 +35,7 @@ dnspython==2.6.1 # related to eventlet fixes
 # jq not available on Windows so must be installed manually

 # Notification library
-apprise~=1.8.0
+apprise~=1.8.1

 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
 # and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible
@@ -79,8 +79,9 @@ pyppeteerstealth>=0.0.4
 pytest ~=7.2
 pytest-flask ~=1.2

-# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
-jsonschema==4.17.3
+# Anything 4.0 and up but not 5.0
+jsonschema ~= 4.0
+

 loguru
Author	SHA1	Message	Date
dgtlmoon	92d715272a	Merge branch 'master' into total-bytes-counter	2024-09-05 11:27:08 +02:00
dgtlmoon	5b70625eaa	0.46.04	2024-09-04 13:55:18 +02:00
dgtlmoon	60d292107d	Fixing restock monitor tests and tweaking docker default config example,	2024-09-02 15:11:31 +02:00
dgtlmoon	1cb38347da	Container name should be 'sockpuppetbrowser' because its not just playwright that uses it	2024-09-02 13:21:38 +02:00
dgtlmoon	55fe2abf42	Restock/Price detection - Better catching of errors when parsing metadata documents for restock/price check (#2602 )	2024-09-01 13:07:06 +02:00
dgtlmoon	4225900ec3	Restock - updating texts and text offsets	2024-09-01 12:47:21 +02:00
dgtlmoon	1fb4342488	Build - Unpin jsonschema for faster builds (#2583 )	2024-08-22 15:02:00 +02:00
dgtlmoon	7071df061a	Price detection/scraping - Adding extra element training data (#2582 )	2024-08-22 15:01:36 +02:00
dgtlmoon	6dd1fa2b88	0.46.03	2024-08-19 17:22:13 +02:00
dgtlmoon	371f85d544	Watch 'Download last snapshot' link/button should give last, not first snapshot (#2576 )	2024-08-19 17:20:30 +02:00
dgtlmoon	932cf15e1e	Price and restock scraping - small price fix scraper (#2575 )	2024-08-19 15:47:19 +02:00
Mike Splain	bf0d410d32	Browser Steps UI - Interactive UI wasn't sending headers but was when the check ran (#2551 )	2024-08-19 10:21:05 +02:00
dgtlmoon	730f37c7ba	Set encoding type for scraper script reader (#2574 #2568 )	2024-08-19 09:17:18 +02:00
dgtlmoon	0e5261dd87	WIP	2024-07-29 16:59:17 +02:00
dgtlmoon	8a35d62e02	Handle zero-byte/empty content responses with "`[ ] Empty pages are a change`" option, the same as when the HTML doesnt render any useful text (#2530 )	2024-07-29 13:27:59 +02:00
dgtlmoon	f527744024	0.46.02	2024-07-27 20:28:04 +02:00
dgtlmoon	71c9b1273c	Adding test for #1995 UTF-8 encoding in POST request body and post:// notifications (#2525 )	2024-07-27 19:47:03 +02:00
dgtlmoon	ec68450df1	Updating Apprise notification library , Splunk/VictorOps, Africas Talking, Microsoft Power Automate / Workflows, Société Française du Radiotéléphone (SFR) Support (#2524 )	2024-07-27 14:28:57 +02:00
dgtlmoon	2fd762a783	Encode POST style requests and notifications as UTF-8 if it has no encoding/basic string (#2523 )	2024-07-27 14:27:15 +02:00