Update requirements.txt

Re #1708 unpin jsonschema
0.46.03
2025-11-21 08:56:09 +00:00 · 2024-08-22 14:17:02 +02:00 · 2024-08-22 14:13:04 +02:00 · 2024-08-19 17:22:13 +02:00 · 2024-08-19 17:20:30 +02:00 · 2024-08-19 15:47:19 +02:00
15 changed files with 108 additions and 79 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@
 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
-__version__ = '0.46.02'
+__version__ = '0.46.03'
 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/blueprint/browser_steps/init.py
+++ b/changedetectionio/blueprint/browser_steps/init.py
@@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
            playwright_browser=browsersteps_start_session['browser'],
            proxy=proxy,
-            start_url=datastore.data['watching'][watch_uuid].get('url')
+            start_url=datastore.data['watching'][watch_uuid].get('url'),
            headers=datastore.data['watching'][watch_uuid].get('headers')
        )
        # For test
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -65,8 +65,8 @@ class Fetcher():
    def __init__(self):
        import importlib.resources
-        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
+        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
-        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()
+        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
    @abstractmethod
    def get_error(self):
@@ -81,7 +81,8 @@ class Fetcher():
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
            empty_pages_are_a_change=False):
        # Should set self.error, self.status_code and self.content
        pass
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -83,7 +83,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
            empty_pages_are_a_change=False):
        from playwright.sync_api import sync_playwright
        import playwright._impl._errors
@@ -130,7 +131,7 @@ class fetcher(Fetcher):
            if response is None:
                context.close()
                browser.close()
-                logger.debug("Content Fetcher > Response object was none")
+                logger.debug("Content Fetcher > Response object from the browser communication was none")
                raise EmptyReply(url=url, status_code=None)
            try:
@@ -166,10 +167,10 @@ class fetcher(Fetcher):
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
-            if len(self.page.content().strip()) == 0:
+            if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
                logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
                context.close()
                browser.close()
                logger.debug("Content Fetcher > Content was empty")
                raise EmptyReply(url=url, status_code=response.status)
            # Run Browser Steps here
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -75,7 +75,8 @@ class fetcher(Fetcher):
                         request_method,
                         ignore_status_codes,
                         current_include_filters,
-                         is_binary
+                         is_binary,
                         empty_pages_are_a_change
                         ):
        from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -153,7 +154,7 @@ class fetcher(Fetcher):
        if response is None:
            await self.page.close()
            await browser.close()
-            logger.warning("Content Fetcher > Response object was none")
+            logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
            raise EmptyReply(url=url, status_code=None)
        self.headers = response.headers
@@ -186,10 +187,11 @@ class fetcher(Fetcher):
            raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
        content = await self.page.content
-        if len(content.strip()) == 0:
+
        if not empty_pages_are_a_change and len(content.strip()) == 0:
            logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
            await self.page.close()
            await browser.close()
            logger.error("Content Fetcher > Content was empty")
            raise EmptyReply(url=url, status_code=response.status)
        # Run Browser Steps here
@@ -247,7 +249,7 @@ class fetcher(Fetcher):
        await self.fetch_page(**kwargs)
    def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
-            current_include_filters=None, is_binary=False):
+            current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -262,7 +264,8 @@ class fetcher(Fetcher):
                request_method=request_method,
                ignore_status_codes=ignore_status_codes,
                current_include_filters=current_include_filters,
-                is_binary=is_binary
+                is_binary=is_binary,
                empty_pages_are_a_change=empty_pages_are_a_change
            ), timeout=max_time))
        except asyncio.TimeoutError:
            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -1,9 +1,8 @@
 from loguru import logger
 import chardet
 import hashlib
 import os
 import chardet
 import requests
 from changedetectionio import strtobool
 from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 from changedetectionio.content_fetchers.base import Fetcher
@@ -26,7 +25,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
            empty_pages_are_a_change=False):
        if self.browser_steps_get_valid_steps():
            raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -74,7 +74,10 @@ class fetcher(Fetcher):
        self.headers = r.headers
        if not r.content or not len(r.content):
-            raise EmptyReply(url=url, status_code=r.status_code)
+            if not empty_pages_are_a_change:
                raise EmptyReply(url=url, status_code=r.status_code)
            else:
                logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
        # @todo test this
        # @todo maybe you really want to test zero-byte return pages?
--- a/changedetectionio/content_fetchers/webdriver_selenium.py
+++ b/changedetectionio/content_fetchers/webdriver_selenium.py
@@ -56,7 +56,8 @@ class fetcher(Fetcher):
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
            empty_pages_are_a_change=False):
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options as ChromeOptions
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -1377,17 +1377,19 @@ def changedetection_app(config=None, datastore_o=None):
        import brotli
        watch = datastore.data['watching'].get(uuid)
-        if watch and os.path.isdir(watch.watch_data_dir):
+        if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
-            latest_filename = list(watch.history.keys())[0]
+            latest_filename = list(watch.history.keys())[-1]
            html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
-            if html_fname.endswith('.br'):
+            with open(html_fname, 'rb') as f:
-                # Read and decompress the Brotli file
+                if html_fname.endswith('.br'):
-                with open(html_fname, 'rb') as f:
+                    # Read and decompress the Brotli file
                    decompressed_data = brotli.decompress(f.read())
                else:
                    decompressed_data = f.read()
-                buffer = BytesIO(decompressed_data)
+            buffer = BytesIO(decompressed_data)
-                return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
+            return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
        # Return a 500 error
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@@ -26,6 +26,8 @@ class difference_detection_processor():
    def call_browser(self):
        from requests.structures import CaseInsensitiveDict
        from changedetectionio.content_fetchers.exceptions import EmptyReply
        # Protect against file:// access
        if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
            if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -133,8 +135,18 @@ class difference_detection_processor():
        is_binary = self.watch.is_pdf
        # And here we go! call the right browser with browser-specific settings
-        self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
+        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-                    is_binary=is_binary)
+
        self.fetcher.run(url=url,
                         timeout=timeout,
                         request_headers=request_headers,
                         request_body=request_body,
                         request_method=request_method,
                         ignore_status_codes=ignore_status_codes,
                         current_include_filters=self.watch.get('include_filters'),
                         is_binary=is_binary,
                         empty_pages_are_a_change=empty_pages_are_a_change
                         )
        #@todo .quit here could go on close object, so we can run JS if change-detected
        self.fetcher.quit()
--- a/changedetectionio/processors/restock_diff/init.py
+++ b/changedetectionio/processors/restock_diff/init.py
@@ -1,11 +1,12 @@
 from changedetectionio.model.Watch import model as BaseWatch
 import re
 from babel.numbers import parse_decimal
 from changedetectionio.model.Watch import model as BaseWatch
 from typing import Union
 import re
 class Restock(dict):
-    def parse_currency(self, raw_value: str) -> float:
+    def parse_currency(self, raw_value: str) -> Union[float, None]:
        # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
        standardized_value = raw_value
@@ -21,8 +22,11 @@ class Restock(dict):
        # Remove any non-numeric characters except for the decimal point
        standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
-        # Convert to float
+        if standardized_value:
-        return float(parse_decimal(standardized_value, locale='en'))
+            # Convert to float
            return float(parse_decimal(standardized_value, locale='en'))
        return None
    def __init__(self, *args, **kwargs):
        # Define default values
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -76,7 +76,7 @@
                    </div>
                    <div class="pure-control-group">
                        {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
-                        <span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
+                        <span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
                    </div>
                {% if form.requests.proxy %}
                    <div class="pure-control-group inline-radio">
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    wait_for_all_checks(client)
    uuid = extract_UUID_from_client(client)
    # Check the 'get latest snapshot works'
    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
    assert b'which has this one new line' in res.data
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
@@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert expected_url.encode('utf-8') in res.data
    # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
-    res = client.get(url_for("diff_history_page", uuid="first"))
+    res = client.get(url_for("diff_history_page", uuid=uuid))
    assert b'selected=""' in res.data, "Confirm diff history page loaded"
    # Check the [preview] pulls the right one
@@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b'unviewed' not in res.data
    # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
    uuid = extract_UUID_from_client(client)
    client.get(url_for("clear_watch_history", uuid=uuid))
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'preview/' in res.data
    # Check the 'get latest snapshot works'
    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
    assert b'<head><title>head title</title></head>' in res.data
    #
    # Cleanup everything
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
--- a/changedetectionio/tests/test_nonrenderable_pages.py
+++ b/changedetectionio/tests/test_nonrenderable_pages.py
@@ -1,12 +1,7 @@
 #!/usr/bin/env python3
 import time
 from flask import url_for
-from urllib.request import urlopen
+from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
 from .util import set_original_response, set_modified_response, live_server_setup
 sleep_time_for_fetch_thread = 3
 def set_nonrenderable_response():
    test_return_data = """<html>
@@ -22,6 +17,13 @@ def set_nonrenderable_response():
    return None
 def set_zero_byte_response():
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("")
    return None
 def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
    set_original_response()
    live_server_setup(live_server)
@@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b"1 Imported" in res.data
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
-    # Do this a few times.. ensures we dont accidently set the status
+    # It should report nothing found (no new 'unviewed' class)
-    for n in range(3):
+    res = client.get(url_for("index"))
-        client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    assert b'unviewed' not in res.data
        # Give the thread time to pick it up
        time.sleep(sleep_time_for_fetch_thread)
        # It should report nothing found (no new 'unviewed' class)
        res = client.get(url_for("index"))
        assert b'unviewed' not in res.data
    #####################
@@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
@@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
    client.get(url_for("mark_all_viewed"), follow_redirects=True)
-
+    # A totally zero byte (#2528) response should also not trigger an error
-
+    set_zero_byte_response()
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
    assert b'fetch-error' not in res.data
    #
    # Cleanup everything
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -1,6 +1,5 @@
 from .processors.exceptions import ProcessorException
-from . import content_fetchers
+import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
 from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
 from changedetectionio import html_tools
@@ -301,7 +300,7 @@ class update_worker(threading.Thread):
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.ReplyWithContentButNoText as e:
+                    except content_fetchers_exceptions.ReplyWithContentButNoText as e:
                        # Totally fine, it's by choice - just continue on, nothing more to care about
                        # Page had elements/content but no renderable text
                        # Backend (not filters) gave zero output
@@ -327,7 +326,7 @@ class update_worker(threading.Thread):
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
+                    except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
                        if e.status_code == 403:
                            err_text = "Error - 403 (Access denied) received"
                        elif e.status_code == 404:
@@ -380,23 +379,23 @@ class update_worker(threading.Thread):
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
+                    except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
                        # Yes fine, so nothing todo, don't continue to process.
                        process_changedetection_results = False
                        changed_detected = False
-                    except content_fetchers.exceptions.BrowserConnectError as e:
+                    except content_fetchers_exceptions.BrowserConnectError as e:
                        self.datastore.update_watch(uuid=uuid,
                                                    update_obj={'last_error': e.msg
                                                                }
                                                    )
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
+                    except content_fetchers_exceptions.BrowserFetchTimedOut as e:
                        self.datastore.update_watch(uuid=uuid,
                                                    update_obj={'last_error': e.msg
                                                                }
                                                    )
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserStepsStepException as e:
+                    except content_fetchers_exceptions.BrowserStepsStepException as e:
                        if not self.datastore.data['watching'].get(uuid):
                            continue
@@ -438,25 +437,25 @@ class update_worker(threading.Thread):
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.EmptyReply as e:
+                    except content_fetchers_exceptions.EmptyReply as e:
                        # Some kind of custom to-str handler in the exception handler that does this?
                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.ScreenshotUnavailable as e:
+                    except content_fetchers_exceptions.ScreenshotUnavailable as e:
                        err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.JSActionExceptions as e:
+                    except content_fetchers_exceptions.JSActionExceptions as e:
                        err_text = "Error running JS Actions - Page request - "+e.message
                        if e.screenshot:
                            watch.save_screenshot(screenshot=e.screenshot, as_error=True)
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.PageUnloadable as e:
+                    except content_fetchers_exceptions.PageUnloadable as e:
                        err_text = "Page request from server didnt respond correctly"
                        if e.message:
                            err_text = "{} - {}".format(err_text, e.message)
@@ -468,7 +467,7 @@ class update_worker(threading.Thread):
                                                                           'last_check_status': e.status_code,
                                                                           'has_ldjson_price_data': None})
                        process_changedetection_results = False
-                    except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
+                    except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
                        err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
                        process_changedetection_results = False
--- a/requirements.txt
+++ b/requirements.txt
@@ -79,8 +79,9 @@ pyppeteerstealth>=0.0.4
 pytest ~=7.2
 pytest-flask ~=1.2
-# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
+# Anything 4.0 and up but not 5.0
-jsonschema==4.17.3
+jsonschema ~= 4.0
 loguru
Author	SHA1	Message	Date
dgtlmoon	21c63db01f	Update requirements.txt	2024-08-22 14:17:02 +02:00
dgtlmoon	e3387a00e3	Re #1708 unpin jsonschema	2024-08-22 14:13:04 +02:00
dgtlmoon	6dd1fa2b88	0.46.03	2024-08-19 17:22:13 +02:00
dgtlmoon	371f85d544	Watch 'Download last snapshot' link/button should give last, not first snapshot (#2576 )	2024-08-19 17:20:30 +02:00
dgtlmoon	932cf15e1e	Price and restock scraping - small price fix scraper (#2575 )	2024-08-19 15:47:19 +02:00
Mike Splain	bf0d410d32	Browser Steps UI - Interactive UI wasn't sending headers but was when the check ran (#2551 )	2024-08-19 10:21:05 +02:00
dgtlmoon	730f37c7ba	Set encoding type for scraper script reader (#2574 #2568 )	2024-08-19 09:17:18 +02:00
dgtlmoon	8a35d62e02	Handle zero-byte/empty content responses with "`[ ] Empty pages are a change`" option, the same as when the HTML doesnt render any useful text (#2530 )	2024-07-29 13:27:59 +02:00