Merge branch 'master' into selenium-proxy-fix

Selenium proxy fixes
Fixes to ensure proxy errors are handled correctly
2025-11-06 01:26:25 +00:00 · 2025-05-02 14:05:45 +02:00 · 2025-05-02 10:54:01 +02:00 · 2025-05-02 10:21:27 +02:00
19 changed files with 172 additions and 228 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.49.17'
+__version__ = '0.49.15'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -186,7 +186,7 @@ class fetcher(Fetcher):
            self.page = context.new_page()

            # Listen for all console events and handle errors
-            self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
+            self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))

            # Re-use as much code from browser steps as possible so its the same
            from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
--- a/changedetectionio/content_fetchers/webdriver_selenium.py
+++ b/changedetectionio/content_fetchers/webdriver_selenium.py
@@ -10,13 +10,16 @@ class fetcher(Fetcher):
    else:
        fetcher_description = "WebDriver Chrome/Javascript"

+    # Configs for Proxy setup
+    # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
+    selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
+                                        'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
+                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
    proxy = None
-    proxy_url = None

    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
-        from urllib.parse import urlparse
-        from selenium.webdriver.common.proxy import Proxy
+        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        if not custom_browser_connection_url:
@@ -25,27 +28,25 @@ class fetcher(Fetcher):
            self.browser_connection_is_custom = True
            self.browser_connection_url = custom_browser_connection_url

+        # If any proxy settings are enabled, then we should setup the proxy object
+        proxy_args = {}
+        for k in self.selenium_proxy_settings_mappings:
+            v = os.getenv('webdriver_' + k, False)
+            if v:
+                proxy_args[k] = v.strip('"')

-        ##### PROXY SETUP #####
+        # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
+        if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
+            proxy_args['httpProxy'] = self.system_http_proxy
+        if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
+            proxy_args['httpsProxy'] = self.system_https_proxy

-        proxy_sources = [
-            self.system_http_proxy,
-            self.system_https_proxy,
-            os.getenv('webdriver_proxySocks'),
-            os.getenv('webdriver_socksProxy'),
-            os.getenv('webdriver_proxyHttp'),
-            os.getenv('webdriver_httpProxy'),
-            os.getenv('webdriver_proxyHttps'),
-            os.getenv('webdriver_httpsProxy'),
-            os.getenv('webdriver_sslProxy'),
-            proxy_override, # last one should override
-        ]
-        # The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
-        for k in filter(None, proxy_sources):
-            if not k:
-                continue
-            self.proxy_url = k.strip()
+        # Allows override the proxy on a per-request basis
+        if proxy_override is not None:
+            proxy_args['httpProxy'] = proxy_override

+        if proxy_args:
+            self.proxy = SeleniumProxy(raw=proxy_args)

    def run(self,
            url,
@@ -58,7 +59,9 @@ class fetcher(Fetcher):
            is_binary=False,
            empty_pages_are_a_change=False):

+        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options as ChromeOptions
+        from selenium.common.exceptions import WebDriverException
        # request_body, request_method unused for now, until some magic in the future happens.

        options = ChromeOptions()
@@ -73,62 +76,58 @@ class fetcher(Fetcher):
        for opt in CHROME_OPTIONS:
            options.add_argument(opt)

-        # 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
-        # 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
-        # 3. selenium only allows ONE runner at a time by default!
-        # 4. driver must use quit() or it will continue to block/hold the selenium process!!
+        options.add_argument(f"--proxy-server={self.proxy}")

-        if self.proxy_url:
-            options.add_argument(f'--proxy-server={self.proxy_url}')
-
-        from selenium.webdriver.remote.remote_connection import RemoteConnection
-        from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
-        driver = None
-        try:
-            # Create the RemoteConnection and set timeout (e.g., 30 seconds)
-            remote_connection = RemoteConnection(
-                self.browser_connection_url,
-            )
-            remote_connection.set_timeout(30)  # seconds
-
-            # Now create the driver with the RemoteConnection
-            driver = RemoteWebDriver(
-                command_executor=remote_connection,
-                options=options
-            )
-
-            driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
-        except Exception as e:
-            if driver:
-                driver.quit()
-            raise e
+        self.driver = webdriver.Remote(
+            command_executor=self.browser_connection_url,
+            options=options)

        try:
-            driver.get(url)
+            self.driver.get(url)
+        except WebDriverException as e:
+            # Be sure we close the session window
+            self.quit()
+            raise

-            if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
-                driver.set_window_size(1280, 1024)
+        if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
+            self.driver.set_window_size(1280, 1024)

-            driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
+        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

-            if self.webdriver_js_execute_code is not None:
-                driver.execute_script(self.webdriver_js_execute_code)
-                # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
-                driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
+        if self.webdriver_js_execute_code is not None:
+            self.driver.execute_script(self.webdriver_js_execute_code)
+            # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
+            self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

-            # @todo - how to check this? is it possible?
-            self.status_code = 200
-            # @todo somehow we should try to get this working for WebDriver
-            # raise EmptyReply(url=url, status_code=r.status_code)

-            # @todo - dom wait loaded?
-            time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
-            self.content = driver.page_source
-            self.headers = {}
-            self.screenshot = driver.get_screenshot_as_png()
-        except Exception as e:
-            driver.quit()
-            raise e
+        # @todo - how to check this? is it possible?
+        self.status_code = 200
+        # @todo somehow we should try to get this working for WebDriver
+        # raise EmptyReply(url=url, status_code=r.status_code)

-        driver.quit()
+        # @todo - dom wait loaded?
+        time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
+        self.content = self.driver.page_source
+        self.headers = {}

+        self.screenshot = self.driver.get_screenshot_as_png()
+
+    # Does the connection to the webdriver work? run a test connection.
+    def is_ready(self):
+        from selenium import webdriver
+        from selenium.webdriver.chrome.options import Options as ChromeOptions
+
+        self.driver = webdriver.Remote(
+            command_executor=self.command_executor,
+            options=ChromeOptions())
+
+        # driver.quit() seems to cause better exceptions
+        self.quit()
+        return True
+
+    def quit(self, watch=None):
+        if self.driver:
+            try:
+                self.driver.quit()
+            except Exception as e:
+                logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -224,37 +224,27 @@ class StringDictKeyValue(StringField):

    def _value(self):
        if self.data:
-            output = ''
-            for k, v in self.data.items():
-                output += f"{k}: {v}\r\n"
+            output = u''
+            for k in self.data.keys():
+                output += "{}: {}\r\n".format(k, self.data[k])
+
            return output
        else:
-            return ''
+            return u''

-    # incoming data processing + validation
+    # incoming
    def process_formdata(self, valuelist):
-        self.data = {}
-        errors = []
        if valuelist:
-            # Remove empty strings (blank lines)
-            cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
-            for idx, s in enumerate(cleaned, start=1):
-                if ':' not in s:
-                    errors.append(f"Line {idx} is missing a ':' separator.")
-                    continue
-                parts = s.split(':', 1)
-                key = parts[0].strip()
-                value = parts[1].strip()
+            self.data = {}
+            # Remove empty strings
+            cleaned = list(filter(None, valuelist[0].split("\n")))
+            for s in cleaned:
+                parts = s.strip().split(':', 1)
+                if len(parts) == 2:
+                    self.data.update({parts[0].strip(): parts[1].strip()})

-                if not key:
-                    errors.append(f"Line {idx} has an empty key.")
-                if not value:
-                    errors.append(f"Line {idx} has an empty value.")
-
-                self.data[key] = value
-
-        if errors:
-            raise ValidationError("Invalid input:\n" + "\n".join(errors))
+        else:
+            self.data = {}

 class ValidateContentFetcherIsReady(object):
    """
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -309,10 +309,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
        soup = BeautifulSoup(content, 'html.parser')

        if ensure_is_ldjson_info_type:
-            bs_result = soup.find_all('script', {"type": "application/ld+json"})
+            bs_result = soup.findAll('script', {"type": "application/ld+json"})
        else:
-            bs_result = soup.find_all('script')
-        bs_result += soup.find_all('body')
+            bs_result = soup.findAll('script')
+        bs_result += soup.findAll('body')

        bs_jsons = []
        for result in bs_result:
@@ -436,27 +436,55 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)


-# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):

-
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig

+    """Converts html string to a string with just the text. If ignoring
+    rendering anchor tag content is enable, anchor tag content are also
+    included in the text
+
+    :param html_content: string with html content
+    :param render_anchor_tag_content: boolean flag indicating whether to extract
+    hyperlinks (the anchor tag content) together with text. This refers to the
+    'href' inside 'a' tags.
+    Anchor tag content is rendered in the following manner:
+    '[ text ](anchor tag content)'
+    :return: extracted text from the HTML
+    """
+    #  if anchor tag content flag is set to True define a config for
+    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
+    # otherwise set config to None/default
    else:
        parser_config = None

+    # RSS Mode - Inscriptis will treat `title` as something else.
+    # Make it as a regular block display element (//item/title)
+    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)

    text_content = get_text(html_content, config=parser_config)
-    return text_content
+    conn.send(text_content)
+    conn.close()
+
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+    from multiprocessing import Process, Pipe
+
+    parent_conn, child_conn = Pipe()
+    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
+    p.start()
+    text = parent_conn.recv()
+    p.join()
+    return text

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@@ -89,7 +89,7 @@ class difference_detection_processor():
                proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
                logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
            else:
-                logger.debug("Skipping adding proxy data when custom Browser endpoint is specified. ")
+                logger.debug(f"Skipping adding proxy data when custom Browser endpoint is specified. ")

        # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
        # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -79,7 +79,7 @@ def get_itemprop_availability(html_content) -> Restock:
    # First phase, dead simple scanning of anything that looks useful
    value = Restock()
    if data:
-        logger.debug("Using jsonpath to find price/availability/etc")
+        logger.debug(f"Using jsonpath to find price/availability/etc")
        price_parse = parse('$..(price|Price)')
        pricecurrency_parse = parse('$..(pricecurrency|currency|priceCurrency )')
        availability_parse = parse('$..(availability|Availability)')
@@ -110,7 +110,7 @@ def get_itemprop_availability(html_content) -> Restock:

        # Second, go dig OpenGraph which is something that jsonpath_ng cant do because of the tuples and double-dots (:)
        if not value.get('price') or value.get('availability'):
-            logger.debug("Alternatively digging through OpenGraph properties for restock/price info..")
+            logger.debug(f"Alternatively digging through OpenGraph properties for restock/price info..")
            jsonpath_expr = parse('$..properties')

            for match in jsonpath_expr.find(data):
--- a/changedetectionio/processors/text_json_diff/init.py
+++ b/changedetectionio/processors/text_json_diff/init.py
@@ -15,7 +15,7 @@ def _task(watch, update_handler):
    except FilterNotFoundInResponse as e:
        text_after_filter = f"Filter not found in HTML: {str(e)}"
    except ReplyWithContentButNoText as e:
-        text_after_filter = "Filter found but no text (empty result)"
+        text_after_filter = f"Filter found but no text (empty result)"
    except Exception as e:
        text_after_filter = f"Error: {str(e)}"

--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -38,9 +38,6 @@ pytest tests/test_backend.py
 pytest tests/test_rss.py
 pytest tests/test_unique_lines.py

-# Try high concurrency
-FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l
-
 # Check file:// will pickup a file when enabled
 echo "Hello world" > /tmp/test-file.txt
 ALLOW_FILE_URI=yes pytest tests/test_security.py
--- a/changedetectionio/run_proxy_tests.sh
+++ b/changedetectionio/run_proxy_tests.sh
@@ -100,7 +100,8 @@ docker run --network changedet-network \
  test-changedetectionio \
  bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'

-# Selenium
+# Selenium - todo - fix proxies
 docker run --network changedet-network \
+  -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" \
  test-changedetectionio \
-  bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'
+  bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'
--- a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py
+++ b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py
@@ -7,7 +7,7 @@ from ..util import live_server_setup, wait_for_all_checks
 def do_test(client, live_server, make_test_use_extra_browser=False):

    # Grep for this string in the logs?
-    test_url = "https://changedetection.io/ci-test.html?non-custom-default=true"
+    test_url = f"https://changedetection.io/ci-test.html?non-custom-default=true"
    # "non-custom-default" should not appear in the custom browser connection
    custom_browser_name = 'custom browser URL'

@@ -51,7 +51,7 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
            url_for("ui.ui_edit.edit_page", uuid="first"),
            data={
                # 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not
-                  "url": "https://changedetection.io/ci-test.html?custom-browser-search-string=1",
+                  "url": f"https://changedetection.io/ci-test.html?custom-browser-search-string=1",
                  "tags": "",
                  "headers": "",
                  'fetch_backend': f"extra_browser_{custom_browser_name}",
--- a/changedetectionio/tests/proxy_list/test_proxy_noconnect.py
+++ b/changedetectionio/tests/proxy_list/test_proxy_noconnect.py
@@ -7,11 +7,6 @@ from ... import strtobool


 # Just to be sure the UI outputs the right error message on proxy connection failed
-# docker run -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome:4
-# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
-# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
-# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
-
 def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
    live_server_setup(live_server)

@@ -21,48 +16,38 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
        data={
            "requests-time_between_check-minutes": 180,
            "application-ignore_whitespace": "y",
-            "application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
+            "application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
            "requests-extra_proxies-0-proxy_name": "custom-test-proxy",
            # test:awesome is set in tests/proxy_list/squid-passwords.txt
-            "requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
+            "requests-extra_proxies-0-proxy_url": "http://THISPROXYDOESNTEXIST:3128",
        },
        follow_redirects=True
    )

    assert b"Settings updated." in res.data

-    test_url = "https://changedetection.io"
    res = client.post(
-        url_for("ui.ui_views.form_quick_watch_add"),
-        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
+        url_for("imports.import_page"),
+        # Because a URL wont show in squid/proxy logs due it being SSLed
+        # Use plain HTTP or a specific domain-name here
+        data={"urls": "https://changedetection.io/CHANGELOG.txt"},
        follow_redirects=True
    )

-    assert b"Watch added in Paused state, saving will unpause" in res.data
-
-    options = {
-        "url": test_url,
-        "fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
-        "proxy": "ui-0custom-test-proxy",
-    }
-
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
-        data=options,
-        follow_redirects=True
-    )
-    assert b"unpaused" in res.data
-    import time
+    assert b"1 Imported" in res.data
    wait_for_all_checks(client)

-    # Requests default
-    check_string = b'Cannot connect to proxy'
+    res = client.get(url_for("watchlist.index"))
+    assert b'Page.goto: net::ERR_PROXY_CONNECTION_FAILED' in res.data

-    if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
+    # Requests
+    check_string = b'Proxy connection failed?'
+
+    if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
        check_string = b'ERR_PROXY_CONNECTION_FAILED'

+    if os.getenv("WEBDRIVER_URL"):
+        check_string = b'ERR_PROXY_CONNECTION_FAILED'

-    res = client.get(url_for("watchlist.index"))
-    #with open("/tmp/debug.html", 'wb') as f:
-    #    f.write(res.data)
    assert check_string in res.data
+
--- a/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py
+++ b/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py
@@ -7,7 +7,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks,

 def set_response():
    import time
-    data = """<html>
+    data = f"""<html>
       <body>
     <h1>Awesome, you made it</h1>
     yeah the socks request worked
--- a/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py
+++ b/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py
@@ -6,7 +6,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks

 def set_response():
    import time
-    data = """<html>
+    data = f"""<html>
       <body>
     <h1>Awesome, you made it</h1>
     yeah the socks request worked
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs

 def test_consistent_history(client, live_server, measure_memory_usage):
    live_server_setup(live_server)
-    workers = int(os.getenv("FETCH_WORKERS", 10))
-    r = range(1, 10+workers)
+
+    r = range(1, 30)

    for one in r:
        test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@@ -46,10 +46,9 @@ def test_consistent_history(client, live_server, measure_memory_usage):

    # assert the right amount of watches was found in the JSON
    assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
-    i=0
+
    # each one should have a history.txt containing just one line
    for w in json_obj['watching'].keys():
-        i+=1
        history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
        assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"

@@ -59,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
            assert len(tmp_history) == 1, "History.txt should contain 1 line"

        # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
-
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
+                                                     w))
        # Find the snapshot one
        for fname in files_in_watch_dir:
            if fname != 'history.txt' and 'html' not in fname:
@@ -76,6 +75,7 @@ def test_consistent_history(client, live_server, measure_memory_usage):

        assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"

+
    json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
    with open(json_db_file, 'r') as f:
        assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
--- a/changedetectionio/tests/test_live_preview.py
+++ b/changedetectionio/tests/test_live_preview.py
@@ -6,7 +6,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks,

 def set_response():

-    data = """<html>
+    data = f"""<html>
       <body>Awesome, you made it<br>
 yeah the socks request worked<br>
 something to ignore<br>
--- a/changedetectionio/tests/test_request.py
+++ b/changedetectionio/tests/test_request.py
@@ -424,27 +424,3 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
    # unlink headers.txt on start/stop
    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
-
-def test_headers_validation(client, live_server):
-    #live_server_setup(live_server)
-
-    test_url = url_for('test_headers', _external=True)
-    res = client.post(
-        url_for("imports.import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-    assert b"1 Imported" in res.data
-
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid="first"),
-        data={
-            "url": test_url,
-            "fetch_backend": 'html_requests',
-            "headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
-        follow_redirects=True
-    )
-
-    assert b"Line 1 is missing a &#39;:&#39; separator." in res.data
-    assert b"Line 3 has an empty key." in res.data
-
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -126,51 +126,18 @@ def extract_UUID_from_client(client):
    uuid = m.group(1)
    return uuid.strip()

-
-def wait_for_all_checks(client=None):
-    """
-    Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
-    and also ensures no running threads have `current_uuid` set.
-    Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
-    """
-    from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
-
-    # Configuration
-    attempt = 0
-    i=0
-    max_attempts = 60
-    wait_between_attempts = 2
-    required_empty_duration = 2
-
-    logger = logging.getLogger()
-    time.sleep(1.2)
-
-    empty_since = None
-
-    while attempt < max_attempts:
-        q_length = global_update_q.qsize()
-
-        # Check if any threads are still processing
-        time.sleep(1.2)
-        any_threads_busy = any(t.current_uuid for t in running_update_threads)
-
-
-        if q_length == 0 and not any_threads_busy:
-            if empty_since is None:
-                empty_since = time.time()
-                logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
-            elif time.time() - empty_since >= required_empty_duration:
-                logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
-                break
-            else:
-                logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
-        else:
-            if q_length != 0:
-                logger.info(f"Queue not empty (size={q_length}), resetting timer.")
-            if any_threads_busy:
-                busy_threads = [t.name for t in running_update_threads if t.current_uuid]
-                logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
-            empty_since = None
+def wait_for_all_checks(client):
+    # actually this is not entirely true, it can still be 'processing' but not in the queue
+    # Loop waiting until done..
+    attempt=0
+    # because sub-second rechecks are problematic in testing, use lots of delays
+    time.sleep(1)
+    while attempt < 60:
+        res = client.get(url_for("watchlist.index"))
+        if not b'Checking now' in res.data:
+            break
+        logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
+        time.sleep(1)
        attempt += 1

    time.sleep(1)
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,7 +32,7 @@ dnspython==2.6.1 # related to eventlet fixes
 # jq not available on Windows so must be installed manually

 # Notification library
-apprise==1.9.3
+apprise==1.9.2

 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
 # use any version other than 2.0.x due to https://github.com/eclipse/paho.mqtt.python/issues/814
@@ -42,7 +42,7 @@ paho-mqtt!=2.0.*
 cryptography~=42.0.8

 # Used for CSS filtering
-beautifulsoup4>=4.0.0
+beautifulsoup4

 # XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
 # #2328 - 5.2.0 and 5.2.1 had extra CPU flag CFLAGS set which was not compatible on older hardware
@@ -53,7 +53,8 @@ lxml >=4.8.0,<6,!=5.2.0,!=5.2.1
 # XPath 2.0-3.1 support - 4.2.0 broke something?
 elementpath==4.1.5

-selenium~=4.31.0
+selenium==4.31.0
+

 # https://github.com/pallets/werkzeug/issues/2985
 # Maybe related to pytest?
@@ -70,7 +71,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"

 # playwright is installed at Dockerfile build time because it's not available on all platforms

-pyppeteer-ng==2.0.0rc10
+pyppeteer-ng==2.0.0rc9

 pyppeteerstealth>=0.0.4
Author	SHA1	Message	Date
dgtlmoon	78f3f2b26a	Merge branch 'master' into selenium-proxy-fix	2025-05-02 14:05:45 +02:00
dgtlmoon	535ee97ef7	Selenium proxy fixes	2025-05-02 10:54:01 +02:00
dgtlmoon	b2923b8c3a	Fixes to ensure proxy errors are handled correctly	2025-05-02 10:21:27 +02:00