Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon
78f3f2b26a Merge branch 'master' into selenium-proxy-fix 2025-05-02 14:05:45 +02:00
dgtlmoon
535ee97ef7 Selenium proxy fixes 2025-05-02 10:54:01 +02:00
dgtlmoon
b2923b8c3a Fixes to ensure proxy errors are handled correctly 2025-05-02 10:21:27 +02:00
19 changed files with 172 additions and 228 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.49.17'
__version__ = '0.49.15'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -186,7 +186,7 @@ class fetcher(Fetcher):
self.page = context.new_page()
# Listen for all console events and handle errors
self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Re-use as much code from browser steps as possible so its the same
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface

View File

@@ -10,13 +10,16 @@ class fetcher(Fetcher):
else:
fetcher_description = "WebDriver Chrome/Javascript"
# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None
proxy_url = None
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from urllib.parse import urlparse
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not custom_browser_connection_url:
@@ -25,27 +28,25 @@ class fetcher(Fetcher):
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.selenium_proxy_settings_mappings:
v = os.getenv('webdriver_' + k, False)
if v:
proxy_args[k] = v.strip('"')
##### PROXY SETUP #####
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
proxy_args['httpProxy'] = self.system_http_proxy
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
proxy_args['httpsProxy'] = self.system_https_proxy
proxy_sources = [
self.system_http_proxy,
self.system_https_proxy,
os.getenv('webdriver_proxySocks'),
os.getenv('webdriver_socksProxy'),
os.getenv('webdriver_proxyHttp'),
os.getenv('webdriver_httpProxy'),
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
if not k:
continue
self.proxy_url = k.strip()
# Allows override the proxy on a per-request basis
if proxy_override is not None:
proxy_args['httpProxy'] = proxy_override
if proxy_args:
self.proxy = SeleniumProxy(raw=proxy_args)
def run(self,
url,
@@ -58,7 +59,9 @@ class fetcher(Fetcher):
is_binary=False,
empty_pages_are_a_change=False):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.common.exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
@@ -73,62 +76,58 @@ class fetcher(Fetcher):
for opt in CHROME_OPTIONS:
options.add_argument(opt)
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
# 3. selenium only allows ONE runner at a time by default!
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
options.add_argument(f"--proxy-server={self.proxy}")
if self.proxy_url:
options.add_argument(f'--proxy-server={self.proxy_url}')
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
driver = None
try:
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
remote_connection = RemoteConnection(
self.browser_connection_url,
)
remote_connection.set_timeout(30) # seconds
# Now create the driver with the RemoteConnection
driver = RemoteWebDriver(
command_executor=remote_connection,
options=options
)
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
except Exception as e:
if driver:
driver.quit()
raise e
self.driver = webdriver.Remote(
command_executor=self.browser_connection_url,
options=options)
try:
driver.get(url)
self.driver.get(url)
except WebDriverException as e:
# Be sure we close the session window
self.quit()
raise
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
driver.set_window_size(1280, 1024)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
self.driver.set_window_size(1280, 1024)
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = driver.page_source
self.headers = {}
self.screenshot = driver.get_screenshot_as_png()
except Exception as e:
driver.quit()
raise e
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
driver.quit()
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
self.driver = webdriver.Remote(
command_executor=self.command_executor,
options=ChromeOptions())
# driver.quit() seems to cause better exceptions
self.quit()
return True
def quit(self, watch=None):
if self.driver:
try:
self.driver.quit()
except Exception as e:
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")

View File

@@ -224,37 +224,27 @@ class StringDictKeyValue(StringField):
def _value(self):
if self.data:
output = ''
for k, v in self.data.items():
output += f"{k}: {v}\r\n"
output = u''
for k in self.data.keys():
output += "{}: {}\r\n".format(k, self.data[k])
return output
else:
return ''
return u''
# incoming data processing + validation
# incoming
def process_formdata(self, valuelist):
self.data = {}
errors = []
if valuelist:
# Remove empty strings (blank lines)
cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
for idx, s in enumerate(cleaned, start=1):
if ':' not in s:
errors.append(f"Line {idx} is missing a ':' separator.")
continue
parts = s.split(':', 1)
key = parts[0].strip()
value = parts[1].strip()
self.data = {}
# Remove empty strings
cleaned = list(filter(None, valuelist[0].split("\n")))
for s in cleaned:
parts = s.strip().split(':', 1)
if len(parts) == 2:
self.data.update({parts[0].strip(): parts[1].strip()})
if not key:
errors.append(f"Line {idx} has an empty key.")
if not value:
errors.append(f"Line {idx} has an empty value.")
self.data[key] = value
if errors:
raise ValidationError("Invalid input:\n" + "\n".join(errors))
else:
self.data = {}
class ValidateContentFetcherIsReady(object):
"""

View File

@@ -309,10 +309,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
soup = BeautifulSoup(content, 'html.parser')
if ensure_is_ldjson_info_type:
bs_result = soup.find_all('script', {"type": "application/ld+json"})
bs_result = soup.findAll('script', {"type": "application/ld+json"})
else:
bs_result = soup.find_all('script')
bs_result += soup.find_all('body')
bs_result = soup.findAll('script')
bs_result += soup.findAll('body')
bs_jsons = []
for result in bs_result:
@@ -436,27 +436,55 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
"""Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also
included in the text
:param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the
'href' inside 'a' tags.
Anchor tag content is rendered in the following manner:
'[ text ](anchor tag content)'
:return: extracted text from the HTML
"""
# if anchor tag content flag is set to True define a config for
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
# otherwise set config to None/default
else:
parser_config = None
# RSS Mode - Inscriptis will treat `title` as something else.
# Make it as a regular block display element (//item/title)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
return text_content
conn.send(text_content)
conn.close()
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

View File

@@ -89,7 +89,7 @@ class difference_detection_processor():
proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
else:
logger.debug("Skipping adding proxy data when custom Browser endpoint is specified. ")
logger.debug(f"Skipping adding proxy data when custom Browser endpoint is specified. ")
# Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
# When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)

View File

@@ -79,7 +79,7 @@ def get_itemprop_availability(html_content) -> Restock:
# First phase, dead simple scanning of anything that looks useful
value = Restock()
if data:
logger.debug("Using jsonpath to find price/availability/etc")
logger.debug(f"Using jsonpath to find price/availability/etc")
price_parse = parse('$..(price|Price)')
pricecurrency_parse = parse('$..(pricecurrency|currency|priceCurrency )')
availability_parse = parse('$..(availability|Availability)')
@@ -110,7 +110,7 @@ def get_itemprop_availability(html_content) -> Restock:
# Second, go dig OpenGraph which is something that jsonpath_ng cant do because of the tuples and double-dots (:)
if not value.get('price') or value.get('availability'):
logger.debug("Alternatively digging through OpenGraph properties for restock/price info..")
logger.debug(f"Alternatively digging through OpenGraph properties for restock/price info..")
jsonpath_expr = parse('$..properties')
for match in jsonpath_expr.find(data):

View File

@@ -15,7 +15,7 @@ def _task(watch, update_handler):
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:
text_after_filter = "Filter found but no text (empty result)"
text_after_filter = f"Filter found but no text (empty result)"
except Exception as e:
text_after_filter = f"Error: {str(e)}"

View File

@@ -38,9 +38,6 @@ pytest tests/test_backend.py
pytest tests/test_rss.py
pytest tests/test_unique_lines.py
# Try high concurrency
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest tests/test_security.py

View File

@@ -100,7 +100,8 @@ docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
# Selenium
# Selenium - todo - fix proxies
docker run --network changedet-network \
-e "WEBDRIVER_URL=http://selenium:4444/wd/hub" \
test-changedetectionio \
bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'
bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'

View File

@@ -7,7 +7,7 @@ from ..util import live_server_setup, wait_for_all_checks
def do_test(client, live_server, make_test_use_extra_browser=False):
# Grep for this string in the logs?
test_url = "https://changedetection.io/ci-test.html?non-custom-default=true"
test_url = f"https://changedetection.io/ci-test.html?non-custom-default=true"
# "non-custom-default" should not appear in the custom browser connection
custom_browser_name = 'custom browser URL'
@@ -51,7 +51,7 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
# 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not
"url": "https://changedetection.io/ci-test.html?custom-browser-search-string=1",
"url": f"https://changedetection.io/ci-test.html?custom-browser-search-string=1",
"tags": "",
"headers": "",
'fetch_backend': f"extra_browser_{custom_browser_name}",

View File

@@ -7,11 +7,6 @@ from ... import strtobool
# Just to be sure the UI outputs the right error message on proxy connection failed
# docker run -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
live_server_setup(live_server)
@@ -21,48 +16,38 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
# test:awesome is set in tests/proxy_list/squid-passwords.txt
"requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
"requests-extra_proxies-0-proxy_url": "http://THISPROXYDOESNTEXIST:3128",
},
follow_redirects=True
)
assert b"Settings updated." in res.data
test_url = "https://changedetection.io"
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
url_for("imports.import_page"),
# Because a URL wont show in squid/proxy logs due it being SSLed
# Use plain HTTP or a specific domain-name here
data={"urls": "https://changedetection.io/CHANGELOG.txt"},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
options = {
"url": test_url,
"fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
"proxy": "ui-0custom-test-proxy",
}
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
data=options,
follow_redirects=True
)
assert b"unpaused" in res.data
import time
assert b"1 Imported" in res.data
wait_for_all_checks(client)
# Requests default
check_string = b'Cannot connect to proxy'
res = client.get(url_for("watchlist.index"))
assert b'Page.goto: net::ERR_PROXY_CONNECTION_FAILED' in res.data
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
# Requests
check_string = b'Proxy connection failed?'
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
check_string = b'ERR_PROXY_CONNECTION_FAILED'
if os.getenv("WEBDRIVER_URL"):
check_string = b'ERR_PROXY_CONNECTION_FAILED'
res = client.get(url_for("watchlist.index"))
#with open("/tmp/debug.html", 'wb') as f:
# f.write(res.data)
assert check_string in res.data

View File

@@ -7,7 +7,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks,
def set_response():
import time
data = """<html>
data = f"""<html>
<body>
<h1>Awesome, you made it</h1>
yeah the socks request worked

View File

@@ -6,7 +6,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks
def set_response():
import time
data = """<html>
data = f"""<html>
<body>
<h1>Awesome, you made it</h1>
yeah the socks request worked

View File

@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):
live_server_setup(live_server)
workers = int(os.getenv("FETCH_WORKERS", 10))
r = range(1, 10+workers)
r = range(1, 30)
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@@ -46,10 +46,9 @@ def test_consistent_history(client, live_server, measure_memory_usage):
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
i=0
# each one should have a history.txt containing just one line
for w in json_obj['watching'].keys():
i+=1
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
@@ -59,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(tmp_history) == 1, "History.txt should contain 1 line"
# Should be two files,. the history.txt , and the snapshot.txt
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
w))
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and 'html' not in fname:
@@ -76,6 +75,7 @@ def test_consistent_history(client, live_server, measure_memory_usage):
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"

View File

@@ -6,7 +6,7 @@ from changedetectionio.tests.util import live_server_setup, wait_for_all_checks,
def set_response():
data = """<html>
data = f"""<html>
<body>Awesome, you made it<br>
yeah the socks request worked<br>
something to ignore<br>

View File

@@ -424,27 +424,3 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
# unlink headers.txt on start/stop
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_headers_validation(client, live_server):
#live_server_setup(live_server)
test_url = url_for('test_headers', _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"fetch_backend": 'html_requests',
"headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
follow_redirects=True
)
assert b"Line 1 is missing a &#39;:&#39; separator." in res.data
assert b"Line 3 has an empty key." in res.data

View File

@@ -126,51 +126,18 @@ def extract_UUID_from_client(client):
uuid = m.group(1)
return uuid.strip()
def wait_for_all_checks(client=None):
"""
Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
and also ensures no running threads have `current_uuid` set.
Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
"""
from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
# Configuration
attempt = 0
i=0
max_attempts = 60
wait_between_attempts = 2
required_empty_duration = 2
logger = logging.getLogger()
time.sleep(1.2)
empty_since = None
while attempt < max_attempts:
q_length = global_update_q.qsize()
# Check if any threads are still processing
time.sleep(1.2)
any_threads_busy = any(t.current_uuid for t in running_update_threads)
if q_length == 0 and not any_threads_busy:
if empty_since is None:
empty_since = time.time()
logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
elif time.time() - empty_since >= required_empty_duration:
logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
break
else:
logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
else:
if q_length != 0:
logger.info(f"Queue not empty (size={q_length}), resetting timer.")
if any_threads_busy:
busy_threads = [t.name for t in running_update_threads if t.current_uuid]
logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
empty_since = None
def wait_for_all_checks(client):
# actually this is not entirely true, it can still be 'processing' but not in the queue
# Loop waiting until done..
attempt=0
# because sub-second rechecks are problematic in testing, use lots of delays
time.sleep(1)
while attempt < 60:
res = client.get(url_for("watchlist.index"))
if not b'Checking now' in res.data:
break
logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
time.sleep(1)
attempt += 1
time.sleep(1)

View File

@@ -32,7 +32,7 @@ dnspython==2.6.1 # related to eventlet fixes
# jq not available on Windows so must be installed manually
# Notification library
apprise==1.9.3
apprise==1.9.2
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
# use any version other than 2.0.x due to https://github.com/eclipse/paho.mqtt.python/issues/814
@@ -42,7 +42,7 @@ paho-mqtt!=2.0.*
cryptography~=42.0.8
# Used for CSS filtering
beautifulsoup4>=4.0.0
beautifulsoup4
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
# #2328 - 5.2.0 and 5.2.1 had extra CPU flag CFLAGS set which was not compatible on older hardware
@@ -53,7 +53,8 @@ lxml >=4.8.0,<6,!=5.2.0,!=5.2.1
# XPath 2.0-3.1 support - 4.2.0 broke something?
elementpath==4.1.5
selenium~=4.31.0
selenium==4.31.0
# https://github.com/pallets/werkzeug/issues/2985
# Maybe related to pytest?
@@ -70,7 +71,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
# playwright is installed at Dockerfile build time because it's not available on all platforms
pyppeteer-ng==2.0.0rc10
pyppeteer-ng==2.0.0rc9
pyppeteerstealth>=0.0.4