Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon
78f3f2b26a Merge branch 'master' into selenium-proxy-fix 2025-05-02 14:05:45 +02:00
dgtlmoon
535ee97ef7 Selenium proxy fixes 2025-05-02 10:54:01 +02:00
dgtlmoon
b2923b8c3a Fixes to ensure proxy errors are handled correctly 2025-05-02 10:21:27 +02:00
8 changed files with 108 additions and 156 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.49.16'
__version__ = '0.49.15'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -10,13 +10,16 @@ class fetcher(Fetcher):
else:
fetcher_description = "WebDriver Chrome/Javascript"
# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None
proxy_url = None
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from urllib.parse import urlparse
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not custom_browser_connection_url:
@@ -25,27 +28,25 @@ class fetcher(Fetcher):
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.selenium_proxy_settings_mappings:
v = os.getenv('webdriver_' + k, False)
if v:
proxy_args[k] = v.strip('"')
##### PROXY SETUP #####
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
proxy_args['httpProxy'] = self.system_http_proxy
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
proxy_args['httpsProxy'] = self.system_https_proxy
proxy_sources = [
self.system_http_proxy,
self.system_https_proxy,
os.getenv('webdriver_proxySocks'),
os.getenv('webdriver_socksProxy'),
os.getenv('webdriver_proxyHttp'),
os.getenv('webdriver_httpProxy'),
os.getenv('webdriver_proxyHttps'),
os.getenv('webdriver_httpsProxy'),
os.getenv('webdriver_sslProxy'),
proxy_override, # last one should override
]
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
for k in filter(None, proxy_sources):
if not k:
continue
self.proxy_url = k.strip()
# Allows override the proxy on a per-request basis
if proxy_override is not None:
proxy_args['httpProxy'] = proxy_override
if proxy_args:
self.proxy = SeleniumProxy(raw=proxy_args)
def run(self,
url,
@@ -58,7 +59,9 @@ class fetcher(Fetcher):
is_binary=False,
empty_pages_are_a_change=False):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.common.exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
options = ChromeOptions()
@@ -73,62 +76,58 @@ class fetcher(Fetcher):
for opt in CHROME_OPTIONS:
options.add_argument(opt)
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
# 3. selenium only allows ONE runner at a time by default!
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
options.add_argument(f"--proxy-server={self.proxy}")
if self.proxy_url:
options.add_argument(f'--proxy-server={self.proxy_url}')
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
driver = None
try:
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
remote_connection = RemoteConnection(
self.browser_connection_url,
)
remote_connection.set_timeout(30) # seconds
# Now create the driver with the RemoteConnection
driver = RemoteWebDriver(
command_executor=remote_connection,
options=options
)
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
except Exception as e:
if driver:
driver.quit()
raise e
self.driver = webdriver.Remote(
command_executor=self.browser_connection_url,
options=options)
try:
driver.get(url)
self.driver.get(url)
except WebDriverException as e:
# Be sure we close the session window
self.quit()
raise
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
driver.set_window_size(1280, 1024)
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
self.driver.set_window_size(1280, 1024)
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = driver.page_source
self.headers = {}
self.screenshot = driver.get_screenshot_as_png()
except Exception as e:
driver.quit()
raise e
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
driver.quit()
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
self.driver = webdriver.Remote(
command_executor=self.command_executor,
options=ChromeOptions())
# driver.quit() seems to cause better exceptions
self.quit()
return True
def quit(self, watch=None):
if self.driver:
try:
self.driver.quit()
except Exception as e:
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")

View File

@@ -224,37 +224,27 @@ class StringDictKeyValue(StringField):
def _value(self):
if self.data:
output = ''
for k, v in self.data.items():
output += f"{k}: {v}\r\n"
output = u''
for k in self.data.keys():
output += "{}: {}\r\n".format(k, self.data[k])
return output
else:
return ''
return u''
# incoming data processing + validation
# incoming
def process_formdata(self, valuelist):
self.data = {}
errors = []
if valuelist:
# Remove empty strings (blank lines)
cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
for idx, s in enumerate(cleaned, start=1):
if ':' not in s:
errors.append(f"Line {idx} is missing a ':' separator.")
continue
parts = s.split(':', 1)
key = parts[0].strip()
value = parts[1].strip()
self.data = {}
# Remove empty strings
cleaned = list(filter(None, valuelist[0].split("\n")))
for s in cleaned:
parts = s.strip().split(':', 1)
if len(parts) == 2:
self.data.update({parts[0].strip(): parts[1].strip()})
if not key:
errors.append(f"Line {idx} has an empty key.")
if not value:
errors.append(f"Line {idx} has an empty value.")
self.data[key] = value
if errors:
raise ValidationError("Invalid input:\n" + "\n".join(errors))
else:
self.data = {}
class ValidateContentFetcherIsReady(object):
"""

View File

@@ -100,7 +100,8 @@ docker run --network changedet-network \
test-changedetectionio \
bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
# Selenium
# Selenium - todo - fix proxies
docker run --network changedet-network \
-e "WEBDRIVER_URL=http://selenium:4444/wd/hub" \
test-changedetectionio \
bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'
bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'

View File

@@ -7,11 +7,6 @@ from ... import strtobool
# Just to be sure the UI outputs the right error message on proxy connection failed
# docker run -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
live_server_setup(live_server)
@@ -21,48 +16,38 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
# test:awesome is set in tests/proxy_list/squid-passwords.txt
"requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
"requests-extra_proxies-0-proxy_url": "http://THISPROXYDOESNTEXIST:3128",
},
follow_redirects=True
)
assert b"Settings updated." in res.data
test_url = "https://changedetection.io"
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
url_for("imports.import_page"),
# Because a URL wont show in squid/proxy logs due it being SSLed
# Use plain HTTP or a specific domain-name here
data={"urls": "https://changedetection.io/CHANGELOG.txt"},
follow_redirects=True
)
assert b"Watch added in Paused state, saving will unpause" in res.data
options = {
"url": test_url,
"fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
"proxy": "ui-0custom-test-proxy",
}
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
data=options,
follow_redirects=True
)
assert b"unpaused" in res.data
import time
assert b"1 Imported" in res.data
wait_for_all_checks(client)
# Requests default
check_string = b'Cannot connect to proxy'
res = client.get(url_for("watchlist.index"))
assert b'Page.goto: net::ERR_PROXY_CONNECTION_FAILED' in res.data
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
# Requests
check_string = b'Proxy connection failed?'
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
check_string = b'ERR_PROXY_CONNECTION_FAILED'
if os.getenv("WEBDRIVER_URL"):
check_string = b'ERR_PROXY_CONNECTION_FAILED'
res = client.get(url_for("watchlist.index"))
#with open("/tmp/debug.html", 'wb') as f:
# f.write(res.data)
assert check_string in res.data

View File

@@ -424,27 +424,3 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
# unlink headers.txt on start/stop
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_headers_validation(client, live_server):
#live_server_setup(live_server)
test_url = url_for('test_headers', _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={
"url": test_url,
"fetch_backend": 'html_requests',
"headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
follow_redirects=True
)
assert b"Line 1 is missing a ':' separator." in res.data
assert b"Line 3 has an empty key." in res.data

View File

@@ -136,7 +136,7 @@ def wait_for_all_checks(client):
res = client.get(url_for("watchlist.index"))
if not b'Checking now' in res.data:
break
logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
time.sleep(1)
attempt += 1

View File

@@ -53,7 +53,8 @@ lxml >=4.8.0,<6,!=5.2.0,!=5.2.1
# XPath 2.0-3.1 support - 4.2.0 broke something?
elementpath==4.1.5
selenium~=4.31.0
selenium==4.31.0
# https://github.com/pallets/werkzeug/issues/2985
# Maybe related to pytest?
@@ -70,7 +71,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
# playwright is installed at Dockerfile build time because it's not available on all platforms
pyppeteer-ng==2.0.0rc10
pyppeteer-ng==2.0.0rc9
pyppeteerstealth>=0.0.4