mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-01-02 13:20:21 +00:00
Compare commits
3 Commits
0.49.16
...
selenium-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
78f3f2b26a | ||
|
|
535ee97ef7 | ||
|
|
b2923b8c3a |
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
|
||||
__version__ = '0.49.16'
|
||||
__version__ = '0.49.15'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -10,13 +10,16 @@ class fetcher(Fetcher):
|
||||
else:
|
||||
fetcher_description = "WebDriver Chrome/Javascript"
|
||||
|
||||
# Configs for Proxy setup
|
||||
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
|
||||
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
|
||||
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
|
||||
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
|
||||
proxy = None
|
||||
proxy_url = None
|
||||
|
||||
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
||||
super().__init__()
|
||||
from urllib.parse import urlparse
|
||||
from selenium.webdriver.common.proxy import Proxy
|
||||
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
|
||||
|
||||
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
||||
if not custom_browser_connection_url:
|
||||
@@ -25,27 +28,25 @@ class fetcher(Fetcher):
|
||||
self.browser_connection_is_custom = True
|
||||
self.browser_connection_url = custom_browser_connection_url
|
||||
|
||||
# If any proxy settings are enabled, then we should setup the proxy object
|
||||
proxy_args = {}
|
||||
for k in self.selenium_proxy_settings_mappings:
|
||||
v = os.getenv('webdriver_' + k, False)
|
||||
if v:
|
||||
proxy_args[k] = v.strip('"')
|
||||
|
||||
##### PROXY SETUP #####
|
||||
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
|
||||
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
|
||||
proxy_args['httpProxy'] = self.system_http_proxy
|
||||
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
|
||||
proxy_args['httpsProxy'] = self.system_https_proxy
|
||||
|
||||
proxy_sources = [
|
||||
self.system_http_proxy,
|
||||
self.system_https_proxy,
|
||||
os.getenv('webdriver_proxySocks'),
|
||||
os.getenv('webdriver_socksProxy'),
|
||||
os.getenv('webdriver_proxyHttp'),
|
||||
os.getenv('webdriver_httpProxy'),
|
||||
os.getenv('webdriver_proxyHttps'),
|
||||
os.getenv('webdriver_httpsProxy'),
|
||||
os.getenv('webdriver_sslProxy'),
|
||||
proxy_override, # last one should override
|
||||
]
|
||||
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
|
||||
for k in filter(None, proxy_sources):
|
||||
if not k:
|
||||
continue
|
||||
self.proxy_url = k.strip()
|
||||
# Allows override the proxy on a per-request basis
|
||||
if proxy_override is not None:
|
||||
proxy_args['httpProxy'] = proxy_override
|
||||
|
||||
if proxy_args:
|
||||
self.proxy = SeleniumProxy(raw=proxy_args)
|
||||
|
||||
def run(self,
|
||||
url,
|
||||
@@ -58,7 +59,9 @@ class fetcher(Fetcher):
|
||||
is_binary=False,
|
||||
empty_pages_are_a_change=False):
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
# request_body, request_method unused for now, until some magic in the future happens.
|
||||
|
||||
options = ChromeOptions()
|
||||
@@ -73,62 +76,58 @@ class fetcher(Fetcher):
|
||||
for opt in CHROME_OPTIONS:
|
||||
options.add_argument(opt)
|
||||
|
||||
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
|
||||
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
|
||||
# 3. selenium only allows ONE runner at a time by default!
|
||||
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
|
||||
options.add_argument(f"--proxy-server={self.proxy}")
|
||||
|
||||
if self.proxy_url:
|
||||
options.add_argument(f'--proxy-server={self.proxy_url}')
|
||||
|
||||
from selenium.webdriver.remote.remote_connection import RemoteConnection
|
||||
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
|
||||
driver = None
|
||||
try:
|
||||
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
|
||||
remote_connection = RemoteConnection(
|
||||
self.browser_connection_url,
|
||||
)
|
||||
remote_connection.set_timeout(30) # seconds
|
||||
|
||||
# Now create the driver with the RemoteConnection
|
||||
driver = RemoteWebDriver(
|
||||
command_executor=remote_connection,
|
||||
options=options
|
||||
)
|
||||
|
||||
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
|
||||
except Exception as e:
|
||||
if driver:
|
||||
driver.quit()
|
||||
raise e
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.browser_connection_url,
|
||||
options=options)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
self.driver.get(url)
|
||||
except WebDriverException as e:
|
||||
# Be sure we close the session window
|
||||
self.quit()
|
||||
raise
|
||||
|
||||
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
||||
driver.set_window_size(1280, 1024)
|
||||
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
||||
self.driver.set_window_size(1280, 1024)
|
||||
|
||||
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
|
||||
if self.webdriver_js_execute_code is not None:
|
||||
driver.execute_script(self.webdriver_js_execute_code)
|
||||
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
||||
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
if self.webdriver_js_execute_code is not None:
|
||||
self.driver.execute_script(self.webdriver_js_execute_code)
|
||||
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
|
||||
# @todo - how to check this? is it possible?
|
||||
self.status_code = 200
|
||||
# @todo somehow we should try to get this working for WebDriver
|
||||
# raise EmptyReply(url=url, status_code=r.status_code)
|
||||
|
||||
# @todo - dom wait loaded?
|
||||
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
||||
self.content = driver.page_source
|
||||
self.headers = {}
|
||||
self.screenshot = driver.get_screenshot_as_png()
|
||||
except Exception as e:
|
||||
driver.quit()
|
||||
raise e
|
||||
# @todo - how to check this? is it possible?
|
||||
self.status_code = 200
|
||||
# @todo somehow we should try to get this working for WebDriver
|
||||
# raise EmptyReply(url=url, status_code=r.status_code)
|
||||
|
||||
driver.quit()
|
||||
# @todo - dom wait loaded?
|
||||
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
||||
self.content = self.driver.page_source
|
||||
self.headers = {}
|
||||
|
||||
self.screenshot = self.driver.get_screenshot_as_png()
|
||||
|
||||
# Does the connection to the webdriver work? run a test connection.
|
||||
def is_ready(self):
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor=self.command_executor,
|
||||
options=ChromeOptions())
|
||||
|
||||
# driver.quit() seems to cause better exceptions
|
||||
self.quit()
|
||||
return True
|
||||
|
||||
def quit(self, watch=None):
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception as e:
|
||||
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
|
||||
|
||||
@@ -224,37 +224,27 @@ class StringDictKeyValue(StringField):
|
||||
|
||||
def _value(self):
|
||||
if self.data:
|
||||
output = ''
|
||||
for k, v in self.data.items():
|
||||
output += f"{k}: {v}\r\n"
|
||||
output = u''
|
||||
for k in self.data.keys():
|
||||
output += "{}: {}\r\n".format(k, self.data[k])
|
||||
|
||||
return output
|
||||
else:
|
||||
return ''
|
||||
return u''
|
||||
|
||||
# incoming data processing + validation
|
||||
# incoming
|
||||
def process_formdata(self, valuelist):
|
||||
self.data = {}
|
||||
errors = []
|
||||
if valuelist:
|
||||
# Remove empty strings (blank lines)
|
||||
cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
|
||||
for idx, s in enumerate(cleaned, start=1):
|
||||
if ':' not in s:
|
||||
errors.append(f"Line {idx} is missing a ':' separator.")
|
||||
continue
|
||||
parts = s.split(':', 1)
|
||||
key = parts[0].strip()
|
||||
value = parts[1].strip()
|
||||
self.data = {}
|
||||
# Remove empty strings
|
||||
cleaned = list(filter(None, valuelist[0].split("\n")))
|
||||
for s in cleaned:
|
||||
parts = s.strip().split(':', 1)
|
||||
if len(parts) == 2:
|
||||
self.data.update({parts[0].strip(): parts[1].strip()})
|
||||
|
||||
if not key:
|
||||
errors.append(f"Line {idx} has an empty key.")
|
||||
if not value:
|
||||
errors.append(f"Line {idx} has an empty value.")
|
||||
|
||||
self.data[key] = value
|
||||
|
||||
if errors:
|
||||
raise ValidationError("Invalid input:\n" + "\n".join(errors))
|
||||
else:
|
||||
self.data = {}
|
||||
|
||||
class ValidateContentFetcherIsReady(object):
|
||||
"""
|
||||
|
||||
@@ -100,7 +100,8 @@ docker run --network changedet-network \
|
||||
test-changedetectionio \
|
||||
bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||
|
||||
# Selenium
|
||||
# Selenium - todo - fix proxies
|
||||
docker run --network changedet-network \
|
||||
-e "WEBDRIVER_URL=http://selenium:4444/wd/hub" \
|
||||
test-changedetectionio \
|
||||
bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||
bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||
|
||||
@@ -7,11 +7,6 @@ from ... import strtobool
|
||||
|
||||
|
||||
# Just to be sure the UI outputs the right error message on proxy connection failed
|
||||
# docker run -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
|
||||
# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
|
||||
# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
|
||||
# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
|
||||
|
||||
def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
|
||||
live_server_setup(live_server)
|
||||
|
||||
@@ -21,48 +16,38 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
|
||||
data={
|
||||
"requests-time_between_check-minutes": 180,
|
||||
"application-ignore_whitespace": "y",
|
||||
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
|
||||
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
|
||||
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
|
||||
# test:awesome is set in tests/proxy_list/squid-passwords.txt
|
||||
"requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
|
||||
"requests-extra_proxies-0-proxy_url": "http://THISPROXYDOESNTEXIST:3128",
|
||||
},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Settings updated." in res.data
|
||||
|
||||
test_url = "https://changedetection.io"
|
||||
res = client.post(
|
||||
url_for("ui.ui_views.form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||
url_for("imports.import_page"),
|
||||
# Because a URL wont show in squid/proxy logs due it being SSLed
|
||||
# Use plain HTTP or a specific domain-name here
|
||||
data={"urls": "https://changedetection.io/CHANGELOG.txt"},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||
|
||||
options = {
|
||||
"url": test_url,
|
||||
"fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
|
||||
"proxy": "ui-0custom-test-proxy",
|
||||
}
|
||||
|
||||
res = client.post(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
|
||||
data=options,
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"unpaused" in res.data
|
||||
import time
|
||||
assert b"1 Imported" in res.data
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Requests default
|
||||
check_string = b'Cannot connect to proxy'
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'Page.goto: net::ERR_PROXY_CONNECTION_FAILED' in res.data
|
||||
|
||||
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
|
||||
# Requests
|
||||
check_string = b'Proxy connection failed?'
|
||||
|
||||
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
|
||||
check_string = b'ERR_PROXY_CONNECTION_FAILED'
|
||||
|
||||
if os.getenv("WEBDRIVER_URL"):
|
||||
check_string = b'ERR_PROXY_CONNECTION_FAILED'
|
||||
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
#with open("/tmp/debug.html", 'wb') as f:
|
||||
# f.write(res.data)
|
||||
assert check_string in res.data
|
||||
|
||||
|
||||
@@ -424,27 +424,3 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
|
||||
# unlink headers.txt on start/stop
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_headers_validation(client, live_server):
|
||||
#live_server_setup(live_server)
|
||||
|
||||
test_url = url_for('test_headers', _external=True)
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
res = client.post(
|
||||
url_for("ui.ui_edit.edit_page", uuid="first"),
|
||||
data={
|
||||
"url": test_url,
|
||||
"fetch_backend": 'html_requests',
|
||||
"headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"Line 1 is missing a ':' separator." in res.data
|
||||
assert b"Line 3 has an empty key." in res.data
|
||||
|
||||
|
||||
@@ -136,7 +136,7 @@ def wait_for_all_checks(client):
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
if not b'Checking now' in res.data:
|
||||
break
|
||||
logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
|
||||
logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
|
||||
time.sleep(1)
|
||||
attempt += 1
|
||||
|
||||
|
||||
@@ -53,7 +53,8 @@ lxml >=4.8.0,<6,!=5.2.0,!=5.2.1
|
||||
# XPath 2.0-3.1 support - 4.2.0 broke something?
|
||||
elementpath==4.1.5
|
||||
|
||||
selenium~=4.31.0
|
||||
selenium==4.31.0
|
||||
|
||||
|
||||
# https://github.com/pallets/werkzeug/issues/2985
|
||||
# Maybe related to pytest?
|
||||
@@ -70,7 +71,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
|
||||
|
||||
# playwright is installed at Dockerfile build time because it's not available on all platforms
|
||||
|
||||
pyppeteer-ng==2.0.0rc10
|
||||
pyppeteer-ng==2.0.0rc9
|
||||
|
||||
pyppeteerstealth>=0.0.4
|
||||
|
||||
|
||||
Reference in New Issue
Block a user