diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 08a3f944..3064e97d 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -30,7 +30,10 @@ jobs: # Selenium+browserless docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4 - docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable + docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable + + # For accessing custom browser tests + docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g" browserless/chrome:1.60-chrome-stable - name: Build changedetection.io container for testing run: | @@ -86,6 +89,12 @@ jobs: # And again with PLAYWRIGHT_DRIVER_URL=.. cd .. + - name: Test custom browser URL + run: | + cd changedetectionio + ./run_custom_browser_url_tests.sh + cd .. + - name: Test changedetection.io container starts+runs basically without error run: | docker run -p 5556:5000 -d test-changedetectionio diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1d07d790..9edea3cc 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -614,6 +614,8 @@ def changedetection_app(config=None, datastore_o=None): # For the form widget tag uuid lookup form.tags.datastore = datastore # in _value + for p in datastore.extra_browsers: + form.fetch_backend.choices.append(p) form.fetch_backend.choices.append(("system", 'System settings default')) @@ -714,7 +716,7 @@ def changedetection_app(config=None, datastore_o=None): system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver' is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True # Only works reliably with Playwright @@ -987,7 +989,7 @@ def changedetection_app(config=None, datastore_o=None): system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver' is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True password_enabled_and_share_is_off = False @@ -1041,7 +1043,7 @@ def changedetection_app(config=None, datastore_o=None): is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True # Never requested successfully, but we detected a fetch error diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d9c14590..db5c7b99 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -96,6 +96,7 @@ class Fetcher(): content = None error = None fetcher_description = "No description" + browser_connection_url = None headers = {} status_code = None webdriver_js_execute_code = None @@ -251,14 +252,16 @@ class base_html_playwright(Fetcher): proxy = None - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - self.command_executor = os.getenv( - "PLAYWRIGHT_DRIVER_URL", - 'ws://playwright-chrome:3000' - ).strip('"') + + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + if not browser_connection_url: + self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') + else: + self.browser_connection_url = browser_connection_url # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} @@ -444,7 +447,7 @@ class base_html_playwright(Fetcher): # Seemed to cause a connection Exception even tho I can see it connect # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) # 60,000 connection timeout only - browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000) + browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000) # SOCKS5 with authentication is not supported (yet) # https://github.com/microsoft/playwright/issues/10567 @@ -504,7 +507,11 @@ class base_html_playwright(Fetcher): self.status_code = response.status if self.status_code != 200 and not ignore_status_codes: - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code) + + screenshot=self.page.screenshot(type='jpeg', full_page=True, + quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) + + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) if len(self.page.content().strip()) == 0: context.close() @@ -555,8 +562,6 @@ class base_html_webdriver(Fetcher): else: fetcher_description = "WebDriver Chrome/Javascript" - command_executor = '' - # Configs for Proxy setup # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', @@ -564,12 +569,15 @@ class base_html_webdriver(Fetcher): 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] proxy = None - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() from selenium.webdriver.common.proxy import Proxy as SeleniumProxy # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + if not browser_connection_url: + self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + else: + self.browser_connection_url = browser_connection_url # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} @@ -611,7 +619,7 @@ class base_html_webdriver(Fetcher): options.proxy = self.proxy self.driver = webdriver.Remote( - command_executor=self.command_executor, + command_executor=self.browser_connection_url, options=options) try: @@ -666,9 +674,10 @@ class base_html_webdriver(Fetcher): class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() self.proxy_override = proxy_override + # browser_connection_url is none because its always 'launched locally' def run(self, url, diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index e8c35cb8..b3de842b 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -168,7 +168,9 @@ class ValidateContentFetcherIsReady(object): def __call__(self, form, field): import urllib3.exceptions from changedetectionio import content_fetcher + return +# AttributeError: module 'changedetectionio.content_fetcher' has no attribute 'extra_browser_unlocked<>ASDF213r123r' # Better would be a radiohandler that keeps a reference to each class if field.data is not None and field.data != 'system': klass = getattr(content_fetcher, field.data) @@ -496,6 +498,12 @@ class SingleExtraProxy(Form): proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50}) # @todo do the validation here instead +class SingleExtraBrowser(Form): + browser_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"}) + browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50}) + # @todo do the validation here instead + + # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): time_between_check = FormField(TimeBetweenCheckForm) @@ -504,6 +512,7 @@ class globalSettingsRequestForm(Form): render_kw={"style": "width: 5em;"}, validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")]) extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5) + extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5) def validate_extra_proxies(self, extra_validators=None): for e in self.data['extra_proxies']: diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 697d0d00..1202d5db 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -16,6 +16,7 @@ class model(dict): }, 'requests': { 'extra_proxies': [], # Configurable extra proxies via the UI + 'extra_browsers': [], # Configurable extra proxies via the UI 'jitter_seconds': 0, 'proxy': None, # Preferred proxy connection 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index d2e5ee5c..10c9138c 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -8,11 +8,12 @@ from distutils.util import strtobool class difference_detection_processor(): + browser_steps = None datastore = None fetcher = None screenshot = None + watch = None xpath_data = None - browser_steps = None def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) @@ -40,6 +41,18 @@ class difference_detection_processor(): if not prefer_fetch_backend or prefer_fetch_backend == 'system': prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + # In the case that the preferred fetcher was a browser config with custom connection URL.. + # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..) + browser_connection_url = None + if prefer_fetch_backend.startswith('extra_browser_'): + (t, key) = prefer_fetch_backend.split('extra_browser_') + connection = list( + filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', []))) + if connection: + prefer_fetch_backend = 'base_html_playwright' + browser_connection_url = connection[0].get('browser_connection_url') + + # Grab the right kind of 'fetcher', (playwright, requests, etc) if hasattr(content_fetcher, prefer_fetch_backend): fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) @@ -54,8 +67,9 @@ class difference_detection_processor(): print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. + # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) self.fetcher = fetcher_obj(proxy_override=proxy_url, - #browser_url_extra/configurable browser url=... + browser_connection_url=browser_connection_url ) if self.watch.has_browser_steps: diff --git a/changedetectionio/run_custom_browser_url_tests.sh b/changedetectionio/run_custom_browser_url_tests.sh new file mode 100755 index 00000000..10cea9c5 --- /dev/null +++ b/changedetectionio/run_custom_browser_url_tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# run some tests and look if the 'custom-browser-search-string=1' connect string appeared in the correct containers + +# enable debug +set -x + +# A extra browser is configured, but we never chose to use it, so it should NOT show in the logs +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url' +docker logs browserless-custom-url &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + +docker logs browserless &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browser' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + +# Special connect string should appear in the custom-url container, but not in the 'default' one +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url' +docker logs browserless-custom-url &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 0 ] +then + echo "Did not see request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should" + exit 1 +fi + +docker logs browserless &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browser' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + + diff --git a/changedetectionio/static/styles/scss/parts/_extra_browsers.scss b/changedetectionio/static/styles/scss/parts/_extra_browsers.scss new file mode 100644 index 00000000..da0204ad --- /dev/null +++ b/changedetectionio/static/styles/scss/parts/_extra_browsers.scss @@ -0,0 +1,24 @@ +ul#requests-extra_browsers { + list-style: none; + /* tidy up the table to look more "inline" */ + li { + > label { + display: none; + } + + } + + /* each proxy entry is a `table` */ + table { + tr { + display: inline; + } + } +} + +#extra-browsers-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; +} \ No newline at end of file diff --git a/changedetectionio/static/styles/scss/parts/_extra_proxies.scss b/changedetectionio/static/styles/scss/parts/_extra_proxies.scss index 756dd9b9..ed6de397 100644 --- a/changedetectionio/static/styles/scss/parts/_extra_proxies.scss +++ b/changedetectionio/static/styles/scss/parts/_extra_proxies.scss @@ -60,3 +60,10 @@ body.proxy-check-active { padding-bottom: 1em; } + +#extra-proxies-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; +} diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index 68b95337..ed98a1c6 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -5,6 +5,7 @@ @import "parts/_arrows"; @import "parts/_browser-steps"; @import "parts/_extra_proxies"; +@import "parts/_extra_browsers"; @import "parts/_pagination"; @import "parts/_spinners"; @import "parts/_variables"; diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index 3b7a87d8..c1865879 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -128,6 +128,27 @@ body.proxy-check-active #request .proxy-timing { border-radius: 4px; padding: 1em; } +#extra-proxies-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; } + +ul#requests-extra_browsers { + list-style: none; + /* tidy up the table to look more "inline" */ + /* each proxy entry is a `table` */ } + ul#requests-extra_browsers li > label { + display: none; } + ul#requests-extra_browsers table tr { + display: inline; } + +#extra-browsers-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; } + .pagination-page-info { color: #fff; font-size: 0.85rem; diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 6306a391..c00018c4 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -633,6 +633,18 @@ class ChangeDetectionStore: return {} + @property + def extra_browsers(self): + res = [] + p = list(filter( + lambda s: (s.get('browser_name') and s.get('browser_connection_url')), + self.__data['settings']['requests'].get('extra_browsers', []))) + if p: + for i in p: + res.append(("extra_browser_"+i['browser_name'], i['browser_name'])) + + return res + def tag_exists_by_name(self, tag_name): return any(v.get('title', '').lower() == tag_name.lower() for k, v in self.__data['settings']['application']['tags'].items()) diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 273ac561..461208f0 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -230,11 +230,15 @@ nav
Tip: "Residential" and "Mobile" proxy type can be more successfull than "Data Center" for blocked websites. -