mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 14:47:21 +00:00 
			
		
		
		
	Compare commits
	
		
			14 Commits
		
	
	
		
			bugfix-del
			...
			playwright
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 895050f4a8 | ||
|   | 8ce75f40d9 | ||
|   | 5f3251a3e1 | ||
|   | 703922c369 | ||
|   | 22dda97a65 | ||
|   | 8134242b38 | ||
|   | dc8f20d104 | ||
|   | 704452322a | ||
|   | 1be1cee04d | ||
|   | c990db2bd5 | ||
|   | 25a7fd050f | ||
|   | f71545a4b0 | ||
|   | d87a8cc661 | ||
|   | 0d114f2adc | 
| @@ -20,6 +20,11 @@ COPY requirements.txt /requirements.txt | ||||
|  | ||||
| RUN pip install --target=/dependencies -r /requirements.txt | ||||
|  | ||||
| # Playwright is an alternative to Selenium | ||||
| # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing | ||||
| RUN pip install --target=/dependencies playwright~=1.20 \ | ||||
|     || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." | ||||
|  | ||||
| # Final image stage | ||||
| FROM python:3.8-slim | ||||
|  | ||||
|   | ||||
| @@ -8,7 +8,7 @@ import sys | ||||
|  | ||||
| import eventlet | ||||
| import eventlet.wsgi | ||||
| from . import store, changedetection_app | ||||
| from . import store, changedetection_app, content_fetcher | ||||
| from . import __version__ | ||||
|  | ||||
| def main(): | ||||
|   | ||||
| @@ -1,13 +1,10 @@ | ||||
| from abc import ABC, abstractmethod | ||||
| import chardet | ||||
| import os | ||||
| from selenium import webdriver | ||||
| from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||||
| from selenium.webdriver.common.proxy import Proxy as SeleniumProxy | ||||
| from selenium.common.exceptions import WebDriverException | ||||
| import requests | ||||
| import time | ||||
| import urllib3.exceptions | ||||
| import sys | ||||
|  | ||||
|  | ||||
| class EmptyReply(Exception): | ||||
| @@ -19,13 +16,15 @@ class EmptyReply(Exception): | ||||
|  | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Fetcher(): | ||||
|     error = None | ||||
|     status_code = None | ||||
|     content = None | ||||
|     headers = None | ||||
|  | ||||
|     fetcher_description ="No description" | ||||
|     # Will be needed in the future by the VisualSelector, always get this where possible. | ||||
|     screenshot = False | ||||
|     fetcher_description = "No description" | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_error(self): | ||||
| @@ -46,10 +45,6 @@ class Fetcher(): | ||||
|     def quit(self): | ||||
|         return | ||||
|  | ||||
|     @abstractmethod | ||||
|     def screenshot(self): | ||||
|         return | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_last_status_code(self): | ||||
|         return self.status_code | ||||
| @@ -59,29 +54,109 @@ class Fetcher(): | ||||
|     def is_ready(self): | ||||
|         return True | ||||
|  | ||||
|  | ||||
| #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image | ||||
| #   the current one would return javascript output (as we use JS to generate the diff) | ||||
| # | ||||
| #   Returns tuple(mime_type, stream) | ||||
| #    @abstractmethod | ||||
| #    def return_diff(self, stream_a, stream_b): | ||||
| #        return | ||||
|  | ||||
| def available_fetchers(): | ||||
|         import inspect | ||||
|         from changedetectionio import content_fetcher | ||||
|         p=[] | ||||
|         for name, obj in inspect.getmembers(content_fetcher): | ||||
|             if inspect.isclass(obj): | ||||
|                 # @todo html_ is maybe better as fetcher_ or something | ||||
|                 # In this case, make sure to edit the default one in store.py and fetch_site_status.py | ||||
|                 if "html_" in name: | ||||
|                     t=tuple([name,obj.fetcher_description]) | ||||
|                     p.append(t) | ||||
|     # See the if statement at the bottom of this file for how we switch between playwright and webdriver | ||||
|     import inspect | ||||
|     p = [] | ||||
|     for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): | ||||
|         if inspect.isclass(obj): | ||||
|             # @todo html_ is maybe better as fetcher_ or something | ||||
|             # In this case, make sure to edit the default one in store.py and fetch_site_status.py | ||||
|             if name.startswith('html_'): | ||||
|                 t = tuple([name, obj.fetcher_description]) | ||||
|                 p.append(t) | ||||
|  | ||||
|         return p | ||||
|     return p | ||||
|  | ||||
| class html_webdriver(Fetcher): | ||||
|  | ||||
| class base_html_playwright(Fetcher): | ||||
|     fetcher_description = "Playwright {}/Javascript".format( | ||||
|         os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() | ||||
|     ) | ||||
|     if os.getenv("PLAYWRIGHT_DRIVER_URL"): | ||||
|         fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) | ||||
|  | ||||
| #    try: | ||||
| #        from playwright.sync_api import sync_playwright | ||||
| #    except ModuleNotFoundError: | ||||
| #        fetcher_enabled = False | ||||
|  | ||||
|     browser_type = '' | ||||
|     command_executor = '' | ||||
|  | ||||
|     # Configs for Proxy setup | ||||
|     # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" | ||||
|     playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password'] | ||||
|  | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self): | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') | ||||
|         self.command_executor = os.getenv( | ||||
|             "PLAYWRIGHT_DRIVER_URL", | ||||
|             'ws://playwright-chrome:3000/playwright' | ||||
|         ).strip('"') | ||||
|  | ||||
|         # If any proxy settings are enabled, then we should setup the proxy object | ||||
|         proxy_args = {} | ||||
|         for k in self.playwright_proxy_settings_mappings: | ||||
|             v = os.getenv('playwright_proxy_' + k, False) | ||||
|             if v: | ||||
|                 proxy_args[k] = v.strip('"') | ||||
|  | ||||
|         if proxy_args: | ||||
|             self.proxy = proxy_args | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False): | ||||
|  | ||||
|         from playwright.sync_api import sync_playwright | ||||
|  | ||||
|         with sync_playwright() as p: | ||||
|             browser_type = getattr(p, self.browser_type) | ||||
|  | ||||
|             # Seemed to cause a connection Exception even tho I can see it connect | ||||
|             # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) | ||||
|             browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000) | ||||
|  | ||||
|             # Set user agent to prevent Cloudflare from blocking the browser | ||||
|             context = browser.new_context( | ||||
|                 user_agent="Mozilla/5.0", | ||||
|                 proxy=self.proxy | ||||
|             ) | ||||
|             page = context.new_page() | ||||
|             page.set_viewport_size({"width": 1280, "height": 1024}) | ||||
|             response = page.goto(url, timeout=timeout * 1000) | ||||
|  | ||||
|             extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) | ||||
|             page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
|             if response is None: | ||||
|                 raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             self.status_code = response.status | ||||
|             self.content = page.content() | ||||
|             self.headers = response.all_headers() | ||||
|  | ||||
|             # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|             # JPEG is better here because the screenshots can be very very large | ||||
|             page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}) | ||||
|             self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=90) | ||||
|             context.close() | ||||
|             browser.close() | ||||
|  | ||||
|  | ||||
| class base_html_webdriver(Fetcher): | ||||
|     if os.getenv("WEBDRIVER_URL"): | ||||
|         fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) | ||||
|     else: | ||||
| @@ -94,12 +169,11 @@ class html_webdriver(Fetcher): | ||||
|     selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', | ||||
|                                         'proxyAutoconfigUrl', 'sslProxy', 'autodetect', | ||||
|                                         'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] | ||||
|  | ||||
|  | ||||
|  | ||||
|     proxy=None | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self): | ||||
|         from selenium.webdriver.common.proxy import Proxy as SeleniumProxy | ||||
|  | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') | ||||
|  | ||||
| @@ -121,6 +195,9 @@ class html_webdriver(Fetcher): | ||||
|             request_method, | ||||
|             ignore_status_codes=False): | ||||
|  | ||||
|         from selenium import webdriver | ||||
|         from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||||
|         from selenium.common.exceptions import WebDriverException | ||||
|         # request_body, request_method unused for now, until some magic in the future happens. | ||||
|  | ||||
|         # check env for WEBDRIVER_URL | ||||
| @@ -145,9 +222,8 @@ class html_webdriver(Fetcher): | ||||
|         time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) | ||||
|         self.content = self.driver.page_source | ||||
|         self.headers = {} | ||||
|  | ||||
|     def screenshot(self): | ||||
|         return self.driver.get_screenshot_as_png() | ||||
|         self.screenshot = self.driver.get_screenshot_as_png() | ||||
|         self.quit() | ||||
|  | ||||
|     # Does the connection to the webdriver work? run a test connection. | ||||
|     def is_ready(self): | ||||
| @@ -170,6 +246,7 @@ class html_webdriver(Fetcher): | ||||
|             except Exception as e: | ||||
|                 print("Exception in chrome shutdown/quit" + str(e)) | ||||
|  | ||||
|  | ||||
| # "html_requests" is listed as the default fetcher in store.py! | ||||
| class html_requests(Fetcher): | ||||
|     fetcher_description = "Basic fast Plaintext/HTTP Client" | ||||
| @@ -183,11 +260,11 @@ class html_requests(Fetcher): | ||||
|             ignore_status_codes=False): | ||||
|  | ||||
|         r = requests.request(method=request_method, | ||||
|                          data=request_body, | ||||
|                          url=url, | ||||
|                          headers=request_headers, | ||||
|                          timeout=timeout, | ||||
|                          verify=False) | ||||
|                              data=request_body, | ||||
|                              url=url, | ||||
|                              headers=request_headers, | ||||
|                              timeout=timeout, | ||||
|                              verify=False) | ||||
|  | ||||
|         # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. | ||||
|         # For example - some sites don't tell us it's utf-8, but return utf-8 content | ||||
| @@ -207,3 +284,11 @@ class html_requests(Fetcher): | ||||
|         self.content = r.text | ||||
|         self.headers = r.headers | ||||
|  | ||||
|  | ||||
| # Decide which is the 'real' HTML webdriver, this is more a system wide config | ||||
| # rather than site-specific. | ||||
| use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) | ||||
| if use_playwright_as_chrome_fetcher: | ||||
|     html_webdriver = base_html_playwright | ||||
| else: | ||||
|     html_webdriver = base_html_webdriver | ||||
|   | ||||
| @@ -68,6 +68,7 @@ class perform_site_check(): | ||||
|  | ||||
|         fetcher = klass() | ||||
|         fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code) | ||||
|  | ||||
|         # Fetching complete, now filters | ||||
|         # @todo move to class / maybe inside of fetcher abstract base? | ||||
|  | ||||
| @@ -192,9 +193,4 @@ class perform_site_check(): | ||||
|                 if not watch['title'] or not len(watch['title']): | ||||
|                     update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) | ||||
|  | ||||
|         if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True): | ||||
|             screenshot = fetcher.screenshot() | ||||
|  | ||||
|         fetcher.quit() | ||||
|  | ||||
|         return changed_detected, update_obj, text_content_before_ignored_filter, screenshot | ||||
|         return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot | ||||
| @@ -23,6 +23,13 @@ services: | ||||
|   # | ||||
|   #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy | ||||
|   # | ||||
|   #       Alternative Playwright URL, do not use "'s or 's! | ||||
|   #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/playwright | ||||
|   # | ||||
|   #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password | ||||
|   # | ||||
|   #             https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-proxy | ||||
|   # | ||||
|   #        Plain requsts - proxy support example. | ||||
|   #      - HTTP_PROXY=socks5h://10.10.1.10:1080 | ||||
|   #      - HTTPS_PROXY=socks5h://10.10.1.10:1080 | ||||
| @@ -58,6 +65,13 @@ services: | ||||
| #            # Workaround to avoid the browser crashing inside a docker container | ||||
| #            # See https://github.com/SeleniumHQ/docker-selenium#quick-start | ||||
| #            - /dev/shm:/dev/shm | ||||
| #        restart: unless-stopped | ||||
|  | ||||
|      # Used for fetching pages via Playwright+Chrome where you need Javascript support. | ||||
|  | ||||
| #    playwright-chrome: | ||||
| #        hostname: playwright-chrome | ||||
| #        image: browserless/chrome | ||||
| #        restart: unless-stopped | ||||
|  | ||||
| volumes: | ||||
|   | ||||
| @@ -40,3 +40,4 @@ selenium ~= 4.1.0 | ||||
| # need to revisit flask login versions | ||||
| werkzeug ~= 2.0.0 | ||||
|  | ||||
| # playwright is installed at Dockerfile build time because it's not available on all platforms | ||||
|   | ||||
		Reference in New Issue
	
	Block a user