mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 14:47:21 +00:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			api-new-wa
			...
			raw-browse
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | ec77b45e84 | ||
|   | 138f7fc59c | ||
|   | 56b768d24f | ||
|   | a61d7b4284 | 
| @@ -679,7 +679,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|     @app.route("/settings", methods=['GET', "POST"]) | ||||
|     @login_optionally_required | ||||
|     def settings_page(): | ||||
|         from changedetectionio import content_fetcher, forms | ||||
|         from . import forms | ||||
|  | ||||
|         default = deepcopy(datastore.data['settings']) | ||||
|         if datastore.proxy_list is not None: | ||||
|   | ||||
| @@ -237,7 +237,7 @@ class browsersteps_live_ui(steppable_browser_interface): | ||||
|     def get_current_state(self): | ||||
|         """Return the screenshot and interactive elements mapping, generally always called after action_()""" | ||||
|         from pkg_resources import resource_string | ||||
|         xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') | ||||
|         xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8') | ||||
|         now = time.time() | ||||
|         self.page.wait_for_timeout(1 * 1000) | ||||
|  | ||||
| @@ -272,8 +272,8 @@ class browsersteps_live_ui(steppable_browser_interface): | ||||
|         self.page.evaluate("var include_filters=''") | ||||
|         from pkg_resources import resource_string | ||||
|         # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector | ||||
|         xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') | ||||
|         from changedetectionio.content_fetcher import visualselector_xpath_selectors | ||||
|         xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8') | ||||
|         from changedetectionio.fetchers import visualselector_xpath_selectors | ||||
|         xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) | ||||
|         xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") | ||||
|         screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) | ||||
|   | ||||
| @@ -13,7 +13,7 @@ import signal | ||||
| import socket | ||||
| import sys | ||||
|  | ||||
| from . import store, changedetection_app, content_fetcher | ||||
| from . import store, changedetection_app | ||||
| from . import __version__ | ||||
|  | ||||
| # Only global so we can access it in the signal handler | ||||
|   | ||||
| @@ -1,595 +0,0 @@ | ||||
| import hashlib | ||||
| from abc import abstractmethod | ||||
| import chardet | ||||
| import json | ||||
| import logging | ||||
| import os | ||||
| import requests | ||||
| import sys | ||||
| import time | ||||
|  | ||||
| visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary' | ||||
|  | ||||
| class Non200ErrorCodeReceived(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.xpath_data = xpath_data | ||||
|         self.page_text = None | ||||
|  | ||||
|         if page_html: | ||||
|             from changedetectionio import html_tools | ||||
|             self.page_text = html_tools.html_to_text(page_html) | ||||
|         return | ||||
|  | ||||
| class checksumFromPreviousCheckWasTheSame(Exception): | ||||
|     def __init__(self): | ||||
|         return | ||||
|  | ||||
| class JSActionExceptions(Exception): | ||||
|     def __init__(self, status_code, url, screenshot, message=''): | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.message = message | ||||
|         return | ||||
|  | ||||
| class BrowserStepsStepTimout(Exception): | ||||
|     def __init__(self, step_n): | ||||
|         self.step_n = step_n | ||||
|         return | ||||
|  | ||||
|  | ||||
| class PageUnloadable(Exception): | ||||
|     def __init__(self, status_code, url, message, screenshot=False): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.message = message | ||||
|         return | ||||
|  | ||||
| class EmptyReply(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         return | ||||
|  | ||||
| class ScreenshotUnavailable(Exception): | ||||
|     def __init__(self, status_code, url, page_html=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         if page_html: | ||||
|             from html_tools import html_to_text | ||||
|             self.page_text = html_to_text(page_html) | ||||
|         return | ||||
|  | ||||
| class ReplyWithContentButNoText(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         return | ||||
|  | ||||
| class Fetcher(): | ||||
|     browser_steps = None | ||||
|     browser_steps_screenshot_path = None | ||||
|     content = None | ||||
|     error = None | ||||
|     fetcher_description = "No description" | ||||
|     headers = None | ||||
|     status_code = None | ||||
|     webdriver_js_execute_code = None | ||||
|     xpath_data = None | ||||
|     xpath_element_js = "" | ||||
|     instock_data = None | ||||
|     instock_data_js = "" | ||||
|  | ||||
|     # Will be needed in the future by the VisualSelector, always get this where possible. | ||||
|     screenshot = False | ||||
|     system_http_proxy = os.getenv('HTTP_PROXY') | ||||
|     system_https_proxy = os.getenv('HTTPS_PROXY') | ||||
|  | ||||
|     # Time ONTOP of the system defined env minimum time | ||||
|     render_extract_delay = 0 | ||||
|  | ||||
|     def __init__(self): | ||||
|         from pkg_resources import resource_string | ||||
|         # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector | ||||
|         self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8') | ||||
|         self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8') | ||||
|  | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_error(self): | ||||
|         return self.error | ||||
|  | ||||
|     @abstractmethod | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|         # Should set self.error, self.status_code and self.content | ||||
|         pass | ||||
|  | ||||
|     @abstractmethod | ||||
|     def quit(self): | ||||
|         return | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_last_status_code(self): | ||||
|         return self.status_code | ||||
|  | ||||
|     @abstractmethod | ||||
|     def screenshot_step(self, step_n): | ||||
|         return None | ||||
|  | ||||
|     @abstractmethod | ||||
|     # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc | ||||
|     def is_ready(self): | ||||
|         return True | ||||
|  | ||||
|     def iterate_browser_steps(self): | ||||
|         from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface | ||||
|         from playwright._impl._api_types import TimeoutError | ||||
|         from jinja2 import Environment | ||||
|         jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) | ||||
|  | ||||
|         step_n = 0 | ||||
|  | ||||
|         if self.browser_steps is not None and len(self.browser_steps): | ||||
|             interface = steppable_browser_interface() | ||||
|             interface.page = self.page | ||||
|  | ||||
|             valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps) | ||||
|  | ||||
|             for step in valid_steps: | ||||
|                 step_n += 1 | ||||
|                 print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation'])) | ||||
|                 self.screenshot_step("before-"+str(step_n)) | ||||
|                 self.save_step_html("before-"+str(step_n)) | ||||
|                 try: | ||||
|                     optional_value = step['optional_value'] | ||||
|                     selector = step['selector'] | ||||
|                     # Support for jinja2 template in step values, with date module added | ||||
|                     if '{%' in step['optional_value'] or '{{' in step['optional_value']: | ||||
|                         optional_value = str(jinja2_env.from_string(step['optional_value']).render()) | ||||
|                     if '{%' in step['selector'] or '{{' in step['selector']: | ||||
|                         selector = str(jinja2_env.from_string(step['selector']).render()) | ||||
|  | ||||
|                     getattr(interface, "call_action")(action_name=step['operation'], | ||||
|                                                       selector=selector, | ||||
|                                                       optional_value=optional_value) | ||||
|                     self.screenshot_step(step_n) | ||||
|                     self.save_step_html(step_n) | ||||
|                 except TimeoutError: | ||||
|                     # Stop processing here | ||||
|                     raise BrowserStepsStepTimout(step_n=step_n) | ||||
|  | ||||
|  | ||||
|  | ||||
|     # It's always good to reset these | ||||
|     def delete_browser_steps_screenshots(self): | ||||
|         import glob | ||||
|         if self.browser_steps_screenshot_path is not None: | ||||
|             dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg') | ||||
|             files = glob.glob(dest) | ||||
|             for f in files: | ||||
|                 os.unlink(f) | ||||
|  | ||||
| #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image | ||||
| #   the current one would return javascript output (as we use JS to generate the diff) | ||||
| # | ||||
| def available_fetchers(): | ||||
|     # See the if statement at the bottom of this file for how we switch between playwright and webdriver | ||||
|     import inspect | ||||
|     p = [] | ||||
|     for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): | ||||
|         if inspect.isclass(obj): | ||||
|             # @todo html_ is maybe better as fetcher_ or something | ||||
|             # In this case, make sure to edit the default one in store.py and fetch_site_status.py | ||||
|             if name.startswith('html_'): | ||||
|                 t = tuple([name, obj.fetcher_description]) | ||||
|                 p.append(t) | ||||
|  | ||||
|     return p | ||||
|  | ||||
| class base_html_playwright(Fetcher): | ||||
|     fetcher_description = "Playwright {}/Javascript".format( | ||||
|         os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() | ||||
|     ) | ||||
|     if os.getenv("PLAYWRIGHT_DRIVER_URL"): | ||||
|         fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) | ||||
|  | ||||
|     browser_type = '' | ||||
|     command_executor = '' | ||||
|  | ||||
|     # Configs for Proxy setup | ||||
|     # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" | ||||
|     playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] | ||||
|  | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         super().__init__() | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') | ||||
|         self.command_executor = os.getenv( | ||||
|             "PLAYWRIGHT_DRIVER_URL", | ||||
|             'ws://playwright-chrome:3000' | ||||
|         ).strip('"') | ||||
|  | ||||
|         # If any proxy settings are enabled, then we should setup the proxy object | ||||
|         proxy_args = {} | ||||
|         for k in self.playwright_proxy_settings_mappings: | ||||
|             v = os.getenv('playwright_proxy_' + k, False) | ||||
|             if v: | ||||
|                 proxy_args[k] = v.strip('"') | ||||
|  | ||||
|         if proxy_args: | ||||
|             self.proxy = proxy_args | ||||
|  | ||||
|         # allow per-watch proxy selection override | ||||
|         if proxy_override: | ||||
|             self.proxy = {'server': proxy_override} | ||||
|  | ||||
|         if self.proxy: | ||||
|             # Playwright needs separate username and password values | ||||
|             from urllib.parse import urlparse | ||||
|             parsed = urlparse(self.proxy.get('server')) | ||||
|             if parsed.username: | ||||
|                 self.proxy['username'] = parsed.username | ||||
|                 self.proxy['password'] = parsed.password | ||||
|  | ||||
|     def screenshot_step(self, step_n=''): | ||||
|         screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85) | ||||
|  | ||||
|         if self.browser_steps_screenshot_path is not None: | ||||
|             destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) | ||||
|             logging.debug("Saving step screenshot to {}".format(destination)) | ||||
|             with open(destination, 'wb') as f: | ||||
|                 f.write(screenshot) | ||||
|  | ||||
|     def save_step_html(self, step_n): | ||||
|         content = self.page.content() | ||||
|         destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) | ||||
|         logging.debug("Saving step HTML to {}".format(destination)) | ||||
|         with open(destination, 'w') as f: | ||||
|             f.write(content) | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         from playwright.sync_api import sync_playwright | ||||
|         import playwright._impl._api_types | ||||
|  | ||||
|         self.delete_browser_steps_screenshots() | ||||
|         response = None | ||||
|         with sync_playwright() as p: | ||||
|             browser_type = getattr(p, self.browser_type) | ||||
|  | ||||
|             # Seemed to cause a connection Exception even tho I can see it connect | ||||
|             # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) | ||||
|             # 60,000 connection timeout only | ||||
|             browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000) | ||||
|  | ||||
|             # Set user agent to prevent Cloudflare from blocking the browser | ||||
|             # Use the default one configured in the App.py model that's passed from fetch_site_status.py | ||||
|             context = browser.new_context( | ||||
|                 user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', | ||||
|                 proxy=self.proxy, | ||||
|                 # This is needed to enable JavaScript execution on GitHub and others | ||||
|                 bypass_csp=True, | ||||
|                 # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers | ||||
|                 service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), | ||||
|                 # Should never be needed | ||||
|                 accept_downloads=False | ||||
|             ) | ||||
|  | ||||
|             self.page = context.new_page() | ||||
|             if len(request_headers): | ||||
|                 context.set_extra_http_headers(request_headers) | ||||
|  | ||||
|                 self.page.set_default_navigation_timeout(90000) | ||||
|                 self.page.set_default_timeout(90000) | ||||
|  | ||||
|                 # Listen for all console events and handle errors | ||||
|                 self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) | ||||
|  | ||||
|             # Goto page | ||||
|             try: | ||||
|                 # Wait_until = commit | ||||
|                 # - `'commit'` - consider operation to be finished when network response is received and the document started loading. | ||||
|                 # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds | ||||
|                 # This seemed to solve nearly all 'TimeoutErrors' | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|             except playwright._impl._api_types.Error as e: | ||||
|                 # Retry once - https://github.com/browserless/chrome/issues/2485 | ||||
|                 # Sometimes errors related to invalid cert's and other can be random | ||||
|                 print ("Content Fetcher > retrying request got error - ", str(e)) | ||||
|                 time.sleep(1) | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|  | ||||
|             except Exception as e: | ||||
|                 print ("Content Fetcher > Other exception when page.goto", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             # Execute any browser steps | ||||
|             try: | ||||
|                 extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|                 self.page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
|                 if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): | ||||
|                     self.page.evaluate(self.webdriver_js_execute_code) | ||||
|  | ||||
|             except playwright._impl._api_types.TimeoutError as e: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 # This can be ok, we will try to grab what we could retrieve | ||||
|                 pass | ||||
|             except Exception as e: | ||||
|                 print ("Content Fetcher > Other exception when executing custom JS code", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             if response is None: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("Content Fetcher > Response object was none") | ||||
|                 raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             # Run Browser Steps here | ||||
|             self.iterate_browser_steps() | ||||
|  | ||||
|             extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|             time.sleep(extra_wait) | ||||
|  | ||||
|             self.content = self.page.content() | ||||
|             self.status_code = response.status | ||||
|             if len(self.page.content().strip()) == 0: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("Content Fetcher > Content was empty") | ||||
|                 raise EmptyReply(url=url, status_code=response.status) | ||||
|  | ||||
|             self.status_code = response.status | ||||
|             self.headers = response.all_headers() | ||||
|  | ||||
|             # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) | ||||
|             if current_include_filters is not None: | ||||
|                 self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) | ||||
|             else: | ||||
|                 self.page.evaluate("var include_filters=''") | ||||
|  | ||||
|             self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") | ||||
|             self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") | ||||
|  | ||||
|             # Bug 3 in Playwright screenshot handling | ||||
|             # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|             # JPEG is better here because the screenshots can be very very large | ||||
|  | ||||
|             # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded | ||||
|             # which will significantly increase the IO size between the server and client, it's recommended to use the lowest | ||||
|             # acceptable screenshot quality here | ||||
|             try: | ||||
|                 # The actual screenshot | ||||
|                 self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) | ||||
|             except Exception as e: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise ScreenshotUnavailable(url=url, status_code=None) | ||||
|  | ||||
|             context.close() | ||||
|             browser.close() | ||||
|  | ||||
| class base_html_webdriver(Fetcher): | ||||
|     if os.getenv("WEBDRIVER_URL"): | ||||
|         fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) | ||||
|     else: | ||||
|         fetcher_description = "WebDriver Chrome/Javascript" | ||||
|  | ||||
|     command_executor = '' | ||||
|  | ||||
|     # Configs for Proxy setup | ||||
|     # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" | ||||
|     selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', | ||||
|                                         'proxyAutoconfigUrl', 'sslProxy', 'autodetect', | ||||
|                                         'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         super().__init__() | ||||
|         from selenium.webdriver.common.proxy import Proxy as SeleniumProxy | ||||
|  | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') | ||||
|  | ||||
|         # If any proxy settings are enabled, then we should setup the proxy object | ||||
|         proxy_args = {} | ||||
|         for k in self.selenium_proxy_settings_mappings: | ||||
|             v = os.getenv('webdriver_' + k, False) | ||||
|             if v: | ||||
|                 proxy_args[k] = v.strip('"') | ||||
|  | ||||
|         # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy | ||||
|         if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy: | ||||
|             proxy_args['httpProxy'] = self.system_http_proxy | ||||
|         if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy: | ||||
|             proxy_args['httpsProxy'] = self.system_https_proxy | ||||
|  | ||||
|         # Allows override the proxy on a per-request basis | ||||
|         if proxy_override is not None: | ||||
|             proxy_args['httpProxy'] = proxy_override | ||||
|  | ||||
|         if proxy_args: | ||||
|             self.proxy = SeleniumProxy(raw=proxy_args) | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         from selenium import webdriver | ||||
|         from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||||
|         from selenium.common.exceptions import WebDriverException | ||||
|         # request_body, request_method unused for now, until some magic in the future happens. | ||||
|  | ||||
|         # check env for WEBDRIVER_URL | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor=self.command_executor, | ||||
|             desired_capabilities=DesiredCapabilities.CHROME, | ||||
|             proxy=self.proxy) | ||||
|  | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except WebDriverException as e: | ||||
|             # Be sure we close the session window | ||||
|             self.quit() | ||||
|             raise | ||||
|  | ||||
|         self.driver.set_window_size(1280, 1024) | ||||
|         self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) | ||||
|  | ||||
|         if self.webdriver_js_execute_code is not None: | ||||
|             self.driver.execute_script(self.webdriver_js_execute_code) | ||||
|             # Selenium doesn't automatically wait for actions as good as Playwright, so wait again | ||||
|             self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) | ||||
|  | ||||
|         # @todo - how to check this? is it possible? | ||||
|         self.status_code = 200 | ||||
|         # @todo somehow we should try to get this working for WebDriver | ||||
|         # raise EmptyReply(url=url, status_code=r.status_code) | ||||
|  | ||||
|         # @todo - dom wait loaded? | ||||
|         time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) | ||||
|         self.content = self.driver.page_source | ||||
|         self.headers = {} | ||||
|  | ||||
|         self.screenshot = self.driver.get_screenshot_as_png() | ||||
|  | ||||
|     # Does the connection to the webdriver work? run a test connection. | ||||
|     def is_ready(self): | ||||
|         from selenium import webdriver | ||||
|         from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||||
|  | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor=self.command_executor, | ||||
|             desired_capabilities=DesiredCapabilities.CHROME) | ||||
|  | ||||
|         # driver.quit() seems to cause better exceptions | ||||
|         self.quit() | ||||
|         return True | ||||
|  | ||||
|     def quit(self): | ||||
|         if self.driver: | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|             except Exception as e: | ||||
|                 print("Content Fetcher > Exception in chrome shutdown/quit" + str(e)) | ||||
|  | ||||
|  | ||||
| # "html_requests" is listed as the default fetcher in store.py! | ||||
| class html_requests(Fetcher): | ||||
|     fetcher_description = "Basic fast Plaintext/HTTP Client" | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         self.proxy_override = proxy_override | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         # Make requests use a more modern looking user-agent | ||||
|         if not 'User-Agent' in request_headers: | ||||
|             request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", | ||||
|                                                       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') | ||||
|  | ||||
|         proxies = {} | ||||
|  | ||||
|         # Allows override the proxy on a per-request basis | ||||
|         if self.proxy_override: | ||||
|             proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override} | ||||
|         else: | ||||
|             if self.system_http_proxy: | ||||
|                 proxies['http'] = self.system_http_proxy | ||||
|             if self.system_https_proxy: | ||||
|                 proxies['https'] = self.system_https_proxy | ||||
|  | ||||
|         r = requests.request(method=request_method, | ||||
|                              data=request_body, | ||||
|                              url=url, | ||||
|                              headers=request_headers, | ||||
|                              timeout=timeout, | ||||
|                              proxies=proxies, | ||||
|                              verify=False) | ||||
|  | ||||
|         # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. | ||||
|         # For example - some sites don't tell us it's utf-8, but return utf-8 content | ||||
|         # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably. | ||||
|         # https://github.com/psf/requests/issues/1604 good info about requests encoding detection | ||||
|         if not is_binary: | ||||
|             # Don't run this for PDF (and requests identified as binary) takes a _long_ time | ||||
|             if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'): | ||||
|                 encoding = chardet.detect(r.content)['encoding'] | ||||
|                 if encoding: | ||||
|                     r.encoding = encoding | ||||
|  | ||||
|         if not r.content or not len(r.content): | ||||
|             raise EmptyReply(url=url, status_code=r.status_code) | ||||
|  | ||||
|         # @todo test this | ||||
|         # @todo maybe you really want to test zero-byte return pages? | ||||
|         if r.status_code != 200 and not ignore_status_codes: | ||||
|             # maybe check with content works? | ||||
|             raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text) | ||||
|  | ||||
|         self.status_code = r.status_code | ||||
|         if is_binary: | ||||
|             # Binary files just return their checksum until we add something smarter | ||||
|             self.content = hashlib.md5(r.content).hexdigest() | ||||
|         else: | ||||
|             self.content = r.text | ||||
|  | ||||
|         self.headers = r.headers | ||||
|         self.raw_content = r.content | ||||
|  | ||||
|  | ||||
| # Decide which is the 'real' HTML webdriver, this is more a system wide config | ||||
| # rather than site-specific. | ||||
| use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) | ||||
| if use_playwright_as_chrome_fetcher: | ||||
|     html_webdriver = base_html_playwright | ||||
| else: | ||||
|     html_webdriver = base_html_webdriver | ||||
							
								
								
									
										150
									
								
								changedetectionio/fetchers/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										150
									
								
								changedetectionio/fetchers/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,150 @@ | ||||
| from abc import abstractmethod | ||||
| import os | ||||
| from . import exceptions | ||||
|  | ||||
| visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary' | ||||
|  | ||||
|  | ||||
| class Fetcher(): | ||||
|     browser_steps = None | ||||
|     browser_steps_screenshot_path = None | ||||
|     content = None | ||||
|     error = None | ||||
|     fetcher_description = "No description" | ||||
|     headers = None | ||||
|     status_code = None | ||||
|     webdriver_js_execute_code = None | ||||
|     xpath_data = None | ||||
|     xpath_element_js = "" | ||||
|     instock_data = None | ||||
|     instock_data_js = "" | ||||
|  | ||||
|     # Will be needed in the future by the VisualSelector, always get this where possible. | ||||
|     screenshot = False | ||||
|     system_http_proxy = os.getenv('HTTP_PROXY') | ||||
|     system_https_proxy = os.getenv('HTTPS_PROXY') | ||||
|  | ||||
|     # Time ONTOP of the system defined env minimum time | ||||
|     render_extract_delay = 0 | ||||
|  | ||||
|     def __init__(self): | ||||
|         from pkg_resources import resource_string | ||||
|         # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector | ||||
|         self.xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8') | ||||
|         self.instock_data_js = resource_string(__name__, "../res/stock-not-in-stock.js").decode('utf-8') | ||||
|  | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_error(self): | ||||
|         return self.error | ||||
|  | ||||
|     @abstractmethod | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|         # Should set self.error, self.status_code and self.content | ||||
|         pass | ||||
|  | ||||
|     @abstractmethod | ||||
|     def quit(self): | ||||
|         return | ||||
|  | ||||
|     @abstractmethod | ||||
|     def get_last_status_code(self): | ||||
|         return self.status_code | ||||
|  | ||||
|     @abstractmethod | ||||
|     def screenshot_step(self, step_n): | ||||
|         return None | ||||
|  | ||||
|     @abstractmethod | ||||
|     # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc | ||||
|     def is_ready(self): | ||||
|         return True | ||||
|  | ||||
|     def iterate_browser_steps(self): | ||||
|         from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface | ||||
|         from playwright._impl._api_types import TimeoutError | ||||
|         from jinja2 import Environment | ||||
|         jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) | ||||
|  | ||||
|         step_n = 0 | ||||
|  | ||||
|         if self.browser_steps is not None and len(self.browser_steps): | ||||
|             interface = steppable_browser_interface() | ||||
|             interface.page = self.page | ||||
|  | ||||
|             valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps) | ||||
|  | ||||
|             for step in valid_steps: | ||||
|                 step_n += 1 | ||||
|                 print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation'])) | ||||
|                 self.screenshot_step("before-"+str(step_n)) | ||||
|                 self.save_step_html("before-"+str(step_n)) | ||||
|                 try: | ||||
|                     optional_value = step['optional_value'] | ||||
|                     selector = step['selector'] | ||||
|                     # Support for jinja2 template in step values, with date module added | ||||
|                     if '{%' in step['optional_value'] or '{{' in step['optional_value']: | ||||
|                         optional_value = str(jinja2_env.from_string(step['optional_value']).render()) | ||||
|                     if '{%' in step['selector'] or '{{' in step['selector']: | ||||
|                         selector = str(jinja2_env.from_string(step['selector']).render()) | ||||
|  | ||||
|                     getattr(interface, "call_action")(action_name=step['operation'], | ||||
|                                                       selector=selector, | ||||
|                                                       optional_value=optional_value) | ||||
|                     self.screenshot_step(step_n) | ||||
|                     self.save_step_html(step_n) | ||||
|                 except TimeoutError: | ||||
|                     # Stop processing here | ||||
|                     raise exceptions.BrowserStepsStepTimout(step_n=step_n) | ||||
|  | ||||
|  | ||||
|  | ||||
|     # It's always good to reset these | ||||
|     def delete_browser_steps_screenshots(self): | ||||
|         import glob | ||||
|         if self.browser_steps_screenshot_path is not None: | ||||
|             dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg') | ||||
|             files = glob.glob(dest) | ||||
|             for f in files: | ||||
|                 os.unlink(f) | ||||
|  | ||||
| #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image | ||||
| #   the current one would return javascript output (as we use JS to generate the diff) | ||||
| # | ||||
|  | ||||
|  | ||||
| def available_fetchers(): | ||||
|     from . import playwright, html_requests, webdriver | ||||
|  | ||||
|     p = [] | ||||
|     p.append(tuple(['html_requests', html_requests.fetcher.fetcher_description])) | ||||
|  | ||||
|     # Prefer playwright | ||||
|     if os.getenv('PLAYWRIGHT_DRIVER_URL', False): | ||||
|         p.append(tuple(['html_webdriver', playwright.fetcher.fetcher_description])) | ||||
|  | ||||
|     elif os.getenv('WEBDRIVER_URL'): | ||||
|         p.append(tuple(['html_webdriver', webdriver.fetcher.fetcher_description])) | ||||
|  | ||||
|  | ||||
|     return p | ||||
|  | ||||
| html_webdriver = None | ||||
| # Decide which is the 'real' HTML webdriver, this is more a system wide config rather than site-specific. | ||||
| use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) | ||||
| if use_playwright_as_chrome_fetcher: | ||||
|     from . import playwright | ||||
|     html_webdriver = getattr(playwright, "fetcher") | ||||
|  | ||||
| else: | ||||
|     from . import webdriver | ||||
|     html_webdriver = getattr(webdriver, "fetcher") | ||||
|  | ||||
							
								
								
									
										71
									
								
								changedetectionio/fetchers/browserless.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								changedetectionio/fetchers/browserless.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,71 @@ | ||||
| from . import Fetcher | ||||
| import os | ||||
| import requests | ||||
|  | ||||
|  | ||||
| # Exploit the debugging API to get screenshot and HTML without needing playwright | ||||
| # https://www.browserless.io/docs/scrape#debugging | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
|     fetcher_description = "Browserless Chrome/Javascript via '{}'".format(os.getenv("BROWSERLESS_DRIVER_URL")) | ||||
|  | ||||
|     command_executor = '' | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self, proxy_override=None, command_executor=None): | ||||
|         super().__init__() | ||||
|         self.proxy = proxy_override | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         proxy = "" | ||||
|         if self.proxy: | ||||
|             proxy = f"--proxy-server={self.proxy}" | ||||
|  | ||||
|         import json | ||||
|         r = requests.request(method='POST', | ||||
|                              data=json.dumps({ | ||||
|                                  "url": f"{url}?{proxy}", | ||||
|                                  "elements": [], | ||||
|                                  "debug": { | ||||
|                                      "screenshot": True, | ||||
|                                      "console": False, | ||||
|                                      "network": True, | ||||
|                                      "cookies": False, | ||||
|                                      "html": True | ||||
|                                  } | ||||
|                              }), | ||||
|                              url=os.getenv("BROWSERLESS_DRIVER_URL"), | ||||
|                              headers={'Content-Type': 'application/json'}, | ||||
|                              timeout=timeout, | ||||
|                              verify=False) | ||||
|  | ||||
|         # "waitFor": "() => document.querySelector('h1')" | ||||
|         #        extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|         #        self.page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
|         if r.status_code == 200: | ||||
|             # the basic request to browserless was OK, but how was the internal request to the site? | ||||
|             result = r.json() | ||||
|  | ||||
|             if result['debug']['network'].get('inbound') and len(result['debug']['network']['inbound']): | ||||
|                 self.status_code = result['debug']['network']['inbound'][000]['status'] | ||||
|  | ||||
|             self.content = result['debug']['html'] | ||||
|  | ||||
|             self.headers = {} | ||||
|             if result['debug'].get('screenshot'): | ||||
|                 import base64 | ||||
|                 self.screenshot = base64.b64decode(result['debug']['screenshot']) | ||||
|  | ||||
|     def is_ready(self): | ||||
|         # Try ping? | ||||
|         return os.getenv("BROWSERLESS_DRIVER_URL", False) | ||||
							
								
								
									
										66
									
								
								changedetectionio/fetchers/exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								changedetectionio/fetchers/exceptions.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,66 @@ | ||||
| class Non200ErrorCodeReceived(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.xpath_data = xpath_data | ||||
|         self.page_text = None | ||||
|  | ||||
|         if page_html: | ||||
|             from changedetectionio import html_tools | ||||
|             self.page_text = html_tools.html_to_text(page_html) | ||||
|         return | ||||
|  | ||||
| class checksumFromPreviousCheckWasTheSame(Exception): | ||||
|     def __init__(self): | ||||
|         return | ||||
|  | ||||
| class JSActionExceptions(Exception): | ||||
|     def __init__(self, status_code, url, screenshot, message=''): | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.message = message | ||||
|         return | ||||
|  | ||||
| class BrowserStepsStepTimout(Exception): | ||||
|     def __init__(self, step_n): | ||||
|         self.step_n = step_n | ||||
|         return | ||||
|  | ||||
|  | ||||
| class PageUnloadable(Exception): | ||||
|     def __init__(self, status_code, url, message, screenshot=False): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.message = message | ||||
|         return | ||||
|  | ||||
| class EmptyReply(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         return | ||||
|  | ||||
| class ScreenshotUnavailable(Exception): | ||||
|     def __init__(self, status_code, url, page_html=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         if page_html: | ||||
|             from ..html_tools import html_to_text | ||||
|             self.page_text = html_to_text(page_html) | ||||
|         return | ||||
|  | ||||
| class ReplyWithContentButNoText(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         return | ||||
							
								
								
									
										80
									
								
								changedetectionio/fetchers/html_requests.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								changedetectionio/fetchers/html_requests.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| from . import Fetcher | ||||
| from . import exceptions | ||||
|  | ||||
|  | ||||
| # "html_requests" is listed as the default fetcher in store.py! | ||||
| class fetcher(Fetcher): | ||||
|     fetcher_description = "Basic fast Plaintext/HTTP Client" | ||||
|  | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         self.proxy_override = proxy_override | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         import chardet | ||||
|         import hashlib | ||||
|         import os | ||||
|         import requests | ||||
|  | ||||
|         # Make requests use a more modern looking user-agent | ||||
|         if not 'User-Agent' in request_headers: | ||||
|             request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", | ||||
|                                                       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') | ||||
|  | ||||
|         proxies = {} | ||||
|  | ||||
|         # Allows override the proxy on a per-request basis | ||||
|         if self.proxy_override: | ||||
|             proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override} | ||||
|         else: | ||||
|             if self.system_http_proxy: | ||||
|                 proxies['http'] = self.system_http_proxy | ||||
|             if self.system_https_proxy: | ||||
|                 proxies['https'] = self.system_https_proxy | ||||
|  | ||||
|         r = requests.request(method=request_method, | ||||
|                              data=request_body, | ||||
|                              url=url, | ||||
|                              headers=request_headers, | ||||
|                              timeout=timeout, | ||||
|                              proxies=proxies, | ||||
|                              verify=False) | ||||
|  | ||||
|         # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. | ||||
|         # For example - some sites don't tell us it's utf-8, but return utf-8 content | ||||
|         # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably. | ||||
|         # https://github.com/psf/requests/issues/1604 good info about requests encoding detection | ||||
|         if not is_binary: | ||||
|             # Don't run this for PDF (and requests identified as binary) takes a _long_ time | ||||
|             if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'): | ||||
|                 encoding = chardet.detect(r.content)['encoding'] | ||||
|                 if encoding: | ||||
|                     r.encoding = encoding | ||||
|  | ||||
|         if not r.content or not len(r.content): | ||||
|             raise exceptions.EmptyReply(url=url, status_code=r.status_code) | ||||
|  | ||||
|         # @todo test this | ||||
|         # @todo maybe you really want to test zero-byte return pages? | ||||
|         if r.status_code != 200 and not ignore_status_codes: | ||||
|             # maybe check with content works? | ||||
|             raise exceptions.Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text) | ||||
|  | ||||
|         self.status_code = r.status_code | ||||
|         if is_binary: | ||||
|             # Binary files just return their checksum until we add something smarter | ||||
|             self.content = hashlib.md5(r.content).hexdigest() | ||||
|         else: | ||||
|             self.content = r.text | ||||
|  | ||||
|         self.headers = r.headers | ||||
|         self.raw_content = r.content | ||||
							
								
								
									
										208
									
								
								changedetectionio/fetchers/playwright.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										208
									
								
								changedetectionio/fetchers/playwright.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,208 @@ | ||||
| from . import Fetcher | ||||
| from . import exceptions | ||||
| from . import visualselector_xpath_selectors | ||||
|  | ||||
| import os | ||||
| import logging | ||||
| import time | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
|     fetcher_description = "Playwright {}/Javascript".format( | ||||
|         os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() | ||||
|     ) | ||||
|     if os.getenv("PLAYWRIGHT_DRIVER_URL"): | ||||
|         fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) | ||||
|  | ||||
|     browser_type = '' | ||||
|     command_executor = '' | ||||
|  | ||||
|     # Configs for Proxy setup | ||||
|     # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" | ||||
|     playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] | ||||
|  | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         super().__init__() | ||||
|         import json | ||||
|  | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') | ||||
|         self.command_executor = os.getenv( | ||||
|             "PLAYWRIGHT_DRIVER_URL", | ||||
|             'ws://playwright-chrome:3000' | ||||
|         ).strip('"') | ||||
|  | ||||
|         # If any proxy settings are enabled, then we should setup the proxy object | ||||
|         proxy_args = {} | ||||
|         for k in self.playwright_proxy_settings_mappings: | ||||
|             v = os.getenv('playwright_proxy_' + k, False) | ||||
|             if v: | ||||
|                 proxy_args[k] = v.strip('"') | ||||
|  | ||||
|         if proxy_args: | ||||
|             self.proxy = proxy_args | ||||
|  | ||||
|         # allow per-watch proxy selection override | ||||
|         if proxy_override: | ||||
|             self.proxy = {'server': proxy_override} | ||||
|  | ||||
|         if self.proxy: | ||||
|             # Playwright needs separate username and password values | ||||
|             from urllib.parse import urlparse | ||||
|             parsed = urlparse(self.proxy.get('server')) | ||||
|             if parsed.username: | ||||
|                 self.proxy['username'] = parsed.username | ||||
|                 self.proxy['password'] = parsed.password | ||||
|  | ||||
|     def screenshot_step(self, step_n=''): | ||||
|         screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85) | ||||
|  | ||||
|         if self.browser_steps_screenshot_path is not None: | ||||
|             destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) | ||||
|             logging.debug("Saving step screenshot to {}".format(destination)) | ||||
|             with open(destination, 'wb') as f: | ||||
|                 f.write(screenshot) | ||||
|  | ||||
|     def save_step_html(self, step_n): | ||||
|         content = self.page.content() | ||||
|         destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) | ||||
|         logging.debug("Saving step HTML to {}".format(destination)) | ||||
|         with open(destination, 'w') as f: | ||||
|             f.write(content) | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         from playwright.sync_api import sync_playwright | ||||
|         import playwright._impl._api_types | ||||
|         import json | ||||
|  | ||||
|         self.delete_browser_steps_screenshots() | ||||
|         response = None | ||||
|         with sync_playwright() as p: | ||||
|             browser_type = getattr(p, self.browser_type) | ||||
|  | ||||
|             # Seemed to cause a connection Exception even tho I can see it connect | ||||
|             # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) | ||||
|             # 60,000 connection timeout only | ||||
|             browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000) | ||||
|  | ||||
|             # Set user agent to prevent Cloudflare from blocking the browser | ||||
|             # Use the default one configured in the App.py model that's passed from fetch_site_status.py | ||||
|             context = browser.new_context( | ||||
|                 user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', | ||||
|                 proxy=self.proxy, | ||||
|                 # This is needed to enable JavaScript execution on GitHub and others | ||||
|                 bypass_csp=True, | ||||
|                 # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers | ||||
|                 service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), | ||||
|                 # Should never be needed | ||||
|                 accept_downloads=False | ||||
|             ) | ||||
|  | ||||
|             self.page = context.new_page() | ||||
|             if len(request_headers): | ||||
|                 context.set_extra_http_headers(request_headers) | ||||
|  | ||||
|                 self.page.set_default_navigation_timeout(90000) | ||||
|                 self.page.set_default_timeout(90000) | ||||
|  | ||||
|                 # Listen for all console events and handle errors | ||||
|                 self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) | ||||
|  | ||||
|             # Goto page | ||||
|             try: | ||||
|                 # Wait_until = commit | ||||
|                 # - `'commit'` - consider operation to be finished when network response is received and the document started loading. | ||||
|                 # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds | ||||
|                 # This seemed to solve nearly all 'TimeoutErrors' | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|             except playwright._impl._api_types.Error as e: | ||||
|                 # Retry once - https://github.com/browserless/chrome/issues/2485 | ||||
|                 # Sometimes errors related to invalid cert's and other can be random | ||||
|                 print ("Content Fetcher > retrying request got error - ", str(e)) | ||||
|                 time.sleep(1) | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|  | ||||
|             except Exception as e: | ||||
|                 print ("Content Fetcher > Other exception when page.goto", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise exceptions.PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             # Execute any browser steps | ||||
|             try: | ||||
|                 extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|                 self.page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
|                 if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): | ||||
|                     self.page.evaluate(self.webdriver_js_execute_code) | ||||
|  | ||||
|             except playwright._impl._api_types.TimeoutError as e: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 # This can be ok, we will try to grab what we could retrieve | ||||
|                 pass | ||||
|             except Exception as e: | ||||
|                 print ("Content Fetcher > Other exception when executing custom JS code", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise exceptions.PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             if response is None: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("Content Fetcher > Response object was none") | ||||
|                 raise exceptions.EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             # Run Browser Steps here | ||||
|             self.iterate_browser_steps() | ||||
|  | ||||
|             extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|             time.sleep(extra_wait) | ||||
|  | ||||
|             self.content = self.page.content() | ||||
|             self.status_code = response.status | ||||
|             if len(self.page.content().strip()) == 0: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("Content Fetcher > Content was empty") | ||||
|                 raise exceptions.EmptyReply(url=url, status_code=response.status) | ||||
|  | ||||
|             self.status_code = response.status | ||||
|             self.headers = response.all_headers() | ||||
|  | ||||
|             # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) | ||||
|             if current_include_filters is not None: | ||||
|                 self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) | ||||
|             else: | ||||
|                 self.page.evaluate("var include_filters=''") | ||||
|  | ||||
|             self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") | ||||
|             self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") | ||||
|  | ||||
|             # Bug 3 in Playwright screenshot handling | ||||
|             # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|             # JPEG is better here because the screenshots can be very very large | ||||
|  | ||||
|             # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded | ||||
|             # which will significantly increase the IO size between the server and client, it's recommended to use the lowest | ||||
|             # acceptable screenshot quality here | ||||
|             try: | ||||
|                 # The actual screenshot | ||||
|                 self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) | ||||
|             except Exception as e: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise exceptions.ScreenshotUnavailable(url=url, status_code=None) | ||||
|  | ||||
|             context.close() | ||||
|             browser.close() | ||||
							
								
								
									
										103
									
								
								changedetectionio/fetchers/webdriver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										103
									
								
								changedetectionio/fetchers/webdriver.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,103 @@ | ||||
| from . import Fetcher | ||||
| import os | ||||
| import time | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
|     if os.getenv("WEBDRIVER_URL"): | ||||
|         fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) | ||||
|     else: | ||||
|         fetcher_description = "WebDriver Chrome/Javascript" | ||||
|  | ||||
|     command_executor = '' | ||||
|  | ||||
|     # Configs for Proxy setup | ||||
|     # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" | ||||
|     selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', | ||||
|                                         'proxyAutoconfigUrl', 'sslProxy', 'autodetect', | ||||
|                                         'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] | ||||
|     proxy = None | ||||
|  | ||||
|     def __init__(self, proxy_override=None, command_executor=None): | ||||
|         super().__init__() | ||||
|         from selenium.webdriver.common.proxy import Proxy as SeleniumProxy | ||||
|  | ||||
|         # .strip('"') is going to save someone a lot of time when they accidently wrap the env value | ||||
|         if command_executor: | ||||
|             self.command_executor = command_executor | ||||
|         else: | ||||
|             self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') | ||||
|  | ||||
|         # If any proxy settings are enabled, then we should setup the proxy object | ||||
|         proxy_args = {} | ||||
|         for k in self.selenium_proxy_settings_mappings: | ||||
|             v = os.getenv('webdriver_' + k, False) | ||||
|             if v: | ||||
|                 proxy_args[k] = v.strip('"') | ||||
|  | ||||
|         # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy | ||||
|         if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy: | ||||
|             proxy_args['httpProxy'] = self.system_http_proxy | ||||
|         if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy: | ||||
|             proxy_args['httpsProxy'] = self.system_https_proxy | ||||
|  | ||||
|         # Allows override the proxy on a per-request basis | ||||
|         if proxy_override is not None: | ||||
|             proxy_args['httpProxy'] = proxy_override | ||||
|  | ||||
|         if proxy_args: | ||||
|             self.proxy = SeleniumProxy(raw=proxy_args) | ||||
|  | ||||
|     def run(self, | ||||
|             url, | ||||
|             timeout, | ||||
|             request_headers, | ||||
|             request_body, | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|  | ||||
|         from selenium import webdriver | ||||
|         from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | ||||
|         from selenium.common.exceptions import WebDriverException | ||||
|         # request_body, request_method unused for now, until some magic in the future happens. | ||||
|  | ||||
|         # check env for WEBDRIVER_URL | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor=self.command_executor, | ||||
|             desired_capabilities=DesiredCapabilities.CHROME, | ||||
|             proxy=self.proxy | ||||
|         ) | ||||
|  | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except WebDriverException as e: | ||||
|             # Be sure we close the session window | ||||
|             self.quit() | ||||
|             raise | ||||
|  | ||||
|         self.driver.set_window_size(1280, 1024) | ||||
|         self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) | ||||
|  | ||||
|         if self.webdriver_js_execute_code is not None: | ||||
|             self.driver.execute_script(self.webdriver_js_execute_code) | ||||
|             # Selenium doesn't automatically wait for actions as good as Playwright, so wait again | ||||
|             self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) | ||||
|  | ||||
|         # @todo - how to check this? is it possible? | ||||
|         self.status_code = 200 | ||||
|         # @todo somehow we should try to get this working for WebDriver | ||||
|         # raise EmptyReply(url=url, status_code=r.status_code) | ||||
|  | ||||
|         # @todo - dom wait loaded? | ||||
|         time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) | ||||
|         self.content = self.driver.page_source | ||||
|         self.headers = {} | ||||
|  | ||||
|         self.screenshot = self.driver.get_screenshot_as_png() | ||||
|  | ||||
|     # Try something with requests? | ||||
|     def is_ready(self): | ||||
|         return True | ||||
|  | ||||
|  | ||||
| @@ -21,7 +21,6 @@ from wtforms.validators import ValidationError | ||||
| # each select <option data-enabled="enabled-0-0" | ||||
| from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config | ||||
|  | ||||
| from changedetectionio import content_fetcher | ||||
| from changedetectionio.notification import ( | ||||
|     valid_notification_formats, | ||||
| ) | ||||
| @@ -135,30 +134,31 @@ class ValidateContentFetcherIsReady(object): | ||||
|  | ||||
|     def __call__(self, form, field): | ||||
|         import urllib3.exceptions | ||||
|         from changedetectionio import content_fetcher | ||||
|         import importlib | ||||
|  | ||||
|         # Better would be a radiohandler that keeps a reference to each class | ||||
|         if field.data is not None and field.data != 'system': | ||||
|             klass = getattr(content_fetcher, field.data) | ||||
|             some_object = klass() | ||||
|             try: | ||||
|                 ready = some_object.is_ready() | ||||
|             from . import fetchers | ||||
|             if fetchers.html_webdriver is not None: | ||||
|                 try: | ||||
|                     driver = fetchers.html_webdriver() | ||||
|                     driver.is_ready() | ||||
|  | ||||
|             except urllib3.exceptions.MaxRetryError as e: | ||||
|                 driver_url = some_object.command_executor | ||||
|                 message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data)) | ||||
|                 message += '<br>' + field.gettext( | ||||
|                     'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.') | ||||
|                 message += '<br>' + field.gettext('Did you follow the instructions in the wiki?') | ||||
|                 message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url)) | ||||
|                 message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>' | ||||
|                 message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e))) | ||||
|                 except urllib3.exceptions.MaxRetryError as e: | ||||
|                     driver_url = fetchers.html_webdriver.command_executor | ||||
|                     message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data)) | ||||
|                     message += '<br>' + field.gettext( | ||||
|                         'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.') | ||||
|                     message += '<br>' + field.gettext('Did you follow the instructions in the wiki?') | ||||
|                     message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url)) | ||||
|                     message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>' | ||||
|                     message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e))) | ||||
|  | ||||
|                 raise ValidationError(message) | ||||
|                     raise ValidationError(message) | ||||
|  | ||||
|             except Exception as e: | ||||
|                 message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s') | ||||
|                 raise ValidationError(message % (field.data, e)) | ||||
|                 except Exception as e: | ||||
|                     message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s') | ||||
|                     raise ValidationError(message % (field.data, e)) | ||||
|  | ||||
|  | ||||
| class ValidateNotificationBodyAndTitleWhenURLisSet(object): | ||||
| @@ -355,11 +355,12 @@ class quickWatchForm(Form): | ||||
|  | ||||
| # Common to a single watch and the global settings | ||||
| class commonSettingsForm(Form): | ||||
|     from .fetchers import available_fetchers | ||||
|     notification_urls = StringListField('Notification URL List', validators=[validators.Optional(), ValidateAppRiseServers()]) | ||||
|     notification_title = StringField('Notification Title', default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()]) | ||||
|     notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()]) | ||||
|     notification_format = SelectField('Notification format', choices=valid_notification_formats.keys()) | ||||
|     fetch_backend = RadioField(u'Fetch Method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) | ||||
|     fetch_backend = RadioField(u'Fetch Method', choices=available_fetchers(), validators=[ValidateContentFetcherIsReady()]) | ||||
|     extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False) | ||||
|     webdriver_delay = IntegerField('Wait seconds before extracting text', validators=[validators.Optional(), validators.NumberRange(min=1, | ||||
|                                                                                                                                     message="Should contain one or more seconds")]) | ||||
| @@ -472,11 +473,11 @@ class globalSettingsRequestForm(Form): | ||||
|  | ||||
| # datastore.data['settings']['application'].. | ||||
| class globalSettingsApplicationForm(commonSettingsForm): | ||||
|  | ||||
|     from .fetchers import available_fetchers | ||||
|     api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()]) | ||||
|     base_url = StringField('Base URL', validators=[validators.Optional()]) | ||||
|     empty_pages_are_a_change =  BooleanField('Treat empty pages as a change?', default=False) | ||||
|     fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) | ||||
|     fetch_backend = RadioField('Fetch Method', default="html_requests", choices=available_fetchers(), validators=[ValidateContentFetcherIsReady()]) | ||||
|     global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) | ||||
|     global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) | ||||
|     ignore_whitespace = BooleanField('Ignore whitespace') | ||||
|   | ||||
| @@ -4,8 +4,8 @@ import os | ||||
| import re | ||||
| import urllib3 | ||||
| from . import difference_detection_processor | ||||
| from changedetectionio import content_fetcher | ||||
| from copy import deepcopy | ||||
| from .. import fetchers | ||||
|  | ||||
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
|  | ||||
| @@ -61,11 +61,12 @@ class perform_site_check(difference_detection_processor): | ||||
|         if not prefer_backend or prefer_backend == 'system': | ||||
|             prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] | ||||
|  | ||||
|         if hasattr(content_fetcher, prefer_backend): | ||||
|             klass = getattr(content_fetcher, prefer_backend) | ||||
|         if prefer_backend == 'html_webdriver': | ||||
|             preferred_fetcher = fetchers.html_webdriver | ||||
|         else: | ||||
|             # If the klass doesnt exist, just use a default | ||||
|             klass = getattr(content_fetcher, "html_requests") | ||||
|             from ..fetchers import html_requests | ||||
|             preferred_fetcher = html_requests | ||||
|  | ||||
|  | ||||
|         proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) | ||||
|         proxy_url = None | ||||
| @@ -73,7 +74,7 @@ class perform_site_check(difference_detection_processor): | ||||
|             proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') | ||||
|             print("UUID {} Using proxy {}".format(uuid, proxy_url)) | ||||
|  | ||||
|         fetcher = klass(proxy_override=proxy_url) | ||||
|         fetcher = preferred_fetcher(proxy_override=proxy_url) | ||||
|  | ||||
|         # Configurable per-watch or global extra delay before extracting text (for webDriver types) | ||||
|         system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) | ||||
|   | ||||
| @@ -7,10 +7,11 @@ import os | ||||
| import re | ||||
| import urllib3 | ||||
|  | ||||
| from changedetectionio import content_fetcher, html_tools | ||||
| from changedetectionio import html_tools | ||||
| from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT | ||||
| from copy import deepcopy | ||||
| from . import difference_detection_processor | ||||
| from .. import fetchers | ||||
|  | ||||
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
|  | ||||
| @@ -101,11 +102,12 @@ class perform_site_check(difference_detection_processor): | ||||
|         if not prefer_backend or prefer_backend == 'system': | ||||
|             prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] | ||||
|  | ||||
|         if hasattr(content_fetcher, prefer_backend): | ||||
|             klass = getattr(content_fetcher, prefer_backend) | ||||
|         if prefer_backend == 'html_webdriver': | ||||
|             preferred_fetcher = fetchers.html_webdriver | ||||
|         else: | ||||
|             # If the klass doesnt exist, just use a default | ||||
|             klass = getattr(content_fetcher, "html_requests") | ||||
|             from ..fetchers import html_requests | ||||
|             preferred_fetcher = html_requests | ||||
|  | ||||
|  | ||||
|         proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) | ||||
|         proxy_url = None | ||||
| @@ -113,7 +115,7 @@ class perform_site_check(difference_detection_processor): | ||||
|             proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') | ||||
|             print("UUID {} Using proxy {}".format(uuid, proxy_url)) | ||||
|  | ||||
|         fetcher = klass(proxy_override=proxy_url) | ||||
|         fetcher = preferred_fetcher(proxy_override=proxy_url) | ||||
|  | ||||
|         # Configurable per-watch or global extra delay before extracting text (for webDriver types) | ||||
|         system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) | ||||
| @@ -147,7 +149,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() | ||||
|         if skip_when_checksum_same: | ||||
|             if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): | ||||
|                 raise content_fetcher.checksumFromPreviousCheckWasTheSame() | ||||
|                 raise fetchers.exceptions.checksumFromPreviousCheckWasTheSame() | ||||
|  | ||||
|  | ||||
|         # Fetching complete, now filters | ||||
| @@ -310,7 +312,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         # Treat pages with no renderable text content as a change? No by default | ||||
|         empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) | ||||
|         if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: | ||||
|             raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot) | ||||
|             raise fetchers.exceptions.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot) | ||||
|  | ||||
|         # We rely on the actual text in the html output.. many sites have random script vars etc, | ||||
|         # in the future we'll implement other mechanisms. | ||||
|   | ||||
| @@ -3,9 +3,8 @@ import threading | ||||
| import queue | ||||
| import time | ||||
|  | ||||
| from changedetectionio import content_fetcher | ||||
| from .processors.text_json_diff import FilterNotFoundInResponse | ||||
|  | ||||
| from .fetchers import exceptions | ||||
|  | ||||
| # A single update worker | ||||
| # | ||||
| @@ -190,6 +189,7 @@ class update_worker(threading.Thread): | ||||
|                         processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff') | ||||
|  | ||||
|                         # @todo some way to switch by name | ||||
|                         update_handler = None | ||||
|                         if processor == 'restock_diff': | ||||
|                             update_handler = restock_diff.perform_site_check(datastore=self.datastore) | ||||
|                         else: | ||||
| @@ -205,7 +205,7 @@ class update_worker(threading.Thread): | ||||
|                     except PermissionError as e: | ||||
|                         self.app.logger.error("File permission error updating", uuid, str(e)) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetcher.ReplyWithContentButNoText as e: | ||||
|                     except exceptions.ReplyWithContentButNoText as e: | ||||
|                         # Totally fine, it's by choice - just continue on, nothing more to care about | ||||
|                         # Page had elements/content but no renderable text | ||||
|                         # Backend (not filters) gave zero output | ||||
| @@ -214,7 +214,7 @@ class update_worker(threading.Thread): | ||||
|                             self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot) | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetcher.Non200ErrorCodeReceived as e: | ||||
|                     except exceptions.Non200ErrorCodeReceived as e: | ||||
|                         if e.status_code == 403: | ||||
|                             err_text = "Error - 403 (Access denied) received" | ||||
|                         elif e.status_code == 404: | ||||
| @@ -258,12 +258,12 @@ class update_worker(threading.Thread): | ||||
|  | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetcher.checksumFromPreviousCheckWasTheSame as e: | ||||
|                     except exceptions.checksumFromPreviousCheckWasTheSame as e: | ||||
|                         # Yes fine, so nothing todo, don't continue to process. | ||||
|                         process_changedetection_results = False | ||||
|                         changed_detected = False | ||||
|  | ||||
|                     except content_fetcher.BrowserStepsStepTimout as e: | ||||
|                     except exceptions.BrowserStepsStepTimout as e: | ||||
|  | ||||
|                         if not self.datastore.data['watching'].get(uuid): | ||||
|                             continue | ||||
| @@ -288,25 +288,25 @@ class update_worker(threading.Thread): | ||||
|  | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetcher.EmptyReply as e: | ||||
|                     except exceptions.EmptyReply as e: | ||||
|                         # Some kind of custom to-str handler in the exception handler that does this? | ||||
|                         err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetcher.ScreenshotUnavailable as e: | ||||
|                     except exceptions.ScreenshotUnavailable as e: | ||||
|                         err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'" | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetcher.JSActionExceptions as e: | ||||
|                     except exceptions.JSActionExceptions as e: | ||||
|                         err_text = "Error running JS Actions - Page request - "+e.message | ||||
|                         if e.screenshot: | ||||
|                             self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetcher.PageUnloadable as e: | ||||
|                     except exceptions.PageUnloadable as e: | ||||
|                         err_text = "Page request from server didnt respond correctly" | ||||
|                         if e.message: | ||||
|                             err_text = "{} - {}".format(err_text, e.message) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user