mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 06:37:41 +00:00 
			
		
		
		
	Compare commits
	
		
			14 Commits
		
	
	
		
			notificati
			...
			abstract-o
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | dfe26cb486 | ||
|   | ac89cd56cc | ||
|   | 2b215443d7 | ||
|   | eb2560e77d | ||
|   | ab943c0c51 | ||
|   | 9d99f4cb59 | ||
|   | 0bcbcb80f1 | ||
|   | b6bdc2738b | ||
|   | ebc7a7e568 | ||
|   | d7bc2bd3f6 | ||
|   | 2bd32b261a | ||
|   | 572a169a47 | ||
|   | 68d1e2736c | ||
|   | 97e591fa24 | 
| @@ -40,8 +40,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|         contents = '' | ||||
|         now = time.time() | ||||
|         try: | ||||
|             update_handler = text_json_diff.perform_site_check(datastore=datastore) | ||||
|             changed_detected, update_obj, contents = update_handler.run(uuid, preferred_proxy=preferred_proxy, skip_when_checksum_same=False) | ||||
|             update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) | ||||
|             update_handler.call_browser() | ||||
|         # title, size is len contents not len xfer | ||||
|         except content_fetcher.Non200ErrorCodeReceived as e: | ||||
|             if e.status_code == 404: | ||||
|   | ||||
| @@ -419,11 +419,7 @@ class base_html_playwright(Fetcher): | ||||
|             is_binary=False): | ||||
|  | ||||
|         # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) | ||||
|         has_browser_steps = self.browser_steps and list(filter( | ||||
|                 lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), | ||||
|                 self.browser_steps)) | ||||
|  | ||||
|         if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): | ||||
|         if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): | ||||
|             if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')): | ||||
|                 # Temporary backup solution until we rewrite the playwright code | ||||
|                 return self.run_fetch_browserless_puppeteer( | ||||
| @@ -671,6 +667,7 @@ class html_requests(Fetcher): | ||||
|     fetcher_description = "Basic fast Plaintext/HTTP Client" | ||||
|  | ||||
|     def __init__(self, proxy_override=None): | ||||
|         super().__init__() | ||||
|         self.proxy_override = proxy_override | ||||
|  | ||||
|     def run(self, | ||||
|   | ||||
| @@ -19,6 +19,7 @@ from changedetectionio.notification import ( | ||||
|  | ||||
| base_config = { | ||||
|     'body': None, | ||||
|     'browser_steps': [], | ||||
|     'browser_steps_last_error_step': None, | ||||
|     'check_unique_lines': False,  # On change-detected, compare against all history if its something new | ||||
|     'check_count': 0, | ||||
| @@ -145,8 +146,14 @@ class model(dict): | ||||
|                 flash(message, 'error') | ||||
|                 return '' | ||||
|  | ||||
|         if ready_url.startswith('source:'): | ||||
|             ready_url=ready_url.replace('source:', '') | ||||
|         return ready_url | ||||
|  | ||||
|     @property | ||||
|     def is_source_type_url(self): | ||||
|         return self.get('url', '').startswith('source:') | ||||
|  | ||||
|     @property | ||||
|     def get_fetch_backend(self): | ||||
|         """ | ||||
| @@ -234,6 +241,14 @@ class model(dict): | ||||
|         fname = os.path.join(self.watch_data_dir, "history.txt") | ||||
|         return os.path.isfile(fname) | ||||
|  | ||||
|     @property | ||||
|     def has_browser_steps(self): | ||||
|         has_browser_steps = self.get('browser_steps') and list(filter( | ||||
|                 lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), | ||||
|                 self.get('browser_steps'))) | ||||
|  | ||||
|         return  has_browser_steps | ||||
|  | ||||
|     # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. | ||||
|     @property | ||||
|     def newest_history_key(self): | ||||
|   | ||||
| @@ -1,15 +1,108 @@ | ||||
| from abc import abstractmethod | ||||
| import os | ||||
| import hashlib | ||||
|  | ||||
| import re | ||||
| from changedetectionio import content_fetcher | ||||
| from copy import deepcopy | ||||
| from distutils.util import strtobool | ||||
|  | ||||
| class difference_detection_processor(): | ||||
|  | ||||
|     datastore = None | ||||
|     fetcher = None | ||||
|     screenshot = None | ||||
|     xpath_data = None | ||||
|     browser_steps = None | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|     def __init__(self, *args, datastore, watch_uuid, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self.datastore = datastore | ||||
|         self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) | ||||
|  | ||||
|     def call_browser(self): | ||||
|  | ||||
|         # Protect against file:// access | ||||
|         if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): | ||||
|             if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): | ||||
|                 raise Exception( | ||||
|                     "file:// type access is denied for security reasons." | ||||
|                 ) | ||||
|  | ||||
|         url = self.watch.link | ||||
|  | ||||
|         # Requests, playwright, other browser via wss:// etc, fetch_extra_something | ||||
|         prefer_fetch_backend = self.watch.get('fetch_backend', 'system') | ||||
|  | ||||
|         # Proxy ID "key" | ||||
|         preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid')) | ||||
|  | ||||
|         # Pluggable content self.fetcher | ||||
|         if not prefer_fetch_backend or prefer_fetch_backend == 'system': | ||||
|             prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') | ||||
|  | ||||
|         # Grab the right kind of 'fetcher', (playwright, requests, etc) | ||||
|         if hasattr(content_fetcher, prefer_fetch_backend): | ||||
|             fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) | ||||
|         else: | ||||
|             # If the klass doesnt exist, just use a default | ||||
|             fetcher_obj = getattr(content_fetcher, "html_requests") | ||||
|  | ||||
|  | ||||
|         proxy_url = None | ||||
|         if preferred_proxy_id: | ||||
|             proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') | ||||
|             print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") | ||||
|  | ||||
|         # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. | ||||
|         self.fetcher = fetcher_obj(proxy_override=proxy_url, | ||||
|                                    #browser_url_extra/configurable browser url=... | ||||
|                                    ) | ||||
|  | ||||
|         if self.watch.has_browser_steps: | ||||
|             self.fetcher.browser_steps = self.watch.get('browser_steps', []) | ||||
|             self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) | ||||
|  | ||||
|         # Tweak the base config with the per-watch ones | ||||
|         request_headers = self.watch.get('headers', []) | ||||
|         request_headers.update(self.datastore.get_all_base_headers()) | ||||
|         request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) | ||||
|  | ||||
|         # https://github.com/psf/requests/issues/4525 | ||||
|         # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot | ||||
|         # do this by accident. | ||||
|         if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: | ||||
|             request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') | ||||
|  | ||||
|         timeout = self.datastore.data['settings']['requests'].get('timeout') | ||||
|  | ||||
|         request_body = self.watch.get('body') | ||||
|         request_method = self.watch.get('method') | ||||
|         ignore_status_codes = self.watch.get('ignore_status_codes', False) | ||||
|  | ||||
|         # Configurable per-watch or global extra delay before extracting text (for webDriver types) | ||||
|         system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) | ||||
|         if self.watch.get('webdriver_delay'): | ||||
|             self.fetcher.render_extract_delay = self.watch.get('webdriver_delay') | ||||
|         elif system_webdriver_delay is not None: | ||||
|             self.fetcher.render_extract_delay = system_webdriver_delay | ||||
|  | ||||
|         if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip(): | ||||
|             self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code') | ||||
|  | ||||
|         # Requests for PDF's, images etc should be passwd the is_binary flag | ||||
|         is_binary = self.watch.is_pdf | ||||
|  | ||||
|         # And here we go! call the right browser with browser-specific settings | ||||
|         self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), | ||||
|                     is_binary=is_binary) | ||||
|  | ||||
|         #@todo .quit here could go on close object, so we can run JS if change-detected | ||||
|         self.fetcher.quit() | ||||
|  | ||||
|         # After init, call run_changedetection() which will do the actual change-detection | ||||
|  | ||||
|     @abstractmethod | ||||
|     def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): | ||||
|     def run_changedetection(self, uuid, skip_when_checksum_same=True): | ||||
|         update_obj = {'last_notification_error': False, 'last_error': False} | ||||
|         some_data = 'xxxxx' | ||||
|         update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() | ||||
|   | ||||
| @@ -1,10 +1,7 @@ | ||||
|  | ||||
| import hashlib | ||||
| import os | ||||
| import re | ||||
| import urllib3 | ||||
| from . import difference_detection_processor | ||||
| from changedetectionio import content_fetcher | ||||
| from copy import deepcopy | ||||
|  | ||||
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
| @@ -22,11 +19,7 @@ class perform_site_check(difference_detection_processor): | ||||
|     screenshot = None | ||||
|     xpath_data = None | ||||
|  | ||||
|     def __init__(self, *args, datastore, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self.datastore = datastore | ||||
|  | ||||
|     def run(self, uuid, skip_when_checksum_same=True): | ||||
|     def run_changedetection(self, uuid, skip_when_checksum_same=True): | ||||
|  | ||||
|         # DeepCopy so we can be sure we don't accidently change anything by reference | ||||
|         watch = deepcopy(self.datastore.data['watching'].get(uuid)) | ||||
| @@ -34,84 +27,24 @@ class perform_site_check(difference_detection_processor): | ||||
|         if not watch: | ||||
|             raise Exception("Watch no longer exists.") | ||||
|  | ||||
|         # Protect against file:// access | ||||
|         if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): | ||||
|             raise Exception( | ||||
|                 "file:// type access is denied for security reasons." | ||||
|             ) | ||||
|  | ||||
|         # Unset any existing notification error | ||||
|         update_obj = {'last_notification_error': False, 'last_error': False} | ||||
|  | ||||
|         request_headers = watch.get('headers', []) | ||||
|         request_headers.update(self.datastore.get_all_base_headers()) | ||||
|         request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) | ||||
|  | ||||
|         # https://github.com/psf/requests/issues/4525 | ||||
|         # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot | ||||
|         # do this by accident. | ||||
|         if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: | ||||
|             request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') | ||||
|  | ||||
|         timeout = self.datastore.data['settings']['requests'].get('timeout') | ||||
|  | ||||
|         url = watch.link | ||||
|  | ||||
|         request_body = self.datastore.data['watching'][uuid].get('body') | ||||
|         request_method = self.datastore.data['watching'][uuid].get('method') | ||||
|         ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) | ||||
|  | ||||
|         # Pluggable content fetcher | ||||
|         prefer_backend = watch.get_fetch_backend | ||||
|         if not prefer_backend or prefer_backend == 'system': | ||||
|             prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] | ||||
|  | ||||
|         if hasattr(content_fetcher, prefer_backend): | ||||
|             klass = getattr(content_fetcher, prefer_backend) | ||||
|         else: | ||||
|             # If the klass doesnt exist, just use a default | ||||
|             klass = getattr(content_fetcher, "html_requests") | ||||
|  | ||||
|         proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) | ||||
|         proxy_url = None | ||||
|         if proxy_id: | ||||
|             proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') | ||||
|             print("UUID {} Using proxy {}".format(uuid, proxy_url)) | ||||
|  | ||||
|         fetcher = klass(proxy_override=proxy_url) | ||||
|  | ||||
|         # Configurable per-watch or global extra delay before extracting text (for webDriver types) | ||||
|         system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) | ||||
|         if watch['webdriver_delay'] is not None: | ||||
|             fetcher.render_extract_delay = watch.get('webdriver_delay') | ||||
|         elif system_webdriver_delay is not None: | ||||
|             fetcher.render_extract_delay = system_webdriver_delay | ||||
|  | ||||
|         # Could be removed if requests/plaintext could also return some info? | ||||
|         if prefer_backend != 'html_webdriver': | ||||
|             raise Exception("Re-stock detection requires Chrome or compatible webdriver/playwright fetcher to work") | ||||
|  | ||||
|         if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): | ||||
|             fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') | ||||
|  | ||||
|         fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters')) | ||||
|         fetcher.quit() | ||||
|  | ||||
|         self.screenshot = fetcher.screenshot | ||||
|         self.xpath_data = fetcher.xpath_data | ||||
|         self.screenshot = self.fetcher.screenshot | ||||
|         self.xpath_data = self.fetcher.xpath_data | ||||
|  | ||||
|         # Track the content type | ||||
|         update_obj['content_type'] = fetcher.headers.get('Content-Type', '') | ||||
|         update_obj["last_check_status"] = fetcher.get_last_status_code() | ||||
|         update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') | ||||
|         update_obj["last_check_status"] = self.fetcher.get_last_status_code() | ||||
|  | ||||
|         # Main detection method | ||||
|         fetched_md5 = None | ||||
|         if fetcher.instock_data: | ||||
|             fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest() | ||||
|         if self.fetcher.instock_data: | ||||
|             fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() | ||||
|             # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. | ||||
|             update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False | ||||
|             update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False | ||||
|         else: | ||||
|             raise UnableToExtractRestockData(status_code=fetcher.status_code) | ||||
|             raise UnableToExtractRestockData(status_code=self.fetcher.status_code) | ||||
|  | ||||
|         # The main thing that all this at the moment comes down to :) | ||||
|         changed_detected = False | ||||
| @@ -128,4 +61,4 @@ class perform_site_check(difference_detection_processor): | ||||
|         # Always record the new checksum | ||||
|         update_obj["previous_md5"] = fetched_md5 | ||||
|  | ||||
|         return changed_detected, update_obj, fetcher.instock_data.encode('utf-8') | ||||
|         return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8') | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| # HTML to TEXT/JSON DIFFERENCE FETCHER | ||||
| # HTML to TEXT/JSON DIFFERENCE self.fetcher | ||||
|  | ||||
| import hashlib | ||||
| import json | ||||
| @@ -32,15 +32,10 @@ class PDFToHTMLToolNotFound(ValueError): | ||||
| # Some common stuff here that can be moved to a base class | ||||
| # (set_proxy_from_list) | ||||
| class perform_site_check(difference_detection_processor): | ||||
|     screenshot = None | ||||
|     xpath_data = None | ||||
|  | ||||
|     def __init__(self, *args, datastore, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self.datastore = datastore | ||||
|  | ||||
|     def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): | ||||
|     def run_changedetection(self, uuid, skip_when_checksum_same=True): | ||||
|         changed_detected = False | ||||
|         html_content = "" | ||||
|         screenshot = False  # as bytes | ||||
|         stripped_text_from_html = "" | ||||
|  | ||||
| @@ -49,100 +44,25 @@ class perform_site_check(difference_detection_processor): | ||||
|         if not watch: | ||||
|             raise Exception("Watch no longer exists.") | ||||
|  | ||||
|         # Protect against file:// access | ||||
|         if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): | ||||
|             raise Exception( | ||||
|                 "file:// type access is denied for security reasons." | ||||
|             ) | ||||
|  | ||||
|         # Unset any existing notification error | ||||
|         update_obj = {'last_notification_error': False, 'last_error': False} | ||||
|  | ||||
|         # Tweak the base config with the per-watch ones | ||||
|         request_headers = watch.get('headers', []) | ||||
|         request_headers.update(self.datastore.get_all_base_headers()) | ||||
|         request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) | ||||
|  | ||||
|         # https://github.com/psf/requests/issues/4525 | ||||
|         # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot | ||||
|         # do this by accident. | ||||
|         if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: | ||||
|             request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') | ||||
|  | ||||
|         timeout = self.datastore.data['settings']['requests'].get('timeout') | ||||
|  | ||||
|         url = watch.link | ||||
|  | ||||
|         request_body = self.datastore.data['watching'][uuid].get('body') | ||||
|         request_method = self.datastore.data['watching'][uuid].get('method') | ||||
|         ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) | ||||
|  | ||||
|         # source: support | ||||
|         is_source = False | ||||
|         if url.startswith('source:'): | ||||
|             url = url.replace('source:', '') | ||||
|             is_source = True | ||||
|  | ||||
|         # Pluggable content fetcher | ||||
|         prefer_backend = watch.get_fetch_backend | ||||
|         if not prefer_backend or prefer_backend == 'system': | ||||
|             prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] | ||||
|  | ||||
|         if hasattr(content_fetcher, prefer_backend): | ||||
|             klass = getattr(content_fetcher, prefer_backend) | ||||
|         else: | ||||
|             # If the klass doesnt exist, just use a default | ||||
|             klass = getattr(content_fetcher, "html_requests") | ||||
|  | ||||
|         if preferred_proxy: | ||||
|             proxy_id = preferred_proxy | ||||
|         else: | ||||
|             proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) | ||||
|  | ||||
|         proxy_url = None | ||||
|         if proxy_id: | ||||
|             proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') | ||||
|             print("UUID {} Using proxy {}".format(uuid, proxy_url)) | ||||
|  | ||||
|         fetcher = klass(proxy_override=proxy_url) | ||||
|  | ||||
|         # Configurable per-watch or global extra delay before extracting text (for webDriver types) | ||||
|         system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) | ||||
|         if watch['webdriver_delay'] is not None: | ||||
|             fetcher.render_extract_delay = watch.get('webdriver_delay') | ||||
|         elif system_webdriver_delay is not None: | ||||
|             fetcher.render_extract_delay = system_webdriver_delay | ||||
|  | ||||
|         # Possible conflict | ||||
|         if prefer_backend == 'html_webdriver': | ||||
|             fetcher.browser_steps = watch.get('browser_steps', None) | ||||
|             fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid) | ||||
|  | ||||
|         if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): | ||||
|             fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') | ||||
|  | ||||
|         # requests for PDF's, images etc should be passwd the is_binary flag | ||||
|         is_binary = watch.is_pdf | ||||
|  | ||||
|         fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), | ||||
|                     is_binary=is_binary) | ||||
|         fetcher.quit() | ||||
|  | ||||
|         self.screenshot = fetcher.screenshot | ||||
|         self.xpath_data = fetcher.xpath_data | ||||
|         self.screenshot = self.fetcher.screenshot | ||||
|         self.xpath_data = self.fetcher.xpath_data | ||||
|  | ||||
|         # Track the content type | ||||
|         update_obj['content_type'] = fetcher.get_all_headers().get('content-type', '').lower() | ||||
|         update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower() | ||||
|  | ||||
|         # Watches added automatically in the queue manager will skip if its the same checksum as the previous run | ||||
|         # Saves a lot of CPU | ||||
|         update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() | ||||
|         update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest() | ||||
|         if skip_when_checksum_same: | ||||
|             if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): | ||||
|                 raise content_fetcher.checksumFromPreviousCheckWasTheSame() | ||||
|  | ||||
|         # Fetching complete, now filters | ||||
|         # @todo move to class / maybe inside of fetcher abstract base? | ||||
|  | ||||
|         # @note: I feel like the following should be in a more obvious chain system | ||||
|         #  - Check filter text | ||||
| @@ -151,24 +71,24 @@ class perform_site_check(difference_detection_processor): | ||||
|         # https://stackoverflow.com/questions/41817578/basic-method-chaining ? | ||||
|         # return content().textfilter().jsonextract().checksumcompare() ? | ||||
|  | ||||
|         is_json = 'application/json' in fetcher.get_all_headers().get('content-type', '').lower() | ||||
|         is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower() | ||||
|         is_html = not is_json | ||||
|         is_rss = False | ||||
|  | ||||
|         ctype_header = fetcher.get_all_headers().get('content-type', '').lower() | ||||
|         ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() | ||||
|         # Go into RSS preprocess for converting CDATA/comment to usable text | ||||
|         if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']): | ||||
|             if '<rss' in fetcher.content[:100].lower(): | ||||
|                 fetcher.content = cdata_in_document_to_text(html_content=fetcher.content) | ||||
|             if '<rss' in self.fetcher.content[:100].lower(): | ||||
|                 self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content) | ||||
|                 is_rss = True | ||||
|  | ||||
|         # source: support, basically treat it as plaintext | ||||
|         if is_source: | ||||
|         if watch.is_source_type_url: | ||||
|             is_html = False | ||||
|             is_json = False | ||||
|  | ||||
|         inline_pdf = fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in fetcher.content[:10] | ||||
|         if watch.is_pdf or 'application/pdf' in fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf: | ||||
|         inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10] | ||||
|         if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf: | ||||
|             from shutil import which | ||||
|             tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml") | ||||
|             if not which(tool): | ||||
| @@ -179,18 +99,18 @@ class perform_site_check(difference_detection_processor): | ||||
|                 [tool, '-stdout', '-', '-s', 'out.pdf', '-i'], | ||||
|                 stdout=subprocess.PIPE, | ||||
|                 stdin=subprocess.PIPE) | ||||
|             proc.stdin.write(fetcher.raw_content) | ||||
|             proc.stdin.write(self.fetcher.raw_content) | ||||
|             proc.stdin.close() | ||||
|             fetcher.content = proc.stdout.read().decode('utf-8') | ||||
|             self.fetcher.content = proc.stdout.read().decode('utf-8') | ||||
|             proc.wait(timeout=60) | ||||
|  | ||||
|             # Add a little metadata so we know if the file changes (like if an image changes, but the text is the same | ||||
|             # @todo may cause problems with non-UTF8? | ||||
|             metadata = "<p>Added by changedetection.io: Document checksum - {} Filesize - {} bytes</p>".format( | ||||
|                 hashlib.md5(fetcher.raw_content).hexdigest().upper(), | ||||
|                 len(fetcher.content)) | ||||
|                 hashlib.md5(self.fetcher.raw_content).hexdigest().upper(), | ||||
|                 len(self.fetcher.content)) | ||||
|  | ||||
|             fetcher.content = fetcher.content.replace('</body>', metadata + '</body>') | ||||
|             self.fetcher.content = self.fetcher.content.replace('</body>', metadata + '</body>') | ||||
|  | ||||
|         # Better would be if Watch.model could access the global data also | ||||
|         # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ | ||||
| @@ -217,7 +137,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         if is_json: | ||||
|             # Sort the JSON so we dont get false alerts when the content is just re-ordered | ||||
|             try: | ||||
|                 fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True) | ||||
|                 self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True) | ||||
|             except Exception as e: | ||||
|                 # Might have just been a snippet, or otherwise bad JSON, continue | ||||
|                 pass | ||||
| @@ -225,22 +145,22 @@ class perform_site_check(difference_detection_processor): | ||||
|         if has_filter_rule: | ||||
|             for filter in include_filters_rule: | ||||
|                 if any(prefix in filter for prefix in json_filter_prefixes): | ||||
|                     stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) | ||||
|                     stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter) | ||||
|                     is_html = False | ||||
|  | ||||
|         if is_html or is_source: | ||||
|         if is_html or watch.is_source_type_url: | ||||
|  | ||||
|             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text | ||||
|             fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content) | ||||
|             html_content = fetcher.content | ||||
|             self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) | ||||
|             html_content = self.fetcher.content | ||||
|  | ||||
|             # If not JSON,  and if it's not text/plain.. | ||||
|             if 'text/plain' in fetcher.get_all_headers().get('content-type', '').lower(): | ||||
|             if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower(): | ||||
|                 # Don't run get_text or xpath/css filters on plaintext | ||||
|                 stripped_text_from_html = html_content | ||||
|             else: | ||||
|                 # Does it have some ld+json price data? used for easier monitoring | ||||
|                 update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) | ||||
|                 update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content) | ||||
|  | ||||
|                 # Then we assume HTML | ||||
|                 if has_filter_rule: | ||||
| @@ -250,14 +170,14 @@ class perform_site_check(difference_detection_processor): | ||||
|                         # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." | ||||
|                         if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): | ||||
|                             html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), | ||||
|                                                                     html_content=fetcher.content, | ||||
|                                                                     append_pretty_line_formatting=not is_source, | ||||
|                                                                     html_content=self.fetcher.content, | ||||
|                                                                     append_pretty_line_formatting=not watch.is_source_type_url, | ||||
|                                                                     is_rss=is_rss) | ||||
|                         else: | ||||
|                             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text | ||||
|                             html_content += html_tools.include_filters(include_filters=filter_rule, | ||||
|                                                                        html_content=fetcher.content, | ||||
|                                                                        append_pretty_line_formatting=not is_source) | ||||
|                                                                        html_content=self.fetcher.content, | ||||
|                                                                        append_pretty_line_formatting=not watch.is_source_type_url) | ||||
|  | ||||
|                     if not html_content.strip(): | ||||
|                         raise FilterNotFoundInResponse(include_filters_rule) | ||||
| @@ -265,7 +185,7 @@ class perform_site_check(difference_detection_processor): | ||||
|                 if has_subtractive_selectors: | ||||
|                     html_content = html_tools.element_removal(subtractive_selectors, html_content) | ||||
|  | ||||
|                 if is_source: | ||||
|                 if watch.is_source_type_url: | ||||
|                     stripped_text_from_html = html_content | ||||
|                 else: | ||||
|                     # extract text | ||||
| @@ -311,7 +231,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) | ||||
|         if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: | ||||
|             raise content_fetcher.ReplyWithContentButNoText(url=url, | ||||
|                                                             status_code=fetcher.get_last_status_code(), | ||||
|                                                             status_code=self.fetcher.get_last_status_code(), | ||||
|                                                             screenshot=screenshot, | ||||
|                                                             has_filters=has_filter_rule, | ||||
|                                                             html_content=html_content | ||||
| @@ -320,7 +240,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         # We rely on the actual text in the html output.. many sites have random script vars etc, | ||||
|         # in the future we'll implement other mechanisms. | ||||
|  | ||||
|         update_obj["last_check_status"] = fetcher.get_last_status_code() | ||||
|         update_obj["last_check_status"] = self.fetcher.get_last_status_code() | ||||
|  | ||||
|         # If there's text to skip | ||||
|         # @todo we could abstract out the get_text() to handle this cleaner | ||||
| @@ -408,7 +328,7 @@ class perform_site_check(difference_detection_processor): | ||||
|         if is_html: | ||||
|             if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']: | ||||
|                 if not watch['title'] or not len(watch['title']): | ||||
|                     update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) | ||||
|                     update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) | ||||
|  | ||||
|         if changed_detected: | ||||
|             if watch.get('check_unique_lines', False): | ||||
|   | ||||
| @@ -24,7 +24,7 @@ def test_check_extract_text_from_diff(client, live_server): | ||||
|     ) | ||||
|  | ||||
|     assert b"1 Imported" in res.data | ||||
|     time.sleep(1) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Load in 5 different numbers/changes | ||||
|     last_date="" | ||||
|   | ||||
| @@ -33,8 +33,6 @@ def test_strip_regex_text_func(): | ||||
|         "/not" | ||||
|     ] | ||||
|  | ||||
|  | ||||
|     fetcher = fetch_site_status.perform_site_check(datastore=False) | ||||
|     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) | ||||
|  | ||||
|     assert b"but 1 lines" in stripped_content | ||||
|   | ||||
| @@ -24,7 +24,6 @@ def test_strip_text_func(): | ||||
|  | ||||
|     ignore_lines = ["sometimes"] | ||||
|  | ||||
|     fetcher = fetch_site_status.perform_site_check(datastore=False) | ||||
|     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) | ||||
|  | ||||
|     assert b"sometimes" not in stripped_content | ||||
|   | ||||
| @@ -80,8 +80,11 @@ def test_headers_in_request(client, live_server): | ||||
|  | ||||
|     # Should be only one with headers set | ||||
|     assert watches_with_headers==1 | ||||
|     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|     assert b'Deleted' in res.data | ||||
|  | ||||
| def test_body_in_request(client, live_server): | ||||
|  | ||||
|     # Add our URL to the import page | ||||
|     test_url = url_for('test_body', _external=True) | ||||
|     if os.getenv('PLAYWRIGHT_DRIVER_URL'): | ||||
| @@ -170,7 +173,8 @@ def test_body_in_request(client, live_server): | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"Body must be empty when Request Method is set to GET" in res.data | ||||
|  | ||||
|     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|     assert b'Deleted' in res.data | ||||
|  | ||||
| def test_method_in_request(client, live_server): | ||||
|     # Add our URL to the import page | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| from flask import url_for | ||||
| from . util import set_original_response, set_modified_response, live_server_setup | ||||
| from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks | ||||
| import time | ||||
|  | ||||
|  | ||||
| @@ -12,6 +12,7 @@ def test_bad_access(client, live_server): | ||||
|     ) | ||||
|  | ||||
|     assert b"1 Imported" in res.data | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Attempt to add a body with a GET method | ||||
|     res = client.post( | ||||
| @@ -59,7 +60,7 @@ def test_bad_access(client, live_server): | ||||
|         data={"url": 'file:///tasty/disk/drive', "tags": ''}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     time.sleep(1) | ||||
|     wait_for_all_checks(client) | ||||
|     res = client.get(url_for("index")) | ||||
|  | ||||
|     assert b'file:// type access is denied for security reasons.' in res.data | ||||
| @@ -209,6 +209,7 @@ class update_worker(threading.Thread): | ||||
|         from .processors import text_json_diff, restock_diff | ||||
|  | ||||
|         while not self.app.config.exit.is_set(): | ||||
|             update_handler = None | ||||
|  | ||||
|             try: | ||||
|                 queued_item_data = self.q.get(block=False) | ||||
| @@ -229,17 +230,35 @@ class update_worker(threading.Thread): | ||||
|                     now = time.time() | ||||
|  | ||||
|                     try: | ||||
|                         processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff') | ||||
|                         # Processor is what we are using for detecting the "Change" | ||||
|                         processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') | ||||
|                         # if system... | ||||
|  | ||||
|                         # Abort processing when the content was the same as the last fetch | ||||
|                         skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') | ||||
|  | ||||
|  | ||||
|                         # @todo some way to switch by name | ||||
|                         # Init a new 'difference_detection_processor' | ||||
|  | ||||
|                         if processor == 'restock_diff': | ||||
|                             update_handler = restock_diff.perform_site_check(datastore=self.datastore) | ||||
|                             update_handler = restock_diff.perform_site_check(datastore=self.datastore, | ||||
|                                                                              watch_uuid=uuid | ||||
|                                                                              ) | ||||
|                         else: | ||||
|                             # Used as a default and also by some tests | ||||
|                             update_handler = text_json_diff.perform_site_check(datastore=self.datastore) | ||||
|                             update_handler = text_json_diff.perform_site_check(datastore=self.datastore, | ||||
|                                                                                watch_uuid=uuid | ||||
|                                                                                ) | ||||
|  | ||||
|                         # Clear last errors (move to preflight func?) | ||||
|                         self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None | ||||
|                         changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same')) | ||||
|  | ||||
|                         update_handler.call_browser() | ||||
|  | ||||
|                         changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, | ||||
|                                                                                     skip_when_checksum_same=skip_when_same_checksum, | ||||
|                                                                                     ) | ||||
|  | ||||
|                         # Re #342 | ||||
|                         # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. | ||||
| @@ -391,6 +410,9 @@ class update_worker(threading.Thread): | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) | ||||
|                         # Other serious error | ||||
|                         process_changedetection_results = False | ||||
| #                        import traceback | ||||
| #                        print(traceback.format_exc()) | ||||
|  | ||||
|                     else: | ||||
|                         # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) | ||||
|                         if not self.datastore.data['watching'].get(uuid): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user