mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-30 22:27:52 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			lev-test
			...
			2528-empty
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 4fdabd53fc | ||
|   | b97d34d77c | ||
|   | e12b5a6f71 | 
| @@ -81,7 +81,8 @@ class Fetcher(): | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|             is_binary=False, | ||||
|             empty_pages_are_a_change=False): | ||||
|         # Should set self.error, self.status_code and self.content | ||||
|         pass | ||||
|  | ||||
|   | ||||
| @@ -83,7 +83,8 @@ class fetcher(Fetcher): | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|             is_binary=False, | ||||
|             empty_pages_are_a_change=False): | ||||
|  | ||||
|         from playwright.sync_api import sync_playwright | ||||
|         import playwright._impl._errors | ||||
| @@ -130,7 +131,7 @@ class fetcher(Fetcher): | ||||
|             if response is None: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 logger.debug("Content Fetcher > Response object was none") | ||||
|                 logger.debug("Content Fetcher > Response object from the browser communication was none") | ||||
|                 raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             try: | ||||
| @@ -166,10 +167,10 @@ class fetcher(Fetcher): | ||||
|  | ||||
|                 raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) | ||||
|  | ||||
|             if len(self.page.content().strip()) == 0: | ||||
|             if not empty_pages_are_a_change and len(self.page.content().strip()) == 0: | ||||
|                 logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False") | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 logger.debug("Content Fetcher > Content was empty") | ||||
|                 raise EmptyReply(url=url, status_code=response.status) | ||||
|  | ||||
|             # Run Browser Steps here | ||||
|   | ||||
| @@ -75,7 +75,8 @@ class fetcher(Fetcher): | ||||
|                          request_method, | ||||
|                          ignore_status_codes, | ||||
|                          current_include_filters, | ||||
|                          is_binary | ||||
|                          is_binary, | ||||
|                          empty_pages_are_a_change | ||||
|                          ): | ||||
|  | ||||
|         from changedetectionio.content_fetchers import visualselector_xpath_selectors | ||||
| @@ -153,7 +154,7 @@ class fetcher(Fetcher): | ||||
|         if response is None: | ||||
|             await self.page.close() | ||||
|             await browser.close() | ||||
|             logger.warning("Content Fetcher > Response object was none") | ||||
|             logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)") | ||||
|             raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|         self.headers = response.headers | ||||
| @@ -186,10 +187,11 @@ class fetcher(Fetcher): | ||||
|  | ||||
|             raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) | ||||
|         content = await self.page.content | ||||
|         if len(content.strip()) == 0: | ||||
|  | ||||
|         if not empty_pages_are_a_change and len(content.strip()) == 0: | ||||
|             logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers") | ||||
|             await self.page.close() | ||||
|             await browser.close() | ||||
|             logger.error("Content Fetcher > Content was empty") | ||||
|             raise EmptyReply(url=url, status_code=response.status) | ||||
|  | ||||
|         # Run Browser Steps here | ||||
| @@ -247,7 +249,7 @@ class fetcher(Fetcher): | ||||
|         await self.fetch_page(**kwargs) | ||||
|  | ||||
|     def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, | ||||
|             current_include_filters=None, is_binary=False): | ||||
|             current_include_filters=None, is_binary=False, empty_pages_are_a_change=False): | ||||
|  | ||||
|         #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints | ||||
|         max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180) | ||||
| @@ -262,7 +264,8 @@ class fetcher(Fetcher): | ||||
|                 request_method=request_method, | ||||
|                 ignore_status_codes=ignore_status_codes, | ||||
|                 current_include_filters=current_include_filters, | ||||
|                 is_binary=is_binary | ||||
|                 is_binary=is_binary, | ||||
|                 empty_pages_are_a_change=empty_pages_are_a_change | ||||
|             ), timeout=max_time)) | ||||
|         except asyncio.TimeoutError: | ||||
|             raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) | ||||
|   | ||||
| @@ -1,9 +1,8 @@ | ||||
| from loguru import logger | ||||
| import chardet | ||||
| import hashlib | ||||
| import os | ||||
|  | ||||
| import chardet | ||||
| import requests | ||||
|  | ||||
| from changedetectionio import strtobool | ||||
| from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived | ||||
| from changedetectionio.content_fetchers.base import Fetcher | ||||
| @@ -26,7 +25,8 @@ class fetcher(Fetcher): | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|             is_binary=False, | ||||
|             empty_pages_are_a_change=False): | ||||
|  | ||||
|         if self.browser_steps_get_valid_steps(): | ||||
|             raise BrowserStepsInUnsupportedFetcher(url=url) | ||||
| @@ -74,7 +74,10 @@ class fetcher(Fetcher): | ||||
|         self.headers = r.headers | ||||
|  | ||||
|         if not r.content or not len(r.content): | ||||
|             raise EmptyReply(url=url, status_code=r.status_code) | ||||
|             if not empty_pages_are_a_change: | ||||
|                 raise EmptyReply(url=url, status_code=r.status_code) | ||||
|             else: | ||||
|                 logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True") | ||||
|  | ||||
|         # @todo test this | ||||
|         # @todo maybe you really want to test zero-byte return pages? | ||||
|   | ||||
| @@ -56,7 +56,8 @@ class fetcher(Fetcher): | ||||
|             request_method, | ||||
|             ignore_status_codes=False, | ||||
|             current_include_filters=None, | ||||
|             is_binary=False): | ||||
|             is_binary=False, | ||||
|             empty_pages_are_a_change=False): | ||||
|  | ||||
|         from selenium import webdriver | ||||
|         from selenium.webdriver.chrome.options import Options as ChromeOptions | ||||
|   | ||||
| @@ -26,6 +26,8 @@ class difference_detection_processor(): | ||||
|  | ||||
|     def call_browser(self): | ||||
|         from requests.structures import CaseInsensitiveDict | ||||
|         from changedetectionio.content_fetchers.exceptions import EmptyReply | ||||
|  | ||||
|         # Protect against file:// access | ||||
|         if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): | ||||
|             if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): | ||||
| @@ -133,8 +135,18 @@ class difference_detection_processor(): | ||||
|         is_binary = self.watch.is_pdf | ||||
|  | ||||
|         # And here we go! call the right browser with browser-specific settings | ||||
|         self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), | ||||
|                     is_binary=is_binary) | ||||
|         empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) | ||||
|  | ||||
|         self.fetcher.run(url=url, | ||||
|                          timeout=timeout, | ||||
|                          request_headers=request_headers, | ||||
|                          request_body=request_body, | ||||
|                          request_method=request_method, | ||||
|                          ignore_status_codes=ignore_status_codes, | ||||
|                          current_include_filters=self.watch.get('include_filters'), | ||||
|                          is_binary=is_binary, | ||||
|                          empty_pages_are_a_change=empty_pages_are_a_change | ||||
|                          ) | ||||
|  | ||||
|         #@todo .quit here could go on close object, so we can run JS if change-detected | ||||
|         self.fetcher.quit() | ||||
|   | ||||
| @@ -76,7 +76,7 @@ | ||||
|                     </div> | ||||
|                     <div class="pure-control-group"> | ||||
|                         {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }} | ||||
|                         <span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span> | ||||
|                         <span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span> | ||||
|                     </div> | ||||
|                 {% if form.requests.proxy %} | ||||
|                     <div class="pure-control-group inline-radio"> | ||||
|   | ||||
| @@ -1,12 +1,7 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| import time | ||||
| from flask import url_for | ||||
| from urllib.request import urlopen | ||||
| from .util import set_original_response, set_modified_response, live_server_setup | ||||
|  | ||||
| sleep_time_for_fetch_thread = 3 | ||||
|  | ||||
| from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks | ||||
|  | ||||
| def set_nonrenderable_response(): | ||||
|     test_return_data = """<html> | ||||
| @@ -22,6 +17,13 @@ def set_nonrenderable_response(): | ||||
|  | ||||
|     return None | ||||
|  | ||||
| def set_zero_byte_response(): | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write("") | ||||
|  | ||||
|     return None | ||||
|  | ||||
| def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage): | ||||
|     set_original_response() | ||||
|     live_server_setup(live_server) | ||||
| @@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure | ||||
|  | ||||
|     assert b"1 Imported" in res.data | ||||
|  | ||||
|     time.sleep(sleep_time_for_fetch_thread) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Do this a few times.. ensures we dont accidently set the status | ||||
|     for n in range(3): | ||||
|         client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|  | ||||
|         # Give the thread time to pick it up | ||||
|         time.sleep(sleep_time_for_fetch_thread) | ||||
|  | ||||
|         # It should report nothing found (no new 'unviewed' class) | ||||
|         res = client.get(url_for("index")) | ||||
|         assert b'unviewed' not in res.data | ||||
|     # It should report nothing found (no new 'unviewed' class) | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'unviewed' not in res.data | ||||
|  | ||||
|  | ||||
|     ##################### | ||||
| @@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|  | ||||
|     # Give the thread time to pick it up | ||||
|     time.sleep(sleep_time_for_fetch_thread) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # It should report nothing found (no new 'unviewed' class) | ||||
|     res = client.get(url_for("index")) | ||||
| @@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|  | ||||
|     # Give the thread time to pick it up | ||||
|     time.sleep(sleep_time_for_fetch_thread) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # It should report nothing found (no new 'unviewed' class) | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'unviewed' in res.data | ||||
|     client.get(url_for("mark_all_viewed"), follow_redirects=True) | ||||
|  | ||||
|  | ||||
|  | ||||
|     # A totally zero byte (#2528) response should also not trigger an error | ||||
|     set_zero_byte_response() | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     wait_for_all_checks(client) | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON | ||||
|     assert b'fetch-error' not in res.data | ||||
|  | ||||
|     # | ||||
|     # Cleanup everything | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| from .processors.exceptions import ProcessorException | ||||
| from . import content_fetchers | ||||
|  | ||||
| import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions | ||||
| from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse | ||||
| from changedetectionio import html_tools | ||||
|  | ||||
| @@ -301,7 +300,7 @@ class update_worker(threading.Thread): | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message}) | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetchers.exceptions.ReplyWithContentButNoText as e: | ||||
|                     except content_fetchers_exceptions.ReplyWithContentButNoText as e: | ||||
|                         # Totally fine, it's by choice - just continue on, nothing more to care about | ||||
|                         # Page had elements/content but no renderable text | ||||
|                         # Backend (not filters) gave zero output | ||||
| @@ -327,7 +326,7 @@ class update_worker(threading.Thread): | ||||
|                              | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetchers.exceptions.Non200ErrorCodeReceived as e: | ||||
|                     except content_fetchers_exceptions.Non200ErrorCodeReceived as e: | ||||
|                         if e.status_code == 403: | ||||
|                             err_text = "Error - 403 (Access denied) received" | ||||
|                         elif e.status_code == 404: | ||||
| @@ -380,23 +379,23 @@ class update_worker(threading.Thread): | ||||
|  | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e: | ||||
|                     except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e: | ||||
|                         # Yes fine, so nothing todo, don't continue to process. | ||||
|                         process_changedetection_results = False | ||||
|                         changed_detected = False | ||||
|                     except content_fetchers.exceptions.BrowserConnectError as e: | ||||
|                     except content_fetchers_exceptions.BrowserConnectError as e: | ||||
|                         self.datastore.update_watch(uuid=uuid, | ||||
|                                                     update_obj={'last_error': e.msg | ||||
|                                                                 } | ||||
|                                                     ) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.BrowserFetchTimedOut as e: | ||||
|                     except content_fetchers_exceptions.BrowserFetchTimedOut as e: | ||||
|                         self.datastore.update_watch(uuid=uuid, | ||||
|                                                     update_obj={'last_error': e.msg | ||||
|                                                                 } | ||||
|                                                     ) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.BrowserStepsStepException as e: | ||||
|                     except content_fetchers_exceptions.BrowserStepsStepException as e: | ||||
|  | ||||
|                         if not self.datastore.data['watching'].get(uuid): | ||||
|                             continue | ||||
| @@ -438,25 +437,25 @@ class update_worker(threading.Thread): | ||||
|  | ||||
|                         process_changedetection_results = False | ||||
|  | ||||
|                     except content_fetchers.exceptions.EmptyReply as e: | ||||
|                     except content_fetchers_exceptions.EmptyReply as e: | ||||
|                         # Some kind of custom to-str handler in the exception handler that does this? | ||||
|                         err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.ScreenshotUnavailable as e: | ||||
|                     except content_fetchers_exceptions.ScreenshotUnavailable as e: | ||||
|                         err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'" | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.JSActionExceptions as e: | ||||
|                     except content_fetchers_exceptions.JSActionExceptions as e: | ||||
|                         err_text = "Error running JS Actions - Page request - "+e.message | ||||
|                         if e.screenshot: | ||||
|                             watch.save_screenshot(screenshot=e.screenshot, as_error=True) | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, | ||||
|                                                                            'last_check_status': e.status_code}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.PageUnloadable as e: | ||||
|                     except content_fetchers_exceptions.PageUnloadable as e: | ||||
|                         err_text = "Page request from server didnt respond correctly" | ||||
|                         if e.message: | ||||
|                             err_text = "{} - {}".format(err_text, e.message) | ||||
| @@ -468,7 +467,7 @@ class update_worker(threading.Thread): | ||||
|                                                                            'last_check_status': e.status_code, | ||||
|                                                                            'has_ldjson_price_data': None}) | ||||
|                         process_changedetection_results = False | ||||
|                     except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e: | ||||
|                     except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e: | ||||
|                         err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher." | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text}) | ||||
|                         process_changedetection_results = False | ||||
|   | ||||
		Reference in New Issue
	
	Block a user