mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			history-li
			...
			2528-empty
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					4fdabd53fc | ||
| 
						 | 
					b97d34d77c | ||
| 
						 | 
					e12b5a6f71 | 
@@ -81,7 +81,8 @@ class Fetcher():
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
            is_binary=False,
 | 
			
		||||
            empty_pages_are_a_change=False):
 | 
			
		||||
        # Should set self.error, self.status_code and self.content
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -83,7 +83,8 @@ class fetcher(Fetcher):
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
            is_binary=False,
 | 
			
		||||
            empty_pages_are_a_change=False):
 | 
			
		||||
 | 
			
		||||
        from playwright.sync_api import sync_playwright
 | 
			
		||||
        import playwright._impl._errors
 | 
			
		||||
@@ -130,7 +131,7 @@ class fetcher(Fetcher):
 | 
			
		||||
            if response is None:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Response object was none")
 | 
			
		||||
                logger.debug("Content Fetcher > Response object from the browser communication was none")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
@@ -166,10 +167,10 @@ class fetcher(Fetcher):
 | 
			
		||||
 | 
			
		||||
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 | 
			
		||||
 | 
			
		||||
            if len(self.page.content().strip()) == 0:
 | 
			
		||||
            if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
 | 
			
		||||
                logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Content was empty")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=response.status)
 | 
			
		||||
 | 
			
		||||
            # Run Browser Steps here
 | 
			
		||||
 
 | 
			
		||||
@@ -75,7 +75,8 @@ class fetcher(Fetcher):
 | 
			
		||||
                         request_method,
 | 
			
		||||
                         ignore_status_codes,
 | 
			
		||||
                         current_include_filters,
 | 
			
		||||
                         is_binary
 | 
			
		||||
                         is_binary,
 | 
			
		||||
                         empty_pages_are_a_change
 | 
			
		||||
                         ):
 | 
			
		||||
 | 
			
		||||
        from changedetectionio.content_fetchers import visualselector_xpath_selectors
 | 
			
		||||
@@ -153,7 +154,7 @@ class fetcher(Fetcher):
 | 
			
		||||
        if response is None:
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            logger.warning("Content Fetcher > Response object was none")
 | 
			
		||||
            logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
 | 
			
		||||
            raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
        self.headers = response.headers
 | 
			
		||||
@@ -186,10 +187,11 @@ class fetcher(Fetcher):
 | 
			
		||||
 | 
			
		||||
            raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 | 
			
		||||
        content = await self.page.content
 | 
			
		||||
        if len(content.strip()) == 0:
 | 
			
		||||
 | 
			
		||||
        if not empty_pages_are_a_change and len(content.strip()) == 0:
 | 
			
		||||
            logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            logger.error("Content Fetcher > Content was empty")
 | 
			
		||||
            raise EmptyReply(url=url, status_code=response.status)
 | 
			
		||||
 | 
			
		||||
        # Run Browser Steps here
 | 
			
		||||
@@ -247,7 +249,7 @@ class fetcher(Fetcher):
 | 
			
		||||
        await self.fetch_page(**kwargs)
 | 
			
		||||
 | 
			
		||||
    def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None, is_binary=False):
 | 
			
		||||
            current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
 | 
			
		||||
 | 
			
		||||
        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
 | 
			
		||||
        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
 | 
			
		||||
@@ -262,7 +264,8 @@ class fetcher(Fetcher):
 | 
			
		||||
                request_method=request_method,
 | 
			
		||||
                ignore_status_codes=ignore_status_codes,
 | 
			
		||||
                current_include_filters=current_include_filters,
 | 
			
		||||
                is_binary=is_binary
 | 
			
		||||
                is_binary=is_binary,
 | 
			
		||||
                empty_pages_are_a_change=empty_pages_are_a_change
 | 
			
		||||
            ), timeout=max_time))
 | 
			
		||||
        except asyncio.TimeoutError:
 | 
			
		||||
            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,8 @@
 | 
			
		||||
from loguru import logger
 | 
			
		||||
import chardet
 | 
			
		||||
import hashlib
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import chardet
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
from changedetectionio import strtobool
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher
 | 
			
		||||
@@ -26,7 +25,8 @@ class fetcher(Fetcher):
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
            is_binary=False,
 | 
			
		||||
            empty_pages_are_a_change=False):
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps_get_valid_steps():
 | 
			
		||||
            raise BrowserStepsInUnsupportedFetcher(url=url)
 | 
			
		||||
@@ -74,7 +74,10 @@ class fetcher(Fetcher):
 | 
			
		||||
        self.headers = r.headers
 | 
			
		||||
 | 
			
		||||
        if not r.content or not len(r.content):
 | 
			
		||||
            raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
            if not empty_pages_are_a_change:
 | 
			
		||||
                raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
            else:
 | 
			
		||||
                logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
 | 
			
		||||
 | 
			
		||||
        # @todo test this
 | 
			
		||||
        # @todo maybe you really want to test zero-byte return pages?
 | 
			
		||||
 
 | 
			
		||||
@@ -56,7 +56,8 @@ class fetcher(Fetcher):
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
            is_binary=False,
 | 
			
		||||
            empty_pages_are_a_change=False):
 | 
			
		||||
 | 
			
		||||
        from selenium import webdriver
 | 
			
		||||
        from selenium.webdriver.chrome.options import Options as ChromeOptions
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,8 @@ class difference_detection_processor():
 | 
			
		||||
 | 
			
		||||
    def call_browser(self):
 | 
			
		||||
        from requests.structures import CaseInsensitiveDict
 | 
			
		||||
        from changedetectionio.content_fetchers.exceptions import EmptyReply
 | 
			
		||||
 | 
			
		||||
        # Protect against file:// access
 | 
			
		||||
        if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
 | 
			
		||||
            if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
 | 
			
		||||
@@ -133,8 +135,18 @@ class difference_detection_processor():
 | 
			
		||||
        is_binary = self.watch.is_pdf
 | 
			
		||||
 | 
			
		||||
        # And here we go! call the right browser with browser-specific settings
 | 
			
		||||
        self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
 | 
			
		||||
                    is_binary=is_binary)
 | 
			
		||||
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
 | 
			
		||||
 | 
			
		||||
        self.fetcher.run(url=url,
 | 
			
		||||
                         timeout=timeout,
 | 
			
		||||
                         request_headers=request_headers,
 | 
			
		||||
                         request_body=request_body,
 | 
			
		||||
                         request_method=request_method,
 | 
			
		||||
                         ignore_status_codes=ignore_status_codes,
 | 
			
		||||
                         current_include_filters=self.watch.get('include_filters'),
 | 
			
		||||
                         is_binary=is_binary,
 | 
			
		||||
                         empty_pages_are_a_change=empty_pages_are_a_change
 | 
			
		||||
                         )
 | 
			
		||||
 | 
			
		||||
        #@todo .quit here could go on close object, so we can run JS if change-detected
 | 
			
		||||
        self.fetcher.quit()
 | 
			
		||||
 
 | 
			
		||||
@@ -76,7 +76,7 @@
 | 
			
		||||
                    </div>
 | 
			
		||||
                    <div class="pure-control-group">
 | 
			
		||||
                        {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
 | 
			
		||||
                        <span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
 | 
			
		||||
                        <span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
 | 
			
		||||
                    </div>
 | 
			
		||||
                {% if form.requests.proxy %}
 | 
			
		||||
                    <div class="pure-control-group inline-radio">
 | 
			
		||||
 
 | 
			
		||||
@@ -1,12 +1,7 @@
 | 
			
		||||
#!/usr/bin/env python3
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from urllib.request import urlopen
 | 
			
		||||
from .util import set_original_response, set_modified_response, live_server_setup
 | 
			
		||||
 | 
			
		||||
sleep_time_for_fetch_thread = 3
 | 
			
		||||
 | 
			
		||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
 | 
			
		||||
 | 
			
		||||
def set_nonrenderable_response():
 | 
			
		||||
    test_return_data = """<html>
 | 
			
		||||
@@ -22,6 +17,13 @@ def set_nonrenderable_response():
 | 
			
		||||
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
def set_zero_byte_response():
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write("")
 | 
			
		||||
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
 | 
			
		||||
    set_original_response()
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
@@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
 | 
			
		||||
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
 | 
			
		||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # Do this a few times.. ensures we dont accidently set the status
 | 
			
		||||
    for n in range(3):
 | 
			
		||||
        client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
        # Give the thread time to pick it up
 | 
			
		||||
        time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
 | 
			
		||||
        # It should report nothing found (no new 'unviewed' class)
 | 
			
		||||
        res = client.get(url_for("index"))
 | 
			
		||||
        assert b'unviewed' not in res.data
 | 
			
		||||
    # It should report nothing found (no new 'unviewed' class)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'unviewed' not in res.data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    #####################
 | 
			
		||||
@@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
    # Give the thread time to pick it up
 | 
			
		||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # It should report nothing found (no new 'unviewed' class)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
@@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
    # Give the thread time to pick it up
 | 
			
		||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # It should report nothing found (no new 'unviewed' class)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'unviewed' in res.data
 | 
			
		||||
    client.get(url_for("mark_all_viewed"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # A totally zero byte (#2528) response should also not trigger an error
 | 
			
		||||
    set_zero_byte_response()
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
 | 
			
		||||
    assert b'fetch-error' not in res.data
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # Cleanup everything
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,5 @@
 | 
			
		||||
from .processors.exceptions import ProcessorException
 | 
			
		||||
from . import content_fetchers
 | 
			
		||||
 | 
			
		||||
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
 | 
			
		||||
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
 | 
			
		||||
from changedetectionio import html_tools
 | 
			
		||||
 | 
			
		||||
@@ -301,7 +300,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.ReplyWithContentButNoText as e:
 | 
			
		||||
                    except content_fetchers_exceptions.ReplyWithContentButNoText as e:
 | 
			
		||||
                        # Totally fine, it's by choice - just continue on, nothing more to care about
 | 
			
		||||
                        # Page had elements/content but no renderable text
 | 
			
		||||
                        # Backend (not filters) gave zero output
 | 
			
		||||
@@ -327,7 +326,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                            
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
 | 
			
		||||
                    except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
 | 
			
		||||
                        if e.status_code == 403:
 | 
			
		||||
                            err_text = "Error - 403 (Access denied) received"
 | 
			
		||||
                        elif e.status_code == 404:
 | 
			
		||||
@@ -380,23 +379,23 @@ class update_worker(threading.Thread):
 | 
			
		||||
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
 | 
			
		||||
                    except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
 | 
			
		||||
                        # Yes fine, so nothing todo, don't continue to process.
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                        changed_detected = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserConnectError as e:
 | 
			
		||||
                    except content_fetchers_exceptions.BrowserConnectError as e:
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid,
 | 
			
		||||
                                                    update_obj={'last_error': e.msg
 | 
			
		||||
                                                                }
 | 
			
		||||
                                                    )
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
 | 
			
		||||
                    except content_fetchers_exceptions.BrowserFetchTimedOut as e:
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid,
 | 
			
		||||
                                                    update_obj={'last_error': e.msg
 | 
			
		||||
                                                                }
 | 
			
		||||
                                                    )
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserStepsStepException as e:
 | 
			
		||||
                    except content_fetchers_exceptions.BrowserStepsStepException as e:
 | 
			
		||||
 | 
			
		||||
                        if not self.datastore.data['watching'].get(uuid):
 | 
			
		||||
                            continue
 | 
			
		||||
@@ -438,25 +437,25 @@ class update_worker(threading.Thread):
 | 
			
		||||
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.EmptyReply as e:
 | 
			
		||||
                    except content_fetchers_exceptions.EmptyReply as e:
 | 
			
		||||
                        # Some kind of custom to-str handler in the exception handler that does this?
 | 
			
		||||
                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.ScreenshotUnavailable as e:
 | 
			
		||||
                    except content_fetchers_exceptions.ScreenshotUnavailable as e:
 | 
			
		||||
                        err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.JSActionExceptions as e:
 | 
			
		||||
                    except content_fetchers_exceptions.JSActionExceptions as e:
 | 
			
		||||
                        err_text = "Error running JS Actions - Page request - "+e.message
 | 
			
		||||
                        if e.screenshot:
 | 
			
		||||
                            watch.save_screenshot(screenshot=e.screenshot, as_error=True)
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.PageUnloadable as e:
 | 
			
		||||
                    except content_fetchers_exceptions.PageUnloadable as e:
 | 
			
		||||
                        err_text = "Page request from server didnt respond correctly"
 | 
			
		||||
                        if e.message:
 | 
			
		||||
                            err_text = "{} - {}".format(err_text, e.message)
 | 
			
		||||
@@ -468,7 +467,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                                                                           'last_check_status': e.status_code,
 | 
			
		||||
                                                                           'has_ldjson_price_data': None})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
 | 
			
		||||
                    except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
 | 
			
		||||
                        err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user