Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon
a40813a4c0 Oops 2024-07-27 19:06:25 +02:00
dgtlmoon
579faba57c Adding more tests 2024-07-27 19:00:19 +02:00
dgtlmoon
98957a0a9e Adding test for #1995 UTF-8 encoding in notification body 2024-07-27 18:44:13 +02:00
15 changed files with 56 additions and 135 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.02'
__version__ = '0.46.01'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -81,8 +81,7 @@ class Fetcher():
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
is_binary=False):
# Should set self.error, self.status_code and self.content
pass

View File

@@ -83,8 +83,7 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
is_binary=False):
from playwright.sync_api import sync_playwright
import playwright._impl._errors
@@ -131,7 +130,7 @@ class fetcher(Fetcher):
if response is None:
context.close()
browser.close()
logger.debug("Content Fetcher > Response object from the browser communication was none")
logger.debug("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None)
try:
@@ -167,10 +166,10 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
if len(self.page.content().strip()) == 0:
context.close()
browser.close()
logger.debug("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here

View File

@@ -75,8 +75,7 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes,
current_include_filters,
is_binary,
empty_pages_are_a_change
is_binary
):
from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -154,7 +153,7 @@ class fetcher(Fetcher):
if response is None:
await self.page.close()
await browser.close()
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
logger.warning("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None)
self.headers = response.headers
@@ -187,11 +186,10 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
content = await self.page.content
if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
if len(content.strip()) == 0:
await self.page.close()
await browser.close()
logger.error("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here
@@ -249,7 +247,7 @@ class fetcher(Fetcher):
await self.fetch_page(**kwargs)
def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
current_include_filters=None, is_binary=False):
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -264,8 +262,7 @@ class fetcher(Fetcher):
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters,
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
is_binary=is_binary
), timeout=max_time))
except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))

View File

@@ -1,8 +1,9 @@
from loguru import logger
import chardet
import hashlib
import os
import chardet
import requests
from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
@@ -25,8 +26,7 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
is_binary=False):
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -74,10 +74,7 @@ class fetcher(Fetcher):
self.headers = r.headers
if not r.content or not len(r.content):
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
raise EmptyReply(url=url, status_code=r.status_code)
# @todo test this
# @todo maybe you really want to test zero-byte return pages?

View File

@@ -56,8 +56,7 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False,
empty_pages_are_a_change=False):
is_binary=False):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions

View File

@@ -26,8 +26,6 @@ class difference_detection_processor():
def call_browser(self):
from requests.structures import CaseInsensitiveDict
from changedetectionio.content_fetchers.exceptions import EmptyReply
# Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -135,18 +133,8 @@ class difference_detection_processor():
is_binary = self.watch.is_pdf
# And here we go! call the right browser with browser-specific settings
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)
self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
is_binary=is_binary)
#@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit()

View File

@@ -1,23 +0,0 @@
import pluggy
from typing import Dict
from changedetectionio.model import Watch as Watch
plugin_namespace = "changedetectionio.restock_price_scraper"
hookspec = pluggy.HookspecMarker(plugin_namespace)
class HookSpec:
@hookspec
def scrape_price_restock(self, watch: Watch.model, html_content: str, screenshot: bytes, update_obj: Dict) -> Dict:
"""
Scrape price and restock data from html_content and/or screenshot and return via update_obj
Args:
watch (Watch.model): The watch object containing watch configuration.
html_content (str): The HTML content to scrape.
screenshot (bytes): The screenshot data.
update_obj (Dict): The dictionary to update with scraped data.
Returns:
Optional[Dict]: The updated dictionary with the scraped price data, or None if no update is made.
"""

View File

@@ -1,17 +0,0 @@
import pluggy
from .hookspecs import HookSpec
import importlib.metadata
# Define the plugin namespace
plugin_namespace = "changedetectionio.restock_price_scraper"
# Create a pluggy.PluginManager instance
pm = pluggy.PluginManager(plugin_namespace)
# Register the hook specifications
pm.add_hookspecs(HookSpec)
# Automatically discover and register plugins using entry points
for entry_point in importlib.metadata.entry_points().get(plugin_namespace, []):
plugin = entry_point.load()
pm.register(plugin())

View File

@@ -119,8 +119,6 @@ class perform_site_check(difference_detection_processor):
xpath_data = None
def run_changedetection(self, watch, skip_when_checksum_same=True):
from .plugin_manager import pm
if not watch:
raise Exception("Watch no longer exists.")
@@ -200,19 +198,6 @@ class perform_site_check(difference_detection_processor):
update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
# Ask any "changedetectionio.restock_price_scraper" namespace plugins if they can add something
# (Should return an updated 'update_obj')
plugin_price_scraping = pm.hook.scrape_price_restock(watch=watch,
html_content=self.fetcher.content,
screenshot=self.fetcher.screenshot,
update_obj=update_obj)
if plugin_price_scraping:
for plugin_result in plugin_price_scraping:
update_obj.update(plugin_result)
if plugin_result.get('restock'):
update_obj['restock'].update(plugin_result.get('restock'))
# What we store in the snapshot
price = update_obj.get('restock').get('price') if update_obj.get('restock').get('price') else ""
snapshot_content = f"In Stock: {update_obj.get('restock').get('in_stock')} - Price: {price}"

View File

@@ -76,7 +76,7 @@
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
<span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">

View File

@@ -168,7 +168,7 @@
{% if watch.get('restock') and watch['restock']['price'] != None %}
{% if watch['restock']['price'] != None %}
<span class="restock-label price" title="Price">
{{ watch['restock']['price']|format_number_locale }} {% if watch['restock']['currency'] %} {{ watch['restock']['currency'] }}{% endif %}
{{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }}
</span>
{% endif %}
{% elif not watch.has_restock_info %}

View File

@@ -1,7 +1,12 @@
#!/usr/bin/env python3
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup
sleep_time_for_fetch_thread = 3
def set_nonrenderable_response():
test_return_data = """<html>
@@ -17,13 +22,6 @@ def set_nonrenderable_response():
return None
def set_zero_byte_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("")
return None
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
set_original_response()
live_server_setup(live_server)
@@ -37,11 +35,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
#####################
@@ -59,7 +64,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
@@ -81,20 +86,14 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
client.get(url_for("mark_all_viewed"), follow_redirects=True)
# A totally zero byte (#2528) response should also not trigger an error
set_zero_byte_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
assert b'fetch-error' not in res.data
#
# Cleanup everything

View File

@@ -1,5 +1,6 @@
from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from . import content_fetchers
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
@@ -300,7 +301,7 @@ class update_worker(threading.Thread):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
except content_fetchers.exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
@@ -326,7 +327,7 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
@@ -379,23 +380,23 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
except content_fetchers_exceptions.BrowserConnectError as e:
except content_fetchers.exceptions.BrowserConnectError as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers_exceptions.BrowserFetchTimedOut as e:
except content_fetchers.exceptions.BrowserFetchTimedOut as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsStepException as e:
except content_fetchers.exceptions.BrowserStepsStepException as e:
if not self.datastore.data['watching'].get(uuid):
continue
@@ -437,25 +438,25 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers_exceptions.EmptyReply as e:
except content_fetchers.exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.ScreenshotUnavailable as e:
except content_fetchers.exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.JSActionExceptions as e:
except content_fetchers.exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.PageUnloadable as e:
except content_fetchers.exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
@@ -467,7 +468,7 @@ class update_worker(threading.Thread):
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False

View File

@@ -92,6 +92,3 @@ babel
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
greenlet >= 3.0.3
# Our own plugins
changedetection.io-amazon-price-scraper>=0.03