mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-14 05:26:09 +00:00
Compare commits
3 Commits
2528-empty
...
restock-pl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29a1651ae1 | ||
|
|
321ab19ffb | ||
|
|
8a35d62e02 |
@@ -81,7 +81,8 @@ class Fetcher():
|
||||
request_method,
|
||||
ignore_status_codes=False,
|
||||
current_include_filters=None,
|
||||
is_binary=False):
|
||||
is_binary=False,
|
||||
empty_pages_are_a_change=False):
|
||||
# Should set self.error, self.status_code and self.content
|
||||
pass
|
||||
|
||||
|
||||
@@ -83,7 +83,8 @@ class fetcher(Fetcher):
|
||||
request_method,
|
||||
ignore_status_codes=False,
|
||||
current_include_filters=None,
|
||||
is_binary=False):
|
||||
is_binary=False,
|
||||
empty_pages_are_a_change=False):
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import playwright._impl._errors
|
||||
@@ -130,7 +131,7 @@ class fetcher(Fetcher):
|
||||
if response is None:
|
||||
context.close()
|
||||
browser.close()
|
||||
logger.debug("Content Fetcher > Response object was none")
|
||||
logger.debug("Content Fetcher > Response object from the browser communication was none")
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
try:
|
||||
@@ -166,10 +167,10 @@ class fetcher(Fetcher):
|
||||
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||
|
||||
if len(self.page.content().strip()) == 0:
|
||||
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
|
||||
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
|
||||
context.close()
|
||||
browser.close()
|
||||
logger.debug("Content Fetcher > Content was empty")
|
||||
raise EmptyReply(url=url, status_code=response.status)
|
||||
|
||||
# Run Browser Steps here
|
||||
|
||||
@@ -75,7 +75,8 @@ class fetcher(Fetcher):
|
||||
request_method,
|
||||
ignore_status_codes,
|
||||
current_include_filters,
|
||||
is_binary
|
||||
is_binary,
|
||||
empty_pages_are_a_change
|
||||
):
|
||||
|
||||
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
||||
@@ -153,7 +154,7 @@ class fetcher(Fetcher):
|
||||
if response is None:
|
||||
await self.page.close()
|
||||
await browser.close()
|
||||
logger.warning("Content Fetcher > Response object was none")
|
||||
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
self.headers = response.headers
|
||||
@@ -186,10 +187,11 @@ class fetcher(Fetcher):
|
||||
|
||||
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
||||
content = await self.page.content
|
||||
if len(content.strip()) == 0:
|
||||
|
||||
if not empty_pages_are_a_change and len(content.strip()) == 0:
|
||||
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
|
||||
await self.page.close()
|
||||
await browser.close()
|
||||
logger.error("Content Fetcher > Content was empty")
|
||||
raise EmptyReply(url=url, status_code=response.status)
|
||||
|
||||
# Run Browser Steps here
|
||||
@@ -247,7 +249,7 @@ class fetcher(Fetcher):
|
||||
await self.fetch_page(**kwargs)
|
||||
|
||||
def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
|
||||
current_include_filters=None, is_binary=False):
|
||||
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
|
||||
|
||||
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
|
||||
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
|
||||
@@ -262,7 +264,8 @@ class fetcher(Fetcher):
|
||||
request_method=request_method,
|
||||
ignore_status_codes=ignore_status_codes,
|
||||
current_include_filters=current_include_filters,
|
||||
is_binary=is_binary
|
||||
is_binary=is_binary,
|
||||
empty_pages_are_a_change=empty_pages_are_a_change
|
||||
), timeout=max_time))
|
||||
except asyncio.TimeoutError:
|
||||
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
from loguru import logger
|
||||
import chardet
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
import chardet
|
||||
import requests
|
||||
|
||||
from changedetectionio import strtobool
|
||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
|
||||
from changedetectionio.content_fetchers.base import Fetcher
|
||||
@@ -26,7 +25,8 @@ class fetcher(Fetcher):
|
||||
request_method,
|
||||
ignore_status_codes=False,
|
||||
current_include_filters=None,
|
||||
is_binary=False):
|
||||
is_binary=False,
|
||||
empty_pages_are_a_change=False):
|
||||
|
||||
if self.browser_steps_get_valid_steps():
|
||||
raise BrowserStepsInUnsupportedFetcher(url=url)
|
||||
@@ -74,7 +74,10 @@ class fetcher(Fetcher):
|
||||
self.headers = r.headers
|
||||
|
||||
if not r.content or not len(r.content):
|
||||
raise EmptyReply(url=url, status_code=r.status_code)
|
||||
if not empty_pages_are_a_change:
|
||||
raise EmptyReply(url=url, status_code=r.status_code)
|
||||
else:
|
||||
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
|
||||
|
||||
# @todo test this
|
||||
# @todo maybe you really want to test zero-byte return pages?
|
||||
|
||||
@@ -56,7 +56,8 @@ class fetcher(Fetcher):
|
||||
request_method,
|
||||
ignore_status_codes=False,
|
||||
current_include_filters=None,
|
||||
is_binary=False):
|
||||
is_binary=False,
|
||||
empty_pages_are_a_change=False):
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
|
||||
@@ -26,6 +26,8 @@ class difference_detection_processor():
|
||||
|
||||
def call_browser(self):
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
from changedetectionio.content_fetchers.exceptions import EmptyReply
|
||||
|
||||
# Protect against file:// access
|
||||
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
|
||||
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
|
||||
@@ -133,8 +135,18 @@ class difference_detection_processor():
|
||||
is_binary = self.watch.is_pdf
|
||||
|
||||
# And here we go! call the right browser with browser-specific settings
|
||||
self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
|
||||
is_binary=is_binary)
|
||||
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
|
||||
|
||||
self.fetcher.run(url=url,
|
||||
timeout=timeout,
|
||||
request_headers=request_headers,
|
||||
request_body=request_body,
|
||||
request_method=request_method,
|
||||
ignore_status_codes=ignore_status_codes,
|
||||
current_include_filters=self.watch.get('include_filters'),
|
||||
is_binary=is_binary,
|
||||
empty_pages_are_a_change=empty_pages_are_a_change
|
||||
)
|
||||
|
||||
#@todo .quit here could go on close object, so we can run JS if change-detected
|
||||
self.fetcher.quit()
|
||||
|
||||
23
changedetectionio/processors/restock_diff/hookspecs.py
Normal file
23
changedetectionio/processors/restock_diff/hookspecs.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import pluggy
|
||||
from typing import Dict
|
||||
from changedetectionio.model import Watch as Watch
|
||||
|
||||
plugin_namespace = "changedetectionio.restock_price_scraper"
|
||||
hookspec = pluggy.HookspecMarker(plugin_namespace)
|
||||
|
||||
class HookSpec:
|
||||
@hookspec
|
||||
def scrape_price_restock(self, watch: Watch.model, html_content: str, screenshot: bytes, update_obj: Dict) -> Dict:
|
||||
"""
|
||||
Scrape price and restock data from html_content and/or screenshot and return via update_obj
|
||||
|
||||
Args:
|
||||
watch (Watch.model): The watch object containing watch configuration.
|
||||
html_content (str): The HTML content to scrape.
|
||||
screenshot (bytes): The screenshot data.
|
||||
update_obj (Dict): The dictionary to update with scraped data.
|
||||
|
||||
Returns:
|
||||
Optional[Dict]: The updated dictionary with the scraped price data, or None if no update is made.
|
||||
"""
|
||||
|
||||
17
changedetectionio/processors/restock_diff/plugin_manager.py
Normal file
17
changedetectionio/processors/restock_diff/plugin_manager.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import pluggy
|
||||
from .hookspecs import HookSpec
|
||||
import importlib.metadata
|
||||
|
||||
# Define the plugin namespace
|
||||
plugin_namespace = "changedetectionio.restock_price_scraper"
|
||||
|
||||
# Create a pluggy.PluginManager instance
|
||||
pm = pluggy.PluginManager(plugin_namespace)
|
||||
|
||||
# Register the hook specifications
|
||||
pm.add_hookspecs(HookSpec)
|
||||
|
||||
# Automatically discover and register plugins using entry points
|
||||
for entry_point in importlib.metadata.entry_points().get(plugin_namespace, []):
|
||||
plugin = entry_point.load()
|
||||
pm.register(plugin())
|
||||
@@ -119,6 +119,8 @@ class perform_site_check(difference_detection_processor):
|
||||
xpath_data = None
|
||||
|
||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||
from .plugin_manager import pm
|
||||
|
||||
if not watch:
|
||||
raise Exception("Watch no longer exists.")
|
||||
|
||||
@@ -198,6 +200,19 @@ class perform_site_check(difference_detection_processor):
|
||||
update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
|
||||
logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
|
||||
|
||||
# Ask any "changedetectionio.restock_price_scraper" namespace plugins if they can add something
|
||||
# (Should return an updated 'update_obj')
|
||||
plugin_price_scraping = pm.hook.scrape_price_restock(watch=watch,
|
||||
html_content=self.fetcher.content,
|
||||
screenshot=self.fetcher.screenshot,
|
||||
update_obj=update_obj)
|
||||
if plugin_price_scraping:
|
||||
for plugin_result in plugin_price_scraping:
|
||||
update_obj.update(plugin_result)
|
||||
if plugin_result.get('restock'):
|
||||
update_obj['restock'].update(plugin_result.get('restock'))
|
||||
|
||||
|
||||
# What we store in the snapshot
|
||||
price = update_obj.get('restock').get('price') if update_obj.get('restock').get('price') else ""
|
||||
snapshot_content = f"In Stock: {update_obj.get('restock').get('in_stock')} - Price: {price}"
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
</div>
|
||||
<div class="pure-control-group">
|
||||
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
|
||||
<span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
|
||||
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
|
||||
</div>
|
||||
{% if form.requests.proxy %}
|
||||
<div class="pure-control-group inline-radio">
|
||||
|
||||
@@ -168,7 +168,7 @@
|
||||
{% if watch.get('restock') and watch['restock']['price'] != None %}
|
||||
{% if watch['restock']['price'] != None %}
|
||||
<span class="restock-label price" title="Price">
|
||||
{{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }}
|
||||
{{ watch['restock']['price']|format_number_locale }} {% if watch['restock']['currency'] %} {{ watch['restock']['currency'] }}{% endif %}
|
||||
</span>
|
||||
{% endif %}
|
||||
{% elif not watch.has_restock_info %}
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from urllib.request import urlopen
|
||||
from .util import set_original_response, set_modified_response, live_server_setup
|
||||
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
||||
|
||||
def set_nonrenderable_response():
|
||||
test_return_data = """<html>
|
||||
@@ -22,6 +17,13 @@ def set_nonrenderable_response():
|
||||
|
||||
return None
|
||||
|
||||
def set_zero_byte_response():
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("")
|
||||
|
||||
return None
|
||||
|
||||
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
|
||||
set_original_response()
|
||||
live_server_setup(live_server)
|
||||
@@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# Do this a few times.. ensures we dont accidently set the status
|
||||
for n in range(3):
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' not in res.data
|
||||
|
||||
|
||||
#####################
|
||||
@@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
@@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# It should report nothing found (no new 'unviewed' class)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
client.get(url_for("mark_all_viewed"), follow_redirects=True)
|
||||
|
||||
|
||||
|
||||
# A totally zero byte (#2528) response should also not trigger an error
|
||||
set_zero_byte_response()
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
|
||||
assert b'fetch-error' not in res.data
|
||||
|
||||
#
|
||||
# Cleanup everything
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from .processors.exceptions import ProcessorException
|
||||
from . import content_fetchers
|
||||
|
||||
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
|
||||
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
|
||||
from changedetectionio import html_tools
|
||||
|
||||
@@ -301,7 +300,7 @@ class update_worker(threading.Thread):
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
|
||||
process_changedetection_results = False
|
||||
|
||||
except content_fetchers.exceptions.ReplyWithContentButNoText as e:
|
||||
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
|
||||
# Totally fine, it's by choice - just continue on, nothing more to care about
|
||||
# Page had elements/content but no renderable text
|
||||
# Backend (not filters) gave zero output
|
||||
@@ -327,7 +326,7 @@ class update_worker(threading.Thread):
|
||||
|
||||
process_changedetection_results = False
|
||||
|
||||
except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
|
||||
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
|
||||
if e.status_code == 403:
|
||||
err_text = "Error - 403 (Access denied) received"
|
||||
elif e.status_code == 404:
|
||||
@@ -380,23 +379,23 @@ class update_worker(threading.Thread):
|
||||
|
||||
process_changedetection_results = False
|
||||
|
||||
except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
|
||||
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
|
||||
# Yes fine, so nothing todo, don't continue to process.
|
||||
process_changedetection_results = False
|
||||
changed_detected = False
|
||||
except content_fetchers.exceptions.BrowserConnectError as e:
|
||||
except content_fetchers_exceptions.BrowserConnectError as e:
|
||||
self.datastore.update_watch(uuid=uuid,
|
||||
update_obj={'last_error': e.msg
|
||||
}
|
||||
)
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.BrowserFetchTimedOut as e:
|
||||
except content_fetchers_exceptions.BrowserFetchTimedOut as e:
|
||||
self.datastore.update_watch(uuid=uuid,
|
||||
update_obj={'last_error': e.msg
|
||||
}
|
||||
)
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.BrowserStepsStepException as e:
|
||||
except content_fetchers_exceptions.BrowserStepsStepException as e:
|
||||
|
||||
if not self.datastore.data['watching'].get(uuid):
|
||||
continue
|
||||
@@ -438,25 +437,25 @@ class update_worker(threading.Thread):
|
||||
|
||||
process_changedetection_results = False
|
||||
|
||||
except content_fetchers.exceptions.EmptyReply as e:
|
||||
except content_fetchers_exceptions.EmptyReply as e:
|
||||
# Some kind of custom to-str handler in the exception handler that does this?
|
||||
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
||||
'last_check_status': e.status_code})
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.ScreenshotUnavailable as e:
|
||||
except content_fetchers_exceptions.ScreenshotUnavailable as e:
|
||||
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
||||
'last_check_status': e.status_code})
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.JSActionExceptions as e:
|
||||
except content_fetchers_exceptions.JSActionExceptions as e:
|
||||
err_text = "Error running JS Actions - Page request - "+e.message
|
||||
if e.screenshot:
|
||||
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
||||
'last_check_status': e.status_code})
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.PageUnloadable as e:
|
||||
except content_fetchers_exceptions.PageUnloadable as e:
|
||||
err_text = "Page request from server didnt respond correctly"
|
||||
if e.message:
|
||||
err_text = "{} - {}".format(err_text, e.message)
|
||||
@@ -468,7 +467,7 @@ class update_worker(threading.Thread):
|
||||
'last_check_status': e.status_code,
|
||||
'has_ldjson_price_data': None})
|
||||
process_changedetection_results = False
|
||||
except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
|
||||
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
|
||||
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
|
||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
|
||||
process_changedetection_results = False
|
||||
|
||||
@@ -92,3 +92,6 @@ babel
|
||||
|
||||
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
|
||||
greenlet >= 3.0.3
|
||||
|
||||
# Our own plugins
|
||||
changedetection.io-amazon-price-scraper>=0.03
|
||||
Reference in New Issue
Block a user