mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-08 02:26:31 +00:00
Compare commits
2 Commits
total-byte
...
restock-pl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29a1651ae1 | ||
|
|
321ab19ffb |
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
|
||||
__version__ = '0.46.04'
|
||||
__version__ = '0.46.02'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
|
||||
playwright_browser=browsersteps_start_session['browser'],
|
||||
proxy=proxy,
|
||||
start_url=datastore.data['watching'][watch_uuid].get('url'),
|
||||
headers=datastore.data['watching'][watch_uuid].get('headers')
|
||||
start_url=datastore.data['watching'][watch_uuid].get('url')
|
||||
)
|
||||
|
||||
# For test
|
||||
|
||||
@@ -51,7 +51,6 @@ class Fetcher():
|
||||
instock_data = None
|
||||
instock_data_js = ""
|
||||
status_code = None
|
||||
total_bytes = None
|
||||
webdriver_js_execute_code = None
|
||||
xpath_data = None
|
||||
xpath_element_js = ""
|
||||
@@ -66,8 +65,8 @@ class Fetcher():
|
||||
|
||||
def __init__(self):
|
||||
import importlib.resources
|
||||
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
|
||||
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
|
||||
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
|
||||
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()
|
||||
|
||||
@abstractmethod
|
||||
def get_error(self):
|
||||
|
||||
@@ -12,27 +12,6 @@ from changedetectionio.content_fetchers.base import Fetcher
|
||||
class fetcher(Fetcher):
|
||||
fetcher_description = "Basic fast Plaintext/HTTP Client"
|
||||
|
||||
def get_total_bytes_received(self, response):
|
||||
# Calculate the size of the response content
|
||||
content_size = len(response.content)
|
||||
# Calculate the size of the response headers
|
||||
headers_size = sum(len(k) + len(v) for k, v in response.headers.items()) + len(response.headers) * 4 # adding 4 for ': ' and '\r\n'
|
||||
|
||||
# Total bytes received
|
||||
total_received = content_size + headers_size
|
||||
return total_received
|
||||
|
||||
def get_total_bytes_transferred(self, request):
|
||||
# Calculate the size of the request headers
|
||||
headers_size = sum(len(k) + len(v) for k, v in request.headers.items()) + len(request.headers) * 4 # adding 4 for ': ' and '\r\n'
|
||||
|
||||
# Calculate the size of the request body, if any
|
||||
body_size = len(request.body or '')
|
||||
|
||||
# Total bytes transferred (request + response)
|
||||
total_transferred = headers_size + body_size
|
||||
return total_transferred
|
||||
|
||||
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
||||
super().__init__()
|
||||
self.proxy_override = proxy_override
|
||||
@@ -81,10 +60,6 @@ class fetcher(Fetcher):
|
||||
proxies=proxies,
|
||||
verify=False)
|
||||
|
||||
total_received = self.get_total_bytes_received(response=r)
|
||||
request_prepared = r.request
|
||||
self.total_bytes = self.get_total_bytes_transferred(request_prepared) + total_received
|
||||
|
||||
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
||||
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
||||
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
|
||||
|
||||
@@ -75,7 +75,6 @@ function isItemInStock() {
|
||||
'vergriffen',
|
||||
'vorbestellen',
|
||||
'vorbestellung ist bald möglich',
|
||||
'we don\'t currently have any',
|
||||
'we couldn\'t find any products that match',
|
||||
'we do not currently have an estimate of when this product will be back in stock.',
|
||||
'we don\'t know when or if this item will be back in stock.',
|
||||
@@ -174,8 +173,7 @@ function isItemInStock() {
|
||||
const element = elementsToScan[i];
|
||||
// outside the 'fold' or some weird text in the heading area
|
||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||
continue
|
||||
}
|
||||
elementText = "";
|
||||
@@ -189,7 +187,7 @@ function isItemInStock() {
|
||||
// and these mean its out of stock
|
||||
for (const outOfStockText of outOfStockTexts) {
|
||||
if (elementText.includes(outOfStockText)) {
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
|
||||
return outOfStockText; // item is out of stock
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
|
||||
}
|
||||
}
|
||||
|
||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
||||
|
||||
let text = element.textContent.trim().slice(0, 30).trim();
|
||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
||||
}
|
||||
|
||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']) + scroll_y,
|
||||
// tagName used by Browser Steps
|
||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||
// tagtype used by Browser Steps
|
||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||
isClickable: window.getComputedStyle(element).cursor === "pointer",
|
||||
// Used by the keras trainer
|
||||
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
|
||||
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
|
||||
hasDigitCurrency: hasDigitCurrency,
|
||||
label: label,
|
||||
isClickable: window.getComputedStyle(element).cursor == "pointer"
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
@@ -1377,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
import brotli
|
||||
|
||||
watch = datastore.data['watching'].get(uuid)
|
||||
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
|
||||
latest_filename = list(watch.history.keys())[-1]
|
||||
if watch and os.path.isdir(watch.watch_data_dir):
|
||||
latest_filename = list(watch.history.keys())[0]
|
||||
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
|
||||
with open(html_fname, 'rb') as f:
|
||||
if html_fname.endswith('.br'):
|
||||
# Read and decompress the Brotli file
|
||||
if html_fname.endswith('.br'):
|
||||
# Read and decompress the Brotli file
|
||||
with open(html_fname, 'rb') as f:
|
||||
decompressed_data = brotli.decompress(f.read())
|
||||
else:
|
||||
decompressed_data = f.read()
|
||||
|
||||
buffer = BytesIO(decompressed_data)
|
||||
buffer = BytesIO(decompressed_data)
|
||||
|
||||
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
|
||||
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
|
||||
|
||||
|
||||
# Return a 500 error
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
|
||||
from babel.numbers import parse_decimal
|
||||
from changedetectionio.model.Watch import model as BaseWatch
|
||||
from typing import Union
|
||||
import re
|
||||
from babel.numbers import parse_decimal
|
||||
|
||||
class Restock(dict):
|
||||
|
||||
def parse_currency(self, raw_value: str) -> Union[float, None]:
|
||||
def parse_currency(self, raw_value: str) -> float:
|
||||
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
|
||||
standardized_value = raw_value
|
||||
|
||||
@@ -22,11 +21,8 @@ class Restock(dict):
|
||||
# Remove any non-numeric characters except for the decimal point
|
||||
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
|
||||
|
||||
if standardized_value:
|
||||
# Convert to float
|
||||
return float(parse_decimal(standardized_value, locale='en'))
|
||||
|
||||
return None
|
||||
# Convert to float
|
||||
return float(parse_decimal(standardized_value, locale='en'))
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Define default values
|
||||
|
||||
23
changedetectionio/processors/restock_diff/hookspecs.py
Normal file
23
changedetectionio/processors/restock_diff/hookspecs.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import pluggy
|
||||
from typing import Dict
|
||||
from changedetectionio.model import Watch as Watch
|
||||
|
||||
plugin_namespace = "changedetectionio.restock_price_scraper"
|
||||
hookspec = pluggy.HookspecMarker(plugin_namespace)
|
||||
|
||||
class HookSpec:
|
||||
@hookspec
|
||||
def scrape_price_restock(self, watch: Watch.model, html_content: str, screenshot: bytes, update_obj: Dict) -> Dict:
|
||||
"""
|
||||
Scrape price and restock data from html_content and/or screenshot and return via update_obj
|
||||
|
||||
Args:
|
||||
watch (Watch.model): The watch object containing watch configuration.
|
||||
html_content (str): The HTML content to scrape.
|
||||
screenshot (bytes): The screenshot data.
|
||||
update_obj (Dict): The dictionary to update with scraped data.
|
||||
|
||||
Returns:
|
||||
Optional[Dict]: The updated dictionary with the scraped price data, or None if no update is made.
|
||||
"""
|
||||
|
||||
17
changedetectionio/processors/restock_diff/plugin_manager.py
Normal file
17
changedetectionio/processors/restock_diff/plugin_manager.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import pluggy
|
||||
from .hookspecs import HookSpec
|
||||
import importlib.metadata
|
||||
|
||||
# Define the plugin namespace
|
||||
plugin_namespace = "changedetectionio.restock_price_scraper"
|
||||
|
||||
# Create a pluggy.PluginManager instance
|
||||
pm = pluggy.PluginManager(plugin_namespace)
|
||||
|
||||
# Register the hook specifications
|
||||
pm.add_hookspecs(HookSpec)
|
||||
|
||||
# Automatically discover and register plugins using entry points
|
||||
for entry_point in importlib.metadata.entry_points().get(plugin_namespace, []):
|
||||
plugin = entry_point.load()
|
||||
pm.register(plugin())
|
||||
@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
|
||||
import extruct
|
||||
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
||||
|
||||
value = {}
|
||||
now = time.time()
|
||||
|
||||
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
|
||||
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
||||
try:
|
||||
data = extruct.extract(html_content, syntaxes=syntaxes)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
|
||||
return Restock()
|
||||
|
||||
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
||||
|
||||
data = extruct.extract(html_content, syntaxes=syntaxes)
|
||||
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
|
||||
|
||||
# First phase, dead simple scanning of anything that looks useful
|
||||
@@ -122,6 +119,8 @@ class perform_site_check(difference_detection_processor):
|
||||
xpath_data = None
|
||||
|
||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||
from .plugin_manager import pm
|
||||
|
||||
if not watch:
|
||||
raise Exception("Watch no longer exists.")
|
||||
|
||||
@@ -201,6 +200,19 @@ class perform_site_check(difference_detection_processor):
|
||||
update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
|
||||
logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
|
||||
|
||||
# Ask any "changedetectionio.restock_price_scraper" namespace plugins if they can add something
|
||||
# (Should return an updated 'update_obj')
|
||||
plugin_price_scraping = pm.hook.scrape_price_restock(watch=watch,
|
||||
html_content=self.fetcher.content,
|
||||
screenshot=self.fetcher.screenshot,
|
||||
update_obj=update_obj)
|
||||
if plugin_price_scraping:
|
||||
for plugin_result in plugin_price_scraping:
|
||||
update_obj.update(plugin_result)
|
||||
if plugin_result.get('restock'):
|
||||
update_obj['restock'].update(plugin_result.get('restock'))
|
||||
|
||||
|
||||
# What we store in the snapshot
|
||||
price = update_obj.get('restock').get('price') if update_obj.get('restock').get('price') else ""
|
||||
snapshot_content = f"In Stock: {update_obj.get('restock').get('in_stock')} - Price: {price}"
|
||||
|
||||
@@ -168,7 +168,7 @@
|
||||
{% if watch.get('restock') and watch['restock']['price'] != None %}
|
||||
{% if watch['restock']['price'] != None %}
|
||||
<span class="restock-label price" title="Price">
|
||||
{{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }}
|
||||
{{ watch['restock']['price']|format_number_locale }} {% if watch['restock']['currency'] %} {{ watch['restock']['currency'] }}{% endif %}
|
||||
</span>
|
||||
{% endif %}
|
||||
{% elif not watch.has_restock_info %}
|
||||
|
||||
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
uuid = extract_UUID_from_client(client)
|
||||
|
||||
# Check the 'get latest snapshot works'
|
||||
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
||||
assert b'which has this one new line' in res.data
|
||||
|
||||
# Now something should be ready, indicated by having a 'unviewed' class
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
assert expected_url.encode('utf-8') in res.data
|
||||
|
||||
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
|
||||
res = client.get(url_for("diff_history_page", uuid=uuid))
|
||||
res = client.get(url_for("diff_history_page", uuid="first"))
|
||||
assert b'selected=""' in res.data, "Confirm diff history page loaded"
|
||||
|
||||
# Check the [preview] pulls the right one
|
||||
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
assert b'unviewed' not in res.data
|
||||
|
||||
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
|
||||
uuid = extract_UUID_from_client(client)
|
||||
client.get(url_for("clear_watch_history", uuid=uuid))
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'preview/' in res.data
|
||||
|
||||
|
||||
# Check the 'get latest snapshot works'
|
||||
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
||||
assert b'<head><title>head title</title></head>' in res.data
|
||||
|
||||
#
|
||||
# Cleanup everything
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
@@ -18,7 +18,7 @@ services:
|
||||
#
|
||||
# Log levels are in descending order. (TRACE is the most detailed one)
|
||||
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
||||
# - LOGGER_LEVEL=TRACE
|
||||
# - LOGGER_LEVEL=DEBUG
|
||||
#
|
||||
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
||||
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
||||
@@ -29,9 +29,8 @@ services:
|
||||
#
|
||||
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
||||
#
|
||||
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
|
||||
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
|
||||
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
|
||||
# Alternative Playwright URL, do not use "'s or 's!
|
||||
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
|
||||
#
|
||||
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
||||
#
|
||||
@@ -74,10 +73,10 @@ services:
|
||||
# condition: service_started
|
||||
|
||||
|
||||
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
|
||||
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
|
||||
# sockpuppetbrowser:
|
||||
# hostname: sockpuppetbrowser
|
||||
# playwright-chrome:
|
||||
# hostname: playwright-chrome
|
||||
# image: dgtlmoon/sockpuppetbrowser:latest
|
||||
# cap_add:
|
||||
# - SYS_ADMIN
|
||||
|
||||
@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
|
||||
pytest ~=7.2
|
||||
pytest-flask ~=1.2
|
||||
|
||||
# Anything 4.0 and up but not 5.0
|
||||
jsonschema ~= 4.0
|
||||
|
||||
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
|
||||
jsonschema==4.17.3
|
||||
|
||||
loguru
|
||||
|
||||
@@ -93,3 +92,6 @@ babel
|
||||
|
||||
# Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
|
||||
greenlet >= 3.0.3
|
||||
|
||||
# Our own plugins
|
||||
changedetection.io-amazon-price-scraper>=0.03
|
||||
Reference in New Issue
Block a user