Compare commits

..

1 Commits

Author SHA1 Message Date
dgtlmoon
74f3c12d2e Re #2568 Add encoding for scraper script reader 2024-08-18 17:52:25 +02:00
10 changed files with 35 additions and 65 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.03'
__version__ = '0.46.02'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_start_session['browser'],
proxy=proxy,
start_url=datastore.data['watching'][watch_uuid].get('url'),
headers=datastore.data['watching'][watch_uuid].get('headers')
start_url=datastore.data['watching'][watch_uuid].get('url')
)
# For test

View File

@@ -75,7 +75,6 @@ function isItemInStock() {
'vergriffen',
'vorbestellen',
'vorbestellung ist bald möglich',
'we don\'t currently have any',
'we couldn\'t find any products that match',
'we do not currently have an estimate of when this product will be back in stock.',
'we don\'t know when or if this item will be back in stock.',
@@ -174,8 +173,7 @@ function isItemInStock() {
const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
// Note: theres also an automated test that places the 'out of stock' text fairly low down
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue
}
elementText = "";
@@ -189,7 +187,7 @@ function isItemInStock() {
// and these mean its out of stock
for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
return outOfStockText; // item is out of stock
}
}

View File

@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
}
}
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
let text = element.textContent.trim().slice(0, 30).trim();
while (/\n{2,}|\t{2,}/.test(text)) {
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
}
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,)/.test(text) ;
size_pos.push({
xpath: xpath_result,
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
height: Math.round(bbox['height']),
left: Math.floor(bbox['left']),
top: Math.floor(bbox['top']) + scroll_y,
// tagName used by Browser Steps
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
// tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor === "pointer",
// Used by the keras trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
hasDigitCurrency: hasDigitCurrency,
label: label,
isClickable: window.getComputedStyle(element).cursor == "pointer"
});
});

View File

@@ -1377,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
import brotli
watch = datastore.data['watching'].get(uuid)
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[-1]
if watch and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[0]
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
if html_fname.endswith('.br'):
# Read and decompress the Brotli file
if html_fname.endswith('.br'):
# Read and decompress the Brotli file
with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read())
else:
decompressed_data = f.read()
buffer = BytesIO(decompressed_data)
buffer = BytesIO(decompressed_data)
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
# Return a 500 error

View File

@@ -1,12 +1,11 @@
from babel.numbers import parse_decimal
from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re
from babel.numbers import parse_decimal
class Restock(dict):
def parse_currency(self, raw_value: str) -> Union[float, None]:
def parse_currency(self, raw_value: str) -> float:
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
standardized_value = raw_value
@@ -22,11 +21,8 @@ class Restock(dict):
# Remove any non-numeric characters except for the decimal point
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
if standardized_value:
# Convert to float
return float(parse_decimal(standardized_value, locale='en'))
return None
# Convert to float
return float(parse_decimal(standardized_value, locale='en'))
def __init__(self, *args, **kwargs):
# Define default values

View File

@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
value = {}
now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
data = extruct.extract(html_content, syntaxes=syntaxes)
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful

View File

@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'which has this one new line' in res.data
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index"))
assert b'unviewed' in res.data
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert expected_url.encode('utf-8') in res.data
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
res = client.get(url_for("diff_history_page", uuid=uuid))
res = client.get(url_for("diff_history_page", uuid="first"))
assert b'selected=""' in res.data, "Confirm diff history page loaded"
# Check the [preview] pulls the right one
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b'unviewed' not in res.data
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
uuid = extract_UUID_from_client(client)
client.get(url_for("clear_watch_history", uuid=uuid))
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'preview/' in res.data
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'<head><title>head title</title></head>' in res.data
#
# Cleanup everything
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

View File

@@ -18,7 +18,7 @@ services:
#
# Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=TRACE
# - LOGGER_LEVEL=DEBUG
#
# Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,9 +29,8 @@ services:
#
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
#
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
# Alternative Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
#
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
#
@@ -74,10 +73,10 @@ services:
# condition: service_started
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
# sockpuppetbrowser:
# hostname: sockpuppetbrowser
# playwright-chrome:
# hostname: playwright-chrome
# image: dgtlmoon/sockpuppetbrowser:latest
# cap_add:
# - SYS_ADMIN

View File

@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
pytest ~=7.2
pytest-flask ~=1.2
# Anything 4.0 and up but not 5.0
jsonschema ~= 4.0
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
jsonschema==4.17.3
loguru