mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-23 09:56:17 +00:00
Compare commits
1 Commits
fixing-res
...
2568-fix-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
74f3c12d2e |
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||||
|
|
||||||
__version__ = '0.46.03'
|
__version__ = '0.46.02'
|
||||||
|
|
||||||
from changedetectionio.strtobool import strtobool
|
from changedetectionio.strtobool import strtobool
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
|
|||||||
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
|||||||
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
|
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
|
||||||
playwright_browser=browsersteps_start_session['browser'],
|
playwright_browser=browsersteps_start_session['browser'],
|
||||||
proxy=proxy,
|
proxy=proxy,
|
||||||
start_url=datastore.data['watching'][watch_uuid].get('url'),
|
start_url=datastore.data['watching'][watch_uuid].get('url')
|
||||||
headers=datastore.data['watching'][watch_uuid].get('headers')
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For test
|
# For test
|
||||||
|
|||||||
@@ -75,7 +75,6 @@ function isItemInStock() {
|
|||||||
'vergriffen',
|
'vergriffen',
|
||||||
'vorbestellen',
|
'vorbestellen',
|
||||||
'vorbestellung ist bald möglich',
|
'vorbestellung ist bald möglich',
|
||||||
'we don\'t currently have any',
|
|
||||||
'we couldn\'t find any products that match',
|
'we couldn\'t find any products that match',
|
||||||
'we do not currently have an estimate of when this product will be back in stock.',
|
'we do not currently have an estimate of when this product will be back in stock.',
|
||||||
'we don\'t know when or if this item will be back in stock.',
|
'we don\'t know when or if this item will be back in stock.',
|
||||||
@@ -174,8 +173,7 @@ function isItemInStock() {
|
|||||||
const element = elementsToScan[i];
|
const element = elementsToScan[i];
|
||||||
// outside the 'fold' or some weird text in the heading area
|
// outside the 'fold' or some weird text in the heading area
|
||||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
elementText = "";
|
elementText = "";
|
||||||
@@ -189,7 +187,7 @@ function isItemInStock() {
|
|||||||
// and these mean its out of stock
|
// and these mean its out of stock
|
||||||
for (const outOfStockText of outOfStockTexts) {
|
for (const outOfStockText of outOfStockTexts) {
|
||||||
if (elementText.includes(outOfStockText)) {
|
if (elementText.includes(outOfStockText)) {
|
||||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
|
||||||
return outOfStockText; // item is out of stock
|
return outOfStockText; // item is out of stock
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
|
|
||||||
|
|
||||||
let text = element.textContent.trim().slice(0, 30).trim();
|
|
||||||
while (/\n{2,}|\t{2,}/.test(text)) {
|
|
||||||
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
|
||||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
|
|
||||||
|
|
||||||
size_pos.push({
|
size_pos.push({
|
||||||
xpath: xpath_result,
|
xpath: xpath_result,
|
||||||
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
|
|||||||
height: Math.round(bbox['height']),
|
height: Math.round(bbox['height']),
|
||||||
left: Math.floor(bbox['left']),
|
left: Math.floor(bbox['left']),
|
||||||
top: Math.floor(bbox['top']) + scroll_y,
|
top: Math.floor(bbox['top']) + scroll_y,
|
||||||
// tagName used by Browser Steps
|
|
||||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||||
// tagtype used by Browser Steps
|
|
||||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||||
isClickable: window.getComputedStyle(element).cursor === "pointer",
|
isClickable: window.getComputedStyle(element).cursor == "pointer"
|
||||||
// Used by the keras trainer
|
|
||||||
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
|
|
||||||
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
|
|
||||||
hasDigitCurrency: hasDigitCurrency,
|
|
||||||
label: label,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1377,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
import brotli
|
import brotli
|
||||||
|
|
||||||
watch = datastore.data['watching'].get(uuid)
|
watch = datastore.data['watching'].get(uuid)
|
||||||
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
|
if watch and os.path.isdir(watch.watch_data_dir):
|
||||||
latest_filename = list(watch.history.keys())[-1]
|
latest_filename = list(watch.history.keys())[0]
|
||||||
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
|
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
|
||||||
with open(html_fname, 'rb') as f:
|
if html_fname.endswith('.br'):
|
||||||
if html_fname.endswith('.br'):
|
# Read and decompress the Brotli file
|
||||||
# Read and decompress the Brotli file
|
with open(html_fname, 'rb') as f:
|
||||||
decompressed_data = brotli.decompress(f.read())
|
decompressed_data = brotli.decompress(f.read())
|
||||||
else:
|
|
||||||
decompressed_data = f.read()
|
|
||||||
|
|
||||||
buffer = BytesIO(decompressed_data)
|
buffer = BytesIO(decompressed_data)
|
||||||
|
|
||||||
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
|
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
|
||||||
|
|
||||||
|
|
||||||
# Return a 500 error
|
# Return a 500 error
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
|
|
||||||
from babel.numbers import parse_decimal
|
|
||||||
from changedetectionio.model.Watch import model as BaseWatch
|
from changedetectionio.model.Watch import model as BaseWatch
|
||||||
from typing import Union
|
|
||||||
import re
|
import re
|
||||||
|
from babel.numbers import parse_decimal
|
||||||
|
|
||||||
class Restock(dict):
|
class Restock(dict):
|
||||||
|
|
||||||
def parse_currency(self, raw_value: str) -> Union[float, None]:
|
def parse_currency(self, raw_value: str) -> float:
|
||||||
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
|
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
|
||||||
standardized_value = raw_value
|
standardized_value = raw_value
|
||||||
|
|
||||||
@@ -22,11 +21,8 @@ class Restock(dict):
|
|||||||
# Remove any non-numeric characters except for the decimal point
|
# Remove any non-numeric characters except for the decimal point
|
||||||
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
|
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
|
||||||
|
|
||||||
if standardized_value:
|
# Convert to float
|
||||||
# Convert to float
|
return float(parse_decimal(standardized_value, locale='en'))
|
||||||
return float(parse_decimal(standardized_value, locale='en'))
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
# Define default values
|
# Define default values
|
||||||
|
|||||||
@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
|
|||||||
import extruct
|
import extruct
|
||||||
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
||||||
|
|
||||||
|
value = {}
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
|
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
|
||||||
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
|
||||||
try:
|
|
||||||
data = extruct.extract(html_content, syntaxes=syntaxes)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
|
|
||||||
return Restock()
|
|
||||||
|
|
||||||
|
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
||||||
|
|
||||||
|
data = extruct.extract(html_content, syntaxes=syntaxes)
|
||||||
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
|
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
|
||||||
|
|
||||||
# First phase, dead simple scanning of anything that looks useful
|
# First phase, dead simple scanning of anything that looks useful
|
||||||
|
|||||||
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
|
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
uuid = extract_UUID_from_client(client)
|
|
||||||
|
|
||||||
# Check the 'get latest snapshot works'
|
|
||||||
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
|
||||||
assert b'which has this one new line' in res.data
|
|
||||||
|
|
||||||
# Now something should be ready, indicated by having a 'unviewed' class
|
# Now something should be ready, indicated by having a 'unviewed' class
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
assert b'unviewed' in res.data
|
assert b'unviewed' in res.data
|
||||||
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
assert expected_url.encode('utf-8') in res.data
|
assert expected_url.encode('utf-8') in res.data
|
||||||
|
|
||||||
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
|
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
|
||||||
res = client.get(url_for("diff_history_page", uuid=uuid))
|
res = client.get(url_for("diff_history_page", uuid="first"))
|
||||||
assert b'selected=""' in res.data, "Confirm diff history page loaded"
|
assert b'selected=""' in res.data, "Confirm diff history page loaded"
|
||||||
|
|
||||||
# Check the [preview] pulls the right one
|
# Check the [preview] pulls the right one
|
||||||
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
assert b'unviewed' not in res.data
|
assert b'unviewed' not in res.data
|
||||||
|
|
||||||
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
|
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
|
||||||
|
uuid = extract_UUID_from_client(client)
|
||||||
client.get(url_for("clear_watch_history", uuid=uuid))
|
client.get(url_for("clear_watch_history", uuid=uuid))
|
||||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
assert b'preview/' in res.data
|
assert b'preview/' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
# Check the 'get latest snapshot works'
|
||||||
|
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
||||||
|
assert b'<head><title>head title</title></head>' in res.data
|
||||||
|
|
||||||
#
|
#
|
||||||
# Cleanup everything
|
# Cleanup everything
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ services:
|
|||||||
#
|
#
|
||||||
# Log levels are in descending order. (TRACE is the most detailed one)
|
# Log levels are in descending order. (TRACE is the most detailed one)
|
||||||
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
||||||
# - LOGGER_LEVEL=TRACE
|
# - LOGGER_LEVEL=DEBUG
|
||||||
#
|
#
|
||||||
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
||||||
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
||||||
@@ -29,9 +29,8 @@ services:
|
|||||||
#
|
#
|
||||||
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
||||||
#
|
#
|
||||||
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
|
# Alternative Playwright URL, do not use "'s or 's!
|
||||||
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
|
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
|
||||||
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
|
|
||||||
#
|
#
|
||||||
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
||||||
#
|
#
|
||||||
@@ -74,10 +73,10 @@ services:
|
|||||||
# condition: service_started
|
# condition: service_started
|
||||||
|
|
||||||
|
|
||||||
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
|
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||||
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
|
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
|
||||||
# sockpuppetbrowser:
|
# playwright-chrome:
|
||||||
# hostname: sockpuppetbrowser
|
# hostname: playwright-chrome
|
||||||
# image: dgtlmoon/sockpuppetbrowser:latest
|
# image: dgtlmoon/sockpuppetbrowser:latest
|
||||||
# cap_add:
|
# cap_add:
|
||||||
# - SYS_ADMIN
|
# - SYS_ADMIN
|
||||||
|
|||||||
@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
|
|||||||
pytest ~=7.2
|
pytest ~=7.2
|
||||||
pytest-flask ~=1.2
|
pytest-flask ~=1.2
|
||||||
|
|
||||||
# Anything 4.0 and up but not 5.0
|
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
|
||||||
jsonschema ~= 4.0
|
jsonschema==4.17.3
|
||||||
|
|
||||||
|
|
||||||
loguru
|
loguru
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user