mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-20 00:16:10 +00:00
Compare commits
1 Commits
price-scra
...
2568-fix-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
74f3c12d2e |
@@ -52,10 +52,6 @@ jobs:
|
|||||||
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest
|
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest
|
||||||
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
|
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
|
||||||
|
|
||||||
# CDIO AI Element scraper for prices
|
|
||||||
# Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
|
|
||||||
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest
|
|
||||||
|
|
||||||
- name: Spin up ancillary SMTP+Echo message test server
|
- name: Spin up ancillary SMTP+Echo message test server
|
||||||
run: |
|
run: |
|
||||||
# Debug SMTP server/echo message back server
|
# Debug SMTP server/echo message back server
|
||||||
@@ -99,11 +95,6 @@ jobs:
|
|||||||
# Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
|
# Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
|
||||||
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .'
|
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .'
|
||||||
|
|
||||||
# PLAYWRIGHT/NODE-> CDP
|
|
||||||
- name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
|
|
||||||
run: |
|
|
||||||
docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
|
|
||||||
|
|
||||||
- name: Playwright and SocketPuppetBrowser - Restock detection
|
- name: Playwright and SocketPuppetBrowser - Restock detection
|
||||||
run: |
|
run: |
|
||||||
# restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
|
# restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||||
|
|
||||||
__version__ = '0.46.04'
|
__version__ = '0.46.02'
|
||||||
|
|
||||||
from changedetectionio.strtobool import strtobool
|
from changedetectionio.strtobool import strtobool
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
|
|||||||
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
|||||||
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
|
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
|
||||||
playwright_browser=browsersteps_start_session['browser'],
|
playwright_browser=browsersteps_start_session['browser'],
|
||||||
proxy=proxy,
|
proxy=proxy,
|
||||||
start_url=datastore.data['watching'][watch_uuid].get('url'),
|
start_url=datastore.data['watching'][watch_uuid].get('url')
|
||||||
headers=datastore.data['watching'][watch_uuid].get('headers')
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For test
|
# For test
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from loguru import logger
|
|||||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
|
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
|
||||||
import os
|
import os
|
||||||
|
|
||||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong'
|
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
|
||||||
|
|
||||||
# available_fetchers() will scan this implementation looking for anything starting with html_
|
# available_fetchers() will scan this implementation looking for anything starting with html_
|
||||||
# this information is used in the form selections
|
# this information is used in the form selections
|
||||||
|
|||||||
@@ -75,7 +75,6 @@ function isItemInStock() {
|
|||||||
'vergriffen',
|
'vergriffen',
|
||||||
'vorbestellen',
|
'vorbestellen',
|
||||||
'vorbestellung ist bald möglich',
|
'vorbestellung ist bald möglich',
|
||||||
'we don\'t currently have any',
|
|
||||||
'we couldn\'t find any products that match',
|
'we couldn\'t find any products that match',
|
||||||
'we do not currently have an estimate of when this product will be back in stock.',
|
'we do not currently have an estimate of when this product will be back in stock.',
|
||||||
'we don\'t know when or if this item will be back in stock.',
|
'we don\'t know when or if this item will be back in stock.',
|
||||||
@@ -174,8 +173,7 @@ function isItemInStock() {
|
|||||||
const element = elementsToScan[i];
|
const element = elementsToScan[i];
|
||||||
// outside the 'fold' or some weird text in the heading area
|
// outside the 'fold' or some weird text in the heading area
|
||||||
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
|
||||||
// Note: theres also an automated test that places the 'out of stock' text fairly low down
|
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
||||||
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
elementText = "";
|
elementText = "";
|
||||||
@@ -189,7 +187,7 @@ function isItemInStock() {
|
|||||||
// and these mean its out of stock
|
// and these mean its out of stock
|
||||||
for (const outOfStockText of outOfStockTexts) {
|
for (const outOfStockText of outOfStockTexts) {
|
||||||
if (elementText.includes(outOfStockText)) {
|
if (elementText.includes(outOfStockText)) {
|
||||||
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
|
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
|
||||||
return outOfStockText; // item is out of stock
|
return outOfStockText; // item is out of stock
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ try {
|
|||||||
console.log(e);
|
console.log(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100);
|
|
||||||
|
|
||||||
// Include the getXpath script directly, easier than fetching
|
// Include the getXpath script directly, easier than fetching
|
||||||
function getxpath(e) {
|
function getxpath(e) {
|
||||||
@@ -78,30 +77,6 @@ const findUpTag = (el) => {
|
|||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
// Text width scraper for ML training/detection
|
|
||||||
// Create a single canvas and get its 2D context
|
|
||||||
const canvas = document.createElement("canvas");
|
|
||||||
const context = canvas.getContext("2d");
|
|
||||||
|
|
||||||
// Function to get the width and height of the text inside an element and round them to the nearest integer
|
|
||||||
function getTextWidthAndHeightinPx(element) {
|
|
||||||
// Set the font to match the style of the text in the element
|
|
||||||
context.font = window.getComputedStyle(element).font;
|
|
||||||
|
|
||||||
// Get the text inside the element
|
|
||||||
const text = element.textContent || element.innerText;
|
|
||||||
|
|
||||||
// Measure the text width
|
|
||||||
const metrics = context.measureText(text);
|
|
||||||
const width = Math.round(metrics.width);
|
|
||||||
|
|
||||||
// Get the font size from the computed style
|
|
||||||
const fontSize = parseFloat(window.getComputedStyle(element).fontSize);
|
|
||||||
const height = Math.round(fontSize); // Using font size as an approximation of height
|
|
||||||
|
|
||||||
// Return both width and height as an object
|
|
||||||
return { textWidth: width, textHeight: height };
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @todo - if it's SVG or IMG, go into image diff mode
|
// @todo - if it's SVG or IMG, go into image diff mode
|
||||||
@@ -147,10 +122,8 @@ const visibleElementsArray = [];
|
|||||||
// Call collectVisibleElements with the starting parent element
|
// Call collectVisibleElements with the starting parent element
|
||||||
collectVisibleElements(document.body, visibleElementsArray);
|
collectVisibleElements(document.body, visibleElementsArray);
|
||||||
|
|
||||||
// Append any custom selectors to the visibleElementsArray
|
|
||||||
|
|
||||||
|
visibleElementsArray.forEach(function (element) {
|
||||||
function get_element_metadata(element) {
|
|
||||||
|
|
||||||
bbox = element.getBoundingClientRect();
|
bbox = element.getBoundingClientRect();
|
||||||
|
|
||||||
@@ -191,68 +164,18 @@ function get_element_metadata(element) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let label = "none" // A placeholder, the actual labels for training are done by hand for now
|
|
||||||
|
|
||||||
// Check if the element was found and get its text , not including any child element
|
size_pos.push({
|
||||||
let text = Array.from(element.childNodes)
|
|
||||||
.filter(node => node.nodeType === Node.TEXT_NODE)
|
|
||||||
.map(node => node.textContent)
|
|
||||||
.join('');
|
|
||||||
|
|
||||||
// Remove any gaps in sequences of newlines and tabs inside the string
|
|
||||||
text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim();
|
|
||||||
|
|
||||||
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
|
|
||||||
// @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
|
|
||||||
//const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
|
|
||||||
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
|
|
||||||
const hasDigit = /[0-9]/.test(text) ;
|
|
||||||
|
|
||||||
// Sizing of the actual text inside the element can be very different from the elements size
|
|
||||||
const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
|
|
||||||
|
|
||||||
const computedStyle = window.getComputedStyle(element);
|
|
||||||
let red, green, blue;
|
|
||||||
|
|
||||||
if (text.length) {
|
|
||||||
// Extract the RGB values from the color string (format: rgb(r, g, b))
|
|
||||||
[red, green, blue] = computedStyle.color.match(/\d+/g).map(Number);
|
|
||||||
} else {
|
|
||||||
// Assign default values if text is empty
|
|
||||||
[red, green, blue] = [0, 0, 0];
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
xpath: xpath_result,
|
xpath: xpath_result,
|
||||||
width: Math.round(bbox['width']),
|
width: Math.round(bbox['width']),
|
||||||
height: Math.round(bbox['height']),
|
height: Math.round(bbox['height']),
|
||||||
left: Math.floor(bbox['left']),
|
left: Math.floor(bbox['left']),
|
||||||
top: Math.floor(bbox['top']) + scroll_y,
|
top: Math.floor(bbox['top']) + scroll_y,
|
||||||
// tagName used by Browser Steps
|
|
||||||
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
|
||||||
// tagtype used by Browser Steps
|
|
||||||
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
|
||||||
isClickable: window.getComputedStyle(element).cursor === "pointer",
|
isClickable: window.getComputedStyle(element).cursor == "pointer"
|
||||||
// Used by the keras/pytorch trainer
|
});
|
||||||
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
|
|
||||||
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
|
|
||||||
pcNumerical: text.length && percentageNumerical(text),
|
|
||||||
hasDigit: hasDigit,
|
|
||||||
hasDigitCurrency: hasDigitCurrency,
|
|
||||||
textWidth: textWidth,
|
|
||||||
textHeight: textHeight,
|
|
||||||
textLength: text.length,
|
|
||||||
t_r: red,
|
|
||||||
t_g: green,
|
|
||||||
t_b: blue,
|
|
||||||
label: label,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
visibleElementsArray.forEach(function (element) {
|
|
||||||
let metadata = get_element_metadata(element);
|
|
||||||
if(metadata) {
|
|
||||||
size_pos.push(metadata);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
@@ -261,19 +184,7 @@ visibleElementsArray.forEach(function (element) {
|
|||||||
if (include_filters.length) {
|
if (include_filters.length) {
|
||||||
let results;
|
let results;
|
||||||
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
|
||||||
outerLoop:
|
|
||||||
for (const f of include_filters) {
|
for (const f of include_filters) {
|
||||||
// Quick check so we dont end up with duplicates in the training data
|
|
||||||
for (let index = 0; index < size_pos.length; index++) {
|
|
||||||
let item = size_pos[index];
|
|
||||||
if (item.xpath === f) {
|
|
||||||
item.highlight_as_custom_filter = true;
|
|
||||||
item.found_as_duplicate = true;
|
|
||||||
item.label = "price";
|
|
||||||
continue outerLoop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bbox = false;
|
bbox = false;
|
||||||
q = false;
|
q = false;
|
||||||
|
|
||||||
@@ -294,6 +205,7 @@ if (include_filters.length) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
console.log("[css] Scanning for included filter " + f)
|
console.log("[css] Scanning for included filter " + f)
|
||||||
|
console.log("[css] Scanning for included filter " + f);
|
||||||
results = document.querySelectorAll(f);
|
results = document.querySelectorAll(f);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -330,15 +242,17 @@ if (include_filters.length) {
|
|||||||
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
console.log("xpath_element_scraper: error looking up q.ownerElement")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
element_info = get_element_metadata(node);
|
|
||||||
if(element_info) {
|
|
||||||
// Be sure we use exactly what was written
|
|
||||||
element_info['xpath'] = f;
|
|
||||||
element_info['highlight_as_custom_filter'] = true;
|
|
||||||
element_info['label'] = "price";
|
|
||||||
size_pos.push(element_info);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
|
||||||
|
size_pos.push({
|
||||||
|
xpath: f,
|
||||||
|
width: parseInt(bbox['width']),
|
||||||
|
height: parseInt(bbox['height']),
|
||||||
|
left: parseInt(bbox['left']),
|
||||||
|
top: parseInt(bbox['top']) + scroll_y,
|
||||||
|
highlight_as_custom_filter: true
|
||||||
|
});
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -792,7 +792,7 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
|
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
|
||||||
# But in the case something is added we should save straight away
|
# But in the case something is added we should save straight away
|
||||||
datastore.needs_write_urgent = True
|
datastore.needs_write_urgent = True
|
||||||
if not datastore.data['watching'][uuid].get('paused'):
|
|
||||||
# Queue the watch for immediate recheck, with a higher priority
|
# Queue the watch for immediate recheck, with a higher priority
|
||||||
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
|
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
|
||||||
|
|
||||||
@@ -1377,15 +1377,13 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
import brotli
|
import brotli
|
||||||
|
|
||||||
watch = datastore.data['watching'].get(uuid)
|
watch = datastore.data['watching'].get(uuid)
|
||||||
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
|
if watch and os.path.isdir(watch.watch_data_dir):
|
||||||
latest_filename = list(watch.history.keys())[-1]
|
latest_filename = list(watch.history.keys())[0]
|
||||||
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
|
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
|
||||||
with open(html_fname, 'rb') as f:
|
|
||||||
if html_fname.endswith('.br'):
|
if html_fname.endswith('.br'):
|
||||||
# Read and decompress the Brotli file
|
# Read and decompress the Brotli file
|
||||||
|
with open(html_fname, 'rb') as f:
|
||||||
decompressed_data = brotli.decompress(f.read())
|
decompressed_data = brotli.decompress(f.read())
|
||||||
else:
|
|
||||||
decompressed_data = f.read()
|
|
||||||
|
|
||||||
buffer = BytesIO(decompressed_data)
|
buffer = BytesIO(decompressed_data)
|
||||||
|
|
||||||
@@ -1602,15 +1600,6 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
|
|
||||||
flash(f"{len(uuids)} watches were tagged")
|
flash(f"{len(uuids)} watches were tagged")
|
||||||
|
|
||||||
elif op.startswith('mode:'):
|
|
||||||
mode = op.replace('mode:','')
|
|
||||||
for uuid in uuids:
|
|
||||||
uuid = uuid.strip()
|
|
||||||
if datastore.data['watching'].get(uuid):
|
|
||||||
datastore.data['watching'][uuid]['processor'] = mode
|
|
||||||
flash(f"{len(uuids)} watches changed modes")
|
|
||||||
|
|
||||||
|
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
@app.route("/api/share-url", methods=['GET'])
|
@app.route("/api/share-url", methods=['GET'])
|
||||||
|
|||||||
@@ -518,7 +518,7 @@ class model(watch_base):
|
|||||||
self.ensure_data_dir_exists()
|
self.ensure_data_dir_exists()
|
||||||
|
|
||||||
with open(target_path, 'w') as f:
|
with open(target_path, 'w') as f:
|
||||||
f.write(json.dumps(data, indent=2))
|
f.write(json.dumps(data))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
# Save as PNG, PNG is larger but better for doing visual diff in the future
|
# Save as PNG, PNG is larger but better for doing visual diff in the future
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
|
|
||||||
from babel.numbers import parse_decimal
|
|
||||||
from changedetectionio.model.Watch import model as BaseWatch
|
from changedetectionio.model.Watch import model as BaseWatch
|
||||||
from typing import Union
|
|
||||||
import re
|
import re
|
||||||
|
from babel.numbers import parse_decimal
|
||||||
|
|
||||||
class Restock(dict):
|
class Restock(dict):
|
||||||
|
|
||||||
def parse_currency(self, raw_value: str) -> Union[float, None]:
|
def parse_currency(self, raw_value: str) -> float:
|
||||||
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
|
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
|
||||||
standardized_value = raw_value
|
standardized_value = raw_value
|
||||||
|
|
||||||
@@ -22,12 +21,9 @@ class Restock(dict):
|
|||||||
# Remove any non-numeric characters except for the decimal point
|
# Remove any non-numeric characters except for the decimal point
|
||||||
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
|
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
|
||||||
|
|
||||||
if standardized_value:
|
|
||||||
# Convert to float
|
# Convert to float
|
||||||
return float(parse_decimal(standardized_value, locale='en'))
|
return float(parse_decimal(standardized_value, locale='en'))
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
# Define default values
|
# Define default values
|
||||||
default_values = {
|
default_values = {
|
||||||
|
|||||||
@@ -3,13 +3,10 @@ from ..exceptions import ProcessorException
|
|||||||
from . import Restock
|
from . import Restock
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import urllib3
|
import urllib3
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ...html_tools import html_to_text
|
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
name = 'Re-stock & Price detection for single product pages'
|
name = 'Re-stock & Price detection for single product pages'
|
||||||
description = 'Detects if the product goes back to in-stock'
|
description = 'Detects if the product goes back to in-stock'
|
||||||
@@ -37,28 +34,23 @@ def get_itemprop_availability(html_content) -> Restock:
|
|||||||
Kind of funny/cool way to find price/availability in one many different possibilities.
|
Kind of funny/cool way to find price/availability in one many different possibilities.
|
||||||
Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
|
Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from jsonpath_ng import parse
|
from jsonpath_ng import parse
|
||||||
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
import extruct
|
import extruct
|
||||||
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
||||||
|
|
||||||
|
value = {}
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
|
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
|
||||||
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
|
||||||
try:
|
|
||||||
data = extruct.extract(html_content, syntaxes=syntaxes)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
|
|
||||||
return Restock()
|
|
||||||
|
|
||||||
|
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
|
||||||
|
|
||||||
|
data = extruct.extract(html_content, syntaxes=syntaxes)
|
||||||
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
|
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
|
||||||
|
|
||||||
# First phase, dead simple scanning of anything that looks useful
|
# First phase, dead simple scanning of anything that looks useful
|
||||||
value = Restock()
|
value = Restock()
|
||||||
return value
|
|
||||||
if data:
|
if data:
|
||||||
logger.debug(f"Using jsonpath to find price/availability/etc")
|
logger.debug(f"Using jsonpath to find price/availability/etc")
|
||||||
price_parse = parse('$..(price|Price)')
|
price_parse = parse('$..(price|Price)')
|
||||||
@@ -126,52 +118,6 @@ class perform_site_check(difference_detection_processor):
|
|||||||
screenshot = None
|
screenshot = None
|
||||||
xpath_data = None
|
xpath_data = None
|
||||||
|
|
||||||
def ML_scrape_for_price_data(self, ML_price_scraper_url):
|
|
||||||
import requests
|
|
||||||
from changedetectionio import html_tools
|
|
||||||
|
|
||||||
price_info = None
|
|
||||||
|
|
||||||
# Perform the POST request
|
|
||||||
response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data)
|
|
||||||
logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'")
|
|
||||||
# Check if the response contains a dict
|
|
||||||
if response.ok: # This checks if the request was successful (status code 200-299)
|
|
||||||
response_json = response.json()
|
|
||||||
logger.debug(f"ML Price scraper: response - {response_json}'")
|
|
||||||
if isinstance(response_json, dict) and 'idx' in response_json.keys():
|
|
||||||
suggested_xpath_idx = response_json.get('idx')
|
|
||||||
if response_json.get('score') <0.80 or response_json.get('score') > 1.0:
|
|
||||||
logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Use the path provided to extra the price text
|
|
||||||
from price_parser import Price
|
|
||||||
scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
|
|
||||||
logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}")
|
|
||||||
|
|
||||||
result_s = None
|
|
||||||
if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
|
|
||||||
result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
|
|
||||||
html_content=self.fetcher.content)
|
|
||||||
else:
|
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
||||||
result_s = html_tools.include_filters(include_filters=scrape_element['xpath'],
|
|
||||||
html_content=self.fetcher.content)
|
|
||||||
|
|
||||||
if result_s:
|
|
||||||
text = html_to_text(result_s)
|
|
||||||
logger.debug(f"Guessed the text '{text}' as the price information")
|
|
||||||
if text:
|
|
||||||
price_info = Price.fromstring(text)
|
|
||||||
else:
|
|
||||||
logger.error(f"ML Price scraper: missing xpath index (IDX) in response?")
|
|
||||||
else:
|
|
||||||
print(f"ML Price scraper: Request failed with status code: {response.status_code}")
|
|
||||||
|
|
||||||
#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT!
|
|
||||||
return price_info
|
|
||||||
|
|
||||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||||
if not watch:
|
if not watch:
|
||||||
raise Exception("Watch no longer exists.")
|
raise Exception("Watch no longer exists.")
|
||||||
@@ -228,21 +174,6 @@ class perform_site_check(difference_detection_processor):
|
|||||||
else:
|
else:
|
||||||
update_obj['restock']['in_stock'] = False
|
update_obj['restock']['in_stock'] = False
|
||||||
|
|
||||||
# Attempt to pass the elements off to the machine-learning endpoint if its enabled
|
|
||||||
# This might return a confident guess as to which element contains the price data
|
|
||||||
if not itemprop_availability.get('price'):
|
|
||||||
ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT")
|
|
||||||
if self.fetcher.xpath_data and ML_price_scraper_url:
|
|
||||||
price_info = self.ML_scrape_for_price_data(ML_price_scraper_url)
|
|
||||||
if price_info and price_info.amount:
|
|
||||||
logger.success(f"ML Price scraper: Got price data {price_info}")
|
|
||||||
itemprop_availability['price'] = f"{price_info.amount}"
|
|
||||||
update_obj['restock']['price'] = f"{price_info.amount}"
|
|
||||||
if price_info and price_info.currency:
|
|
||||||
itemprop_availability['currency'] = price_info.currency
|
|
||||||
update_obj['restock']['currency'] = price_info.currency
|
|
||||||
|
|
||||||
|
|
||||||
# Main detection method
|
# Main detection method
|
||||||
fetched_md5 = None
|
fetched_md5 = None
|
||||||
|
|
||||||
|
|||||||
@@ -37,8 +37,6 @@
|
|||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
|
<button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
|
||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
|
<button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
|
||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
|
<button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
|
||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:text_json_diff">Mode: Page changes</button>
|
|
||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:restock_diff">Mode: Price/Restock</button>
|
|
||||||
<button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
|
<button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
|
||||||
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
|
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
|
||||||
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
|
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from changedetectionio import changedetection_app
|
|||||||
from changedetectionio import store
|
from changedetectionio import store
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
# https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
|
# https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
|
||||||
# Much better boilerplate than the docs
|
# Much better boilerplate than the docs
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import time
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from ..util import live_server_setup, wait_for_all_checks
|
from ..util import live_server_setup, wait_for_all_checks
|
||||||
import logging
|
import logging
|
||||||
|
|||||||
@@ -1,127 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from flask import url_for
|
|
||||||
from changedetectionio.tests.util import set_original_response, set_modified_response, set_more_modified_response, live_server_setup, \
|
|
||||||
wait_for_all_checks, \
|
|
||||||
set_longer_modified_response
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
# No semantic data just some text, we should be able to find the product price.
|
|
||||||
def set_response(price="121.95"):
|
|
||||||
html_content = f"""
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>Ajax Widget</title>
|
|
||||||
<style>
|
|
||||||
body {{
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
margin: 0;
|
|
||||||
padding: 0;
|
|
||||||
display: flex;
|
|
||||||
justify-content: center;
|
|
||||||
align-items: center;
|
|
||||||
height: 100vh;
|
|
||||||
background-color: #f4f4f4;
|
|
||||||
}}
|
|
||||||
.container {{
|
|
||||||
display: flex;
|
|
||||||
flex-direction: row;
|
|
||||||
background-color: #fff;
|
|
||||||
border: 1px solid #ddd;
|
|
||||||
padding: 20px;
|
|
||||||
border-radius: 5px;
|
|
||||||
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
|
||||||
width: 80%;
|
|
||||||
max-width: 800px;
|
|
||||||
}}
|
|
||||||
.description {{
|
|
||||||
flex: 2;
|
|
||||||
margin-right: 20px;
|
|
||||||
}}
|
|
||||||
.description h1 {{
|
|
||||||
margin-top: 0;
|
|
||||||
}}
|
|
||||||
.price {{
|
|
||||||
flex: 1;
|
|
||||||
text-align: right;
|
|
||||||
font-size: 24px;
|
|
||||||
color: #333;
|
|
||||||
}}
|
|
||||||
.price span {{
|
|
||||||
font-size: 32px;
|
|
||||||
font-weight: bold;
|
|
||||||
}}
|
|
||||||
.buy-button {{
|
|
||||||
display: inline-block;
|
|
||||||
margin-top: 20px;
|
|
||||||
padding: 10px 20px;
|
|
||||||
background-color: #28a745;
|
|
||||||
color: #fff;
|
|
||||||
text-decoration: none;
|
|
||||||
border-radius: 5px;
|
|
||||||
font-size: 16px;
|
|
||||||
}}
|
|
||||||
.buy-button:hover {{
|
|
||||||
background-color: #218838;
|
|
||||||
}}
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<div class="container">
|
|
||||||
<div class="description">
|
|
||||||
<h1>Ajax Widget</h1>
|
|
||||||
<p>The Ajax Widget is the ultimate solution for all your widget needs. Crafted with precision and using the latest technology, this widget offers unmatched performance and durability. Whether you're using it for personal or professional purposes, the Ajax Widget will not disappoint. It's easy to use, reliable, and comes with a sleek design that complements any setup. Don't settle for less; get the best with the Ajax Widget today!</p>
|
|
||||||
</div>
|
|
||||||
<div class="price">
|
|
||||||
<span>${price}</span>
|
|
||||||
<br>
|
|
||||||
<a href="#" class="buy-button">Buy Now</a><br>
|
|
||||||
IN STOCK
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
|
||||||
f.write(html_content)
|
|
||||||
time.sleep(1)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_restock_itemprop_basic(client, live_server):
|
|
||||||
|
|
||||||
# needs to be set and something like 'ws://127.0.0.1:3000'
|
|
||||||
assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
|
|
||||||
assert os.getenv('PRICE_SCRAPER_ML_ENDPOINT'), "Needs PRICE_SCRAPER_ML_ENDPOINT set for this test"
|
|
||||||
|
|
||||||
|
|
||||||
live_server_setup(live_server)
|
|
||||||
|
|
||||||
set_response(price="123.99")
|
|
||||||
|
|
||||||
# because it needs to access itself from within the sockpuppetbrowser
|
|
||||||
test_url = url_for('test_endpoint', _external=True)
|
|
||||||
test_url = test_url.replace('localhost.localdomain', 'cdio')
|
|
||||||
test_url = test_url.replace('localhost', 'cdio')
|
|
||||||
|
|
||||||
client.post(
|
|
||||||
url_for("form_quick_watch_add"),
|
|
||||||
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
|
|
||||||
follow_redirects=True
|
|
||||||
)
|
|
||||||
wait_for_all_checks(client)
|
|
||||||
res = client.get(url_for("index"))
|
|
||||||
|
|
||||||
assert b'123.99' in res.data
|
|
||||||
assert b' in-stock' in res.data
|
|
||||||
assert b' not-in-stock' not in res.data
|
|
||||||
|
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
|
||||||
assert b'Deleted' in res.data
|
|
||||||
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
|
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
uuid = extract_UUID_from_client(client)
|
|
||||||
|
|
||||||
# Check the 'get latest snapshot works'
|
|
||||||
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
|
||||||
assert b'which has this one new line' in res.data
|
|
||||||
|
|
||||||
# Now something should be ready, indicated by having a 'unviewed' class
|
# Now something should be ready, indicated by having a 'unviewed' class
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
assert b'unviewed' in res.data
|
assert b'unviewed' in res.data
|
||||||
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
assert expected_url.encode('utf-8') in res.data
|
assert expected_url.encode('utf-8') in res.data
|
||||||
|
|
||||||
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
|
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
|
||||||
res = client.get(url_for("diff_history_page", uuid=uuid))
|
res = client.get(url_for("diff_history_page", uuid="first"))
|
||||||
assert b'selected=""' in res.data, "Confirm diff history page loaded"
|
assert b'selected=""' in res.data, "Confirm diff history page loaded"
|
||||||
|
|
||||||
# Check the [preview] pulls the right one
|
# Check the [preview] pulls the right one
|
||||||
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
assert b'unviewed' not in res.data
|
assert b'unviewed' not in res.data
|
||||||
|
|
||||||
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
|
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
|
||||||
|
uuid = extract_UUID_from_client(client)
|
||||||
client.get(url_for("clear_watch_history", uuid=uuid))
|
client.get(url_for("clear_watch_history", uuid=uuid))
|
||||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
assert b'preview/' in res.data
|
assert b'preview/' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
# Check the 'get latest snapshot works'
|
||||||
|
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
|
||||||
|
assert b'<head><title>head title</title></head>' in res.data
|
||||||
|
|
||||||
#
|
#
|
||||||
# Cleanup everything
|
# Cleanup everything
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
from flask import url_for
|
from flask import url_for
|
||||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
|
||||||
import time
|
|
||||||
|
|
||||||
def set_nonrenderable_response():
|
def set_nonrenderable_response():
|
||||||
test_return_data = """<html>
|
test_return_data = """<html>
|
||||||
@@ -91,7 +90,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
|||||||
|
|
||||||
# A totally zero byte (#2528) response should also not trigger an error
|
# A totally zero byte (#2528) response should also not trigger an error
|
||||||
set_zero_byte_response()
|
set_zero_byte_response()
|
||||||
time.sleep(2)
|
|
||||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ services:
|
|||||||
#
|
#
|
||||||
# Log levels are in descending order. (TRACE is the most detailed one)
|
# Log levels are in descending order. (TRACE is the most detailed one)
|
||||||
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
|
||||||
# - LOGGER_LEVEL=TRACE
|
# - LOGGER_LEVEL=DEBUG
|
||||||
#
|
#
|
||||||
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
# Alternative WebDriver/selenium URL, do not use "'s or 's!
|
||||||
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
|
||||||
@@ -29,9 +29,8 @@ services:
|
|||||||
#
|
#
|
||||||
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
|
||||||
#
|
#
|
||||||
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
|
# Alternative Playwright URL, do not use "'s or 's!
|
||||||
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
|
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
|
||||||
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
|
|
||||||
#
|
#
|
||||||
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
|
||||||
#
|
#
|
||||||
@@ -58,10 +57,6 @@ services:
|
|||||||
#
|
#
|
||||||
# Absolute minimum seconds to recheck, overrides any watch minimum, change to 0 to disable
|
# Absolute minimum seconds to recheck, overrides any watch minimum, change to 0 to disable
|
||||||
# - MINIMUM_SECONDS_RECHECK_TIME=3
|
# - MINIMUM_SECONDS_RECHECK_TIME=3
|
||||||
#
|
|
||||||
# Scrape prices from web pages automatically where the page has no embedded price information (see below also)
|
|
||||||
# - PRICE_SCRAPER_ML_ENDPOINT=http://cdio-price-scraper:5005
|
|
||||||
|
|
||||||
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
|
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
|
||||||
ports:
|
ports:
|
||||||
- 5000:5000
|
- 5000:5000
|
||||||
@@ -78,10 +73,10 @@ services:
|
|||||||
# condition: service_started
|
# condition: service_started
|
||||||
|
|
||||||
|
|
||||||
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
|
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
|
||||||
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
|
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
|
||||||
# sockpuppetbrowser:
|
# playwright-chrome:
|
||||||
# hostname: sockpuppetbrowser
|
# hostname: playwright-chrome
|
||||||
# image: dgtlmoon/sockpuppetbrowser:latest
|
# image: dgtlmoon/sockpuppetbrowser:latest
|
||||||
# cap_add:
|
# cap_add:
|
||||||
# - SYS_ADMIN
|
# - SYS_ADMIN
|
||||||
@@ -108,13 +103,6 @@ services:
|
|||||||
# # Workaround to avoid the browser crashing inside a docker container
|
# # Workaround to avoid the browser crashing inside a docker container
|
||||||
# # See https://github.com/SeleniumHQ/docker-selenium#quick-start
|
# # See https://github.com/SeleniumHQ/docker-selenium#quick-start
|
||||||
# - /dev/shm:/dev/shm
|
# - /dev/shm:/dev/shm
|
||||||
# restart: unless-stopped
|
|
||||||
|
|
||||||
# Machine Learning/AI - Use "Visual Selector" elements data to scrape price data
|
|
||||||
|
|
||||||
# cdio-keras-price-scraper:
|
|
||||||
# hostname: cdio-price-scraper
|
|
||||||
# image: dgtlmoon/changedetection-AI-pricescraper
|
|
||||||
# restart: unless-stopped
|
# restart: unless-stopped
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
@@ -79,10 +79,9 @@ pyppeteerstealth>=0.0.4
|
|||||||
pytest ~=7.2
|
pytest ~=7.2
|
||||||
pytest-flask ~=1.2
|
pytest-flask ~=1.2
|
||||||
|
|
||||||
# Anything 4.0 and up but not 5.0
|
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
|
||||||
jsonschema ~= 4.0
|
jsonschema==4.17.3
|
||||||
|
|
||||||
price_parser
|
|
||||||
loguru
|
loguru
|
||||||
|
|
||||||
# For scraping all possible metadata relating to products so we can do better restock detection
|
# For scraping all possible metadata relating to products so we can do better restock detection
|
||||||
|
|||||||
Reference in New Issue
Block a user