Compare commits

...

30 Commits

Author SHA1 Message Date
dgtlmoon
f967893bd8 Merge branch 'master' into price-scraper-ML-integration 2024-09-04 13:57:47 +02:00
dgtlmoon
5b70625eaa 0.46.04 2024-09-04 13:55:18 +02:00
dgtlmoon
60d292107d Fixing restock monitor tests and tweaking docker default config example, 2024-09-02 15:11:31 +02:00
dgtlmoon
1cb38347da Container name should be 'sockpuppetbrowser' because its not just playwright that uses it 2024-09-02 13:21:38 +02:00
dgtlmoon
55fe2abf42 Restock/Price detection - Better catching of errors when parsing metadata documents for restock/price check (#2602) 2024-09-01 13:07:06 +02:00
dgtlmoon
4225900ec3 Restock - updating texts and text offsets 2024-09-01 12:47:21 +02:00
dgtlmoon
392cc4586f extra selectors 2024-09-01 12:24:39 +02:00
dgtlmoon
b7984f266a Needed new tag 2024-08-30 17:53:16 +02:00
dgtlmoon
446622159c WIP - adding more scrape data and some dev tweaks 2024-08-30 16:06:04 +02:00
dgtlmoon
2e82b17cac fix path 2024-08-23 21:51:33 +02:00
dgtlmoon
7c914cd266 dangit 2024-08-23 20:24:35 +02:00
dgtlmoon
5fa841637e fix imports, paths 2024-08-23 19:53:14 +02:00
dgtlmoon
72579d8ea2 woops 2024-08-23 18:38:51 +02:00
dgtlmoon
bf5b1143e3 Adding test 2024-08-23 18:31:54 +02:00
dgtlmoon
106f258d13 add delay 2024-08-23 14:27:12 +02:00
dgtlmoon
d7160d79bd Use integer value for ML of the r,g,b 2024-08-23 12:25:26 +02:00
dgtlmoon
c0e9846a85 Adding more training data 2024-08-23 11:31:35 +02:00
dgtlmoon
3656951259 Initial support for scraper ML price extraction integration 2024-08-22 16:50:46 +02:00
dgtlmoon
1fb4342488 Build - Unpin jsonschema for faster builds (#2583) 2024-08-22 15:02:00 +02:00
dgtlmoon
7071df061a Price detection/scraping - Adding extra element training data (#2582) 2024-08-22 15:01:36 +02:00
dgtlmoon
6dd1fa2b88 0.46.03 2024-08-19 17:22:13 +02:00
dgtlmoon
371f85d544 Watch 'Download last snapshot' link/button should give last, not first snapshot (#2576) 2024-08-19 17:20:30 +02:00
dgtlmoon
932cf15e1e Price and restock scraping - small price fix scraper (#2575) 2024-08-19 15:47:19 +02:00
Mike Splain
bf0d410d32 Browser Steps UI - Interactive UI wasn't sending headers but was when the check ran (#2551) 2024-08-19 10:21:05 +02:00
dgtlmoon
730f37c7ba Set encoding type for scraper script reader (#2574 #2568) 2024-08-19 09:17:18 +02:00
dgtlmoon
8a35d62e02 Handle zero-byte/empty content responses with "[ ] Empty pages are a change" option, the same as when the HTML doesnt render any useful text (#2530) 2024-07-29 13:27:59 +02:00
dgtlmoon
f527744024 0.46.02 2024-07-27 20:28:04 +02:00
dgtlmoon
71c9b1273c Adding test for #1995 UTF-8 encoding in POST request body and post:// notifications (#2525) 2024-07-27 19:47:03 +02:00
dgtlmoon
ec68450df1 Updating Apprise notification library , Splunk/VictorOps, Africas Talking, Microsoft Power Automate / Workflows, Société Française du Radiotéléphone (SFR) Support (#2524) 2024-07-27 14:28:57 +02:00
dgtlmoon
2fd762a783 Encode POST style requests and notifications as UTF-8 if it has no encoding/basic string (#2523) 2024-07-27 14:27:15 +02:00
29 changed files with 475 additions and 126 deletions

View File

@@ -52,6 +52,10 @@ jobs:
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
# CDIO AI Element scraper for prices
# Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest
- name: Spin up ancillary SMTP+Echo message test server
run: |
# Debug SMTP server/echo message back server
@@ -95,6 +99,11 @@ jobs:
# Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .'
# PLAYWRIGHT/NODE-> CDP
- name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
run: |
docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
- name: Playwright and SocketPuppetBrowser - Restock detection
run: |
# restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.01'
__version__ = '0.46.04'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_start_session['browser'],
proxy=proxy,
start_url=datastore.data['watching'][watch_uuid].get('url')
start_url=datastore.data['watching'][watch_uuid].get('url'),
headers=datastore.data['watching'][watch_uuid].get('headers')
)
# For test

View File

@@ -4,7 +4,7 @@ from loguru import logger
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
import os
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong'
# available_fetchers() will scan this implementation looking for anything starting with html_
# this information is used in the form selections

View File

@@ -65,8 +65,8 @@ class Fetcher():
def __init__(self):
import importlib.resources
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
@abstractmethod
def get_error(self):
@@ -81,7 +81,8 @@ class Fetcher():
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
# Should set self.error, self.status_code and self.content
pass

View File

@@ -83,7 +83,8 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
from playwright.sync_api import sync_playwright
import playwright._impl._errors
@@ -130,7 +131,7 @@ class fetcher(Fetcher):
if response is None:
context.close()
browser.close()
logger.debug("Content Fetcher > Response object was none")
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)
try:
@@ -166,10 +167,10 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
if len(self.page.content().strip()) == 0:
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
context.close()
browser.close()
logger.debug("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here

View File

@@ -75,7 +75,8 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes,
current_include_filters,
is_binary
is_binary,
empty_pages_are_a_change
):
from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -153,7 +154,7 @@ class fetcher(Fetcher):
if response is None:
await self.page.close()
await browser.close()
logger.warning("Content Fetcher > Response object was none")
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
raise EmptyReply(url=url, status_code=None)
self.headers = response.headers
@@ -186,10 +187,11 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
content = await self.page.content
if len(content.strip()) == 0:
if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
await self.page.close()
await browser.close()
logger.error("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here
@@ -247,7 +249,7 @@ class fetcher(Fetcher):
await self.fetch_page(**kwargs)
def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False):
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -262,7 +264,8 @@ class fetcher(Fetcher):
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters,
is_binary=is_binary
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
), timeout=max_time))
except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))

View File

@@ -1,9 +1,8 @@
from loguru import logger
import chardet
import hashlib
import os
import chardet
import requests
from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
@@ -26,7 +25,8 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -53,7 +53,7 @@ class fetcher(Fetcher):
session.mount('file://', FileAdapter())
r = session.request(method=request_method,
data=request_body,
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
url=url,
headers=request_headers,
timeout=timeout,
@@ -74,7 +74,10 @@ class fetcher(Fetcher):
self.headers = r.headers
if not r.content or not len(r.content):
raise EmptyReply(url=url, status_code=r.status_code)
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
# @todo test this
# @todo maybe you really want to test zero-byte return pages?

View File

@@ -75,6 +75,7 @@ function isItemInStock() {
'vergriffen',
'vorbestellen',
'vorbestellung ist bald möglich',
'we don\'t currently have any',
'we couldn\'t find any products that match',
'we do not currently have an estimate of when this product will be back in stock.',
'we don\'t know when or if this item will be back in stock.',
@@ -173,7 +174,8 @@ function isItemInStock() {
const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
// Note: theres also an automated test that places the 'out of stock' text fairly low down
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue
}
elementText = "";
@@ -187,7 +189,7 @@ function isItemInStock() {
// and these mean its out of stock
for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
return outOfStockText; // item is out of stock
}
}

View File

@@ -15,6 +15,7 @@ try {
console.log(e);
}
const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100);
// Include the getXpath script directly, easier than fetching
function getxpath(e) {
@@ -77,6 +78,30 @@ const findUpTag = (el) => {
}
return null;
}
// Text width scraper for ML training/detection
// Create a single canvas and get its 2D context
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");
// Function to get the width and height of the text inside an element and round them to the nearest integer
function getTextWidthAndHeightinPx(element) {
// Set the font to match the style of the text in the element
context.font = window.getComputedStyle(element).font;
// Get the text inside the element
const text = element.textContent || element.innerText;
// Measure the text width
const metrics = context.measureText(text);
const width = Math.round(metrics.width);
// Get the font size from the computed style
const fontSize = parseFloat(window.getComputedStyle(element).fontSize);
const height = Math.round(fontSize); // Using font size as an approximation of height
// Return both width and height as an object
return { textWidth: width, textHeight: height };
}
// @todo - if it's SVG or IMG, go into image diff mode
@@ -122,8 +147,10 @@ const visibleElementsArray = [];
// Call collectVisibleElements with the starting parent element
collectVisibleElements(document.body, visibleElementsArray);
// Append any custom selectors to the visibleElementsArray
visibleElementsArray.forEach(function (element) {
function get_element_metadata(element) {
bbox = element.getBoundingClientRect();
@@ -164,18 +191,68 @@ visibleElementsArray.forEach(function (element) {
}
}
let label = "none" // A placeholder, the actual labels for training are done by hand for now
size_pos.push({
// Check if the element was found and get its text , not including any child element
let text = Array.from(element.childNodes)
.filter(node => node.nodeType === Node.TEXT_NODE)
.map(node => node.textContent)
.join('');
// Remove any gaps in sequences of newlines and tabs inside the string
text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim();
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
// @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
//const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,)/.test(text) ;
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,)/.test(text) ;
const hasDigit = /[0-9]/.test(text) ;
// Sizing of the actual text inside the element can be very different from the elements size
const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
const computedStyle = window.getComputedStyle(element);
let red, green, blue;
if (text.length) {
// Extract the RGB values from the color string (format: rgb(r, g, b))
[red, green, blue] = computedStyle.color.match(/\d+/g).map(Number);
} else {
// Assign default values if text is empty
[red, green, blue] = [0, 0, 0];
}
return {
xpath: xpath_result,
width: Math.round(bbox['width']),
height: Math.round(bbox['height']),
left: Math.floor(bbox['left']),
top: Math.floor(bbox['top']) + scroll_y,
// tagName used by Browser Steps
tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
// tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor == "pointer"
});
isClickable: window.getComputedStyle(element).cursor === "pointer",
// Used by the keras/pytorch trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
pcNumerical: text.length && percentageNumerical(text),
hasDigit: hasDigit,
hasDigitCurrency: hasDigitCurrency,
textWidth: textWidth,
textHeight: textHeight,
textLength: text.length,
t_r: red,
t_g: green,
t_b: blue,
label: label,
};
}
visibleElementsArray.forEach(function (element) {
let metadata = get_element_metadata(element);
if(metadata) {
size_pos.push(metadata);
}
});
@@ -184,7 +261,19 @@ visibleElementsArray.forEach(function (element) {
if (include_filters.length) {
let results;
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
outerLoop:
for (const f of include_filters) {
// Quick check so we dont end up with duplicates in the training data
for (let index = 0; index < size_pos.length; index++) {
let item = size_pos[index];
if (item.xpath === f) {
item.highlight_as_custom_filter = true;
item.found_as_duplicate = true;
item.label = "price";
continue outerLoop;
}
}
bbox = false;
q = false;
@@ -205,7 +294,6 @@ if (include_filters.length) {
}
} else {
console.log("[css] Scanning for included filter " + f)
console.log("[css] Scanning for included filter " + f);
results = document.querySelectorAll(f);
}
} catch (e) {
@@ -242,17 +330,15 @@ if (include_filters.length) {
console.log("xpath_element_scraper: error looking up q.ownerElement")
}
}
if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
size_pos.push({
xpath: f,
width: parseInt(bbox['width']),
height: parseInt(bbox['height']),
left: parseInt(bbox['left']),
top: parseInt(bbox['top']) + scroll_y,
highlight_as_custom_filter: true
});
element_info = get_element_metadata(node);
if(element_info) {
// Be sure we use exactly what was written
element_info['xpath'] = f;
element_info['highlight_as_custom_filter'] = true;
element_info['label'] = "price";
size_pos.push(element_info);
}
});
}
}

View File

@@ -56,7 +56,8 @@ class fetcher(Fetcher):
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions

View File

@@ -792,9 +792,9 @@ def changedetection_app(config=None, datastore_o=None):
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
# But in the case something is added we should save straight away
datastore.needs_write_urgent = True
# Queue the watch for immediate recheck, with a higher priority
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
if not datastore.data['watching'][uuid].get('paused'):
# Queue the watch for immediate recheck, with a higher priority
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
@@ -1377,17 +1377,19 @@ def changedetection_app(config=None, datastore_o=None):
import brotli
watch = datastore.data['watching'].get(uuid)
if watch and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[0]
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[-1]
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
if html_fname.endswith('.br'):
# Read and decompress the Brotli file
with open(html_fname, 'rb') as f:
with open(html_fname, 'rb') as f:
if html_fname.endswith('.br'):
# Read and decompress the Brotli file
decompressed_data = brotli.decompress(f.read())
else:
decompressed_data = f.read()
buffer = BytesIO(decompressed_data)
buffer = BytesIO(decompressed_data)
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
# Return a 500 error
@@ -1600,6 +1602,15 @@ def changedetection_app(config=None, datastore_o=None):
flash(f"{len(uuids)} watches were tagged")
elif op.startswith('mode:'):
mode = op.replace('mode:','')
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['processor'] = mode
flash(f"{len(uuids)} watches changed modes")
return redirect(url_for('index'))
@app.route("/api/share-url", methods=['GET'])

View File

@@ -518,7 +518,7 @@ class model(watch_base):
self.ensure_data_dir_exists()
with open(target_path, 'w') as f:
f.write(json.dumps(data))
f.write(json.dumps(data, indent=2))
f.close()
# Save as PNG, PNG is larger but better for doing visual diff in the future

View File

@@ -107,7 +107,7 @@ def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs):
r(results.get('url'),
auth=auth,
data=body,
data=body.encode('utf-8') if type(body) is str else body,
headers=headers,
params=params
)

View File

@@ -26,6 +26,8 @@ class difference_detection_processor():
def call_browser(self):
from requests.structures import CaseInsensitiveDict
from changedetectionio.content_fetchers.exceptions import EmptyReply
# Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -133,8 +135,18 @@ class difference_detection_processor():
is_binary = self.watch.is_pdf
# And here we go! call the right browser with browser-specific settings
self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
is_binary=is_binary)
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)
#@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit()

View File

@@ -1,11 +1,12 @@
from changedetectionio.model.Watch import model as BaseWatch
import re
from babel.numbers import parse_decimal
from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re
class Restock(dict):
def parse_currency(self, raw_value: str) -> float:
def parse_currency(self, raw_value: str) -> Union[float, None]:
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
standardized_value = raw_value
@@ -21,8 +22,11 @@ class Restock(dict):
# Remove any non-numeric characters except for the decimal point
standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
# Convert to float
return float(parse_decimal(standardized_value, locale='en'))
if standardized_value:
# Convert to float
return float(parse_decimal(standardized_value, locale='en'))
return None
def __init__(self, *args, **kwargs):
# Define default values

View File

@@ -3,10 +3,13 @@ from ..exceptions import ProcessorException
from . import Restock
from loguru import logger
import hashlib
import os
import re
import urllib3
import time
from ...html_tools import html_to_text
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock & Price detection for single product pages'
description = 'Detects if the product goes back to in-stock'
@@ -34,23 +37,28 @@ def get_itemprop_availability(html_content) -> Restock:
Kind of funny/cool way to find price/availability in one many different possibilities.
Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
"""
from jsonpath_ng import parse
now = time.time()
import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
value = {}
now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()
data = extruct.extract(html_content, syntaxes=syntaxes)
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful
value = Restock()
return value
if data:
logger.debug(f"Using jsonpath to find price/availability/etc")
price_parse = parse('$..(price|Price)')
@@ -118,6 +126,52 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def ML_scrape_for_price_data(self, ML_price_scraper_url):
import requests
from changedetectionio import html_tools
price_info = None
# Perform the POST request
response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data)
logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'")
# Check if the response contains a dict
if response.ok: # This checks if the request was successful (status code 200-299)
response_json = response.json()
logger.debug(f"ML Price scraper: response - {response_json}'")
if isinstance(response_json, dict) and 'idx' in response_json.keys():
suggested_xpath_idx = response_json.get('idx')
if response_json.get('score') <0.80 or response_json.get('score') > 1.0:
logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?")
return None
# Use the path provided to extra the price text
from price_parser import Price
scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}")
result_s = None
if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
html_content=self.fetcher.content)
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
result_s = html_tools.include_filters(include_filters=scrape_element['xpath'],
html_content=self.fetcher.content)
if result_s:
text = html_to_text(result_s)
logger.debug(f"Guessed the text '{text}' as the price information")
if text:
price_info = Price.fromstring(text)
else:
logger.error(f"ML Price scraper: missing xpath index (IDX) in response?")
else:
print(f"ML Price scraper: Request failed with status code: {response.status_code}")
#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT!
return price_info
def run_changedetection(self, watch, skip_when_checksum_same=True):
if not watch:
raise Exception("Watch no longer exists.")
@@ -174,6 +228,21 @@ class perform_site_check(difference_detection_processor):
else:
update_obj['restock']['in_stock'] = False
# Attempt to pass the elements off to the machine-learning endpoint if its enabled
# This might return a confident guess as to which element contains the price data
if not itemprop_availability.get('price'):
ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT")
if self.fetcher.xpath_data and ML_price_scraper_url:
price_info = self.ML_scrape_for_price_data(ML_price_scraper_url)
if price_info and price_info.amount:
logger.success(f"ML Price scraper: Got price data {price_info}")
itemprop_availability['price'] = f"{price_info.amount}"
update_obj['restock']['price'] = f"{price_info.amount}"
if price_info and price_info.currency:
itemprop_availability['currency'] = price_info.currency
update_obj['restock']['currency'] = price_info.currency
# Main detection method
fetched_md5 = None

View File

@@ -76,7 +76,7 @@
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">

View File

@@ -37,6 +37,8 @@
<button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:text_json_diff">Mode: Page changes</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:restock_diff">Mode: Price/Restock</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>

View File

@@ -8,7 +8,7 @@ from changedetectionio import changedetection_app
from changedetectionio import store
import os
import sys
from loguru import logger
# https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
# Much better boilerplate than the docs

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python3
import time
from flask import url_for
from ..util import live_server_setup, wait_for_all_checks
import logging

View File

@@ -0,0 +1,127 @@
import os
from flask import url_for
from changedetectionio.tests.util import set_original_response, set_modified_response, set_more_modified_response, live_server_setup, \
wait_for_all_checks, \
set_longer_modified_response
import time
# No semantic data just some text, we should be able to find the product price.
def set_response(price="121.95"):
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Ajax Widget</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
display: flex;
justify-content: center;
align-items: center;
height: 100vh;
background-color: #f4f4f4;
}}
.container {{
display: flex;
flex-direction: row;
background-color: #fff;
border: 1px solid #ddd;
padding: 20px;
border-radius: 5px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
width: 80%;
max-width: 800px;
}}
.description {{
flex: 2;
margin-right: 20px;
}}
.description h1 {{
margin-top: 0;
}}
.price {{
flex: 1;
text-align: right;
font-size: 24px;
color: #333;
}}
.price span {{
font-size: 32px;
font-weight: bold;
}}
.buy-button {{
display: inline-block;
margin-top: 20px;
padding: 10px 20px;
background-color: #28a745;
color: #fff;
text-decoration: none;
border-radius: 5px;
font-size: 16px;
}}
.buy-button:hover {{
background-color: #218838;
}}
</style>
</head>
<body>
<div class="container">
<div class="description">
<h1>Ajax Widget</h1>
<p>The Ajax Widget is the ultimate solution for all your widget needs. Crafted with precision and using the latest technology, this widget offers unmatched performance and durability. Whether you're using it for personal or professional purposes, the Ajax Widget will not disappoint. It's easy to use, reliable, and comes with a sleek design that complements any setup. Don't settle for less; get the best with the Ajax Widget today!</p>
</div>
<div class="price">
<span>${price}</span>
<br>
<a href="#" class="buy-button">Buy Now</a><br>
IN STOCK
</div>
</div>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(html_content)
time.sleep(1)
return None
def test_restock_itemprop_basic(client, live_server):
# needs to be set and something like 'ws://127.0.0.1:3000'
assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
assert os.getenv('PRICE_SCRAPER_ML_ENDPOINT'), "Needs PRICE_SCRAPER_ML_ENDPOINT set for this test"
live_server_setup(live_server)
set_response(price="123.99")
# because it needs to access itself from within the sockpuppetbrowser
test_url = url_for('test_endpoint', _external=True)
test_url = test_url.replace('localhost.localdomain', 'cdio')
test_url = test_url.replace('localhost', 'cdio')
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'123.99' in res.data
assert b' in-stock' in res.data
assert b' not-in-stock' not in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -112,7 +112,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
res = client.post(
url_for("settings_page"),
data={"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
"application-notification_body": 'triggered text was -{{triggered_text}}-',
"application-notification_body": 'triggered text was -{{triggered_text}}- 网站监测 内容更新了',
# https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
"application-notification_urls": test_notification_url,
"application-minutes_between_check": 180,
@@ -167,9 +167,10 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
# Takes a moment for apprise to fire
time.sleep(3)
assert os.path.isfile("test-datastore/notification.txt"), "Notification fired because I can see the output file"
with open("test-datastore/notification.txt", 'r') as f:
response= f.read()
assert '-Oh yes please-' in response
with open("test-datastore/notification.txt", 'rb') as f:
response = f.read()
assert b'-Oh yes please-' in response
assert '网站监测 内容更新了'.encode('utf-8') in response
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

View File

@@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'which has this one new line' in res.data
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index"))
assert b'unviewed' in res.data
@@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert expected_url.encode('utf-8') in res.data
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
res = client.get(url_for("diff_history_page", uuid="first"))
res = client.get(url_for("diff_history_page", uuid=uuid))
assert b'selected=""' in res.data, "Confirm diff history page loaded"
# Check the [preview] pulls the right one
@@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b'unviewed' not in res.data
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
uuid = extract_UUID_from_client(client)
client.get(url_for("clear_watch_history", uuid=uuid))
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'preview/' in res.data
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'<head><title>head title</title></head>' in res.data
#
# Cleanup everything
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

View File

@@ -1,12 +1,8 @@
#!/usr/bin/env python3
import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup
sleep_time_for_fetch_thread = 3
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import time
def set_nonrenderable_response():
test_return_data = """<html>
@@ -22,6 +18,13 @@ def set_nonrenderable_response():
return None
def set_zero_byte_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("")
return None
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
set_original_response()
live_server_setup(live_server)
@@ -35,18 +38,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
#####################
@@ -64,7 +60,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
@@ -86,14 +82,21 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
client.get(url_for("mark_all_viewed"), follow_redirects=True)
# A totally zero byte (#2528) response should also not trigger an error
set_zero_byte_response()
time.sleep(2)
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
assert b'fetch-error' not in res.data
#
# Cleanup everything

View File

@@ -291,11 +291,11 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
data={
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444 }',
"application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了" }',
"application-notification_format": default_notification_format,
"application-notification_urls": test_notification_url,
# https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }} ",
},
follow_redirects=True
)
@@ -324,6 +324,7 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
j = json.loads(x)
assert j['url'].startswith('http://localhost')
assert j['secret'] == 444
assert j['somebug'] == '网站监测 内容更新了'
# URL check, this will always be converted to lowercase
assert os.path.isfile("test-datastore/notification-url.txt")
@@ -354,9 +355,10 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
#2510
def test_global_send_test_notification(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
set_original_response()
if os.path.isfile("test-datastore/notification.txt"):
os.unlink("test-datastore/notification.txt")
# otherwise other settings would have already existed from previous tests in this file
res = client.post(
@@ -364,7 +366,8 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
data={
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"application-notification_body": 'change detection is cool',
#1995 UTF-8 content should be encoded
"application-notification_body": 'change detection is cool 网站监测 内容更新了',
"application-notification_format": default_notification_format,
"application-notification_urls": "",
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
@@ -399,8 +402,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
with open("test-datastore/notification.txt", 'r') as f:
x = f.read()
assert 'change detection is coo' in x
assert 'change detection is cool 网站监测 内容更新了' in x
os.unlink("test-datastore/notification.txt")
@@ -420,7 +422,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
with open("test-datastore/notification.txt", 'r') as f:
x = f.read()
# Should come from notification.py default handler when there is no notification body to pull from
assert 'change detection is coo' in x
assert 'change detection is cool 网站监测 内容更新了' in x
client.get(
url_for("form_delete", uuid="all"),

View File

@@ -1,6 +1,5 @@
from .processors.exceptions import ProcessorException
from . import content_fetchers
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
@@ -301,7 +300,7 @@ class update_worker(threading.Thread):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False
except content_fetchers.exceptions.ReplyWithContentButNoText as e:
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
@@ -327,7 +326,7 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
@@ -380,23 +379,23 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
except content_fetchers.exceptions.BrowserConnectError as e:
except content_fetchers_exceptions.BrowserConnectError as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers.exceptions.BrowserFetchTimedOut as e:
except content_fetchers_exceptions.BrowserFetchTimedOut as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers.exceptions.BrowserStepsStepException as e:
except content_fetchers_exceptions.BrowserStepsStepException as e:
if not self.datastore.data['watching'].get(uuid):
continue
@@ -438,25 +437,25 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetchers.exceptions.EmptyReply as e:
except content_fetchers_exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.ScreenshotUnavailable as e:
except content_fetchers_exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.JSActionExceptions as e:
except content_fetchers_exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.PageUnloadable as e:
except content_fetchers_exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
@@ -468,7 +467,7 @@ class update_worker(threading.Thread):
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False

View File

@@ -18,7 +18,7 @@ services:
#
# Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=DEBUG
# - LOGGER_LEVEL=TRACE
#
# Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,8 +29,9 @@ services:
#
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
#
# Alternative Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
# Alternative target "Chrome" Playwright URL, do not use "'s or 's!
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
#
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
#
@@ -57,6 +58,10 @@ services:
#
# Absolute minimum seconds to recheck, overrides any watch minimum, change to 0 to disable
# - MINIMUM_SECONDS_RECHECK_TIME=3
#
# Scrape prices from web pages automatically where the page has no embedded price information (see below also)
# - PRICE_SCRAPER_ML_ENDPOINT=http://cdio-price-scraper:5005
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
ports:
- 5000:5000
@@ -73,10 +78,10 @@ services:
# condition: service_started
# Used for fetching pages via Playwright+Chrome where you need Javascript support.
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME
# playwright-chrome:
# hostname: playwright-chrome
# sockpuppetbrowser:
# hostname: sockpuppetbrowser
# image: dgtlmoon/sockpuppetbrowser:latest
# cap_add:
# - SYS_ADMIN
@@ -103,6 +108,13 @@ services:
# # Workaround to avoid the browser crashing inside a docker container
# # See https://github.com/SeleniumHQ/docker-selenium#quick-start
# - /dev/shm:/dev/shm
# restart: unless-stopped
# Machine Learning/AI - Use "Visual Selector" elements data to scrape price data
# cdio-keras-price-scraper:
# hostname: cdio-price-scraper
# image: dgtlmoon/changedetection-AI-pricescraper
# restart: unless-stopped
volumes:

View File

@@ -35,7 +35,7 @@ dnspython==2.6.1 # related to eventlet fixes
# jq not available on Windows so must be installed manually
# Notification library
apprise~=1.8.0
apprise~=1.8.1
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
# and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible
@@ -79,9 +79,10 @@ pyppeteerstealth>=0.0.4
pytest ~=7.2
pytest-flask ~=1.2
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
jsonschema==4.17.3
# Anything 4.0 and up but not 5.0
jsonschema ~= 4.0
price_parser
loguru
# For scraping all possible metadata relating to products so we can do better restock detection