Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon d3f6b92670 posts:// for notifications also 2024-07-27 13:43:40 +02:00
dgtlmoon 1813977133 Encode requests POST as UTF-8 if it has no encoding/basic string #1315 #2309 2024-07-27 13:40:12 +02:00
34 changed files with 165 additions and 309 deletions
+1 -1
View File
@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki # Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.04' __version__ = '0.46.01'
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui( browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_start_session['browser'], playwright_browser=browsersteps_start_session['browser'],
proxy=proxy, proxy=proxy,
start_url=datastore.data['watching'][watch_uuid].get('url'), start_url=datastore.data['watching'][watch_uuid].get('url')
headers=datastore.data['watching'][watch_uuid].get('headers')
) )
# For test # For test
@@ -58,9 +58,9 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{% if '/text()' in field %} {% if '/text()' in field %}
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br> <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br>
{% endif %} {% endif %}
<span class="pure-form-message-inline">One CSS, xPath, JSON Path/JQ selector per line, <i>any</i> rules that matches will be used.<br> <span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br>
<div data-target="#advanced-help-selectors" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
<ul id="advanced-help-selectors"> <ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed). <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
<ul> <ul>
+3 -4
View File
@@ -65,8 +65,8 @@ class Fetcher():
def __init__(self): def __init__(self):
import importlib.resources import importlib.resources
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()
@abstractmethod @abstractmethod
def get_error(self): def get_error(self):
@@ -81,8 +81,7 @@ class Fetcher():
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None, current_include_filters=None,
is_binary=False, is_binary=False):
empty_pages_are_a_change=False):
# Should set self.error, self.status_code and self.content # Should set self.error, self.status_code and self.content
pass pass
@@ -83,8 +83,7 @@ class fetcher(Fetcher):
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None, current_include_filters=None,
is_binary=False, is_binary=False):
empty_pages_are_a_change=False):
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
import playwright._impl._errors import playwright._impl._errors
@@ -131,7 +130,7 @@ class fetcher(Fetcher):
if response is None: if response is None:
context.close() context.close()
browser.close() browser.close()
logger.debug("Content Fetcher > Response object from the browser communication was none") logger.debug("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
try: try:
@@ -167,10 +166,10 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0: if len(self.page.content().strip()) == 0:
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
context.close() context.close()
browser.close() browser.close()
logger.debug("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status) raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here # Run Browser Steps here
@@ -75,8 +75,7 @@ class fetcher(Fetcher):
request_method, request_method,
ignore_status_codes, ignore_status_codes,
current_include_filters, current_include_filters,
is_binary, is_binary
empty_pages_are_a_change
): ):
from changedetectionio.content_fetchers import visualselector_xpath_selectors from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -154,7 +153,7 @@ class fetcher(Fetcher):
if response is None: if response is None:
await self.page.close() await self.page.close()
await browser.close() await browser.close()
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)") logger.warning("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
self.headers = response.headers self.headers = response.headers
@@ -187,11 +186,10 @@ class fetcher(Fetcher):
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
content = await self.page.content content = await self.page.content
if len(content.strip()) == 0:
if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
await self.page.close() await self.page.close()
await browser.close() await browser.close()
logger.error("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status) raise EmptyReply(url=url, status_code=response.status)
# Run Browser Steps here # Run Browser Steps here
@@ -249,7 +247,7 @@ class fetcher(Fetcher):
await self.fetch_page(**kwargs) await self.fetch_page(**kwargs)
def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False): current_include_filters=None, is_binary=False):
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180) max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -264,8 +262,7 @@ class fetcher(Fetcher):
request_method=request_method, request_method=request_method,
ignore_status_codes=ignore_status_codes, ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters, current_include_filters=current_include_filters,
is_binary=is_binary, is_binary=is_binary
empty_pages_are_a_change=empty_pages_are_a_change
), timeout=max_time)) ), timeout=max_time))
except asyncio.TimeoutError: except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
@@ -1,8 +1,9 @@
from loguru import logger
import chardet
import hashlib import hashlib
import os import os
import chardet
import requests import requests
from changedetectionio import strtobool from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher from changedetectionio.content_fetchers.base import Fetcher
@@ -25,8 +26,7 @@ class fetcher(Fetcher):
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None, current_include_filters=None,
is_binary=False, is_binary=False):
empty_pages_are_a_change=False):
if self.browser_steps_get_valid_steps(): if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url) raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -74,10 +74,7 @@ class fetcher(Fetcher):
self.headers = r.headers self.headers = r.headers
if not r.content or not len(r.content): if not r.content or not len(r.content):
if not empty_pages_are_a_change: raise EmptyReply(url=url, status_code=r.status_code)
raise EmptyReply(url=url, status_code=r.status_code)
else:
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
# @todo test this # @todo test this
# @todo maybe you really want to test zero-byte return pages? # @todo maybe you really want to test zero-byte return pages?
@@ -75,7 +75,6 @@ function isItemInStock() {
'vergriffen', 'vergriffen',
'vorbestellen', 'vorbestellen',
'vorbestellung ist bald möglich', 'vorbestellung ist bald möglich',
'we don\'t currently have any',
'we couldn\'t find any products that match', 'we couldn\'t find any products that match',
'we do not currently have an estimate of when this product will be back in stock.', 'we do not currently have an estimate of when this product will be back in stock.',
'we don\'t know when or if this item will be back in stock.', 'we don\'t know when or if this item will be back in stock.',
@@ -174,8 +173,7 @@ function isItemInStock() {
const element = elementsToScan[i]; const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area // outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
// Note: theres also an automated test that places the 'out of stock' text fairly low down if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue continue
} }
elementText = ""; elementText = "";
@@ -189,7 +187,7 @@ function isItemInStock() {
// and these mean its out of stock // and these mean its out of stock
for (const outOfStockText of outOfStockTexts) { for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) { if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
return outOfStockText; // item is out of stock return outOfStockText; // item is out of stock
} }
} }
@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
} }
} }
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
let text = element.textContent.trim().slice(0, 30).trim();
while (/\n{2,}|\t{2,}/.test(text)) {
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
}
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,)/.test(text) ;
size_pos.push({ size_pos.push({
xpath: xpath_result, xpath: xpath_result,
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
height: Math.round(bbox['height']), height: Math.round(bbox['height']),
left: Math.floor(bbox['left']), left: Math.floor(bbox['left']),
top: Math.floor(bbox['top']) + scroll_y, top: Math.floor(bbox['top']) + scroll_y,
// tagName used by Browser Steps
tagName: (element.tagName) ? element.tagName.toLowerCase() : '', tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
// tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor === "pointer", isClickable: window.getComputedStyle(element).cursor == "pointer"
// Used by the keras trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
hasDigitCurrency: hasDigitCurrency,
label: label,
}); });
}); });
@@ -56,8 +56,7 @@ class fetcher(Fetcher):
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None, current_include_filters=None,
is_binary=False, is_binary=False):
empty_pages_are_a_change=False):
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.options import Options as ChromeOptions
+7 -15
View File
@@ -729,12 +729,6 @@ def changedetection_app(config=None, datastore_o=None):
for p in datastore.proxy_list: for p in datastore.proxy_list:
form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label']))) form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label'])))
# Add some HTML to be used for form validation
if datastore.data['watching'][uuid].history.keys():
timestamp = list(datastore.data['watching'][uuid].history.keys())[-1]
form.last_html_for_form_validation = datastore.data['watching'][uuid].get_fetched_html(timestamp)
else:
form.last_html_for_form_validation = "<html><body></body></html>"
if request.method == 'POST' and form.validate(): if request.method == 'POST' and form.validate():
@@ -1383,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
import brotli import brotli
watch = datastore.data['watching'].get(uuid) watch = datastore.data['watching'].get(uuid)
if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir): if watch and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[-1] latest_filename = list(watch.history.keys())[0]
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br") html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f: if html_fname.endswith('.br'):
if html_fname.endswith('.br'): # Read and decompress the Brotli file
# Read and decompress the Brotli file with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read()) decompressed_data = brotli.decompress(f.read())
else:
decompressed_data = f.read()
buffer = BytesIO(decompressed_data) buffer = BytesIO(decompressed_data)
return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html') return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
# Return a 500 error # Return a 500 error
+32 -22
View File
@@ -1,9 +1,6 @@
import os import os
import re import re
import elementpath
from changedetectionio.html_tools import xpath_filter, xpath1_filter
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from wtforms import ( from wtforms import (
@@ -325,39 +322,52 @@ class ValidateCSSJSONXPATHInput(object):
self.allow_json = allow_json self.allow_json = allow_json
def __call__(self, form, field): def __call__(self, form, field):
from lxml.etree import XPathEvalError
if isinstance(field.data, str): if isinstance(field.data, str):
data = [field.data] data = [field.data]
else: else:
data = field.data data = field.data
for line in data: for line in data:
line = line.strip() # Nothing to see here
if not len(line.strip()):
return
if not line: # Does it look like XPath?
continue if line.strip()[0] == '/' or line.strip().startswith('xpath:'):
if line.startswith('xpath') or line.startswith('/'):
if not self.allow_xpath: if not self.allow_xpath:
raise ValidationError("XPath not permitted in this field!") raise ValidationError("XPath not permitted in this field!")
from lxml import etree, html
if line.startswith('xpath1:'): import elementpath
filter_function = xpath1_filter # xpath 2.0-3.1
else: from elementpath.xpath3 import XPath3Parser
line = line.replace('xpath:', '') tree = html.fromstring("<html></html>")
filter_function = xpath_filter line = line.replace('xpath:', '')
try: try:
# Call the determined function elementpath.select(tree, line.strip(), parser=XPath3Parser)
res = filter_function(xpath_filter=line, html_content=form.last_html_for_form_validation) except elementpath.ElementPathError as e:
# It's OK if this is an empty result, we just want to check that it doesn't crash the parser
except (elementpath.ElementPathError,XPathEvalError) as e:
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
raise ValidationError(message % (line, str(e))) raise ValidationError(message % (line, str(e)))
except Exception as e: except:
raise ValidationError("A system-error occurred when validating your XPath expression") raise ValidationError("A system-error occurred when validating your XPath expression")
elif 'json:' in line: if line.strip().startswith('xpath1:'):
if not self.allow_xpath:
raise ValidationError("XPath not permitted in this field!")
from lxml import etree, html
tree = html.fromstring("<html></html>")
line = re.sub(r'^xpath1:', '', line)
try:
tree.xpath(line.strip())
except etree.XPathEvalError as e:
message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
raise ValidationError(message % (line, str(e)))
except:
raise ValidationError("A system-error occurred when validating your XPath expression")
if 'json:' in line:
if not self.allow_json: if not self.allow_json:
raise ValidationError("JSONPath not permitted in this field!") raise ValidationError("JSONPath not permitted in this field!")
@@ -382,7 +392,7 @@ class ValidateCSSJSONXPATHInput(object):
if not self.allow_json: if not self.allow_json:
raise ValidationError("jq not permitted in this field!") raise ValidationError("jq not permitted in this field!")
elif line.startswith('jq:'): if 'jq:' in line:
try: try:
import jq import jq
except ModuleNotFoundError: except ModuleNotFoundError:
+1 -23
View File
@@ -8,7 +8,6 @@ from xml.sax.saxutils import escape as xml_escape
import json import json
import re import re
from loguru import logger
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
@@ -109,20 +108,6 @@ def elementpath_tostring(obj):
return str(obj) return str(obj)
def extract_namespaces(xml_content):
"""
Extracts all namespaces from the XML content.
"""
from lxml import etree
from io import BytesIO
it = etree.iterparse(BytesIO(xml_content), events=('start-ns',))
namespaces = {}
for _, ns in it:
prefix, uri = ns
namespaces[prefix] = uri
return namespaces
# Return str Utf-8 of matched rules # Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
from lxml import etree, html from lxml import etree, html
@@ -138,14 +123,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
html_block = "" html_block = ""
# Automatically extract all namespaces from the XML content r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
namespaces = {'re': 'http://exslt.org/regular-expressions'}
try:
namespaces.update(extract_namespaces(html_content.encode('utf-8')))
except Exception as e:
logger.warning(f"Problem extracting namespaces from HTMl/XML content {str(e)}")
r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
#@note: //title/text() wont work where <title>CDATA.. #@note: //title/text() wont work where <title>CDATA..
if type(r) != list: if type(r) != list:
+2 -14
View File
@@ -26,8 +26,6 @@ class difference_detection_processor():
def call_browser(self): def call_browser(self):
from requests.structures import CaseInsensitiveDict from requests.structures import CaseInsensitiveDict
from changedetectionio.content_fetchers.exceptions import EmptyReply
# Protect against file:// access # Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -135,18 +133,8 @@ class difference_detection_processor():
is_binary = self.watch.is_pdf is_binary = self.watch.is_pdf
# And here we go! call the right browser with browser-specific settings # And here we go! call the right browser with browser-specific settings
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
is_binary=is_binary)
self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)
#@todo .quit here could go on close object, so we can run JS if change-detected #@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit() self.fetcher.quit()
@@ -1,12 +1,11 @@
from babel.numbers import parse_decimal
from changedetectionio.model.Watch import model as BaseWatch from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re import re
from babel.numbers import parse_decimal
class Restock(dict): class Restock(dict):
def parse_currency(self, raw_value: str) -> Union[float, None]: def parse_currency(self, raw_value: str) -> float:
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer. # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
standardized_value = raw_value standardized_value = raw_value
@@ -22,11 +21,8 @@ class Restock(dict):
# Remove any non-numeric characters except for the decimal point # Remove any non-numeric characters except for the decimal point
standardized_value = re.sub(r'[^\d.-]', '', standardized_value) standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
if standardized_value: # Convert to float
# Convert to float return float(parse_decimal(standardized_value, locale='en'))
return float(parse_decimal(standardized_value, locale='en'))
return None
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
# Define default values # Define default values
@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
import extruct import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
value = {}
now = time.time() now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
data = extruct.extract(html_content, syntaxes=syntaxes)
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful # First phase, dead simple scanning of anything that looks useful
@@ -77,12 +77,11 @@ class perform_site_check(difference_detection_processor):
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
# Go into RSS preprocess for converting CDATA/comment to usable text # Go into RSS preprocess for converting CDATA/comment to usable text
# Ctype_header could be unset if we are just reprocessing the existin content if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']) or not ctype_header: if '<rss' in self.fetcher.content[:100].lower():
top_text = self.fetcher.content[:200].lower().strip()
if '<rss' in top_text or 'search.yahoo.com/mrss/' in top_text:
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content) self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
is_rss = True is_rss = True
# source: support, basically treat it as plaintext # source: support, basically treat it as plaintext
if watch.is_source_type_url: if watch.is_source_type_url:
is_html = False is_html = False
@@ -18,11 +18,9 @@ $(document).ready(function () {
}); });
$(".toggle-show").click(function (e) { $("#notification-token-toggle").click(function (e) {
e.preventDefault(); e.preventDefault();
let target = $(this).data('target'); $('#notification-tokens-info').toggle();
$(target).toggle();
}); });
}); });
@@ -11,11 +11,8 @@
class="notification-urls" ) class="notification-urls" )
}} }}
<div class="pure-form-message-inline"> <div class="pure-form-message-inline">
<p> <ul>
<strong>Tip:</strong> Use <a target=_new href="https://github.com/caronc/apprise">AppRise Notification URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.<br> <li>Use <a target=_new href="https://github.com/caronc/apprise">AppRise URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.</li>
</p>
<div data-target="#advanced-help-notifications" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
<ul style="display: none" id="advanced-help-notifications">
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li>
@@ -43,7 +40,7 @@
</div> </div>
<div class="pure-controls"> <div class="pure-controls">
<div data-target="#notification-tokens-info" class="toggle-show pure-button button-tag button-xsmall">Show token/placeholders</div> <div id="notification-token-toggle" class="pure-button button-tag button-xsmall">Show token/placeholders</div>
</div> </div>
<div class="pure-controls" style="display: none;" id="notification-tokens-info"> <div class="pure-controls" style="display: none;" id="notification-tokens-info">
<table class="pure-table" id="token-table"> <table class="pure-table" id="token-table">
+5 -9
View File
@@ -4,7 +4,6 @@
{% from '_common_fields.html' import render_common_settings_form %} {% from '_common_fields.html' import render_common_settings_form %}
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='vis.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='vis.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='global-settings.js')}}" defer></script>
<script> <script>
const browser_steps_available_screenshots=JSON.parse('{{ watch.get_browsersteps_available_screenshots|tojson }}'); const browser_steps_available_screenshots=JSON.parse('{{ watch.get_browsersteps_available_screenshots|tojson }}');
const browser_steps_config=JSON.parse('{{ browser_steps_config|tojson }}'); const browser_steps_config=JSON.parse('{{ browser_steps_config|tojson }}');
@@ -276,9 +275,9 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{% if '/text()' in field %} {% if '/text()' in field %}
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br> <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br>
{% endif %} {% endif %}
<span class="pure-form-message-inline">One CSS, xPath, JSON Path/JQ selector per line, <i>any</i> rules that matches will be used.<br> <span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br>
<p><div data-target="#advanced-help-selectors" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div><br></p>
<ul id="advanced-help-selectors" style="display: none;"> <ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed). <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
<ul> <ul>
@@ -298,12 +297,9 @@ xpath://body/div/span[contains(@class, 'example-class')]",
<li>To use XPath1.0: Prefix with <code>xpath1:</code></li> <li>To use XPath1.0: Prefix with <code>xpath1:</code></li>
</ul> </ul>
</li> </li>
<li>
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
</li>
</ul> </ul>
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
</span> </span>
</div> </div>
<fieldset class="pure-control-group"> <fieldset class="pure-control-group">
+1 -1
View File
@@ -76,7 +76,7 @@
</div> </div>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }} {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span> <span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
</div> </div>
{% if form.requests.proxy %} {% if form.requests.proxy %}
<div class="pure-control-group inline-radio"> <div class="pure-control-group inline-radio">
@@ -2,7 +2,7 @@
import os import os
import time import time
from flask import url_for from flask import url_for
from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client, wait_for_notification_endpoint_output from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
from changedetectionio.notification import ( from changedetectionio.notification import (
default_notification_body, default_notification_body,
default_notification_format, default_notification_format,
@@ -94,7 +94,7 @@ def test_restock_detection(client, live_server, measure_memory_usage):
assert b'not-in-stock' not in res.data assert b'not-in-stock' not in res.data
# We should have a notification # We should have a notification
wait_for_notification_endpoint_output() time.sleep(2)
assert os.path.isfile("test-datastore/notification.txt"), "Notification received" assert os.path.isfile("test-datastore/notification.txt"), "Notification received"
os.unlink("test-datastore/notification.txt") os.unlink("test-datastore/notification.txt")
@@ -103,7 +103,6 @@ def test_restock_detection(client, live_server, measure_memory_usage):
set_original_response() set_original_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
time.sleep(5)
assert not os.path.isfile("test-datastore/notification.txt"), "No notification should have fired when it went OUT OF STOCK by default" assert not os.path.isfile("test-datastore/notification.txt"), "No notification should have fired when it went OUT OF STOCK by default"
# BUT we should see that it correctly shows "not in stock" # BUT we should see that it correctly shows "not in stock"
@@ -2,7 +2,7 @@
import os.path import os.path
import time import time
from flask import url_for from flask import url_for
from .util import live_server_setup, wait_for_all_checks, wait_for_notification_endpoint_output from .util import live_server_setup, wait_for_all_checks
from changedetectionio import html_tools from changedetectionio import html_tools
@@ -112,7 +112,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
res = client.post( res = client.post(
url_for("settings_page"), url_for("settings_page"),
data={"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}", data={"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
"application-notification_body": 'triggered text was -{{triggered_text}}- 网站监测 内容更新了', "application-notification_body": 'triggered text was -{{triggered_text}}-',
# https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
"application-notification_urls": test_notification_url, "application-notification_urls": test_notification_url,
"application-minutes_between_check": 180, "application-minutes_between_check": 180,
@@ -165,12 +165,11 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
assert b'unviewed' in res.data assert b'unviewed' in res.data
# Takes a moment for apprise to fire # Takes a moment for apprise to fire
wait_for_notification_endpoint_output() time.sleep(3)
assert os.path.isfile("test-datastore/notification.txt"), "Notification fired because I can see the output file" assert os.path.isfile("test-datastore/notification.txt"), "Notification fired because I can see the output file"
with open("test-datastore/notification.txt", 'rb') as f: with open("test-datastore/notification.txt", 'r') as f:
response = f.read() response= f.read()
assert b'-Oh yes please-' in response assert '-Oh yes please-' in response
assert '网站监测 内容更新了'.encode('utf-8') in response
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+7 -7
View File
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
wait_for_all_checks(client) wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'which has this one new line' in res.data
# Now something should be ready, indicated by having a 'unviewed' class # Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert expected_url.encode('utf-8') in res.data assert expected_url.encode('utf-8') in res.data
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
res = client.get(url_for("diff_history_page", uuid=uuid)) res = client.get(url_for("diff_history_page", uuid="first"))
assert b'selected=""' in res.data, "Confirm diff history page loaded" assert b'selected=""' in res.data, "Confirm diff history page loaded"
# Check the [preview] pulls the right one # Check the [preview] pulls the right one
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b'unviewed' not in res.data assert b'unviewed' not in res.data
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
uuid = extract_UUID_from_client(client)
client.get(url_for("clear_watch_history", uuid=uuid)) client.get(url_for("clear_watch_history", uuid=uuid))
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'preview/' in res.data assert b'preview/' in res.data
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'<head><title>head title</title></head>' in res.data
# #
# Cleanup everything # Cleanup everything
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
@@ -4,7 +4,7 @@
import os import os
import time import time
from flask import url_for from flask import url_for
from .util import set_original_response, live_server_setup, wait_for_notification_endpoint_output from .util import set_original_response, live_server_setup
from changedetectionio.model import App from changedetectionio.model import App
@@ -102,15 +102,14 @@ def test_filter_doesnt_exist_then_exists_should_get_notification(client, live_se
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
wait_for_notification_endpoint_output() time.sleep(3)
# Shouldn't exist, shouldn't have fired # Shouldn't exist, shouldn't have fired
assert not os.path.isfile("test-datastore/notification.txt") assert not os.path.isfile("test-datastore/notification.txt")
# Now the filter should exist # Now the filter should exist
set_response_with_filter() set_response_with_filter()
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(3)
wait_for_notification_endpoint_output()
assert os.path.isfile("test-datastore/notification.txt") assert os.path.isfile("test-datastore/notification.txt")
@@ -1,8 +1,7 @@
import os import os
import time import time
from flask import url_for from flask import url_for
from .util import set_original_response, live_server_setup, extract_UUID_from_client, wait_for_all_checks, \ from .util import set_original_response, live_server_setup, extract_UUID_from_client, wait_for_all_checks
wait_for_notification_endpoint_output
from changedetectionio.model import App from changedetectionio.model import App
@@ -108,8 +107,7 @@ def run_filter_test(client, live_server, content_filter):
# One more check should trigger the _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT threshold # One more check should trigger the _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT threshold
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
time.sleep(2) # delay for apprise to fire
wait_for_notification_endpoint_output()
# Now it should exist and contain our "filter not found" alert # Now it should exist and contain our "filter not found" alert
assert os.path.isfile("test-datastore/notification.txt") assert os.path.isfile("test-datastore/notification.txt")
@@ -129,7 +127,6 @@ def run_filter_test(client, live_server, content_filter):
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
wait_for_notification_endpoint_output()
# It should have sent a notification, but.. # It should have sent a notification, but..
assert os.path.isfile("test-datastore/notification.txt") assert os.path.isfile("test-datastore/notification.txt")
# but it should not contain the info about a failed filter (because there was none in this case) # but it should not contain the info about a failed filter (because there was none in this case)
@@ -1,8 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
import time import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup
sleep_time_for_fetch_thread = 3
def set_nonrenderable_response(): def set_nonrenderable_response():
@@ -13,18 +16,12 @@ def set_nonrenderable_response():
</body> </body>
</html> </html>
""" """
with open("test-datastore/endpoint-content.txt", "w") as f: with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data) f.write(test_return_data)
time.sleep(1)
return None return None
def set_zero_byte_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("")
time.sleep(1)
return None
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage): def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
set_original_response() set_original_response()
live_server_setup(live_server) live_server_setup(live_server)
@@ -38,11 +35,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class) # Do this a few times.. ensures we dont accidently set the status
res = client.get(url_for("index")) for n in range(3):
assert b'unviewed' not in res.data client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
##################### #####################
@@ -60,7 +64,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@@ -82,20 +86,14 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data
client.get(url_for("mark_all_viewed"), follow_redirects=True)
# A totally zero byte (#2528) response should also not trigger an error
set_zero_byte_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
assert b'fetch-error' not in res.data
# #
# Cleanup everything # Cleanup everything
+7 -9
View File
@@ -291,11 +291,11 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
data={ data={
"application-fetch_backend": "html_requests", "application-fetch_backend": "html_requests",
"application-minutes_between_check": 180, "application-minutes_between_check": 180,
"application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了" }', "application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444 }',
"application-notification_format": default_notification_format, "application-notification_format": default_notification_format,
"application-notification_urls": test_notification_url, "application-notification_urls": test_notification_url,
# https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }} ", "application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
}, },
follow_redirects=True follow_redirects=True
) )
@@ -324,7 +324,6 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
j = json.loads(x) j = json.loads(x)
assert j['url'].startswith('http://localhost') assert j['url'].startswith('http://localhost')
assert j['secret'] == 444 assert j['secret'] == 444
assert j['somebug'] == '网站监测 内容更新了'
# URL check, this will always be converted to lowercase # URL check, this will always be converted to lowercase
assert os.path.isfile("test-datastore/notification-url.txt") assert os.path.isfile("test-datastore/notification-url.txt")
@@ -355,10 +354,9 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me
#2510 #2510
def test_global_send_test_notification(client, live_server, measure_memory_usage): def test_global_send_test_notification(client, live_server, measure_memory_usage):
#live_server_setup(live_server) #live_server_setup(live_server)
set_original_response() set_original_response()
if os.path.isfile("test-datastore/notification.txt"):
os.unlink("test-datastore/notification.txt")
# otherwise other settings would have already existed from previous tests in this file # otherwise other settings would have already existed from previous tests in this file
res = client.post( res = client.post(
@@ -366,8 +364,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
data={ data={
"application-fetch_backend": "html_requests", "application-fetch_backend": "html_requests",
"application-minutes_between_check": 180, "application-minutes_between_check": 180,
#1995 UTF-8 content should be encoded "application-notification_body": 'change detection is cool',
"application-notification_body": 'change detection is cool 网站监测 内容更新了',
"application-notification_format": default_notification_format, "application-notification_format": default_notification_format,
"application-notification_urls": "", "application-notification_urls": "",
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}", "application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }}",
@@ -402,7 +399,8 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
with open("test-datastore/notification.txt", 'r') as f: with open("test-datastore/notification.txt", 'r') as f:
x = f.read() x = f.read()
assert 'change detection is cool 网站监测 内容更新了' in x assert 'change detection is coo' in x
os.unlink("test-datastore/notification.txt") os.unlink("test-datastore/notification.txt")
@@ -422,7 +420,7 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage
with open("test-datastore/notification.txt", 'r') as f: with open("test-datastore/notification.txt", 'r') as f:
x = f.read() x = f.read()
# Should come from notification.py default handler when there is no notification body to pull from # Should come from notification.py default handler when there is no notification body to pull from
assert 'change detection is cool 网站监测 内容更新了' in x assert 'change detection is coo' in x
client.get( client.get(
url_for("form_delete", uuid="all"), url_for("form_delete", uuid="all"),
@@ -3,7 +3,7 @@ import os
import time import time
from flask import url_for from flask import url_for
from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client, wait_for_notification_endpoint_output from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
from ..notification import default_notification_format from ..notification import default_notification_format
instock_props = [ instock_props = [
@@ -182,8 +182,7 @@ def _run_test_minmax_limit(client, extra_watch_edit_form):
# price changed to something LESS than min (900), SHOULD be a change # price changed to something LESS than min (900), SHOULD be a change
set_original_response(props_markup=instock_props[0], price='890.45') set_original_response(props_markup=instock_props[0], price='890.45')
# let previous runs wait # let previous runs wait
time.sleep(2) time.sleep(1)
res = client.get(url_for("form_watch_checknow"), follow_redirects=True) res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data assert b'1 watches queued for rechecking.' in res.data
wait_for_all_checks(client) wait_for_all_checks(client)
@@ -198,8 +197,7 @@ def _run_test_minmax_limit(client, extra_watch_edit_form):
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
res = client.get(url_for("index")) res = client.get(url_for("index"))
# Depending on the LOCALE it may be either of these (generally for US/default/etc) assert b'1,890.45' or b'1890.45' in res.data
assert b'1,890.45' in res.data or b'1890.45' in res.data
assert b'unviewed' in res.data assert b'unviewed' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
@@ -364,7 +362,7 @@ def test_change_with_notification_values(client, live_server):
set_original_response(props_markup=instock_props[0], price='1950.45') set_original_response(props_markup=instock_props[0], price='1950.45')
client.get(url_for("form_watch_checknow")) client.get(url_for("form_watch_checknow"))
wait_for_all_checks(client) wait_for_all_checks(client)
wait_for_notification_endpoint_output() time.sleep(3)
assert os.path.isfile("test-datastore/notification.txt"), "Notification received" assert os.path.isfile("test-datastore/notification.txt"), "Notification received"
with open("test-datastore/notification.txt", 'r') as f: with open("test-datastore/notification.txt", 'r') as f:
notification = f.read() notification = f.read()
-43
View File
@@ -164,46 +164,3 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
assert b'Some other description' not in res.data # Should NOT be selected by the xpath assert b'Some other description' not in res.data # Should NOT be selected by the xpath
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
def test_namespace_selectors(live_server, client):
set_original_cdata_xml()
#live_server_setup(live_server)
test_url = url_for('test_endpoint', content_type="application/xml", _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# because it will look for the namespaced stuff during form validation, but on the first check it wont exist..
res = client.post(
url_for("edit_page", uuid=uuid),
data={
"include_filters": "//media:thumbnail/@url",
"fetch_backend": "html_requests",
"headers": "",
"proxy": "no-proxy",
"tags": "",
"url": test_url,
},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'CDATA' not in res.data
assert b'<![' not in res.data
assert b'https://testsite.com/thumbnail-c224e10d81488e818701c981da04869e.jpg' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
-11
View File
@@ -76,17 +76,6 @@ def set_more_modified_response():
return None return None
def wait_for_notification_endpoint_output():
'''Apprise can take a few seconds to fire'''
from os.path import isfile
for i in range(1, 20):
time.sleep(1)
if isfile("test-datastore/notification.txt"):
return True
return False
# kinda funky, but works for now # kinda funky, but works for now
def extract_api_key_from_UI(client): def extract_api_key_from_UI(client):
import re import re
+13 -12
View File
@@ -1,5 +1,6 @@
from .processors.exceptions import ProcessorException from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions from . import content_fetchers
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools from changedetectionio import html_tools
@@ -300,7 +301,7 @@ class update_worker(threading.Thread):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.ReplyWithContentButNoText as e: except content_fetchers.exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about # Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text # Page had elements/content but no renderable text
# Backend (not filters) gave zero output # Backend (not filters) gave zero output
@@ -326,7 +327,7 @@ class update_worker(threading.Thread):
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.Non200ErrorCodeReceived as e: except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403: if e.status_code == 403:
err_text = "Error - 403 (Access denied) received" err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404: elif e.status_code == 404:
@@ -379,23 +380,23 @@ class update_worker(threading.Thread):
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e: except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process. # Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False process_changedetection_results = False
changed_detected = False changed_detected = False
except content_fetchers_exceptions.BrowserConnectError as e: except content_fetchers.exceptions.BrowserConnectError as e:
self.datastore.update_watch(uuid=uuid, self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg update_obj={'last_error': e.msg
} }
) )
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.BrowserFetchTimedOut as e: except content_fetchers.exceptions.BrowserFetchTimedOut as e:
self.datastore.update_watch(uuid=uuid, self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg update_obj={'last_error': e.msg
} }
) )
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsStepException as e: except content_fetchers.exceptions.BrowserStepsStepException as e:
if not self.datastore.data['watching'].get(uuid): if not self.datastore.data['watching'].get(uuid):
continue continue
@@ -437,25 +438,25 @@ class update_worker(threading.Thread):
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.EmptyReply as e: except content_fetchers.exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this? # Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.ScreenshotUnavailable as e: except content_fetchers.exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'" err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.JSActionExceptions as e: except content_fetchers.exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot: if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True) watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.PageUnloadable as e: except content_fetchers.exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly" err_text = "Page request from server didnt respond correctly"
if e.message: if e.message:
err_text = "{} - {}".format(err_text, e.message) err_text = "{} - {}".format(err_text, e.message)
@@ -467,7 +468,7 @@ class update_worker(threading.Thread):
'last_check_status': e.status_code, 'last_check_status': e.status_code,
'has_ldjson_price_data': None}) 'has_ldjson_price_data': None})
process_changedetection_results = False process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e: except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher." err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False process_changedetection_results = False
+6 -7
View File
@@ -18,7 +18,7 @@ services:
# #
# Log levels are in descending order. (TRACE is the most detailed one) # Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=TRACE # - LOGGER_LEVEL=DEBUG
# #
# Alternative WebDriver/selenium URL, do not use "'s or 's! # Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,9 +29,8 @@ services:
# #
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
# #
# Alternative target "Chrome" Playwright URL, do not use "'s or 's! # Alternative Playwright URL, do not use "'s or 's!
# "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser. # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
# #
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
# #
@@ -74,10 +73,10 @@ services:
# condition: service_started # condition: service_started
# Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages. # Used for fetching pages via Playwright+Chrome where you need Javascript support.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME # RECOMMENDED FOR FETCHING PAGES WITH CHROME
# sockpuppetbrowser: # playwright-chrome:
# hostname: sockpuppetbrowser # hostname: playwright-chrome
# image: dgtlmoon/sockpuppetbrowser:latest # image: dgtlmoon/sockpuppetbrowser:latest
# cap_add: # cap_add:
# - SYS_ADMIN # - SYS_ADMIN
+3 -4
View File
@@ -35,7 +35,7 @@ dnspython==2.6.1 # related to eventlet fixes
# jq not available on Windows so must be installed manually # jq not available on Windows so must be installed manually
# Notification library # Notification library
apprise~=1.8.1 apprise~=1.8.0
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
# and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible # and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible
@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
pytest ~=7.2 pytest ~=7.2
pytest-flask ~=1.2 pytest-flask ~=1.2
# Anything 4.0 and up but not 5.0 # Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
jsonschema ~= 4.0 jsonschema==4.17.3
loguru loguru