Compare commits

..

13 Commits

Author SHA1 Message Date
dgtlmoon
73d9373879 Fixing page update 2024-10-10 13:18:02 +02:00
dgtlmoon
d32640d892 highlight ignore lines 2024-10-10 13:12:23 +02:00
dgtlmoon
7ee249e2ff Fix labels 2024-10-10 12:53:56 +02:00
dgtlmoon
5d753f59c4 Unique line test wasnt considering whitespace changes! 2024-10-10 12:27:25 +02:00
dgtlmoon
090f5d7725 fix test 2024-10-10 11:47:13 +02:00
dgtlmoon
7869a7745a Fixing whitespace cleanup - didnt work as expected!! 2024-10-10 09:25:52 +02:00
dgtlmoon
de34f0ad83 Fix bad comment 2024-10-09 18:52:27 +02:00
dgtlmoon
fabbb3733a Stop html_tools.strip_ignore_text from chewing newlines 2024-10-09 18:49:18 +02:00
dgtlmoon
deadf881b0 is now str not bytes 2024-10-09 18:05:08 +02:00
dgtlmoon
77ef42c367 oops 2024-10-09 15:11:56 +02:00
dgtlmoon
5d1f317e30 WIP 2024-10-09 15:09:29 +02:00
dgtlmoon
5ed7f43f6e Fix test 2024-10-09 13:21:07 +02:00
dgtlmoon
3b6ae70c9c Misc fixes - juggling utf-8 not needed
(its utf-16 by default python string)
2024-10-09 13:11:04 +02:00
15 changed files with 62 additions and 219 deletions

View File

@@ -1,7 +1,4 @@
import importlib
from concurrent.futures import ThreadPoolExecutor
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio.store import ChangeDetectionStore
from functools import wraps
@@ -33,6 +30,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def long_task(uuid, preferred_proxy):
import time
from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions
from changedetectionio.processors.text_json_diff import text_json_diff
from changedetectionio.safe_jinja import render as jinja_render
status = {'status': '', 'length': 0, 'text': ''}
@@ -40,12 +38,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
contents = ''
now = time.time()
try:
processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor")
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid
)
update_handler.call_browser(preferred_proxy_id=preferred_proxy)
update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
update_handler.call_browser()
# title, size is len contents not len xfer
except content_fetcher_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 404:
@@ -54,7 +48,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
status.update({'status': 'ERROR', 'length': len(contents), 'text': f"{e.status_code} - Access denied"})
else:
status.update({'status': 'ERROR', 'length': len(contents), 'text': f"Status code: {e.status_code}"})
except FilterNotFoundInResponse:
except text_json_diff.FilterNotFoundInResponse:
status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but CSS/xPath filter not found (page changed layout?)"})
except content_fetcher_exceptions.EmptyReply as e:
if e.status_code == 403 or e.status_code == 401:

View File

@@ -75,7 +75,6 @@ class fetcher(Fetcher):
self.headers = r.headers
if not r.content or not len(r.content):
logger.debug(f"Requests returned empty content for '{url}'")
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:

View File

@@ -788,6 +788,7 @@ def changedetection_app(config=None, datastore_o=None):
# Recast it if need be to right data Watch handler
watch_class = get_custom_watch_obj_for_processor(form.data.get('processor'))
datastore.data['watching'][uuid] = watch_class(datastore_path=datastore_o.datastore_path, default=datastore.data['watching'][uuid])
flash("Updated watch - unpaused!" if request.args.get('unpause_on_save') else "Updated watch.")
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
@@ -1486,6 +1487,7 @@ def changedetection_app(config=None, datastore_o=None):
continue
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1
flash(f"{i} watches queued for rechecking.")
return redirect(url_for('index', tag=tag))

View File

@@ -18,7 +18,6 @@ class difference_detection_processor():
screenshot = None
watch = None
xpath_data = None
preferred_proxy = None
def __init__(self, *args, datastore, watch_uuid, **kwargs):
super().__init__(*args, **kwargs)
@@ -27,8 +26,7 @@ class difference_detection_processor():
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()
def call_browser(self, preferred_proxy_id=None):
def call_browser(self):
from requests.structures import CaseInsensitiveDict
# Protect against file:// access
@@ -44,7 +42,7 @@ class difference_detection_processor():
prefer_fetch_backend = self.watch.get('fetch_backend', 'system')
# Proxy ID "key"
preferred_proxy_id = preferred_proxy_id if preferred_proxy_id else self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
# Pluggable content self.fetcher
if not prefer_fetch_backend or prefer_fetch_backend == 'system':

View File

@@ -27,27 +27,22 @@ def _search_prop_by_value(matches, value):
return prop[1] # Yield the desired value and exit the function
def _deduplicate_prices(data):
import re
seen = set()
unique_data = []
'''
Some price data has multiple entries, OR it has a single entry with ['$159', '159', 159, "$ 159"] or just "159"
Get all the values, clean it and add it to a set then return the unique values
'''
unique_data = set()
# Return the complete 'datum' where its price was not seen before
for datum in data:
# Convert 'value' to float if it can be a numeric string, otherwise leave it as is
try:
normalized_value = float(datum.value) if isinstance(datum.value, str) and datum.value.replace('.', '', 1).isdigit() else datum.value
except ValueError:
normalized_value = datum.value
if isinstance(datum.value, list):
# Process each item in the list
normalized_value = set([float(re.sub(r'[^\d.]', '', str(item))) for item in datum.value])
unique_data.update(normalized_value)
else:
# Process single value
v = float(re.sub(r'[^\d.]', '', str(datum.value)))
unique_data.add(v)
return list(unique_data)
# If the normalized value hasn't been seen yet, add it to unique data
if normalized_value not in seen:
unique_data.append(datum)
seen.add(normalized_value)
return unique_data
# should return Restock()
@@ -88,13 +83,14 @@ def get_itemprop_availability(html_content) -> Restock:
if price_result:
# Right now, we just support single product items, maybe we will store the whole actual metadata seperately in teh future and
# parse that for the UI?
if len(price_result) > 1 and len(price_result) > 1:
prices_found = set(str(item.value).replace('$', '') for item in price_result)
if len(price_result) > 1 and len(prices_found) > 1:
# See of all prices are different, in the case that one product has many embedded data types with the same price
# One might have $121.95 and another 121.95 etc
logger.warning(f"More than one price found {price_result}, throwing exception, cant use this plugin.")
logger.warning(f"More than one price found {prices_found}, throwing exception, cant use this plugin.")
raise MoreThanOnePriceFound()
value['price'] = price_result[0]
value['price'] = price_result[0].value
pricecurrency_result = pricecurrency_parse.find(data)
if pricecurrency_result:
@@ -224,7 +220,7 @@ class perform_site_check(difference_detection_processor):
itemprop_availability['original_price'] = itemprop_availability.get('price')
update_obj['restock']["original_price"] = itemprop_availability.get('price')
if not self.fetcher.instock_data and not itemprop_availability.get('availability') and not itemprop_availability.get('price'):
if not self.fetcher.instock_data and not itemprop_availability.get('availability'):
raise ProcessorException(
message=f"Unable to extract restock data for this page unfortunately. (Got code {self.fetcher.get_last_status_code()} from server), no embedded stock information was found and nothing interesting in the text, try using this watch with Chrome.",
url=watch.get('url'),

View File

@@ -211,7 +211,6 @@ class perform_site_check(difference_detection_processor):
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
# Rewrite's the processing text based on only what diff result they want to see
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
# Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences
from changedetectionio import diff

View File

@@ -1,14 +1,14 @@
$(function () {
/* add container before each proxy location to show status */
var isActive = false;
function setup_html_widget() {
var option_li = $('.fetch-backend-proxy li').filter(function () {
return $("input", this)[0].value.length > 0;
});
$(option_li).prepend('<div class="proxy-status"></div>');
$(option_li).append('<div class="proxy-timing"></div><div class="proxy-check-details"></div>');
}
var option_li = $('.fetch-backend-proxy li').filter(function() {
return $("input",this)[0].value.length >0;
});
//var option_li = $('.fetch-backend-proxy li');
var isActive = false;
$(option_li).prepend('<div class="proxy-status"></div>');
$(option_li).append('<div class="proxy-timing"></div><div class="proxy-check-details"></div>');
function set_proxy_check_status(proxy_key, state) {
// select input by value name
@@ -59,14 +59,8 @@ $(function () {
}
$('#check-all-proxies').click(function (e) {
e.preventDefault()
if (!$('body').hasClass('proxy-check-active')) {
setup_html_widget();
$('body').addClass('proxy-check-active');
}
$('body').addClass('proxy-check-active');
$('.proxy-check-details').html('');
$('.proxy-status').html('<span class="spinner"></span>').fadeIn();
$('.proxy-timing').html('');

View File

@@ -25,19 +25,15 @@ ul#requests-extra_proxies {
body.proxy-check-active {
#request {
// Padding set by flex layout
/*
.proxy-status {
width: 2em;
}
*/
.proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 2em;
max-width: 500px;
padding-left: 4em;
}
.proxy-timing {

View File

@@ -119,22 +119,19 @@ ul#requests-extra_proxies {
#request label[for=proxy] {
display: inline-block; }
body.proxy-check-active #request {
/*
.proxy-status {
width: 2em;
}
*/ }
body.proxy-check-active #request .proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 2em;
max-width: 500px; }
body.proxy-check-active #request .proxy-timing {
font-size: 80%;
padding-left: 1rem;
color: var(--color-link); }
body.proxy-check-active #request .proxy-status {
width: 2em; }
body.proxy-check-active #request .proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 4em; }
body.proxy-check-active #request .proxy-timing {
font-size: 80%;
padding-left: 1rem;
color: var(--color-link); }
#recommended-proxy {
display: grid;

View File

@@ -1,6 +0,0 @@
# A list of real world examples!
Always the price should be 666.66 for our tests
see test_restock_itemprop.py::test_special_prop_examples

View File

@@ -1,25 +0,0 @@
<div class="PriceSection PriceSection_PriceSection__Vx1_Q PriceSection_variantHuge__P9qxg PdpPriceSection"
data-testid="price-section"
data-optly-product-tile-price-section="true"><span
class="PriceRange ProductPrice variant-huge" itemprop="offers"
itemscope="" itemtype="http://schema.org/Offer"><div
class="VisuallyHidden_VisuallyHidden__VBD83">$155.55</div><span
aria-hidden="true" class="Price variant-huge" data-testid="price"
itemprop="price"><sup class="sup" data-testid="price-symbol"
itemprop="priceCurrency" content="AUD">$</sup><span
class="dollars" data-testid="price-value" itemprop="price"
content="155.55">155.55</span><span class="extras"><span class="sup"
data-testid="price-sup"></span></span></span></span>
</div>
<script type="application/ld+json">{
"@type": "Product",
"@context": "https://schema.org",
"name": "test",
"description": "test",
"offers": {
"@type": "Offer",
"priceCurrency": "AUD",
"price": 155.55
},
}</script>

View File

@@ -77,8 +77,6 @@ def test_check_removed_line_contains_trigger(client, live_server, measure_memory
# The trigger line is REMOVED, this should trigger
set_original(excluding='The golden line')
# Check in the processor here what's going on, its triggering empty-reply and no change.
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
@@ -153,6 +151,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
# A line thats not the trigger should not trigger anything
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
wait_for_all_checks(client)
@@ -174,5 +173,6 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
assert b'-Oh yes please-' in response
assert '网站监测 内容更新了'.encode('utf-8') in response
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -5,7 +5,7 @@ import time
from flask import url_for
from ..html_tools import *
from .util import live_server_setup, wait_for_all_checks
from .util import live_server_setup
def test_setup(live_server):
@@ -119,10 +119,12 @@ across multiple lines
def test_element_removal_full(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
@@ -130,8 +132,7 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(1)
# Goto the edit page, add the filter data
# Not sure why \r needs to be added - absent of the #changetext this is not necessary
subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
@@ -147,7 +148,6 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
follow_redirects=True,
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
# Check it saved
res = client.get(
@@ -156,10 +156,10 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
# Trigger a check
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# so that we set the state to 'unviewed' after all the edits
client.get(url_for("diff_history_page", uuid="first"))
@@ -168,11 +168,10 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
set_modified_response()
# Trigger a check
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# There should not be an unviewed change, as changes should be removed
res = client.get(url_for("index"))

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
# `subtractive_selectors` should still work in `source:` type requests
def test_fetch_pdf(client, live_server, measure_memory_usage):
import shutil
shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf")
live_server_setup(live_server)
test_url = url_for('test_pdf_endpoint', _external=True)
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# PDF header should not be there (it was converted to text)
assert b'PDF' not in res.data[:10]
assert b'hello world' in res.data
# So we know if the file changes in other ways
import hashlib
original_md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
# We should have one
assert len(original_md5) > 0
# And it's going to be in the document
assert b'Document checksum - ' + bytes(str(original_md5).encode('utf-8')) in res.data
shutil.copy("tests/test2.pdf", "test-datastore/endpoint-test.pdf")
changed_md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
wait_for_all_checks(client)
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index"))
assert b'unviewed' in res.data
# The original checksum should be not be here anymore (cdio adds it to the bottom of the text)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert original_md5.encode('utf-8') not in res.data
assert changed_md5.encode('utf-8') in res.data
res = client.get(
url_for("diff_history_page", uuid="first"),
follow_redirects=True
)
assert original_md5.encode('utf-8') in res.data
assert changed_md5.encode('utf-8') in res.data
assert b'here is a change' in res.data

View File

@@ -3,7 +3,7 @@ import os
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, wait_for_notification_endpoint_output
from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client, wait_for_notification_endpoint_output
from ..notification import default_notification_format
instock_props = [
@@ -413,31 +413,3 @@ def test_data_sanity(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"))
assert test_url2.encode('utf-8') in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
# All examples should give a prive of 666.66
def test_special_prop_examples(client, live_server):
import glob
#live_server_setup(live_server)
test_url = url_for('test_endpoint', _external=True)
check_path = os.path.join(os.path.dirname(__file__), "itemprop_test_examples", "*.txt")
files = glob.glob(check_path)
assert files
for test_example_filename in files:
with open(test_example_filename, 'r') as example_f:
with open("test-datastore/endpoint-content.txt", "w") as test_f:
test_f.write(f"<html><body>{example_f.read()}</body></html>")
# Now fetch it and check the price worked
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'ception' not in res.data
assert b'155.55' in res.data