Compare commits

..

13 Commits

Author SHA1 Message Date
dgtlmoon
73d9373879 Fixing page update 2024-10-10 13:18:02 +02:00
dgtlmoon
d32640d892 highlight ignore lines 2024-10-10 13:12:23 +02:00
dgtlmoon
7ee249e2ff Fix labels 2024-10-10 12:53:56 +02:00
dgtlmoon
5d753f59c4 Unique line test wasnt considering whitespace changes! 2024-10-10 12:27:25 +02:00
dgtlmoon
090f5d7725 fix test 2024-10-10 11:47:13 +02:00
dgtlmoon
7869a7745a Fixing whitespace cleanup - didnt work as expected!! 2024-10-10 09:25:52 +02:00
dgtlmoon
de34f0ad83 Fix bad comment 2024-10-09 18:52:27 +02:00
dgtlmoon
fabbb3733a Stop html_tools.strip_ignore_text from chewing newlines 2024-10-09 18:49:18 +02:00
dgtlmoon
deadf881b0 is now str not bytes 2024-10-09 18:05:08 +02:00
dgtlmoon
77ef42c367 oops 2024-10-09 15:11:56 +02:00
dgtlmoon
5d1f317e30 WIP 2024-10-09 15:09:29 +02:00
dgtlmoon
5ed7f43f6e Fix test 2024-10-09 13:21:07 +02:00
dgtlmoon
3b6ae70c9c Misc fixes - juggling utf-8 not needed
(its utf-16 by default python string)
2024-10-09 13:11:04 +02:00
27 changed files with 114 additions and 378 deletions

View File

@@ -1,5 +1,4 @@
recursive-include changedetectionio/api *
recursive-include changedetectionio/apprise_plugin *
recursive-include changedetectionio/blueprint *
recursive-include changedetectionio/content_fetchers *
recursive-include changedetectionio/model *

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.47.03'
__version__ = '0.46.04'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -58,7 +58,7 @@ class Watch(Resource):
abort(404, message='No watch exists with the UUID of {}'.format(uuid))
if request.args.get('recheck'):
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return "OK", 200
if request.args.get('paused', '') == 'paused':
self.datastore.data['watching'].get(uuid).pause()
@@ -246,7 +246,7 @@ class CreateWatch(Resource):
new_uuid = self.datastore.add_watch(url=url, extras=extras, tag=tags)
if new_uuid:
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
return {'uuid': new_uuid}, 201
else:
return "Invalid or unsupported URL", 400
@@ -303,7 +303,7 @@ class CreateWatch(Resource):
if request.args.get('recheck_all'):
for uuid in self.datastore.data['watching'].keys():
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
return {'status': "OK"}, 200
return list, 200

View File

@@ -1,7 +1,4 @@
import importlib
from concurrent.futures import ThreadPoolExecutor
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio.store import ChangeDetectionStore
from functools import wraps
@@ -33,6 +30,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def long_task(uuid, preferred_proxy):
import time
from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions
from changedetectionio.processors.text_json_diff import text_json_diff
from changedetectionio.safe_jinja import render as jinja_render
status = {'status': '', 'length': 0, 'text': ''}
@@ -40,12 +38,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
contents = ''
now = time.time()
try:
processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor")
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid
)
update_handler.call_browser(preferred_proxy_id=preferred_proxy)
update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
update_handler.call_browser()
# title, size is len contents not len xfer
except content_fetcher_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 404:
@@ -54,7 +48,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
status.update({'status': 'ERROR', 'length': len(contents), 'text': f"{e.status_code} - Access denied"})
else:
status.update({'status': 'ERROR', 'length': len(contents), 'text': f"Status code: {e.status_code}"})
except FilterNotFoundInResponse:
except text_json_diff.FilterNotFoundInResponse:
status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but CSS/xPath filter not found (page changed layout?)"})
except content_fetcher_exceptions.EmptyReply as e:
if e.status_code == 403 or e.status_code == 401:

View File

@@ -19,7 +19,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue
datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
datastore.data['watching'][uuid]['processor'] = 'restock_diff'
datastore.data['watching'][uuid].clear_watch()
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
return redirect(url_for("index"))
@login_required

View File

@@ -75,7 +75,6 @@ class fetcher(Fetcher):
self.headers = r.headers
if not r.content or not len(r.content):
logger.debug(f"Requests returned empty content for '{url}'")
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:

View File

@@ -788,6 +788,7 @@ def changedetection_app(config=None, datastore_o=None):
# Recast it if need be to right data Watch handler
watch_class = get_custom_watch_obj_for_processor(form.data.get('processor'))
datastore.data['watching'][uuid] = watch_class(datastore_path=datastore_o.datastore_path, default=datastore.data['watching'][uuid])
flash("Updated watch - unpaused!" if request.args.get('unpause_on_save') else "Updated watch.")
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
@@ -795,7 +796,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.needs_write_urgent = True
# Queue the watch for immediate recheck, with a higher priority
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
@@ -976,7 +977,7 @@ def changedetection_app(config=None, datastore_o=None):
importer = import_url_list()
importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', 'text_json_diff'))
for uuid in importer.new_uuids:
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
if len(importer.remaining_data) == 0:
return redirect(url_for('index'))
@@ -989,7 +990,7 @@ def changedetection_app(config=None, datastore_o=None):
d_importer = import_distill_io_json()
d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
for uuid in d_importer.new_uuids:
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
# XLSX importer
if request.files and request.files.get('xlsx_file'):
@@ -1013,7 +1014,7 @@ def changedetection_app(config=None, datastore_o=None):
w_importer.run(data=file, flash=flash, datastore=datastore)
for uuid in w_importer.new_uuids:
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
# Could be some remaining, or we could be on GET
form = forms.importForm(formdata=request.form if request.method == 'POST' else None)
@@ -1442,7 +1443,7 @@ def changedetection_app(config=None, datastore_o=None):
new_uuid = datastore.clone(uuid)
if new_uuid:
if not datastore.data['watching'].get(uuid).get('paused'):
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
flash('Cloned.')
return redirect(url_for('index'))
@@ -1463,7 +1464,7 @@ def changedetection_app(config=None, datastore_o=None):
if uuid:
if uuid not in running_uuids:
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
i = 1
elif tag:
@@ -1474,7 +1475,7 @@ def changedetection_app(config=None, datastore_o=None):
continue
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put(
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid})
queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False})
)
i += 1
@@ -1484,8 +1485,9 @@ def changedetection_app(config=None, datastore_o=None):
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
if with_errors and not watch.get('last_error'):
continue
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
i += 1
flash(f"{i} watches queued for rechecking.")
return redirect(url_for('index', tag=tag))
@@ -1542,7 +1544,7 @@ def changedetection_app(config=None, datastore_o=None):
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
# Recheck and require a full reprocessing
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
flash("{} watches queued for rechecking".format(len(uuids)))
elif (op == 'clear-errors'):
@@ -1866,7 +1868,7 @@ def ticker_thread_check_time_launch_checks():
f"{now - watch['last_checked']:0.2f}s since last checked")
# Into the queue with you
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid}))
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid, 'skip_when_checksum_same': True}))
# Reset for next time
watch.jitter_seconds = 0

View File

@@ -18,7 +18,6 @@ class difference_detection_processor():
screenshot = None
watch = None
xpath_data = None
preferred_proxy = None
def __init__(self, *args, datastore, watch_uuid, **kwargs):
super().__init__(*args, **kwargs)
@@ -27,8 +26,7 @@ class difference_detection_processor():
# Generic fetcher that should be extended (requests, playwright etc)
self.fetcher = Fetcher()
def call_browser(self, preferred_proxy_id=None):
def call_browser(self):
from requests.structures import CaseInsensitiveDict
# Protect against file:// access
@@ -44,7 +42,7 @@ class difference_detection_processor():
prefer_fetch_backend = self.watch.get('fetch_backend', 'system')
# Proxy ID "key"
preferred_proxy_id = preferred_proxy_id if preferred_proxy_id else self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
# Pluggable content self.fetcher
if not prefer_fetch_backend or prefer_fetch_backend == 'system':
@@ -157,7 +155,7 @@ class difference_detection_processor():
# After init, call run_changedetection() which will do the actual change-detection
@abstractmethod
def run_changedetection(self, watch):
def run_changedetection(self, watch, skip_when_checksum_same: bool = True):
update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()

View File

@@ -27,27 +27,22 @@ def _search_prop_by_value(matches, value):
return prop[1] # Yield the desired value and exit the function
def _deduplicate_prices(data):
import re
seen = set()
unique_data = []
'''
Some price data has multiple entries, OR it has a single entry with ['$159', '159', 159, "$ 159"] or just "159"
Get all the values, clean it and add it to a set then return the unique values
'''
unique_data = set()
# Return the complete 'datum' where its price was not seen before
for datum in data:
# Convert 'value' to float if it can be a numeric string, otherwise leave it as is
try:
normalized_value = float(datum.value) if isinstance(datum.value, str) and datum.value.replace('.', '', 1).isdigit() else datum.value
except ValueError:
normalized_value = datum.value
if isinstance(datum.value, list):
# Process each item in the list
normalized_value = set([float(re.sub(r'[^\d.]', '', str(item))) for item in datum.value])
unique_data.update(normalized_value)
else:
# Process single value
v = float(re.sub(r'[^\d.]', '', str(datum.value)))
unique_data.add(v)
return list(unique_data)
# If the normalized value hasn't been seen yet, add it to unique data
if normalized_value not in seen:
unique_data.append(datum)
seen.add(normalized_value)
return unique_data
# should return Restock()
@@ -88,13 +83,14 @@ def get_itemprop_availability(html_content) -> Restock:
if price_result:
# Right now, we just support single product items, maybe we will store the whole actual metadata seperately in teh future and
# parse that for the UI?
if len(price_result) > 1 and len(price_result) > 1:
prices_found = set(str(item.value).replace('$', '') for item in price_result)
if len(price_result) > 1 and len(prices_found) > 1:
# See of all prices are different, in the case that one product has many embedded data types with the same price
# One might have $121.95 and another 121.95 etc
logger.warning(f"More than one price found {price_result}, throwing exception, cant use this plugin.")
logger.warning(f"More than one price found {prices_found}, throwing exception, cant use this plugin.")
raise MoreThanOnePriceFound()
value['price'] = price_result[0]
value['price'] = price_result[0].value
pricecurrency_result = pricecurrency_parse.find(data)
if pricecurrency_result:
@@ -144,7 +140,7 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, watch):
def run_changedetection(self, watch, skip_when_checksum_same=True):
import hashlib
if not watch:
@@ -224,7 +220,7 @@ class perform_site_check(difference_detection_processor):
itemprop_availability['original_price'] = itemprop_availability.get('price')
update_obj['restock']["original_price"] = itemprop_availability.get('price')
if not self.fetcher.instock_data and not itemprop_availability.get('availability') and not itemprop_availability.get('price'):
if not self.fetcher.instock_data and not itemprop_availability.get('availability'):
raise ProcessorException(
message=f"Unable to extract restock data for this page unfortunately. (Got code {self.fetcher.get_last_status_code()} from server), no embedded stock information was found and nothing interesting in the text, try using this watch with Chrome.",
url=watch.get('url'),

View File

@@ -11,7 +11,10 @@ def _task(watch, update_handler):
try:
# The slow process (we run 2 of these in parallel)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(watch=watch)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(
watch=watch,
skip_when_checksum_same=False,
)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:

View File

@@ -35,7 +35,7 @@ class PDFToHTMLToolNotFound(ValueError):
# (set_proxy_from_list)
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch):
def run_changedetection(self, watch, skip_when_checksum_same=True):
changed_detected = False
html_content = ""
screenshot = False # as bytes
@@ -58,6 +58,9 @@ class perform_site_check(difference_detection_processor):
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
if skip_when_checksum_same:
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame()
# Fetching complete, now filters
@@ -208,7 +211,6 @@ class perform_site_check(difference_detection_processor):
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
# Rewrite's the processing text based on only what diff result they want to see
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
# Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences
from changedetectionio import diff
@@ -331,21 +333,13 @@ class perform_site_check(difference_detection_processor):
if result:
blocked = True
# The main thing that all this at the moment comes down to :)
if watch.get('previous_md5') != fetched_md5:
changed_detected = True
# Looks like something changed, but did it match all the rules?
if blocked:
changed_detected = False
else:
# The main thing that all this at the moment comes down to :)
if watch.get('previous_md5') != fetched_md5:
changed_detected = True
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
# On the first run of a site, watch['previous_md5'] will be None, set it the current one.
if not watch.get('previous_md5'):
watch['previous_md5'] = fetched_md5
logger.debug(f"Watch UUID {watch.get('uuid')} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
@@ -365,6 +359,12 @@ class perform_site_check(difference_detection_processor):
else:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content")
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
# On the first run of a site, watch['previous_md5'] will be None, set it the current one.
if not watch.get('previous_md5'):
watch['previous_md5'] = fetched_md5
# stripped_text_from_html - Everything after filters and NO 'ignored' content
return changed_detected, update_obj, stripped_text_from_html

View File

@@ -1,14 +1,14 @@
$(function () {
/* add container before each proxy location to show status */
var isActive = false;
function setup_html_widget() {
var option_li = $('.fetch-backend-proxy li').filter(function () {
return $("input", this)[0].value.length > 0;
});
$(option_li).prepend('<div class="proxy-status"></div>');
$(option_li).append('<div class="proxy-timing"></div><div class="proxy-check-details"></div>');
}
var option_li = $('.fetch-backend-proxy li').filter(function() {
return $("input",this)[0].value.length >0;
});
//var option_li = $('.fetch-backend-proxy li');
var isActive = false;
$(option_li).prepend('<div class="proxy-status"></div>');
$(option_li).append('<div class="proxy-timing"></div><div class="proxy-check-details"></div>');
function set_proxy_check_status(proxy_key, state) {
// select input by value name
@@ -59,14 +59,8 @@ $(function () {
}
$('#check-all-proxies').click(function (e) {
e.preventDefault()
if (!$('body').hasClass('proxy-check-active')) {
setup_html_widget();
$('body').addClass('proxy-check-active');
}
$('body').addClass('proxy-check-active');
$('.proxy-check-details').html('');
$('.proxy-status').html('<span class="spinner"></span>').fadeIn();
$('.proxy-timing').html('');

View File

@@ -26,7 +26,8 @@ function set_active_tab() {
if (tab.length) {
tab[0].parentElement.className = "active";
}
// hash could move the page down
window.scrollTo(0, 0);
}
function focus_error_tab() {

View File

@@ -25,19 +25,15 @@ ul#requests-extra_proxies {
body.proxy-check-active {
#request {
// Padding set by flex layout
/*
.proxy-status {
width: 2em;
}
*/
.proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 2em;
max-width: 500px;
padding-left: 4em;
}
.proxy-timing {

View File

@@ -147,14 +147,8 @@ body.spinner-active {
}
}
.tab-pane-inner {
// .tab-pane-inner will have the #id that the tab button jumps/anchors to
scroll-margin-top: 200px;
}
section.content {
padding-top: 100px;
padding-top: 5em;
padding-bottom: 1em;
flex-direction: column;
display: flex;
@@ -937,7 +931,6 @@ $form-edge-padding: 20px;
}
.tab-pane-inner {
&:not(:target) {
display: none;
}

View File

@@ -119,22 +119,19 @@ ul#requests-extra_proxies {
#request label[for=proxy] {
display: inline-block; }
body.proxy-check-active #request {
/*
.proxy-status {
width: 2em;
}
*/ }
body.proxy-check-active #request .proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 2em;
max-width: 500px; }
body.proxy-check-active #request .proxy-timing {
font-size: 80%;
padding-left: 1rem;
color: var(--color-link); }
body.proxy-check-active #request .proxy-status {
width: 2em; }
body.proxy-check-active #request .proxy-check-details {
font-size: 80%;
color: #555;
display: block;
padding-left: 4em; }
body.proxy-check-active #request .proxy-timing {
font-size: 80%;
padding-left: 1rem;
color: var(--color-link); }
#recommended-proxy {
display: grid;
@@ -605,11 +602,8 @@ body.spinner-active #pure-menu-horizontal-spinner {
background-color: var(--color-background-menu-link-hover);
color: var(--color-text-menu-link-hover); }
.tab-pane-inner {
scroll-margin-top: 200px; }
section.content {
padding-top: 100px;
padding-top: 5em;
padding-bottom: 1em;
flex-direction: column;
display: flex;

View File

@@ -1,6 +0,0 @@
# A list of real world examples!
Always the price should be 666.66 for our tests
see test_restock_itemprop.py::test_special_prop_examples

View File

@@ -1,25 +0,0 @@
<div class="PriceSection PriceSection_PriceSection__Vx1_Q PriceSection_variantHuge__P9qxg PdpPriceSection"
data-testid="price-section"
data-optly-product-tile-price-section="true"><span
class="PriceRange ProductPrice variant-huge" itemprop="offers"
itemscope="" itemtype="http://schema.org/Offer"><div
class="VisuallyHidden_VisuallyHidden__VBD83">$155.55</div><span
aria-hidden="true" class="Price variant-huge" data-testid="price"
itemprop="price"><sup class="sup" data-testid="price-symbol"
itemprop="priceCurrency" content="AUD">$</sup><span
class="dollars" data-testid="price-value" itemprop="price"
content="155.55">155.55</span><span class="extras"><span class="sup"
data-testid="price-sup"></span></span></span></span>
</div>
<script type="application/ld+json">{
"@type": "Product",
"@context": "https://schema.org",
"name": "test",
"description": "test",
"offers": {
"@type": "Offer",
"priceCurrency": "AUD",
"price": 155.55
},
}</script>

View File

@@ -16,4 +16,4 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(3)

View File

@@ -1,8 +1,7 @@
#!/usr/bin/env python3
import json
import os
from flask import url_for
from changedetectionio.tests.util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
from changedetectionio.tests.util import live_server_setup, wait_for_all_checks
def set_response():
@@ -19,6 +18,7 @@ def set_response():
f.write(data)
time.sleep(1)
def test_socks5(client, live_server, measure_memory_usage):
live_server_setup(live_server)
set_response()
@@ -79,24 +79,3 @@ def test_socks5(client, live_server, measure_memory_usage):
# Should see the proper string
assert "Awesome, you made it".encode('utf-8') in res.data
# PROXY CHECKER WIDGET CHECK - this needs more checking
uuid = extract_UUID_from_client(client)
res = client.get(
url_for("check_proxies.start_check", uuid=uuid),
follow_redirects=True
)
# It's probably already finished super fast :(
#assert b"RUNNING" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("check_proxies.get_recheck_status", uuid=uuid),
follow_redirects=True
)
assert b"OK" in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -77,8 +77,6 @@ def test_check_removed_line_contains_trigger(client, live_server, measure_memory
# The trigger line is REMOVED, this should trigger
set_original(excluding='The golden line')
# Check in the processor here what's going on, its triggering empty-reply and no change.
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
@@ -153,6 +151,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
# A line thats not the trigger should not trigger anything
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
wait_for_all_checks(client)
@@ -174,5 +173,6 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
assert b'-Oh yes please-' in response
assert '网站监测 内容更新了'.encode('utf-8') in response
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -65,8 +65,11 @@ def test_check_block_changedetection_text_NOT_present(client, live_server, measu
live_server_setup(live_server)
# Use a mix of case in ZzZ to prove it works case-insensitive.
ignore_text = "out of stoCk\r\nfoobar"
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
@@ -124,24 +127,13 @@ def test_check_block_changedetection_text_NOT_present(client, live_server, measu
assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data
# 2548
# Going back to the ORIGINAL should NOT trigger a change
set_original_ignore_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Now we set a change where the text is gone AND its different content, it should now trigger
# Now we set a change where the text is gone, it should now trigger
set_modified_response_minus_block_text()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -5,7 +5,7 @@ import time
from flask import url_for
from ..html_tools import *
from .util import live_server_setup, wait_for_all_checks
from .util import live_server_setup
def test_setup(live_server):
@@ -119,10 +119,12 @@ across multiple lines
def test_element_removal_full(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
@@ -130,8 +132,7 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(1)
# Goto the edit page, add the filter data
# Not sure why \r needs to be added - absent of the #changetext this is not necessary
subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
@@ -147,7 +148,6 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
follow_redirects=True,
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
# Check it saved
res = client.get(
@@ -156,10 +156,10 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
# Trigger a check
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# so that we set the state to 'unviewed' after all the edits
client.get(url_for("diff_history_page", uuid="first"))
@@ -168,11 +168,10 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
set_modified_response()
# Trigger a check
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# There should not be an unviewed change, as changes should be removed
res = client.get(url_for("index"))

View File

@@ -1,78 +0,0 @@
#!/usr/bin/env python3
from flask import url_for
from changedetectionio.tests.util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
def set_response():
data = f"""<html>
<body>Awesome, you made it<br>
yeah the socks request worked<br>
something to ignore<br>
something to trigger<br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
def test_content_filter_live_preview(client, live_server, measure_memory_usage):
live_server_setup(live_server)
set_response()
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": ''},
follow_redirects=True
)
uuid = extract_UUID_from_client(client)
res = client.post(
url_for("edit_page", uuid=uuid),
data={
"include_filters": "",
"fetch_backend": 'html_requests',
"ignore_text": "something to ignore",
"trigger_text": "something to trigger",
"url": test_url,
},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
# The endpoint is a POST and accepts the form values to override the watch preview
import json
# DEFAULT OUTPUT WITHOUT ANYTHING UPDATED/CHANGED - SHOULD SEE THE WATCH DEFAULTS
res = client.post(
url_for("watch_get_preview_rendered", uuid=uuid)
)
default_return = json.loads(res.data.decode('utf-8'))
assert default_return.get('after_filter')
assert default_return.get('before_filter')
assert default_return.get('ignore_line_numbers') == [3] # "something to ignore" line 3
assert default_return.get('trigger_line_numbers') == [4] # "something to trigger" line 4
# SEND AN UPDATE AND WE SHOULD SEE THE OUTPUT CHANGE SO WE KNOW TO HIGHLIGHT NEW STUFF
res = client.post(
url_for("watch_get_preview_rendered", uuid=uuid),
data={
"include_filters": "",
"fetch_backend": 'html_requests',
"ignore_text": "sOckS", # Also be sure case insensitive works
"trigger_text": "AweSOme",
"url": test_url,
},
)
reply = json.loads(res.data.decode('utf-8'))
assert reply.get('after_filter')
assert reply.get('before_filter')
assert reply.get('ignore_line_numbers') == [2] # Ignored - "socks" on line 2
assert reply.get('trigger_line_numbers') == [1] # Triggers "Awesome" in line 1
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
# `subtractive_selectors` should still work in `source:` type requests
def test_fetch_pdf(client, live_server, measure_memory_usage):
import shutil
shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf")
live_server_setup(live_server)
test_url = url_for('test_pdf_endpoint', _external=True)
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# PDF header should not be there (it was converted to text)
assert b'PDF' not in res.data[:10]
assert b'hello world' in res.data
# So we know if the file changes in other ways
import hashlib
original_md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
# We should have one
assert len(original_md5) > 0
# And it's going to be in the document
assert b'Document checksum - ' + bytes(str(original_md5).encode('utf-8')) in res.data
shutil.copy("tests/test2.pdf", "test-datastore/endpoint-test.pdf")
changed_md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches queued for rechecking.' in res.data
wait_for_all_checks(client)
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index"))
assert b'unviewed' in res.data
# The original checksum should be not be here anymore (cdio adds it to the bottom of the text)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert original_md5.encode('utf-8') not in res.data
assert changed_md5.encode('utf-8') in res.data
res = client.get(
url_for("diff_history_page", uuid="first"),
follow_redirects=True
)
assert original_md5.encode('utf-8') in res.data
assert changed_md5.encode('utf-8') in res.data
assert b'here is a change' in res.data

View File

@@ -3,7 +3,7 @@ import os
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, wait_for_notification_endpoint_output
from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client, wait_for_notification_endpoint_output
from ..notification import default_notification_format
instock_props = [
@@ -413,31 +413,3 @@ def test_data_sanity(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"))
assert test_url2.encode('utf-8') in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
# All examples should give a prive of 666.66
def test_special_prop_examples(client, live_server):
import glob
#live_server_setup(live_server)
test_url = url_for('test_endpoint', _external=True)
check_path = os.path.join(os.path.dirname(__file__), "itemprop_test_examples", "*.txt")
files = glob.glob(check_path)
assert files
for test_example_filename in files:
with open(test_example_filename, 'r') as example_f:
with open("test-datastore/endpoint-content.txt", "w") as test_f:
test_f.write(f"<html><body>{example_f.read()}</body></html>")
# Now fetch it and check the price worked
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'ception' not in res.data
assert b'155.55' in res.data

View File

@@ -260,6 +260,9 @@ class update_worker(threading.Thread):
try:
# Processor is what we are using for detecting the "Change"
processor = watch.get('processor', 'text_json_diff')
# Abort processing when the content was the same as the last fetch
skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same')
# Init a new 'difference_detection_processor', first look in processors
processor_module_name = f"changedetectionio.processors.{processor}.processor"
@@ -275,7 +278,10 @@ class update_worker(threading.Thread):
update_handler.call_browser()
changed_detected, update_obj, contents = update_handler.run_changedetection(watch=watch)
changed_detected, update_obj, contents = update_handler.run_changedetection(
watch=watch,
skip_when_checksum_same=skip_when_same_checksum,
)
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.