diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 28fdfeb9..19056b5d 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -476,7 +476,7 @@ class processor_text_json_diff_form(commonSettingsForm):
title = StringField('Title', default='')
- ignore_text = StringListField('Remove lines containing', [ValidateListRegex()])
+ ignore_text = StringListField('Ignore lines containing', [ValidateListRegex()])
headers = StringDictKeyValue('Request headers')
body = TextAreaField('Request body', [validators.Optional()])
method = SelectField('Request method', choices=valid_method, default=default_method)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 7c2e1eba..6e4ebca0 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -3,11 +3,11 @@ from lxml import etree
import json
import re
-
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = " "
-
+TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
+
# 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
@@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# - "line numbers" return a list of line numbers that match (int list)
#
# wordlist - list of regex's (str) or words (str)
+# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"):
i = 0
output = []
@@ -341,32 +342,30 @@ def strip_ignore_text(content, wordlist, mode="content"):
else:
ignore_text.append(k.strip())
- for line in content.splitlines():
+ for line in content.splitlines(keepends=True):
i += 1
# Always ignore blank lines in this mode. (when this function gets called)
got_match = False
- if len(line.strip()):
- for l in ignore_text:
- if l.lower() in line.lower():
+ for l in ignore_text:
+ if l.lower() in line.lower():
+ got_match = True
+
+ if not got_match:
+ for r in ignore_regex:
+ if r.search(line):
got_match = True
- if not got_match:
- for r in ignore_regex:
- if r.search(line):
- got_match = True
-
- if not got_match:
- # Not ignored
- output.append(line.encode('utf8'))
- else:
- ignored_line_numbers.append(i)
-
+ if not got_match:
+ # Not ignored, and should preserve "keepends"
+ output.append(line)
+ else:
+ ignored_line_numbers.append(i)
# Used for finding out what to highlight
if mode == "line numbers":
return ignored_line_numbers
- return "\n".encode('utf8').join(output)
+ return ''.join(output)
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index b52d37fb..a2e38ce1 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -6,6 +6,8 @@ import re
from pathlib import Path
from loguru import logger
+from ..html_tools import TRANSLATE_WHITESPACE_TABLE
+
# Allowable protocols, protects against javascript: etc
# file:// is further checked by ALLOW_FILE_URI
SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@@ -312,13 +314,13 @@ class model(watch_base):
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
with open(dest, 'wb') as f:
- f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
+ f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
else:
snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
with open(dest, 'wb') as f:
- f.write(contents)
+ f.write(contents.encode('utf-8'))
# Append to index
# @todo check last char was \n
@@ -350,14 +352,32 @@ class model(watch_base):
return seconds
# Iterate over all history texts and see if something new exists
- def lines_contain_something_unique_compared_to_history(self, lines: list):
- local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+ # Always applying .strip() to start/end but optionally replace any other whitespace
+ def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
+ local_lines = []
+ if lines:
+ if ignore_whitespace:
+ if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+ local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+ else:
+ local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+ else:
+ if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+ local_lines = set([l.strip().lower() for l in lines])
+ else:
+ local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+
# Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({})
for k, v in self.history.items():
content = self.get_history_snapshot(k)
- alist = set([line.strip().lower() for line in content.splitlines()])
+
+ if ignore_whitespace:
+ alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
+ else:
+ alist = set([line.strip().lower() for line in content.splitlines()])
+
existing_history = existing_history.union(alist)
# Check that everything in local_lines(new stuff) already exists in existing_history - it should
diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
index 0f490221..911e1838 100644
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
- return changed_detected, update_obj, snapshot_content.encode('utf-8').strip()
+ return changed_detected, update_obj, snapshot_content.strip()
diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py
index f87aa350..6a5efad9 100644
--- a/changedetectionio/processors/text_json_diff/__init__.py
+++ b/changedetectionio/processors/text_json_diff/__init__.py
@@ -46,6 +46,9 @@ def prepare_filter_prevew(datastore, watch_uuid):
text_after_filter = ''
text_before_filter = ''
+ trigger_line_numbers = []
+ ignore_line_numbers = []
+
tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid))
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
@@ -72,7 +75,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
)
# Use the last loaded HTML as the input
update_handler.datastore = datastore
- update_handler.fetcher.content = decompressed_data
+ update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
# Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk
@@ -84,9 +87,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
text_after_filter = future1.result()
text_before_filter = future2.result()
- trigger_line_numbers = []
try:
-
trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
wordlist=tmp_watch['trigger_text'],
mode='line numbers'
@@ -94,6 +95,15 @@ def prepare_filter_prevew(datastore, watch_uuid):
except Exception as e:
text_before_filter = f"Error: {str(e)}"
+ try:
+ text_to_ignore = tmp_watch.get('ignore_text', []) + datastore.data['settings']['application'].get('global_ignore_text', [])
+ ignore_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
+ wordlist=text_to_ignore,
+ mode='line numbers'
+ )
+ except Exception as e:
+ text_before_filter = f"Error: {str(e)}"
+
logger.trace(f"Parsed in {time.time() - now:.3f}s")
return jsonify(
@@ -102,6 +112,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter,
'duration': time.time() - now,
'trigger_line_numbers': trigger_line_numbers,
+ 'ignore_line_numbers': ignore_line_numbers,
}
)
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 43feb05f..c3752956 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -7,7 +7,7 @@ import re
import urllib3
from changedetectionio.processors import difference_detection_processor
-from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
+from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
@@ -36,7 +36,6 @@ class PDFToHTMLToolNotFound(ValueError):
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, skip_when_checksum_same=True):
-
changed_detected = False
html_content = ""
screenshot = False # as bytes
@@ -205,18 +204,9 @@ class perform_site_check(difference_detection_processor):
if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
- if watch.get('remove_duplicate_lines'):
- stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
-
- if watch.get('sort_text_alphabetically'):
- # Note: Because a
something
will add an extra line feed to signify the paragraph gap
- # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
- stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
- stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
-
# Re #340 - return the content before the 'ignore text' was applied
# Also used to calculate/show what was removed
- text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
+ text_content_before_ignored_filter = stripped_text_from_html
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@@ -236,12 +226,12 @@ class perform_site_check(difference_detection_processor):
line_feed_sep="\n",
include_change_type_prefix=False)
- watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter)
+ watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future
- c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest()
+ c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else:
stripped_text_from_html = rendered_diff
@@ -262,14 +252,6 @@ class perform_site_check(difference_detection_processor):
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
- # If there's text to skip
- # @todo we could abstract out the get_text() to handle this cleaner
- text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
- if len(text_to_ignore):
- stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
- else:
- stripped_text_from_html = stripped_text_from_html.encode('utf8')
-
# 615 Extract text by regex
extract_text = watch.get('extract_text', [])
if len(extract_text) > 0:
@@ -278,39 +260,53 @@ class perform_site_check(difference_detection_processor):
# incase they specified something in '/.../x'
if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
- result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+ result = re.findall(regex, stripped_text_from_html)
for l in result:
if type(l) is tuple:
# @todo - some formatter option default (between groups)
- regex_matched_output += list(l) + [b'\n']
+ regex_matched_output += list(l) + ['\n']
else:
# @todo - some formatter option default (between each ungrouped result)
- regex_matched_output += [l] + [b'\n']
+ regex_matched_output += [l] + ['\n']
else:
# Doesnt look like regex, just hunt for plaintext and return that which matches
# `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
- r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
+ r = re.compile(re.escape(s_re), re.IGNORECASE)
res = r.findall(stripped_text_from_html)
if res:
for match in res:
- regex_matched_output += [match] + [b'\n']
+ regex_matched_output += [match] + ['\n']
##########################################################
- stripped_text_from_html = b''
- text_content_before_ignored_filter = b''
+ stripped_text_from_html = ''
+
if regex_matched_output:
# @todo some formatter for presentation?
- stripped_text_from_html = b''.join(regex_matched_output)
- text_content_before_ignored_filter = stripped_text_from_html
+ stripped_text_from_html = ''.join(regex_matched_output)
+
+ if watch.get('remove_duplicate_lines'):
+ stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
+ if watch.get('sort_text_alphabetically'):
+ # Note: Because a
something
will add an extra line feed to signify the paragraph gap
+ # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
+ stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
+ stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
+
+### CALCULATE MD5
+ # If there's text to ignore
+ text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
+ text_for_checksuming = stripped_text_from_html
+ if text_to_ignore:
+ text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
- if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
- fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
+ if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
+ fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
else:
- fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
+ fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
############ Blocking rules, after checksum #################
blocked = False
@@ -350,7 +346,13 @@ class perform_site_check(difference_detection_processor):
if changed_detected:
if watch.get('check_unique_lines', False):
- has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
+ ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
+
+ has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
+ lines=stripped_text_from_html.splitlines(),
+ ignore_whitespace=ignore_whitespace
+ )
+
# One or more lines? unsure?
if not has_unique_lines:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
diff --git a/changedetectionio/static/js/watch-settings.js b/changedetectionio/static/js/watch-settings.js
index f3360dbe..cb9f9c60 100644
--- a/changedetectionio/static/js/watch-settings.js
+++ b/changedetectionio/static/js/watch-settings.js
@@ -42,8 +42,12 @@ function request_textpreview_update() {
{
'color': '#ee0000',
'lines': data['trigger_line_numbers']
+ },
+ {
+ 'color': '#757575',
+ 'lines': data['ignore_line_numbers']
}
- ]);
+ ])
}).fail(function (error) {
if (error.statusText === 'abort') {
console.log('Request was aborted due to a new request being fired.');
@@ -76,8 +80,8 @@ $(document).ready(function () {
$('body').toggleClass('preview-text-enabled')
request_textpreview_update();
const method = $('body').hasClass('preview-text-enabled') ? 'on' : 'off';
- $('textarea:visible')[method]('keyup blur', request_textpreview_update.throttle(1000));
- $('input:visible')[method]('keyup blur change', request_textpreview_update.throttle(1000));
+ $('#filters-and-triggers textarea')[method]('blur', request_textpreview_update.throttle(1000));
+ $('#filters-and-triggers input')[method]('change', request_textpreview_update.throttle(1000));
$("#filters-and-triggers-tab")[method]('click', request_textpreview_update.throttle(1000));
});
$('.minitabs-wrapper').miniTabs({
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index cc1b335f..697da5bc 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -4,6 +4,7 @@ from flask import (
flash
)
+from .html_tools import TRANSLATE_WHITESPACE_TABLE
from . model import App, Watch
from copy import deepcopy, copy
from os import path, unlink
@@ -750,17 +751,17 @@ class ChangeDetectionStore:
def update_5(self):
# If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
# In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
- current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
- current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
+ current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
+ current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
for uuid, watch in self.data['watching'].items():
try:
watch_body = watch.get('notification_body', '')
- if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body:
+ if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
# Looks the same as the default one, so unset it
watch['notification_body'] = None
watch_title = watch.get('notification_title', '')
- if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title:
+ if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
# Looks the same as the default one, so unset it
watch['notification_title'] = None
except Exception as e:
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 9d9f48ff..5847962f 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -26,7 +26,6 @@
-
{% if playwright_enabled %}
@@ -330,9 +329,9 @@ nav
{{ render_checkbox_field(form.filter_text_added) }}
{{ render_checkbox_field(form.filter_text_replaced) }}
{{ render_checkbox_field(form.filter_text_removed) }}
- Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example.
- So it's always better to select Added+Replaced when you're interested in new content.
- When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear
+ Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example.
+ So it's always better to select Added+Replaced when you're interested in new content.
+ When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear