mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-12 20:46:17 +00:00
Text filters - Adding filters "Trim whitespace" and "Remove duplicate lines"
This commit is contained in:
@@ -218,11 +218,19 @@ class perform_site_check(difference_detection_processor):
|
||||
is_rss=is_rss)) #1874 activate the <title workaround hack
|
||||
stripped_text_from_html = future.result()
|
||||
|
||||
if watch.get('sort_text_alphabetically') and stripped_text_from_html:
|
||||
|
||||
if watch.get('trim_text_whitespace'):
|
||||
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
|
||||
|
||||
if watch.get('remove_duplicate_lines'):
|
||||
stripped_text_from_html = '\n'.join(dict.fromkeys(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
|
||||
|
||||
if watch.get('sort_text_alphabetically'):
|
||||
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
|
||||
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
|
||||
stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
|
||||
stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
|
||||
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
|
||||
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
|
||||
|
||||
|
||||
# Re #340 - return the content before the 'ignore text' was applied
|
||||
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
||||
@@ -304,7 +312,7 @@ class perform_site_check(difference_detection_processor):
|
||||
for match in res:
|
||||
regex_matched_output += [match] + [b'\n']
|
||||
|
||||
# Now we will only show what the regex matched
|
||||
##########################################################
|
||||
stripped_text_from_html = b''
|
||||
text_content_before_ignored_filter = b''
|
||||
if regex_matched_output:
|
||||
@@ -312,6 +320,8 @@ class perform_site_check(difference_detection_processor):
|
||||
stripped_text_from_html = b''.join(regex_matched_output)
|
||||
text_content_before_ignored_filter = stripped_text_from_html
|
||||
|
||||
|
||||
|
||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user