mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Fix - Regular Expression text in ignore and trigger were not processing correctly, also refactored for lower CPU usage (#1747)
This commit is contained in:
@@ -191,42 +191,50 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
||||
#
|
||||
# wordlist - list of regex's (str) or words (str)
|
||||
def strip_ignore_text(content, wordlist, mode="content"):
|
||||
ignore = []
|
||||
ignore_regex = []
|
||||
|
||||
# @todo check this runs case insensitive
|
||||
for k in wordlist:
|
||||
|
||||
# Is it a regex?
|
||||
if k[0] == '/':
|
||||
ignore_regex.append(k.strip(" /"))
|
||||
else:
|
||||
ignore.append(k)
|
||||
|
||||
i = 0
|
||||
output = []
|
||||
ignore_text = []
|
||||
ignore_regex = []
|
||||
|
||||
ignored_line_numbers = []
|
||||
|
||||
for k in wordlist:
|
||||
# Is it a regex?
|
||||
x = re.search('^\/(.*)\/(.*)', k.strip())
|
||||
if x:
|
||||
# Starts with / but doesn't look like a regex
|
||||
p = x.group(1)
|
||||
try:
|
||||
# @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
|
||||
ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
|
||||
except Exception as e:
|
||||
# Badly formed regex, treat as text
|
||||
ignore_text.append(k.strip())
|
||||
else:
|
||||
# Had a / but doesn't work as regex
|
||||
ignore_text.append(k.strip())
|
||||
|
||||
for line in content.splitlines():
|
||||
i += 1
|
||||
# Always ignore blank lines in this mode. (when this function gets called)
|
||||
got_match = False
|
||||
if len(line.strip()):
|
||||
regex_matches = False
|
||||
for l in ignore_text:
|
||||
if l.lower() in line.lower():
|
||||
got_match = True
|
||||
|
||||
# if any of these match, skip
|
||||
for regex in ignore_regex:
|
||||
try:
|
||||
if re.search(regex, line, re.IGNORECASE):
|
||||
regex_matches = True
|
||||
except Exception as e:
|
||||
continue
|
||||
if not got_match:
|
||||
for r in ignore_regex:
|
||||
if r.search(line):
|
||||
got_match = True
|
||||
|
||||
if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
|
||||
if not got_match:
|
||||
# Not ignored
|
||||
output.append(line.encode('utf8'))
|
||||
else:
|
||||
ignored_line_numbers.append(i)
|
||||
|
||||
|
||||
|
||||
# Used for finding out what to highlight
|
||||
if mode == "line numbers":
|
||||
return ignored_line_numbers
|
||||
|
||||
@@ -15,11 +15,24 @@ def test_strip_regex_text_func():
|
||||
but sometimes we want to remove the lines.
|
||||
|
||||
but 1 lines
|
||||
skip 5 lines
|
||||
really? yes man
|
||||
#/not this tries weirdly formed regex or just strings starting with /
|
||||
/not this
|
||||
but including 1234 lines
|
||||
igNORe-cAse text we dont want to keep
|
||||
but not always."""
|
||||
|
||||
ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]
|
||||
|
||||
ignore_lines = [
|
||||
"sometimes",
|
||||
"/\s\d{2,3}\s/",
|
||||
"/ignore-case text/",
|
||||
"really?",
|
||||
"/skip \d lines/i",
|
||||
"/not"
|
||||
]
|
||||
|
||||
|
||||
fetcher = fetch_site_status.perform_site_check(datastore=False)
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
||||
@@ -27,4 +40,10 @@ def test_strip_regex_text_func():
|
||||
assert b"but 1 lines" in stripped_content
|
||||
assert b"igNORe-cAse text" not in stripped_content
|
||||
assert b"but 1234 lines" not in stripped_content
|
||||
assert b"really" not in stripped_content
|
||||
assert b"not this" not in stripped_content
|
||||
|
||||
# Check line number reporting
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
|
||||
assert stripped_content == [2, 5, 6, 7, 8, 10]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user