mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-05 09:04:55 +00:00
Compare commits
6 Commits
0.42.2
...
refactor/r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
114dab23e9 | ||
|
|
96ff5dbeeb | ||
|
|
8898f1ba01 | ||
|
|
b069c2d04a | ||
|
|
2e451e1f8a | ||
|
|
ced1c66e4d |
@@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|||||||
|
|
||||||
|
|
||||||
# Some common stuff here that can be moved to a base class
|
# Some common stuff here that can be moved to a base class
|
||||||
|
# (set_proxy_from_list)
|
||||||
class perform_site_check():
|
class perform_site_check():
|
||||||
|
|
||||||
def __init__(self, *args, datastore, **kwargs):
|
def __init__(self, *args, datastore, **kwargs):
|
||||||
@@ -45,6 +46,20 @@ class perform_site_check():
|
|||||||
|
|
||||||
return proxy_args
|
return proxy_args
|
||||||
|
|
||||||
|
# Doesn't look like python supports forward slash auto enclosure in re.findall
|
||||||
|
# So convert it to inline flag "foobar(?i)" type configuration
|
||||||
|
def forward_slash_enclosed_regex_to_options(self, regex):
|
||||||
|
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
|
||||||
|
|
||||||
|
if res:
|
||||||
|
regex = res.group(1)
|
||||||
|
regex += '(?{})'.format(res.group(2))
|
||||||
|
else:
|
||||||
|
regex += '(?{})'.format('i')
|
||||||
|
|
||||||
|
return regex
|
||||||
|
|
||||||
|
|
||||||
def run(self, uuid):
|
def run(self, uuid):
|
||||||
timestamp = int(time.time()) # used for storage etc too
|
timestamp = int(time.time()) # used for storage etc too
|
||||||
|
|
||||||
@@ -215,15 +230,27 @@ class perform_site_check():
|
|||||||
if len(extract_text) > 0:
|
if len(extract_text) > 0:
|
||||||
regex_matched_output = []
|
regex_matched_output = []
|
||||||
for s_re in extract_text:
|
for s_re in extract_text:
|
||||||
result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
|
# incase they specified something in '/.../x'
|
||||||
flags=re.MULTILINE | re.DOTALL | re.LOCALE)
|
regex = self.forward_slash_enclosed_regex_to_options(s_re)
|
||||||
if result:
|
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
|
||||||
regex_matched_output = regex_matched_output + result
|
|
||||||
|
|
||||||
|
for l in result:
|
||||||
|
if type(l) is tuple:
|
||||||
|
#@todo - some formatter option default (between groups)
|
||||||
|
regex_matched_output += list(l) + [b'\n']
|
||||||
|
else:
|
||||||
|
# @todo - some formatter option default (between each ungrouped result)
|
||||||
|
regex_matched_output += [l] + [b'\n']
|
||||||
|
|
||||||
|
# Now we will only show what the regex matched
|
||||||
|
stripped_text_from_html = b''
|
||||||
|
text_content_before_ignored_filter = b''
|
||||||
if regex_matched_output:
|
if regex_matched_output:
|
||||||
stripped_text_from_html = b'\n'.join(regex_matched_output)
|
# @todo some formatter for presentation?
|
||||||
|
stripped_text_from_html = b''.join(regex_matched_output)
|
||||||
text_content_before_ignored_filter = stripped_text_from_html
|
text_content_before_ignored_filter = stripped_text_from_html
|
||||||
|
|
||||||
|
|
||||||
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
|
||||||
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
|
||||||
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
|
||||||
|
|||||||
@@ -239,8 +239,15 @@ Unavailable") }}
|
|||||||
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
|
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
|
||||||
<span class="pure-form-message-inline">
|
<span class="pure-form-message-inline">
|
||||||
<ul>
|
<ul>
|
||||||
<li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
|
<li>Extracts text in the final output (line by line) after other filters using regular expressions;
|
||||||
<li>One line per regular-expression.</li>
|
<ul>
|
||||||
|
<li>Regular expression ‐ example <code>/reports.+?2022/i</code></li>
|
||||||
|
<li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
|
||||||
|
<li>Keyword example ‐ example <code>Out of stock</code></li>
|
||||||
|
<li>Use groups to extract just that text ‐ example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
<li>One line per regular-expression/ string match</li>
|
||||||
</ul>
|
</ul>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ def set_original_response():
|
|||||||
</br>
|
</br>
|
||||||
So let's see what happens. </br>
|
So let's see what happens. </br>
|
||||||
<div id="sametext">Some text thats the same</div>
|
<div id="sametext">Some text thats the same</div>
|
||||||
<div id="changetext">Some text that will change</div>
|
<div class="changetext">Some text that will change</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
@@ -33,7 +33,8 @@ def set_modified_response():
|
|||||||
</br>
|
</br>
|
||||||
So let's see what happens. </br>
|
So let's see what happens. </br>
|
||||||
<div id="sametext">Some text thats the same</div>
|
<div id="sametext">Some text thats the same</div>
|
||||||
<div id="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/> 2000 online )</div>
|
<div class="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/> 2000 online )</div>
|
||||||
|
<div class="changetext">SomeCase insensitive 3456</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
@@ -44,11 +45,78 @@ def set_modified_response():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def test_check_filter_and_regex_extract(client, live_server):
|
def set_multiline_response():
|
||||||
sleep_time_for_fetch_thread = 3
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<p>Something <br/>
|
||||||
|
across 6 billion multiple<br/>
|
||||||
|
lines
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div>aaand something lines</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_setup(client, live_server):
|
||||||
|
|
||||||
live_server_setup(live_server)
|
live_server_setup(live_server)
|
||||||
css_filter = "#changetext"
|
|
||||||
|
def test_check_filter_multiline(client, live_server):
|
||||||
|
|
||||||
|
set_multiline_response()
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Goto the edit page, add our ignore text
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"css_filter": '',
|
||||||
|
'extract_text': '/something.+?6 billion.+?lines/si',
|
||||||
|
"url": test_url,
|
||||||
|
"tag": "",
|
||||||
|
"headers": "",
|
||||||
|
'fetch_backend': "html_requests"
|
||||||
|
},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
res = client.get(
|
||||||
|
url_for("preview_page", uuid="first"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
assert b'<div class="">Something' in res.data
|
||||||
|
assert b'<div class="">across 6 billion multiple' in res.data
|
||||||
|
assert b'<div class="">lines' in res.data
|
||||||
|
|
||||||
|
# but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
|
||||||
|
assert b'aaand something lines' not in res.data
|
||||||
|
|
||||||
|
def test_check_filter_and_regex_extract(client, live_server):
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
css_filter = ".changetext"
|
||||||
|
|
||||||
set_original_response()
|
set_original_response()
|
||||||
|
|
||||||
@@ -64,6 +132,7 @@ def test_check_filter_and_regex_extract(client, live_server):
|
|||||||
)
|
)
|
||||||
assert b"1 Imported" in res.data
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
# Trigger a check
|
# Trigger a check
|
||||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
@@ -75,7 +144,7 @@ def test_check_filter_and_regex_extract(client, live_server):
|
|||||||
res = client.post(
|
res = client.post(
|
||||||
url_for("edit_page", uuid="first"),
|
url_for("edit_page", uuid="first"),
|
||||||
data={"css_filter": css_filter,
|
data={"css_filter": css_filter,
|
||||||
'extract_text': '\d+ online\n\d+ guests',
|
'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
|
||||||
"url": test_url,
|
"url": test_url,
|
||||||
"tag": "",
|
"tag": "",
|
||||||
"headers": "",
|
"headers": "",
|
||||||
@@ -86,15 +155,6 @@ def test_check_filter_and_regex_extract(client, live_server):
|
|||||||
|
|
||||||
assert b"Updated watch." in res.data
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
# Check it saved
|
|
||||||
res = client.get(
|
|
||||||
url_for("edit_page", uuid="first"),
|
|
||||||
)
|
|
||||||
assert b'\d+ online' in res.data
|
|
||||||
|
|
||||||
# Trigger a check
|
|
||||||
# client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
|
||||||
|
|
||||||
# Give the thread time to pick it up
|
# Give the thread time to pick it up
|
||||||
time.sleep(sleep_time_for_fetch_thread)
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
@@ -126,5 +186,13 @@ def test_check_filter_and_regex_extract(client, live_server):
|
|||||||
# Both regexs should be here
|
# Both regexs should be here
|
||||||
assert b'<div class="">80 guests' in res.data
|
assert b'<div class="">80 guests' in res.data
|
||||||
|
|
||||||
|
# Regex with flag handling should be here
|
||||||
|
assert b'<div class="">SomeCase insensitive 3456' in res.data
|
||||||
|
|
||||||
|
# Singular group from /somecase insensitive (345\d)/i
|
||||||
|
assert b'<div class="">3456' in res.data
|
||||||
|
|
||||||
|
# Regex with multiline flag handling should be here
|
||||||
|
|
||||||
# Should not be here
|
# Should not be here
|
||||||
assert b'Some text that did change' not in res.data
|
assert b'Some text that did change' not in res.data
|
||||||
|
|||||||
@@ -113,7 +113,6 @@ class update_worker(threading.Thread):
|
|||||||
err_text = "Page request from server didnt respond correctly"
|
err_text = "Page request from server didnt respond correctly"
|
||||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
|
||||||
'last_check_status': e.status_code})
|
'last_check_status': e.status_code})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
|
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
|
||||||
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
|
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
|
||||||
|
|||||||
Reference in New Issue
Block a user