mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-12-14 20:16:13 +00:00
CSS selector filter (#73)
* Re #9 CSS Selector filtering, Adding test for #9
This commit is contained in:
@@ -386,6 +386,17 @@ def changedetection_app(conig=None, datastore_o=None):
|
|||||||
if len(datastore.data['watching'][uuid]['history']):
|
if len(datastore.data['watching'][uuid]['history']):
|
||||||
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
|
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
|
||||||
|
|
||||||
|
|
||||||
|
# CSS Filter
|
||||||
|
css_filter = request.form.get('css_filter')
|
||||||
|
if css_filter:
|
||||||
|
datastore.data['watching'][uuid]['css_filter'] = css_filter.strip()
|
||||||
|
|
||||||
|
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
|
||||||
|
if len(datastore.data['watching'][uuid]['history']):
|
||||||
|
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
|
||||||
|
|
||||||
|
|
||||||
validators.url(url) # @todo switch to prop/attr/observer
|
validators.url(url) # @todo switch to prop/attr/observer
|
||||||
datastore.data['watching'][uuid].update(update_obj)
|
datastore.data['watching'][uuid].update(update_obj)
|
||||||
datastore.needs_write = True
|
datastore.needs_write = True
|
||||||
@@ -876,7 +887,7 @@ def ticker_thread_check_time_launch_checks():
|
|||||||
if not uuid in running_uuids and uuid not in update_q.queue:
|
if not uuid in running_uuids and uuid not in update_q.queue:
|
||||||
update_q.put(uuid)
|
update_q.put(uuid)
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
# Should be low so we can break this out in testing
|
# Should be low so we can break this out in testing
|
||||||
app.config.exit.wait(1)
|
app.config.exit.wait(1)
|
||||||
|
|||||||
@@ -66,25 +66,36 @@ class perform_site_check():
|
|||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
verify=False)
|
verify=False)
|
||||||
|
|
||||||
|
# CSS Filter
|
||||||
|
css_filter = self.datastore.data['watching'][uuid]['css_filter']
|
||||||
|
if css_filter and len(css_filter.strip()):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
|
stripped_text_from_html = ""
|
||||||
|
for item in soup.select(css_filter):
|
||||||
|
text = str(item.get_text())+"\n"
|
||||||
|
stripped_text_from_html += text
|
||||||
|
|
||||||
|
else:
|
||||||
stripped_text_from_html = get_text(r.text)
|
stripped_text_from_html = get_text(r.text)
|
||||||
|
|
||||||
# Usually from networkIO/requests level
|
# Usually from networkIO/requests level
|
||||||
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
|
||||||
update_obj["last_error"] = str(e)
|
update_obj["last_error"] = str(e)
|
||||||
|
|
||||||
print(str(e))
|
print(str(e))
|
||||||
|
|
||||||
except requests.exceptions.MissingSchema:
|
except requests.exceptions.MissingSchema:
|
||||||
print("Skipping {} due to missing schema/bad url".format(uuid))
|
print("Skipping {} due to missing schema/bad url".format(uuid))
|
||||||
|
|
||||||
# Usually from html2text level
|
# Usually from html2text level
|
||||||
except UnicodeDecodeError as e:
|
except Exception as e:
|
||||||
|
# except UnicodeDecodeError as e:
|
||||||
update_obj["last_error"] = str(e)
|
update_obj["last_error"] = str(e)
|
||||||
print(str(e))
|
print(str(e))
|
||||||
# figure out how to deal with this cleaner..
|
# figure out how to deal with this cleaner..
|
||||||
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
|
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
# We rely on the actual text in the html output.. many sites have random script vars etc,
|
||||||
# in the future we'll implement other mechanisms.
|
# in the future we'll implement other mechanisms.
|
||||||
|
|||||||
@@ -61,7 +61,8 @@ class ChangeDetectionStore:
|
|||||||
'headers': {}, # Extra headers to send
|
'headers': {}, # Extra headers to send
|
||||||
'history': {}, # Dict of timestamp and output stripped filename
|
'history': {}, # Dict of timestamp and output stripped filename
|
||||||
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||||
'notification_urls': [] # List of URLs to add to the notification Queue (Usually AppRise)
|
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
|
||||||
|
'css_filter': "",
|
||||||
}
|
}
|
||||||
|
|
||||||
if path.isfile('backend/source.txt'):
|
if path.isfile('backend/source.txt'):
|
||||||
|
|||||||
@@ -24,7 +24,13 @@
|
|||||||
size="5"/>
|
size="5"/>
|
||||||
<span class="pure-form-message-inline">Minimum 1 minute between recheck</span>
|
<span class="pure-form-message-inline">Minimum 1 minute between recheck</span>
|
||||||
</div>
|
</div>
|
||||||
|
</br>
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<label for="minutes">CSS Filter</label>
|
||||||
|
<input type="text" id="css_filter" name="css_filter" value="{{watch.css_filter}}"
|
||||||
|
size="25"/>
|
||||||
|
<span class="pure-form-message-inline">Limit text to this CSS rule, all matching CSS is included.</span>
|
||||||
|
</div>
|
||||||
<!-- @todo: move to tabs --->
|
<!-- @todo: move to tabs --->
|
||||||
<fieldset class="pure-group">
|
<fieldset class="pure-group">
|
||||||
<label for="ignore-text">Ignore text</label>
|
<label for="ignore-text">Ignore text</label>
|
||||||
|
|||||||
102
backend/tests/test_css_selector.py
Normal file
102
backend/tests/test_css_selector.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
from flask import url_for
|
||||||
|
from . util import live_server_setup
|
||||||
|
|
||||||
|
def test_setup(live_server):
|
||||||
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
def set_original_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="sametext">Some text thats the same</div>
|
||||||
|
<div id="changetext">Some text that will change</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/output.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set_modified_response():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>which has this one new line</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div id="sametext">Some text thats the same</div>
|
||||||
|
<div id="changetext">Some text that changes</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/output.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_markup_css_filter_restriction(client, live_server):
|
||||||
|
sleep_time_for_fetch_thread = 3
|
||||||
|
|
||||||
|
css_filter = "#sametext"
|
||||||
|
|
||||||
|
set_original_response()
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# Goto the edit page, add our ignore text
|
||||||
|
# Add our URL to the import page
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": ""},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
|
||||||
|
# Check it saved
|
||||||
|
res = client.get(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
)
|
||||||
|
assert bytes(css_filter.encode('utf-8')) in res.data
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
# Make a change
|
||||||
|
set_modified_response()
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("api_watch_checknow"), follow_redirects=True)
|
||||||
|
# Give the thread time to pick it up
|
||||||
|
time.sleep(sleep_time_for_fetch_thread)
|
||||||
|
|
||||||
|
# It should have 'unviewed' still
|
||||||
|
# Because it should be looking at only that 'sametext' id
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'unviewed' in res.data
|
||||||
@@ -11,4 +11,10 @@ feedgen ~= 0.9
|
|||||||
flask-login ~= 0.5
|
flask-login ~= 0.5
|
||||||
pytz
|
pytz
|
||||||
urllib3
|
urllib3
|
||||||
|
|
||||||
|
# Notification library
|
||||||
apprise ~= 0.9
|
apprise ~= 0.9
|
||||||
|
|
||||||
|
# Used for CSS filtering
|
||||||
|
bs4
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user