mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Fix encoding errors with XPath filters from UTF-8 responses (#619)
This commit is contained in:
@@ -39,7 +39,7 @@ def element_removal(selectors: List[str], html_content):
|
|||||||
def xpath_filter(xpath_filter, html_content):
|
def xpath_filter(xpath_filter, html_content):
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
|
||||||
tree = html.fromstring(html_content)
|
tree = html.fromstring(bytes(html_content, encoding='utf-8'))
|
||||||
html_block = ""
|
html_block = ""
|
||||||
|
|
||||||
for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
|
for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
|
||||||
|
|||||||
@@ -44,6 +44,61 @@ def set_modified_response():
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
|
||||||
|
def test_check_xpath_filter_utf8(client, live_server):
|
||||||
|
filter='//item/*[self::description]'
|
||||||
|
|
||||||
|
d='''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>rpilocator.com</title>
|
||||||
|
<link>https://rpilocator.com</link>
|
||||||
|
<description>Find Raspberry Pi Computers in Stock</description>
|
||||||
|
<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
|
||||||
|
<image>
|
||||||
|
<url>https://rpilocator.com/favicon.png</url>
|
||||||
|
<title>rpilocator.com</title>
|
||||||
|
<link>https://rpilocator.com/</link>
|
||||||
|
<width>32</width>
|
||||||
|
<height>32</height>
|
||||||
|
</image>
|
||||||
|
<item>
|
||||||
|
<title>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</title>
|
||||||
|
<description>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</description>
|
||||||
|
<link>https://rpilocator.com?vendor=pimoroni&utm_source=feed&utm_medium=rss</link>
|
||||||
|
<category>pimoroni</category>
|
||||||
|
<category>UK</category>
|
||||||
|
<category>CM4</category>
|
||||||
|
<guid isPermaLink="false">F9FAB0D9-DF6F-40C8-8DEE5FC0646BB722</guid>
|
||||||
|
<pubDate>Thu, 19 May 2022 14:32:32 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>'''
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(d)
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
res = client.post(
|
||||||
|
url_for("edit_page", uuid="first"),
|
||||||
|
data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"Updated watch." in res.data
|
||||||
|
time.sleep(3)
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'Unicode strings with encoding declaration are not supported.' not in res.data
|
||||||
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_check_markup_xpath_filter_restriction(client, live_server):
|
def test_check_markup_xpath_filter_restriction(client, live_server):
|
||||||
sleep_time_for_fetch_thread = 3
|
sleep_time_for_fetch_thread = 3
|
||||||
@@ -95,6 +150,8 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
|
|||||||
|
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
assert b'unviewed' not in res.data
|
assert b'unviewed' not in res.data
|
||||||
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
|
||||||
def test_xpath_validation(client, live_server):
|
def test_xpath_validation(client, live_server):
|
||||||
@@ -117,6 +174,8 @@ def test_xpath_validation(client, live_server):
|
|||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
assert b"is not a valid XPath expression" in res.data
|
assert b"is not a valid XPath expression" in res.data
|
||||||
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
|
||||||
# actually only really used by the distll.io importer, but could be handy too
|
# actually only really used by the distll.io importer, but could be handy too
|
||||||
@@ -153,8 +212,6 @@ def test_check_with_prefix_css_filter(client, live_server):
|
|||||||
follow_redirects=True
|
follow_redirects=True
|
||||||
)
|
)
|
||||||
|
|
||||||
with open('/tmp/fuck.html', 'wb') as f:
|
|
||||||
f.write(res.data)
|
|
||||||
assert b"Some text thats the same" in res.data #in selector
|
assert b"Some text thats the same" in res.data #in selector
|
||||||
assert b"Some text that will change" not in res.data #not in selector
|
assert b"Some text that will change" not in res.data #not in selector
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user