mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Fixing bad detection of text text/plain in previous release, adding automated test (#3460)
This commit is contained in:
@@ -154,10 +154,10 @@ class perform_site_check(difference_detection_processor):
|
||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||
html_content = self.fetcher.content
|
||||
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()
|
||||
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
|
||||
|
||||
# Try to detect better mime types if its a download or not announced as HTML
|
||||
if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
|
||||
if is_attachment:
|
||||
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
|
||||
try:
|
||||
import magic
|
||||
|
||||
@@ -167,6 +167,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
|
||||
"""
|
||||
|
||||
https://github.com/dgtlmoon/changedetection.io/issues/3434
|
||||
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
|
||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:return:
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("""some random text that should be split by line
|
||||
and not parsed with html_to_text
|
||||
@@ -215,3 +227,68 @@ got it\r\n
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
def test_standard_text_plain(client, live_server, measure_memory_usage):
|
||||
"""
|
||||
|
||||
https://github.com/dgtlmoon/changedetection.io/issues/3434
|
||||
I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
|
||||
but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
|
||||
changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
|
||||
|
||||
The real bug here can be that it will try to process plain-text as HTML, losing <etc>
|
||||
|
||||
:param client:
|
||||
:param live_server:
|
||||
:param measure_memory_usage:
|
||||
:return:
|
||||
"""
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write("""some random text that should be split by line
|
||||
and not parsed with html_to_text
|
||||
<title>Even this title should stay because we are just plain text</title>
|
||||
this way we know that it correctly parsed as plain text
|
||||
\r\n
|
||||
ok\r\n
|
||||
got it\r\n
|
||||
""")
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
|
||||
|
||||
# Add our URL to the import page
|
||||
res = client.post(
|
||||
url_for("imports.import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
### check the front end
|
||||
res = client.get(
|
||||
url_for("ui.ui_views.preview_page", uuid="first"),
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"some random text that should be split by line\n" in res.data
|
||||
####
|
||||
|
||||
# Check the snapshot by API that it has linefeeds too
|
||||
watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
res = client.get(
|
||||
url_for("watchhistory", uuid=watch_uuid),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
|
||||
# Fetch a snapshot by timestamp, check the right one was found
|
||||
res = client.get(
|
||||
url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
assert b"some random text that should be split by line\n" in res.data
|
||||
assert b"<title>Even this title should stay because we are just plain text</title>" in res.data
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user