HTML hygiene fix/improvement (Dont allow unescaped HTML to become real HTML in notifications)

This commit is contained in:
dgtlmoon
2026-05-12 00:58:02 +10:00
parent ab19cb3e4f
commit 08d30c6f22
2 changed files with 107 additions and 14 deletions
+11 -8
View File
@@ -382,14 +382,17 @@ def process_notification(n_object: NotificationContextData, datastore):
n_object['llm_summary'] = _llm_change_summary or (n_object.get('_llm_result') or {}).get('summary', '')
n_object['llm_intent'] = n_object.get('_llm_intent', '')
# Re #3529: diff content from text/plain pages may contain raw '<' chars that break HTML emails.
# Escape only the diff variables before Jinja2 renders them into the template, so the user's
# own HTML in the notification body (e.g. <a href="{{watch_url}}">) is never touched.
# Diff placemarkers (e.g. @removed_PLACEMARKER_OPEN) contain no HTML chars so they survive
# html_escape and are still replaced with <span> tags by apply_service_tweaks later.
watch_mime_type = n_object.get('watch_mime_type')
if (watch_mime_type and 'text/' in watch_mime_type.lower() and 'html' not in watch_mime_type.lower()
and 'html' in requested_output_format):
# Escape diff/snapshot variables before Jinja renders them into an HTML notification.
# GHSA-q8xq-qg4x-wphg: inscriptis decodes HTML entities when converting text/html
# pages to snapshot text, so a page that visibly displays "&lt;a href...&gt;" yields
# literal "<a href...>" in the snapshot — which would otherwise render as live
# markup in HTML emails / Telegram (parse_mode=html) / Discord embeds, letting a
# watched page inject phishing links into the operator's notification channel.
# Also covers #3529 — raw '<' chars from text/plain pages breaking HTML email layout.
# The operator's own template HTML (e.g. <a href="{{watch_url}}">) is outside the
# variable values so it stays untouched. Diff placemarkers contain no HTML chars,
# so they survive escape and are still replaced with <span> tags later.
if 'html' in requested_output_format:
from markupsafe import escape as html_escape
_page_content_keys = {'raw_diff', 'current_snapshot', 'prev_snapshot', 'triggered_text'}
for key in [k for k in notification_parameters if k.startswith('diff') or k in _page_content_keys]:
+96 -6
View File
@@ -638,9 +638,11 @@ def test_html_color_notifications(client, live_server, measure_memory_usage, dat
def _test_custom_html_in_notification_body_not_escaped(client, datastore_path, content_type=None):
"""
#4121 - Custom HTML in the notification body (e.g. <a href="{{watch_url}}">) must NOT be
HTML-escaped regardless of the watched page's content-type. Only raw diff content from
text/plain pages needs escaping (to prevent raw '<' chars breaking HTML email rendering).
#4121 - The operator's own HTML in the notification body template (e.g.
<a href="{{watch_url}}">) must survive unescaped regardless of the watched page's
Content-Type. The escape pass in handler.py only touches the variable *values*
(diff/snapshot content from the page see GHSA-q8xq-qg4x-wphg) it leaves the
surrounding template HTML alone.
"""
set_original_response(datastore_path=datastore_path)
@@ -693,10 +695,98 @@ def _test_custom_html_in_notification_body_not_escaped(client, datastore_path, c
def test_plaintext_watch_custom_html_in_notification_body_not_escaped(client, live_server, measure_memory_usage, datastore_path):
# text/plain: diff content may contain raw '<' chars — those must be escaped, but NOT the user's template HTML
# Diff/snapshot values are escaped for HTML notifications (covered by
# test_html_watch_diff_content_escaped_in_html_notification). What this test
# locks in is that the *surrounding* template HTML is left alone in every case.
_test_custom_html_in_notification_body_not_escaped(client, datastore_path, content_type="text/plain")
# text/html: HTML processor strips tags before diffing, no escaping needed, user's template HTML must be preserved
_test_custom_html_in_notification_body_not_escaped(client, datastore_path, content_type="text/html")
# no MIME type (None): same as HTML case, user's template HTML must be preserved
_test_custom_html_in_notification_body_not_escaped(client, datastore_path, content_type=None)
def test_html_watch_diff_content_escaped_in_html_notification(client, live_server, measure_memory_usage, datastore_path):
"""
GHSA-q8xq-qg4x-wphg diff/snapshot content from the watched page must be
HTML-escaped before it is rendered into an HTML-format notification, regardless
of the watched page's Content-Type.
Inscriptis (used to convert text/html pages to snapshot text) decodes HTML
entities so a page that visibly displays "&lt;a href=...&gt;" produces snapshot
text containing literal "<a href=...>". The previous gate at handler.py:391
only escaped when watch_mime_type matched 'text/' and not 'html', which let
that decoded markup through to HTML emails / Telegram (parse_mode=html) /
Discord embeds, where it renders as a real clickable link i.e. an attacker
who controls a watched page can inject phishing links into the operator's
trusted notification channel.
"""
from .util import write_test_file_and_sync
if os.path.isfile(os.path.join(datastore_path, "notification.txt")):
os.unlink(os.path.join(datastore_path, "notification.txt"))
# Baseline: an innocuous text/html page.
baseline_html = "<html><body><p>nothing to see here</p></body></html>"
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), baseline_html)
test_notification_url = url_for('test_notification_endpoint', _external=True).replace('http://', 'post://')
# Pass content_type=text/html so the watch records 'text/html' as its content-type
# — this is the branch the previous gate skipped escaping for.
test_url = url_for('test_endpoint', _external=True, content_type='text/html')
# HTML-format notification body that embeds the snapshot directly. Operators do this
# when they want the full changed content in the alert (e.g. an email digest).
res = client.post(
url_for("settings.settings_page"),
data={
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"application-notification_body": 'Watch had changes:\n{{current_snapshot}}',
"application-notification_format": "html",
"application-notification_urls": test_notification_url,
"application-notification_title": "Change detected",
},
follow_redirects=True
)
assert b'Settings updated' in res.data
res = client.post(
url_for("ui.ui_views.form_quick_watch_add"),
data={"url": test_url, "tags": ''},
follow_redirects=True
)
assert b"Watch added" in res.data
wait_for_all_checks(client)
# Now flip the page to something whose *visible* text contains entity-encoded
# angle brackets — exactly the pattern a forum / pastebin / code-sample site uses
# to display literal HTML on the page. Inscriptis will decode &lt;/&gt; back to
# literal < / > in the stored snapshot.
attacker_html = (
'<html><body><pre>'
'&lt;a href="https://attacker.example/payment"&gt;ACTION REQUIRED&lt;/a&gt;'
'&lt;img src="https://attacker.example/track" width="1" height="1"&gt;'
'</pre></body></html>'
)
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), attacker_html)
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
wait_for_all_checks(client)
wait_for_notification_endpoint_output(datastore_path=datastore_path)
with open(os.path.join(datastore_path, "notification.txt"), 'r') as f:
body = f.read()
# Sanity: the snapshot really did contain the decoded markup (otherwise the test
# would pass for the wrong reason). The escaped form must appear somewhere.
assert '&lt;a href=' in body or '&amp;lt;a href=' in body, \
f"Expected escaped attacker markup in notification body, got: {body!r}"
# The bug: a live <a href="https://attacker..."> ends up in the HTML notification.
assert '<a href="https://attacker.example/payment"' not in body, \
f"Diff content from text/html page was NOT escaped — phishing link reached HTML notification: {body!r}"
assert '<img src="https://attacker.example/track"' not in body, \
f"Diff content from text/html page was NOT escaped — tracking pixel reached HTML notification: {body!r}"
client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)