API Security - Watch GET history snapshot - Should return text/plain mimetype so it cant be accidently executed in the browser

2026-05-28 20:41:43 +00:00 · 2026-05-19 11:06:47 +02:00
parent 00d26e3656
commit bdf54ff33f
2 changed files with 82 additions and 2 deletions
@@ -278,8 +278,20 @@ class WatchSingleHistory(Resource):
        if request.args.get('html'):
            content = watch.get_fetched_html(timestamp)
            if content:
+                # XSS mitigation (GHSA-cgj8-g98g-4p9x): this is an API endpoint, not a
+                # browser-rendered view. The bytes ARE HTML (that's what the caller asked
+                # for) but a programmatic client doesn't need text/html — and serving
+                # text/html lets attacker-planted <script> in a monitored site execute
+                # in our origin if someone opens the URL in a browser.
+                #
+                # text/plain + explicit utf-8 + nosniff = browser shows inert text,
+                # sniffing can't re-classify it as HTML, an absent charset can't be
+                # auto-detected as UTF-7 (an alternative XSS vector). API clients
+                # still get the raw bytes — they don't care about Content-Type.
                response = make_response(content, 200)
-                response.mimetype = "text/html"
+                response.headers['Content-Type'] = 'text/plain; charset=utf-8'
+                response.headers['X-Content-Type-Options'] = 'nosniff'
+                response.headers['Content-Disposition'] = 'attachment; filename="snapshot.html"'
            else:
                response = make_response("No content found", 404)
                response.mimetype = "text/plain"
@@ -9,7 +9,7 @@ import json
 import threading
 import uuid as uuid_module
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks, delete_all_watches
+from .util import live_server_setup, wait_for_all_checks, wait_for_watch_history, delete_all_watches
 import os


@@ -653,6 +653,74 @@ def test_api_history_edge_cases(client, live_server, measure_memory_usage, datas
    delete_all_watches(client)


+def test_api_history_html_does_not_serve_as_text_html(client, live_server, measure_memory_usage, datastore_path):
+    """
+    GHSA-cgj8-g98g-4p9x: GET /api/v1/watch/<uuid>/history/<timestamp>?html=true
+    must not serve the stored snapshot with Content-Type: text/html. The bytes
+    are an external site's HTML — if the response is labelled text/html, a
+    <script> the attacker planted on that site executes in our origin when an
+    operator opens the URL in a browser (stored XSS).
+
+    The fix is text/plain; charset=utf-8 + X-Content-Type-Options: nosniff so
+    browsers render inert text and can't sniff back to HTML/UTF-7. API clients
+    don't care about Content-Type and still receive the same bytes.
+
+    This test injects the snapshot directly via Watch.save_history_blob() and
+    save_last_fetched_html() so we exercise the API endpoint's response
+    shaping without depending on the live-fetch pipeline.
+    """
+    api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
+
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("createwatch"),
+        data=json.dumps({"url": test_url}),
+        headers={'content-type': 'application/json', 'x-api-key': api_key},
+    )
+    watch_uuid = res.json.get('uuid')
+
+    # Plant a payload that would execute if the response were rendered as HTML.
+    malicious_html = (
+        "<html><body>"
+        "<script>window.__CD_XSS_PROBE = 1</script>"
+        "<img src=x onerror=\"window.__CD_XSS_PROBE = 1\">"
+        "</body></html>"
+    )
+    ts = '1700000000'
+    watch = live_server.app.config['DATASTORE'].data['watching'][watch_uuid]
+    watch.save_history_blob(contents=malicious_html, timestamp=ts, snapshot_id=ts)
+    watch.save_last_fetched_html(timestamp=ts, contents=malicious_html)
+
+    # The actual XSS-relevant assertion: how is the snapshot served?
+    res = client.get(
+        url_for("watchsinglehistory", uuid=watch_uuid, timestamp=ts) + '?html=true',
+        headers={'x-api-key': api_key},
+    )
+    assert res.status_code == 200, f"unexpected status {res.status_code}: {res.data!r}"
+
+    ctype = res.headers.get('Content-Type', '')
+    assert 'text/html' not in ctype, \
+        f"snapshot must not be served as text/html (got {ctype!r}) — see GHSA-cgj8-g98g-4p9x"
+    # Explicit utf-8 closes the UTF-7 sniffing bypass — without a charset, some
+    # browsers will auto-detect UTF-7 from byte patterns and a crafted snapshot
+    # can still execute via `+ADw-script+AD4-...`
+    assert 'charset=utf-8' in ctype.lower(), \
+        f"Content-Type must pin charset=utf-8 to defeat UTF-7 sniffing XSS (got {ctype!r})"
+
+    nosniff = res.headers.get('X-Content-Type-Options', '')
+    assert nosniff.lower() == 'nosniff', \
+        f"X-Content-Type-Options: nosniff required to defeat MIME-sniffing (got {nosniff!r})"
+
+    # API contract: the raw bytes must still be the original HTML — programmatic
+    # consumers depend on getting the stored snapshot back.
+    assert b'<script>' in res.data, \
+        "Response body must still contain the raw stored bytes (the API contract)"
+
+    # Cleanup
+    client.delete(url_for("watch", uuid=watch_uuid), headers={'x-api-key': api_key})
+    delete_all_watches(client)
+
+
 def test_api_notification_edge_cases(client, live_server, measure_memory_usage, datastore_path):
    """
    Test notification configuration edge cases.