mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-05-30 13:31:04 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b1803b442a | |||
| d42bb74918 | |||
| 624dee60d5 |
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
# Semver means never use .01, or 00. Should be .1.
|
||||
__version__ = '0.55.3'
|
||||
__version__ = '0.55.4'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
|
||||
from changedetectionio.validate_url import is_safe_valid_url
|
||||
@@ -278,8 +279,28 @@ class WatchSingleHistory(Resource):
|
||||
if request.args.get('html'):
|
||||
content = watch.get_fetched_html(timestamp)
|
||||
if content:
|
||||
# XSS mitigation (GHSA-cgj8-g98g-4p9x): this is an API endpoint, not a
|
||||
# browser-rendered view. The bytes ARE HTML (that's what the caller asked
|
||||
# for) but a programmatic client doesn't need text/html — and serving
|
||||
# text/html lets attacker-planted <script> in a monitored site execute
|
||||
# in our origin if someone opens the URL in a browser.
|
||||
#
|
||||
# text/plain + explicit utf-8 + nosniff = browser shows inert text,
|
||||
# sniffing can't re-classify it as HTML, an absent charset can't be
|
||||
# auto-detected as UTF-7 (an alternative XSS vector). API clients
|
||||
# still get the raw bytes — they don't care about Content-Type.
|
||||
response = make_response(content, 200)
|
||||
response.mimetype = "text/html"
|
||||
response.headers['Content-Type'] = 'text/plain; charset=utf-8'
|
||||
response.headers['X-Content-Type-Options'] = 'nosniff'
|
||||
# Include the timestamp in the download name so downloading multiple
|
||||
# snapshots doesn't collide. No extension — the stored bytes are
|
||||
# "whatever the fetcher captured" (HTML, JSON, XML, text…), so
|
||||
# claiming .html on the download would be a false content-type label
|
||||
# for non-HTML watches. The user/curl can rename if needed.
|
||||
# Strip to safe filename chars (timestamp is already validated as a
|
||||
# watch.history key — this is defense in depth against header injection).
|
||||
safe_ts = re.sub(r'[^0-9A-Za-z_-]', '', str(timestamp))[:32] or 'snapshot'
|
||||
response.headers['Content-Disposition'] = f'attachment; filename="snapshot-{safe_ts}"'
|
||||
else:
|
||||
response = make_response("No content found", 404)
|
||||
response.mimetype = "text/plain"
|
||||
|
||||
@@ -364,6 +364,10 @@ def process_notification(n_object: NotificationContextData, datastore):
|
||||
# Should always be false for 'text' mode or its too hard to read
|
||||
# But otherwise, this could be some setting
|
||||
word_diff=False if requested_output_format_original == 'text' else True,
|
||||
# HTML-format notifications must escape diff content (GHSA-q8xq-qg4x-wphg).
|
||||
# FormattableDiff/Extract escape internally so {{ diff(...) }} stays callable —
|
||||
# the post-Jinja escape loop below would otherwise convert them to plain str.
|
||||
escape_output='html' in requested_output_format,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -394,10 +398,19 @@ def process_notification(n_object: NotificationContextData, datastore):
|
||||
# so they survive escape and are still replaced with <span> tags later.
|
||||
if 'html' in requested_output_format:
|
||||
from markupsafe import escape as html_escape
|
||||
from changedetectionio.notification_service import FormattableDiff, FormattableExtract
|
||||
_page_content_keys = {'raw_diff', 'current_snapshot', 'prev_snapshot', 'triggered_text'}
|
||||
for key in [k for k in notification_parameters if k.startswith('diff') or k in _page_content_keys]:
|
||||
if notification_parameters.get(key):
|
||||
notification_parameters[key] = str(html_escape(str(notification_parameters[key])))
|
||||
value = notification_parameters.get(key)
|
||||
if not value:
|
||||
continue
|
||||
# FormattableDiff / FormattableExtract are callable str subclasses — {{ diff(lines=5) }}
|
||||
# etc. relies on __call__. Wrapping them with str(html_escape(...)) here would lose
|
||||
# __call__ and break those tokens. They escape internally via escape_output=True
|
||||
# (set by add_rendered_diff_to_notification_vars above) for both __str__ and __call__.
|
||||
if isinstance(value, (FormattableDiff, FormattableExtract)):
|
||||
continue
|
||||
notification_parameters[key] = str(html_escape(str(value)))
|
||||
|
||||
with (apprise.LogCapture(level=apprise.logging.DEBUG) as logs):
|
||||
for url in n_object['notification_urls']:
|
||||
|
||||
@@ -99,7 +99,7 @@ class FormattableExtract(str):
|
||||
Multiple changed fragments are joined with newlines.
|
||||
Being a str subclass means it is natively JSON serializable.
|
||||
"""
|
||||
def __new__(cls, prev_snapshot, current_snapshot, extract_fn):
|
||||
def __new__(cls, prev_snapshot, current_snapshot, extract_fn, escape_output=False):
|
||||
if prev_snapshot or current_snapshot:
|
||||
from changedetectionio import diff as diff_module
|
||||
# word_diff=True is required — placemarker extraction regexes only exist in word-diff output
|
||||
@@ -107,6 +107,12 @@ class FormattableExtract(str):
|
||||
extracted = extract_fn(raw)
|
||||
else:
|
||||
extracted = ''
|
||||
if escape_output and extracted:
|
||||
# Placemarkers (@removed_PLACEMARKER_OPEN etc) contain no HTML chars,
|
||||
# so html_escape leaves them intact — they still get swapped to <span>
|
||||
# tags later by apply_service_tweaks. See GHSA-q8xq-qg4x-wphg.
|
||||
from markupsafe import escape as html_escape
|
||||
extracted = str(html_escape(extracted))
|
||||
instance = super().__new__(cls, extracted)
|
||||
return instance
|
||||
|
||||
@@ -128,16 +134,23 @@ class FormattableDiff(str):
|
||||
|
||||
Being a str subclass means it is natively JSON serializable.
|
||||
"""
|
||||
def __new__(cls, prev_snapshot, current_snapshot, **base_kwargs):
|
||||
def __new__(cls, prev_snapshot, current_snapshot, escape_output=False, **base_kwargs):
|
||||
if prev_snapshot or current_snapshot:
|
||||
from changedetectionio import diff as diff_module
|
||||
rendered = diff_module.render_diff(prev_snapshot, current_snapshot, **base_kwargs)
|
||||
else:
|
||||
rendered = ''
|
||||
if escape_output and rendered:
|
||||
# Placemarkers (@removed_PLACEMARKER_OPEN etc) contain no HTML chars,
|
||||
# so html_escape leaves them intact — they still get swapped to <span>
|
||||
# tags later by apply_service_tweaks. See GHSA-q8xq-qg4x-wphg.
|
||||
from markupsafe import escape as html_escape
|
||||
rendered = str(html_escape(rendered))
|
||||
instance = super().__new__(cls, rendered)
|
||||
instance._prev = prev_snapshot
|
||||
instance._current = current_snapshot
|
||||
instance._base_kwargs = base_kwargs
|
||||
instance._escape_output = escape_output
|
||||
return instance
|
||||
|
||||
def __call__(self, lines=None, added_only=False, removed_only=False, context=0,
|
||||
@@ -163,6 +176,10 @@ class FormattableDiff(str):
|
||||
if lines is not None:
|
||||
result = '\n'.join(result.splitlines()[:int(lines)])
|
||||
|
||||
if self._escape_output and result:
|
||||
from markupsafe import escape as html_escape
|
||||
result = str(html_escape(result))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -236,7 +253,7 @@ class NotificationContextData(dict):
|
||||
|
||||
super().__setitem__(key, value)
|
||||
|
||||
def add_rendered_diff_to_notification_vars(notification_scan_text:str, prev_snapshot:str, current_snapshot:str, word_diff:bool):
|
||||
def add_rendered_diff_to_notification_vars(notification_scan_text:str, prev_snapshot:str, current_snapshot:str, word_diff:bool, escape_output:bool=False):
|
||||
"""
|
||||
Efficiently renders only the diff placeholders that are actually used in the notification text.
|
||||
|
||||
@@ -249,6 +266,9 @@ def add_rendered_diff_to_notification_vars(notification_scan_text:str, prev_snap
|
||||
prev_snapshot: Previous version of content for diff comparison
|
||||
current_snapshot: Current version of content for diff comparison
|
||||
word_diff: Whether to use word-level (True) or line-level (False) diffing
|
||||
escape_output: If True, the rendered diff output is HTML-escaped. Used for HTML-format
|
||||
notifications so attacker-controlled page content can't inject live markup.
|
||||
Both the cached str representation and the result of {{ diff(...) }} calls are escaped.
|
||||
|
||||
Returns:
|
||||
dict: Only the diff placeholders that were found in notification_scan_text, with rendered content
|
||||
@@ -287,10 +307,10 @@ def add_rendered_diff_to_notification_vars(notification_scan_text:str, prev_snap
|
||||
if not re.search(pattern, notification_scan_text, re.IGNORECASE):
|
||||
continue
|
||||
if key in diff_specs:
|
||||
ret[key] = FormattableDiff(prev_snapshot, current_snapshot, **diff_specs[key])
|
||||
ret[key] = FormattableDiff(prev_snapshot, current_snapshot, escape_output=escape_output, **diff_specs[key])
|
||||
rendered_count += 1
|
||||
elif key in extract_specs:
|
||||
ret[key] = FormattableExtract(prev_snapshot, current_snapshot, extract_fn=extract_specs[key])
|
||||
ret[key] = FormattableExtract(prev_snapshot, current_snapshot, extract_fn=extract_specs[key], escape_output=escape_output)
|
||||
rendered_count += 1
|
||||
|
||||
if rendered_count:
|
||||
|
||||
@@ -9,7 +9,7 @@ import json
|
||||
import threading
|
||||
import uuid as uuid_module
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, wait_for_all_checks, delete_all_watches
|
||||
from .util import live_server_setup, wait_for_all_checks, wait_for_watch_history, delete_all_watches
|
||||
import os
|
||||
|
||||
|
||||
@@ -653,6 +653,80 @@ def test_api_history_edge_cases(client, live_server, measure_memory_usage, datas
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_api_history_html_does_not_serve_as_text_html(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""
|
||||
GHSA-cgj8-g98g-4p9x: GET /api/v1/watch/<uuid>/history/<timestamp>?html=true
|
||||
must not serve the stored snapshot with Content-Type: text/html. The bytes
|
||||
are an external site's HTML — if the response is labelled text/html, a
|
||||
<script> the attacker planted on that site executes in our origin when an
|
||||
operator opens the URL in a browser (stored XSS).
|
||||
|
||||
The fix is text/plain; charset=utf-8 + X-Content-Type-Options: nosniff so
|
||||
browsers render inert text and can't sniff back to HTML/UTF-7. API clients
|
||||
don't care about Content-Type and still receive the same bytes.
|
||||
|
||||
This test injects the snapshot directly via Watch.save_history_blob() and
|
||||
save_last_fetched_html() so we exercise the API endpoint's response
|
||||
shaping without depending on the live-fetch pipeline.
|
||||
"""
|
||||
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
|
||||
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("createwatch"),
|
||||
data=json.dumps({"url": test_url}),
|
||||
headers={'content-type': 'application/json', 'x-api-key': api_key},
|
||||
)
|
||||
watch_uuid = res.json.get('uuid')
|
||||
|
||||
# Plant a payload that would execute if the response were rendered as HTML.
|
||||
malicious_html = (
|
||||
"<html><body>"
|
||||
"<script>window.__CD_XSS_PROBE = 1</script>"
|
||||
"<img src=x onerror=\"window.__CD_XSS_PROBE = 1\">"
|
||||
"</body></html>"
|
||||
)
|
||||
ts = '1700000000'
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][watch_uuid]
|
||||
watch.save_history_blob(contents=malicious_html, timestamp=ts, snapshot_id=ts)
|
||||
watch.save_last_fetched_html(timestamp=ts, contents=malicious_html)
|
||||
|
||||
# The actual XSS-relevant assertion: how is the snapshot served?
|
||||
res = client.get(
|
||||
url_for("watchsinglehistory", uuid=watch_uuid, timestamp=ts) + '?html=true',
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
assert res.status_code == 200, f"unexpected status {res.status_code}: {res.data!r}"
|
||||
|
||||
ctype = res.headers.get('Content-Type', '')
|
||||
assert 'text/html' not in ctype, \
|
||||
f"snapshot must not be served as text/html (got {ctype!r}) — see GHSA-cgj8-g98g-4p9x"
|
||||
# Explicit utf-8 closes the UTF-7 sniffing bypass — without a charset, some
|
||||
# browsers will auto-detect UTF-7 from byte patterns and a crafted snapshot
|
||||
# can still execute via `+ADw-script+AD4-...`
|
||||
assert 'charset=utf-8' in ctype.lower(), \
|
||||
f"Content-Type must pin charset=utf-8 to defeat UTF-7 sniffing XSS (got {ctype!r})"
|
||||
|
||||
nosniff = res.headers.get('X-Content-Type-Options', '')
|
||||
assert nosniff.lower() == 'nosniff', \
|
||||
f"X-Content-Type-Options: nosniff required to defeat MIME-sniffing (got {nosniff!r})"
|
||||
|
||||
# Download filename should include the timestamp so multiple snapshots from
|
||||
# the same watch don't overwrite each other on disk.
|
||||
disp = res.headers.get('Content-Disposition', '')
|
||||
assert 'attachment' in disp and ts in disp, \
|
||||
f"Content-Disposition should be attachment + per-timestamp filename (got {disp!r})"
|
||||
|
||||
# API contract: the raw bytes must still be the original HTML — programmatic
|
||||
# consumers depend on getting the stored snapshot back.
|
||||
assert b'<script>' in res.data, \
|
||||
"Response body must still contain the raw stored bytes (the API contract)"
|
||||
|
||||
# Cleanup
|
||||
client.delete(url_for("watch", uuid=watch_uuid), headers={'x-api-key': api_key})
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_api_notification_edge_cases(client, live_server, measure_memory_usage, datastore_path):
|
||||
"""
|
||||
Test notification configuration edge cases.
|
||||
|
||||
@@ -634,6 +634,12 @@ def _test_color_notifications(client, notification_body_token, datastore_path):
|
||||
def test_html_color_notifications(client, live_server, measure_memory_usage, datastore_path):
|
||||
_test_color_notifications(client, '{{diff}}',datastore_path=datastore_path)
|
||||
_test_color_notifications(client, '{{diff_full}}',datastore_path=datastore_path)
|
||||
# Regression: the html-output escape pass in handler.py used to convert
|
||||
# FormattableDiff into a plain str, stripping its __call__ and breaking any
|
||||
# {{ diff(...) }} / {{ diff_added(...) }} token on htmlcolor/html notifications
|
||||
# with 'str' object is not callable (see commit 08d30c6 + #3923).
|
||||
# word_diff=false reproduces the exact form the user-reported failure used.
|
||||
_test_color_notifications(client, '{{diff(word_diff=false)}}', datastore_path=datastore_path)
|
||||
|
||||
|
||||
def _test_custom_html_in_notification_body_not_escaped(client, datastore_path, content_type=None):
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
#, fuzzy
|
||||
msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: changedetection.io 0.55.3\n"
|
||||
"Project-Id-Version: changedetection.io 0.55.4\n"
|
||||
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
|
||||
"POT-Creation-Date: 2026-05-19 10:29+0200\n"
|
||||
"POT-Creation-Date: 2026-05-19 11:38+0200\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language-Team: LANGUAGE <LL@li.org>\n"
|
||||
|
||||
Reference in New Issue
Block a user