Files
changedetection.io/changedetectionio/blueprint/ui/diff.py
T
dgtlmoon e4bc048280
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / lint-translations (push) Has been cancelled
ChangeDetection.io App Test / lint-template-i18n (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled
UI - AI/LLM - "Summary" button should set last viewed (#4095)
2026-04-28 19:47:15 +10:00

541 lines
23 KiB
Python

from flask import Blueprint, request, redirect, url_for, flash, render_template, make_response, send_from_directory
from flask_babel import gettext
import re
import importlib
from loguru import logger
from markupsafe import Markup
from changedetectionio.diff import (
REMOVED_STYLE, ADDED_STYLE, REMOVED_INNER_STYLE, ADDED_INNER_STYLE,
REMOVED_PLACEMARKER_OPEN, REMOVED_PLACEMARKER_CLOSED,
ADDED_PLACEMARKER_OPEN, ADDED_PLACEMARKER_CLOSED,
CHANGED_PLACEMARKER_OPEN, CHANGED_PLACEMARKER_CLOSED,
CHANGED_INTO_PLACEMARKER_OPEN, CHANGED_INTO_PLACEMARKER_CLOSED
)
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
def _clean_litellm_error(exc) -> str:
"""Return a short, human-readable error string from a litellm exception.
litellm embeds the raw provider JSON in str(exc), which can be hundreds of
characters of verbose quota detail. We try to pull just the provider's
'message' field; failing that we return the first non-empty line with the
'litellm.XxxError:' class prefix stripped.
"""
import json, re
raw = str(exc)
# Try to parse the embedded JSON block (starts at first '{')
brace = raw.find('{')
if brace >= 0:
try:
payload = json.loads(raw[brace:])
msg = (payload.get('error') or {}).get('message') or ''
if msg:
# Take only the first sentence / line — provider messages can be long
return msg.split('\n')[0].split('. ')[0].strip() + '.'
except Exception:
pass
# Fallback: strip the "litellm.XxxError: litellm.XxxError: providerException - " prefix
first_line = raw.split('\n')[0]
first_line = re.sub(r'^(litellm\.\w+:\s*)+', '', first_line)
first_line = re.sub(r'\w+Exception\s*-\s*', '', first_line).strip()
return first_line or raw.split('\n')[0]
def construct_blueprint(datastore: ChangeDetectionStore):
diff_blueprint = Blueprint('ui_diff', __name__, template_folder="../ui/templates")
@diff_blueprint.app_template_filter('diff_unescape_difference_spans')
def diff_unescape_difference_spans(content):
"""Emulate Jinja2's auto-escape, then selectively unescape our diff spans."""
from markupsafe import escape
if not content:
return Markup('')
# Step 1: Escape everything like Jinja2 would (this makes it XSS-safe)
escaped_content = escape(str(content))
# Step 2: Unescape only our exact diff spans generated by apply_html_color_to_body()
# Pattern matches the exact structure:
# <span style="{STYLE}" role="{ROLE}" aria-label="{LABEL}" title="{TITLE}">
# Unescape outer span opening tags with full attributes (role, aria-label, title)
# Matches removed/added/changed/changed_into spans
result = re.sub(
rf'&lt;span style=&#34;({re.escape(REMOVED_STYLE)}|{re.escape(ADDED_STYLE)})&#34; '
rf'role=&#34;(deletion|insertion|note)&#34; '
rf'aria-label=&#34;([^&]+?)&#34; '
rf'title=&#34;([^&]+?)&#34;&gt;',
r'<span style="\1" role="\2" aria-label="\3" title="\4">',
str(escaped_content),
flags=re.IGNORECASE
)
# Unescape inner span opening tags (without additional attributes)
# This matches the darker background styles for changed parts within lines
result = re.sub(
rf'&lt;span style=&#34;({re.escape(REMOVED_INNER_STYLE)}|{re.escape(ADDED_INNER_STYLE)})&#34;&gt;',
r'<span style="\1">',
result,
flags=re.IGNORECASE
)
# Unescape closing tags (but only as many as we opened)
open_count = result.count('<span style=')
close_count = str(escaped_content).count('&lt;/span&gt;')
# Replace up to the number of spans we opened
for _ in range(min(open_count, close_count)):
result = result.replace('&lt;/span&gt;', '</span>', 1)
return Markup(result)
@diff_blueprint.route("/diff/<uuid_str:uuid>", methods=['GET'])
@login_optionally_required
def diff_history_page(uuid):
"""
Render the history/diff page for a watch.
This route is processor-aware: it delegates rendering to the processor's
difference.py module, allowing different processor types to provide
custom visualizations:
- text_json_diff: Text/HTML diff with syntax highlighting
- restock_diff: Could show price charts and stock history
- image_diff: Could show image comparison slider/overlay
Each processor implements processors/{type}/difference.py::render()
If a processor doesn't have a difference module, falls back to text_json_diff.
"""
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
flash(gettext("No history found for the specified link, bad link?"), "error")
return redirect(url_for('watchlist.index'))
dates = list(watch.history.keys())
if not dates or len(dates) < 2:
flash(gettext("Not enough history (2 snapshots required) to show difference page for this watch."), "error")
return redirect(url_for('watchlist.index'))
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
# Call the processor's render() function
if processor_module and hasattr(processor_module, 'render'):
return processor_module.render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Fallback: if processor doesn't have difference module, use text_json_diff as default
from changedetectionio.processors.text_json_diff.difference import render as default_render
return default_render(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
@diff_blueprint.route("/diff/<uuid_str:uuid>/llm-summary/prompt", methods=['GET'])
@login_optionally_required
def diff_llm_summary_prompt(uuid):
"""Return the effective LLM summary prompt for a watch immediately (no LLM call)."""
from flask import jsonify
watch = datastore.data['watching'].get(uuid)
if not watch:
return jsonify({'prompt': ''}), 404
try:
from changedetectionio.llm.evaluator import get_effective_summary_prompt
prompt = get_effective_summary_prompt(watch, datastore)
except Exception:
prompt = ''
return jsonify({'prompt': prompt})
@diff_blueprint.route("/diff/<uuid_str:uuid>/llm-summary", methods=['GET'])
@login_optionally_required
def diff_llm_summary(uuid):
"""
Generate (or return cached) an AI summary of the diff between two snapshots.
Called via AJAX from the diff page when no cached summary exists.
Returns JSON: {"summary": "...", "error": null} or {"summary": null, "error": "..."}
"""
import difflib
from flask import jsonify
try:
watch = datastore.data['watching'][uuid]
except KeyError:
return jsonify({'summary': None, 'error': 'Watch not found'}), 404
llm_cfg = datastore.data.get('settings', {}).get('application', {}).get('llm', {})
if not llm_cfg.get('model'):
return jsonify({'summary': None, 'error': 'LLM not configured'}), 400
dates = list(watch.history.keys())
if len(dates) < 2:
return jsonify({'summary': None, 'error': 'Not enough history'}), 400
best_from = watch.get_from_version_based_on_last_viewed
from_version = request.args.get('from_version', best_from if best_from else dates[-2])
to_version = request.args.get('to_version', dates[-1])
all_changes = request.args.get('all_changes', '0') == '1'
ignore_whitespace = request.args.get('ignore_whitespace', '0') == '1'
show_removed = request.args.get('removed', '1') == '1'
show_added = request.args.get('added', '1') == '1'
def _prep(text):
"""Optionally normalise whitespace on each line before diffing."""
if not ignore_whitespace:
return text.splitlines()
return [' '.join(line.split()) for line in text.splitlines()]
def _make_unified_diff(a_text, b_text):
lines = list(difflib.unified_diff(_prep(a_text), _prep(b_text), lineterm='', n=3))
return '\n'.join(lines[2:]) if len(lines) > 2 else '\n'.join(lines)
def _apply_filters(diff_text):
"""Strip +/- lines the user has hidden in the UI so the LLM matches what they see."""
if show_removed and show_added:
return diff_text
out = []
for line in diff_text.splitlines():
if line.startswith('-') and not show_removed:
continue
if line.startswith('+') and not show_added:
continue
out.append(line)
return '\n'.join(out)
try:
from_text = watch.get_history_snapshot(timestamp=from_version)
to_text = watch.get_history_snapshot(timestamp=to_version)
except Exception as e:
return jsonify({'summary': None, 'error': f'Could not read snapshots: {e}'}), 500
if all_changes:
# Build sequential diffs for every intermediate snapshot between from and to
# so the LLM sees the full timeline of changes, not just start→end
sorted_dates = sorted(dates)
try:
start_idx = sorted_dates.index(from_version)
end_idx = sorted_dates.index(to_version)
except ValueError:
start_idx, end_idx = 0, len(sorted_dates) - 1
steps = sorted_dates[start_idx:end_idx + 1]
segments = []
for i in range(len(steps) - 1):
a_ts, b_ts = steps[i], steps[i + 1]
try:
a_text = watch.get_history_snapshot(timestamp=a_ts) or ''
b_text = watch.get_history_snapshot(timestamp=b_ts) or ''
except Exception:
continue
seg = _apply_filters(_make_unified_diff(a_text, b_text))
if seg.strip():
segments.append(f'=== {a_ts}{b_ts} ===\n{seg}')
diff_text = '\n\n'.join(segments) if segments else ''
else:
diff_text = _apply_filters(_make_unified_diff(from_text, to_text))
if not diff_text.strip():
return jsonify({'summary': None, 'error': 'No differences found'})
from changedetectionio.llm.evaluator import (
summarise_change, get_effective_summary_prompt,
is_global_token_budget_exceeded, get_global_token_budget_month,
LLMInputTooLargeError,
)
effective_prompt = get_effective_summary_prompt(watch, datastore)
from changedetectionio.llm.prompt_builder import build_change_summary_system_prompt
# Diff-pref flags + system prompt are part of the cache key so prompt changes bust the cache
_max_summary_tokens = datastore.data['settings']['application'].get('llm_max_summary_tokens', 3000)
cache_prompt = (
effective_prompt
+ f'\x00prefs:all={int(all_changes)},ws={int(ignore_whitespace)}'
f',rm={int(show_removed)},add={int(show_added)}'
+ f'\x00sys:{build_change_summary_system_prompt()}'
+ f'\x00max_tokens:{_max_summary_tokens}'
)
# Check cache — keyed by version pair + prompt hash (invalidates if prompt changes)
cached = watch.get_llm_diff_summary(from_version, to_version, prompt=cache_prompt)
if cached:
import time
datastore.set_last_viewed(uuid, int(time.time()))
return jsonify({'summary': cached, 'error': None, 'cached': True})
# Check global monthly token budget before making an LLM call
if is_global_token_budget_exceeded(datastore):
budget = get_global_token_budget_month(datastore)
llm_cfg = datastore.data.get('settings', {}).get('application', {}).get('llm', {})
used = llm_cfg.get('tokens_this_month', 0)
return jsonify({
'summary': None,
'error': gettext(
'Monthly AI token budget of %(budget)s tokens reached (%(used)s used). Resets next month.',
budget=f'{budget:,}',
used=f'{used:,}',
),
'budget_exceeded': True,
}), 429
try:
summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text)
except LLMInputTooLargeError as e:
return jsonify({'summary': None, 'error': str(e)}), 400
except Exception as e:
logger.error(f"LLM summary generation failed for {uuid}: {e}")
return jsonify({'summary': None, 'error': _clean_litellm_error(e)}), 500
if not summary:
return jsonify({'summary': None, 'error': 'LLM returned empty summary'})
try:
watch.save_llm_diff_summary(summary, from_version, to_version, prompt=cache_prompt)
except Exception as e:
logger.warning(f"Could not cache llm summary for {uuid}: {e}")
import time
datastore.set_last_viewed(uuid, int(time.time()))
return jsonify({'summary': summary, 'error': None, 'cached': False})
@diff_blueprint.route("/diff/<uuid_str:uuid>/extract", methods=['GET'])
@login_optionally_required
def diff_history_page_extract_GET(uuid):
"""
Render the data extraction form for a watch.
This route is processor-aware: it delegates to the processor's
extract.py module, allowing different processor types to provide
custom extraction interfaces.
Each processor implements processors/{type}/extract.py::render_form()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
flash(gettext("No history found for the specified link, bad link?"), "error")
return redirect(url_for('watchlist.index'))
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
# Call the processor's render_form() function
if processor_module and hasattr(processor_module, 'render_form'):
return processor_module.render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import render_form as default_render_form
return default_render_form(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
render_template=render_template,
flash=flash,
redirect=redirect
)
@diff_blueprint.route("/diff/<uuid_str:uuid>/extract", methods=['POST'])
@login_optionally_required
def diff_history_page_extract_POST(uuid):
"""
Process the data extraction request.
This route is processor-aware: it delegates to the processor's
extract.py module, allowing different processor types to provide
custom extraction logic.
Each processor implements processors/{type}/extract.py::process_extraction()
If a processor doesn't have an extract module, falls back to text_json_diff.
"""
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
flash(gettext("No history found for the specified link, bad link?"), "error")
return redirect(url_for('watchlist.index'))
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's extract module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'extract')
# Call the processor's process_extraction() function
if processor_module and hasattr(processor_module, 'process_extraction'):
return processor_module.process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
# Fallback: if processor doesn't have extract module, use base processors.extract as default
from changedetectionio.processors.extract import process_extraction as default_process_extraction
return default_process_extraction(
watch=watch,
datastore=datastore,
request=request,
url_for=url_for,
make_response=make_response,
send_from_directory=send_from_directory,
flash=flash,
redirect=redirect
)
@diff_blueprint.route("/diff/<uuid_str:uuid>/download-patch", methods=['GET'])
@login_optionally_required
def download_patch(uuid):
"""
Generate and return a unified diff patch file between two snapshots.
Query params: from_version, to_version (timestamp strings from watch history).
Returns the patch as a downloadable .patch file — the same content fed to the LLM.
"""
import difflib
try:
watch = datastore.data['watching'][uuid]
except KeyError:
return make_response('Watch not found', 404)
dates = list(watch.history.keys())
if len(dates) < 2:
return make_response('Not enough history', 400)
from_version = request.args.get('from_version', dates[-2])
to_version = request.args.get('to_version', dates[-1])
try:
from_text = watch.get_history_snapshot(timestamp=from_version)
to_text = watch.get_history_snapshot(timestamp=to_version)
except Exception as e:
return make_response(f'Could not read snapshots: {e}', 500)
diff_lines = list(difflib.unified_diff(
from_text.splitlines(keepends=True),
to_text.splitlines(keepends=True),
fromfile=f'snapshot-{from_version}',
tofile=f'snapshot-{to_version}',
lineterm='',
))
patch_text = ''.join(diff_lines) if diff_lines else '(no differences)\n'
response = make_response(patch_text)
response.headers['Content-Type'] = 'text/plain; charset=utf-8'
return response
@diff_blueprint.route("/diff/<uuid_str:uuid>/processor-asset/<string:asset_name>", methods=['GET'])
@login_optionally_required
def processor_asset(uuid, asset_name):
"""
Serve processor-specific binary assets (images, files, etc.).
This route is processor-aware: it delegates to the processor's
difference.py module, allowing different processor types to serve
custom assets without embedding them as base64 in templates.
This solves memory issues with large binary data (e.g., screenshots)
by streaming them as separate HTTP responses instead of embedding
in the HTML template.
Each processor implements processors/{type}/difference.py::get_asset()
which returns (binary_data, content_type, cache_control_header).
Example URLs:
- /diff/{uuid}/processor-asset/before
- /diff/{uuid}/processor-asset/after
- /diff/{uuid}/processor-asset/rendered_diff
"""
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
try:
watch = datastore.data['watching'][uuid]
except KeyError:
flash(gettext("No history found for the specified link, bad link?"), "error")
return redirect(url_for('watchlist.index'))
# Get the processor type for this watch
processor_name = watch.get('processor', 'text_json_diff')
# Try to get the processor's difference module (works for both built-in and plugin processors)
from changedetectionio.processors import get_processor_submodule
processor_module = get_processor_submodule(processor_name, 'difference')
# Call the processor's get_asset() function
if processor_module and hasattr(processor_module, 'get_asset'):
result = processor_module.get_asset(
asset_name=asset_name,
watch=watch,
datastore=datastore,
request=request
)
if result is None:
from flask import abort
abort(404, description=f"Asset '{asset_name}' not found")
binary_data, content_type, cache_control = result
response = make_response(binary_data)
response.headers['Content-Type'] = content_type
if cache_control:
response.headers['Cache-Control'] = cache_control
return response
else:
logger.warning(f"Processor {processor_name} does not implement get_asset()")
from flask import abort
abort(404, description=f"Processor '{processor_name}' does not support assets")
return diff_blueprint