changedetection.io/changedetectionio/blueprint/ui/diff.py

from flask import Blueprint, request, redirect, url_for, flash, render_template, make_response, send_from_directory
from flask_babel import gettext

import re
import importlib
from loguru import logger
from markupsafe import Markup

from changedetectionio.diff import (
    REMOVED_STYLE, ADDED_STYLE, REMOVED_INNER_STYLE, ADDED_INNER_STYLE,
    REMOVED_PLACEMARKER_OPEN, REMOVED_PLACEMARKER_CLOSED,
    ADDED_PLACEMARKER_OPEN, ADDED_PLACEMARKER_CLOSED,
    CHANGED_PLACEMARKER_OPEN, CHANGED_PLACEMARKER_CLOSED,
    CHANGED_INTO_PLACEMARKER_OPEN, CHANGED_INTO_PLACEMARKER_CLOSED
)
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required


def _clean_litellm_error(exc) -> str:
    """Return a short, human-readable error string from a litellm exception.

    litellm embeds the raw provider JSON in str(exc), which can be hundreds of
    characters of verbose quota detail.  We try to pull just the provider's
    'message' field; failing that we return the first non-empty line with the
    'litellm.XxxError:' class prefix stripped.
    """
    import json, re
    raw = str(exc)
    # Try to parse the embedded JSON block (starts at first '{')
    brace = raw.find('{')
    if brace >= 0:
        try:
            payload = json.loads(raw[brace:])
            msg = (payload.get('error') or {}).get('message') or ''
            if msg:
                # Take only the first sentence / line — provider messages can be long
                return msg.split('\n')[0].split('. ')[0].strip() + '.'
        except Exception:
            pass
    # Fallback: strip the "litellm.XxxError: litellm.XxxError: providerException - " prefix
    first_line = raw.split('\n')[0]
    first_line = re.sub(r'^(litellm\.\w+:\s*)+', '', first_line)
    first_line = re.sub(r'\w+Exception\s*-\s*', '', first_line).strip()
    return first_line or raw.split('\n')[0]


def construct_blueprint(datastore: ChangeDetectionStore):
    diff_blueprint = Blueprint('ui_diff', __name__, template_folder="../ui/templates")

    @diff_blueprint.app_template_filter('diff_unescape_difference_spans')
    def diff_unescape_difference_spans(content):
        """Emulate Jinja2's auto-escape, then selectively unescape our diff spans."""
        from markupsafe import escape

        if not content:
            return Markup('')

        # Step 1: Escape everything like Jinja2 would (this makes it XSS-safe)
        escaped_content = escape(str(content))

        # Step 2: Unescape only our exact diff spans generated by apply_html_color_to_body()
        # Pattern matches the exact structure:
        # <span style="{STYLE}" role="{ROLE}" aria-label="{LABEL}" title="{TITLE}">

        # Unescape outer span opening tags with full attributes (role, aria-label, title)
        # Matches removed/added/changed/changed_into spans
        result = re.sub(
            rf'&lt;span style=&#34;({re.escape(REMOVED_STYLE)}|{re.escape(ADDED_STYLE)})&#34; '
            rf'role=&#34;(deletion|insertion|note)&#34; '
            rf'aria-label=&#34;([^&]+?)&#34; '
            rf'title=&#34;([^&]+?)&#34;&gt;',
            r'<span style="\1" role="\2" aria-label="\3" title="\4">',
            str(escaped_content),
            flags=re.IGNORECASE
        )

        # Unescape inner span opening tags (without additional attributes)
        # This matches the darker background styles for changed parts within lines
        result = re.sub(
            rf'&lt;span style=&#34;({re.escape(REMOVED_INNER_STYLE)}|{re.escape(ADDED_INNER_STYLE)})&#34;&gt;',
            r'<span style="\1">',
            result,
            flags=re.IGNORECASE
        )

        # Unescape closing tags (but only as many as we opened)
        open_count = result.count('<span style=')
        close_count = str(escaped_content).count('&lt;/span&gt;')

        # Replace up to the number of spans we opened
        for _ in range(min(open_count, close_count)):
            result = result.replace('&lt;/span&gt;', '</span>', 1)

        return Markup(result)

    @diff_blueprint.route("/diff/<uuid_str:uuid>", methods=['GET'])
    @login_optionally_required
    def diff_history_page(uuid):
        """
        Render the history/diff page for a watch.

        This route is processor-aware: it delegates rendering to the processor's
        difference.py module, allowing different processor types to provide
        custom visualizations:
        - text_json_diff: Text/HTML diff with syntax highlighting
        - restock_diff: Could show price charts and stock history
        - image_diff: Could show image comparison slider/overlay

        Each processor implements processors/{type}/difference.py::render()
        If a processor doesn't have a difference module, falls back to text_json_diff.
        """

        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()

        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            flash(gettext("No history found for the specified link, bad link?"), "error")
            return redirect(url_for('watchlist.index'))

        dates = list(watch.history.keys())
        if not dates or len(dates) < 2:
            flash(gettext("Not enough history (2 snapshots required) to show difference page for this watch."), "error")
            return redirect(url_for('watchlist.index'))

        # Get the processor type for this watch
        processor_name = watch.get('processor', 'text_json_diff')

        # Try to get the processor's difference module (works for both built-in and plugin processors)
        from changedetectionio.processors import get_processor_submodule
        processor_module = get_processor_submodule(processor_name, 'difference')

        # Call the processor's render() function
        if processor_module and hasattr(processor_module, 'render'):
            return processor_module.render(
                watch=watch,
                datastore=datastore,
                request=request,
                url_for=url_for,
                render_template=render_template,
                flash=flash,
                redirect=redirect
            )

        # Fallback: if processor doesn't have difference module, use text_json_diff as default
        from changedetectionio.processors.text_json_diff.difference import render as default_render
        return default_render(
            watch=watch,
            datastore=datastore,
            request=request,
            url_for=url_for,
            render_template=render_template,
            flash=flash,
            redirect=redirect
        )

    @diff_blueprint.route("/diff/<uuid_str:uuid>/llm-summary/prompt", methods=['GET'])
    @login_optionally_required
    def diff_llm_summary_prompt(uuid):
        """Return the effective LLM summary prompt for a watch immediately (no LLM call)."""
        from flask import jsonify
        watch = datastore.data['watching'].get(uuid)
        if not watch:
            return jsonify({'prompt': ''}), 404
        try:
            from changedetectionio.llm.evaluator import get_effective_summary_prompt
            prompt = get_effective_summary_prompt(watch, datastore)
        except Exception:
            prompt = ''
        return jsonify({'prompt': prompt})

    @diff_blueprint.route("/diff/<uuid_str:uuid>/llm-summary", methods=['GET'])
    @login_optionally_required
    def diff_llm_summary(uuid):
        """
        Generate (or return cached) an AI summary of the diff between two snapshots.
        Called via AJAX from the diff page when no cached summary exists.
        Returns JSON: {"summary": "...", "error": null} or {"summary": null, "error": "..."}
        """
        import difflib
        from flask import jsonify

        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            return jsonify({'summary': None, 'error': 'Watch not found'}), 404

        llm_cfg = datastore.data.get('settings', {}).get('application', {}).get('llm', {})
        if not llm_cfg.get('model'):
            return jsonify({'summary': None, 'error': 'LLM not configured'}), 400

        dates = list(watch.history.keys())
        if len(dates) < 2:
            return jsonify({'summary': None, 'error': 'Not enough history'}), 400

        best_from = watch.get_from_version_based_on_last_viewed
        from_version      = request.args.get('from_version', best_from if best_from else dates[-2])
        to_version        = request.args.get('to_version', dates[-1])
        all_changes       = request.args.get('all_changes', '0') == '1'
        ignore_whitespace = request.args.get('ignore_whitespace', '0') == '1'
        show_removed      = request.args.get('removed', '1') == '1'
        show_added        = request.args.get('added', '1') == '1'

        def _prep(text):
            """Optionally normalise whitespace on each line before diffing."""
            if not ignore_whitespace:
                return text.splitlines()
            return [' '.join(line.split()) for line in text.splitlines()]

        def _make_unified_diff(a_text, b_text):
            lines = list(difflib.unified_diff(_prep(a_text), _prep(b_text), lineterm='', n=3))
            return '\n'.join(lines[2:]) if len(lines) > 2 else '\n'.join(lines)

        def _apply_filters(diff_text):
            """Strip +/- lines the user has hidden in the UI so the LLM matches what they see."""
            if show_removed and show_added:
                return diff_text
            out = []
            for line in diff_text.splitlines():
                if line.startswith('-') and not show_removed:
                    continue
                if line.startswith('+') and not show_added:
                    continue
                out.append(line)
            return '\n'.join(out)

        try:
            from_text = watch.get_history_snapshot(timestamp=from_version)
            to_text = watch.get_history_snapshot(timestamp=to_version)
        except Exception as e:
            return jsonify({'summary': None, 'error': f'Could not read snapshots: {e}'}), 500

        if all_changes:
            # Build sequential diffs for every intermediate snapshot between from and to
            # so the LLM sees the full timeline of changes, not just start→end
            sorted_dates = sorted(dates)
            try:
                start_idx = sorted_dates.index(from_version)
                end_idx   = sorted_dates.index(to_version)
            except ValueError:
                start_idx, end_idx = 0, len(sorted_dates) - 1

            steps = sorted_dates[start_idx:end_idx + 1]
            segments = []
            for i in range(len(steps) - 1):
                a_ts, b_ts = steps[i], steps[i + 1]
                try:
                    a_text = watch.get_history_snapshot(timestamp=a_ts) or ''
                    b_text = watch.get_history_snapshot(timestamp=b_ts) or ''
                except Exception:
                    continue
                seg = _apply_filters(_make_unified_diff(a_text, b_text))
                if seg.strip():
                    segments.append(f'=== {a_ts} → {b_ts} ===\n{seg}')

            diff_text = '\n\n'.join(segments) if segments else ''
        else:
            diff_text = _apply_filters(_make_unified_diff(from_text, to_text))

        if not diff_text.strip():
            return jsonify({'summary': None, 'error': 'No differences found'})

        from changedetectionio.llm.evaluator import (
            summarise_change, get_effective_summary_prompt,
            is_global_token_budget_exceeded, get_global_token_budget_month,
            LLMInputTooLargeError,
        )

        effective_prompt = get_effective_summary_prompt(watch, datastore)
        from changedetectionio.llm.prompt_builder import build_change_summary_system_prompt
        # Diff-pref flags + system prompt are part of the cache key so prompt changes bust the cache
        _max_summary_tokens = datastore.data['settings']['application'].get('llm_max_summary_tokens', 3000)
        cache_prompt = (
            effective_prompt
            + f'\x00prefs:all={int(all_changes)},ws={int(ignore_whitespace)}'
              f',rm={int(show_removed)},add={int(show_added)}'
            + f'\x00sys:{build_change_summary_system_prompt()}'
            + f'\x00max_tokens:{_max_summary_tokens}'
        )

        # Check cache — keyed by version pair + prompt hash (invalidates if prompt changes)
        cached = watch.get_llm_diff_summary(from_version, to_version, prompt=cache_prompt)
        if cached:
            import time
            datastore.set_last_viewed(uuid, int(time.time()))
            return jsonify({'summary': cached, 'error': None, 'cached': True})

        # Check global monthly token budget before making an LLM call
        if is_global_token_budget_exceeded(datastore):
            budget = get_global_token_budget_month(datastore)
            llm_cfg = datastore.data.get('settings', {}).get('application', {}).get('llm', {})
            used = llm_cfg.get('tokens_this_month', 0)
            return jsonify({
                'summary': None,
                'error': gettext(
                    'Monthly AI token budget of %(budget)s tokens reached (%(used)s used). Resets next month.',
                    budget=f'{budget:,}',
                    used=f'{used:,}',
                ),
                'budget_exceeded': True,
            }), 429

        try:
            summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text)
        except LLMInputTooLargeError as e:
            return jsonify({'summary': None, 'error': str(e)}), 400
        except Exception as e:
            logger.error(f"LLM summary generation failed for {uuid}: {e}")
            return jsonify({'summary': None, 'error': _clean_litellm_error(e)}), 500

        if not summary:
            return jsonify({'summary': None, 'error': 'LLM returned empty summary'})

        try:
            watch.save_llm_diff_summary(summary, from_version, to_version, prompt=cache_prompt)
        except Exception as e:
            logger.warning(f"Could not cache llm summary for {uuid}: {e}")

        import time
        datastore.set_last_viewed(uuid, int(time.time()))
        return jsonify({'summary': summary, 'error': None, 'cached': False})

    @diff_blueprint.route("/diff/<uuid_str:uuid>/extract", methods=['GET'])
    @login_optionally_required
    def diff_history_page_extract_GET(uuid):
        """
        Render the data extraction form for a watch.

        This route is processor-aware: it delegates to the processor's
        extract.py module, allowing different processor types to provide
        custom extraction interfaces.

        Each processor implements processors/{type}/extract.py::render_form()
        If a processor doesn't have an extract module, falls back to text_json_diff.
        """


        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()
        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            flash(gettext("No history found for the specified link, bad link?"), "error")
            return redirect(url_for('watchlist.index'))

        # Get the processor type for this watch
        processor_name = watch.get('processor', 'text_json_diff')

        # Try to get the processor's extract module (works for both built-in and plugin processors)
        from changedetectionio.processors import get_processor_submodule
        processor_module = get_processor_submodule(processor_name, 'extract')

        # Call the processor's render_form() function
        if processor_module and hasattr(processor_module, 'render_form'):
            return processor_module.render_form(
                watch=watch,
                datastore=datastore,
                request=request,
                url_for=url_for,
                render_template=render_template,
                flash=flash,
                redirect=redirect
            )

        # Fallback: if processor doesn't have extract module, use base processors.extract as default
        from changedetectionio.processors.extract import render_form as default_render_form
        return default_render_form(
            watch=watch,
            datastore=datastore,
            request=request,
            url_for=url_for,
            render_template=render_template,
            flash=flash,
            redirect=redirect
        )

    @diff_blueprint.route("/diff/<uuid_str:uuid>/extract", methods=['POST'])
    @login_optionally_required
    def diff_history_page_extract_POST(uuid):
        """
        Process the data extraction request.

        This route is processor-aware: it delegates to the processor's
        extract.py module, allowing different processor types to provide
        custom extraction logic.

        Each processor implements processors/{type}/extract.py::process_extraction()
        If a processor doesn't have an extract module, falls back to text_json_diff.
        """

        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()

        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            flash(gettext("No history found for the specified link, bad link?"), "error")
            return redirect(url_for('watchlist.index'))

        # Get the processor type for this watch
        processor_name = watch.get('processor', 'text_json_diff')

        # Try to get the processor's extract module (works for both built-in and plugin processors)
        from changedetectionio.processors import get_processor_submodule
        processor_module = get_processor_submodule(processor_name, 'extract')

        # Call the processor's process_extraction() function
        if processor_module and hasattr(processor_module, 'process_extraction'):
            return processor_module.process_extraction(
                watch=watch,
                datastore=datastore,
                request=request,
                url_for=url_for,
                make_response=make_response,
                send_from_directory=send_from_directory,
                flash=flash,
                redirect=redirect
            )

        # Fallback: if processor doesn't have extract module, use base processors.extract as default
        from changedetectionio.processors.extract import process_extraction as default_process_extraction
        return default_process_extraction(
            watch=watch,
            datastore=datastore,
            request=request,
            url_for=url_for,
            make_response=make_response,
            send_from_directory=send_from_directory,
            flash=flash,
            redirect=redirect
        )

    @diff_blueprint.route("/diff/<uuid_str:uuid>/download-patch", methods=['GET'])
    @login_optionally_required
    def download_patch(uuid):
        """
        Generate and return a unified diff patch file between two snapshots.
        Query params: from_version, to_version (timestamp strings from watch history).
        Returns the patch as a downloadable .patch file — the same content fed to the LLM.
        """
        import difflib

        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            return make_response('Watch not found', 404)

        dates = list(watch.history.keys())
        if len(dates) < 2:
            return make_response('Not enough history', 400)

        from_version = request.args.get('from_version', dates[-2])
        to_version   = request.args.get('to_version',   dates[-1])

        try:
            from_text = watch.get_history_snapshot(timestamp=from_version)
            to_text   = watch.get_history_snapshot(timestamp=to_version)
        except Exception as e:
            return make_response(f'Could not read snapshots: {e}', 500)

        diff_lines = list(difflib.unified_diff(
            from_text.splitlines(keepends=True),
            to_text.splitlines(keepends=True),
            fromfile=f'snapshot-{from_version}',
            tofile=f'snapshot-{to_version}',
            lineterm='',
        ))
        patch_text = ''.join(diff_lines) if diff_lines else '(no differences)\n'

        response = make_response(patch_text)
        response.headers['Content-Type'] = 'text/plain; charset=utf-8'
        return response

    @diff_blueprint.route("/diff/<uuid_str:uuid>/processor-asset/<string:asset_name>", methods=['GET'])
    @login_optionally_required
    def processor_asset(uuid, asset_name):
        """
        Serve processor-specific binary assets (images, files, etc.).

        This route is processor-aware: it delegates to the processor's
        difference.py module, allowing different processor types to serve
        custom assets without embedding them as base64 in templates.

        This solves memory issues with large binary data (e.g., screenshots)
        by streaming them as separate HTTP responses instead of embedding
        in the HTML template.

        Each processor implements processors/{type}/difference.py::get_asset()
        which returns (binary_data, content_type, cache_control_header).

        Example URLs:
        - /diff/{uuid}/processor-asset/before
        - /diff/{uuid}/processor-asset/after
        - /diff/{uuid}/processor-asset/rendered_diff
        """

        if uuid == 'first':
            uuid = list(datastore.data['watching'].keys()).pop()

        try:
            watch = datastore.data['watching'][uuid]
        except KeyError:
            flash(gettext("No history found for the specified link, bad link?"), "error")
            return redirect(url_for('watchlist.index'))

        # Get the processor type for this watch
        processor_name = watch.get('processor', 'text_json_diff')

        # Try to get the processor's difference module (works for both built-in and plugin processors)
        from changedetectionio.processors import get_processor_submodule
        processor_module = get_processor_submodule(processor_name, 'difference')

        # Call the processor's get_asset() function
        if processor_module and hasattr(processor_module, 'get_asset'):
            result = processor_module.get_asset(
                asset_name=asset_name,
                watch=watch,
                datastore=datastore,
                request=request
            )

            if result is None:
                from flask import abort
                abort(404, description=f"Asset '{asset_name}' not found")

            binary_data, content_type, cache_control = result

            response = make_response(binary_data)
            response.headers['Content-Type'] = content_type
            if cache_control:
                response.headers['Cache-Control'] = cache_control
            return response
        else:
            logger.warning(f"Processor {processor_name} does not implement get_asset()")
            from flask import abort
            abort(404, description=f"Processor '{processor_name}' does not support assets")

    return diff_blueprint