changedetection.io/changedetectionio/llm/evaluator.py

"""
LLM evaluation orchestration.

Two public entry points:
  - run_setup(watch, datastore)        — one-time: decide if pre-filter needed
  - evaluate_change(watch, datastore, diff, current_snapshot) — per-change evaluation

Intent resolution: watch.llm_intent → first tag with llm_intent → None (no evaluation)
Cache: each (intent, diff) pair is evaluated exactly once, result stored in watch.

Environment variable overrides (take priority over datastore settings):
  LLM_MODEL    — model string (e.g. "gpt-4o-mini", "ollama/llama3.2")
  LLM_API_KEY  — API key for cloud providers
  LLM_API_BASE — base URL for local/custom endpoints (e.g. http://localhost:11434)
"""

import hashlib
import os
from loguru import logger

from . import client as llm_client
from .prompt_builder import (
    build_change_summary_prompt, build_change_summary_system_prompt,
    build_eval_prompt, build_eval_system_prompt,
    build_preview_prompt, build_preview_system_prompt,
    build_setup_prompt, build_setup_system_prompt,
)
from .response_parser import parse_eval_response, parse_preview_response, parse_setup_response

# AI Change Summary can produce longer output than eval responses
_MAX_SUMMARY_TOKENS = 500

# Default prompt used when the user hasn't configured llm_change_summary
DEFAULT_CHANGE_SUMMARY_PROMPT = "Briefly describe in plain English what changed — what was added, removed, or modified."


# ---------------------------------------------------------------------------
# Intent resolution
# ---------------------------------------------------------------------------

def resolve_llm_field(watch, datastore, field: str) -> tuple[str, str]:
    """
    Generic cascade resolver for any LLM per-watch field.
    Returns (value, source) where source is 'watch' or tag title.
    Returns ('', '') if not set anywhere.
    """
    value = (watch.get(field) or '').strip()
    if value:
        return value, 'watch'

    for tag_uuid in watch.get('tags', []):
        tag = datastore.data['settings']['application'].get('tags', {}).get(tag_uuid)
        if tag:
            tag_value = (tag.get(field) or '').strip()
            if tag_value:
                return tag_value, tag.get('title', 'tag')

    return '', ''


def resolve_intent(watch, datastore) -> tuple[str, str]:
    """
    Return (intent, source) where source is 'watch' or tag title.
    Returns ('', '') if no intent is configured anywhere.
    """
    intent = (watch.get('llm_intent') or '').strip()
    if intent:
        return intent, 'watch'

    for tag_uuid in watch.get('tags', []):
        tag = datastore.data['settings']['application'].get('tags', {}).get(tag_uuid)
        if tag:
            tag_intent = (tag.get('llm_intent') or '').strip()
            if tag_intent:
                return tag_intent, tag.get('title', 'tag')

    return '', ''


# ---------------------------------------------------------------------------
# LLM config helper
# ---------------------------------------------------------------------------

def get_llm_config(datastore) -> dict | None:
    """
    Return LLM config dict or None if not configured.

    Resolution order (first non-empty model wins):
      1. Environment variables: LLM_MODEL, LLM_API_KEY, LLM_API_BASE
      2. Datastore settings (set via UI)
    """
    # 1. Environment variable override
    env_model = os.getenv('LLM_MODEL', '').strip()
    if env_model:
        return {
            'model': env_model,
            'api_key': os.getenv('LLM_API_KEY', '').strip(),
            'api_base': os.getenv('LLM_API_BASE', '').strip(),
        }

    # 2. Datastore settings
    cfg = datastore.data['settings']['application'].get('llm') or {}
    if not cfg.get('model'):
        return None
    return cfg


def llm_configured_via_env() -> bool:
    """True when LLM config comes from environment variables, not the UI."""
    return bool(os.getenv('LLM_MODEL', '').strip())


# ---------------------------------------------------------------------------
# One-time setup: derive pre-filter
# ---------------------------------------------------------------------------

def _check_token_budget(watch, cfg, tokens_this_call: int = 0) -> bool:
    """
    Check token budget limits.  Returns True if within budget, False if exceeded.
    Also accumulates tokens_this_call into watch['llm_tokens_used_cumulative'].
    """
    if tokens_this_call > 0:
        current = watch.get('llm_tokens_used_cumulative') or 0
        watch['llm_tokens_used_cumulative'] = current + tokens_this_call

    max_per_check = int(cfg.get('max_tokens_per_check') or 0)
    max_cumulative = int(cfg.get('max_tokens_cumulative') or 0)

    if max_per_check and tokens_this_call > max_per_check:
        logger.warning(
            f"LLM token budget exceeded for {watch.get('uuid')}: "
            f"{tokens_this_call} tokens > per-check limit {max_per_check}"
        )
        return False

    if max_cumulative:
        total = watch.get('llm_tokens_used_cumulative') or 0
        if total > max_cumulative:
            logger.warning(
                f"LLM cumulative token budget exceeded for {watch.get('uuid')}: "
                f"{total} tokens > limit {max_cumulative}"
            )
            return False

    return True


def run_setup(watch, datastore, snapshot_text: str) -> None:
    """
    Ask the LLM whether a CSS pre-filter would improve precision for this intent.
    Stores result in watch['llm_prefilter'] (str selector or None).
    Called once when intent is first set, and again if pre-filter returns zero matches.
    """
    cfg = get_llm_config(datastore)
    if not cfg:
        return

    intent, _ = resolve_intent(watch, datastore)
    if not intent:
        return

    url = watch.get('url', '')
    system_prompt = build_setup_system_prompt()
    user_prompt = build_setup_prompt(intent, snapshot_text, url=url)

    try:
        raw, tokens = llm_client.completion(
            model=cfg['model'],
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt},
            ],
            api_key=cfg.get('api_key'),
            api_base=cfg.get('api_base'),
        )
        _check_token_budget(watch, cfg, tokens)
        result = parse_setup_response(raw)
        watch['llm_prefilter'] = result['selector']
        logger.debug(f"LLM setup for {watch.get('uuid')}: prefilter={result['selector']} reason={result['reason']}")
    except Exception as e:
        logger.warning(f"LLM setup call failed for {watch.get('uuid')}: {e}")
        watch['llm_prefilter'] = None


# ---------------------------------------------------------------------------
# AI Change Summary — human-readable description of what changed
# ---------------------------------------------------------------------------

def get_effective_summary_prompt(watch, datastore) -> str:
    """Return the prompt that summarise_change will use — custom or the default fallback."""
    prompt, _ = resolve_llm_field(watch, datastore, 'llm_change_summary')
    return prompt or DEFAULT_CHANGE_SUMMARY_PROMPT


def compute_summary_cache_key(diff_text: str, prompt: str) -> str:
    """Stable 16-char hex key for a (diff, prompt) pair.  Stored alongside the summary file."""
    h = hashlib.md5()
    h.update(diff_text.encode('utf-8', errors='replace'))
    h.update(b'\x00')
    h.update(prompt.encode('utf-8', errors='replace'))
    return h.hexdigest()[:16]


def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') -> str:
    """
    Generate a plain-language summary of the change using the watch's
    llm_change_summary prompt (cascades from tag if not set on watch).

    Returns the summary string, or '' on failure.
    The result replaces {{ diff }} in notifications so the user gets a
    readable description instead of raw +/- diff lines.
    """
    cfg = get_llm_config(datastore)
    if not cfg:
        return ''

    custom_prompt, _ = resolve_llm_field(watch, datastore, 'llm_change_summary')
    if not custom_prompt:
        custom_prompt = DEFAULT_CHANGE_SUMMARY_PROMPT
    if not diff.strip():
        return ''

    url = watch.get('url', '')
    title = watch.get('page_title') or watch.get('title') or ''

    system_prompt = build_change_summary_system_prompt()
    user_prompt = build_change_summary_prompt(
        diff=diff,
        custom_prompt=custom_prompt,
        current_snapshot=current_snapshot,
        url=url,
        title=title,
    )

    try:
        raw, tokens = llm_client.completion(
            model=cfg['model'],
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt},
            ],
            api_key=cfg.get('api_key'),
            api_base=cfg.get('api_base'),
            max_tokens=_MAX_SUMMARY_TOKENS,
        )
        summary = raw.strip()
        _check_token_budget(watch, cfg, tokens)
        watch['llm_last_tokens_used'] = (watch.get('llm_last_tokens_used') or 0) + tokens
        logger.debug(
            f"LLM change summary {watch.get('uuid')}: tokens={tokens} "
            f"summary={summary[:80]}"
        )
        return summary
    except Exception as e:
        logger.warning(f"LLM change summary failed for {watch.get('uuid')}: {e}")
        return ''


# ---------------------------------------------------------------------------
# Live-preview extraction (current content, no diff)
# ---------------------------------------------------------------------------

def preview_extract(watch, datastore, content: str) -> dict | None:
    """
    For the live-preview endpoint: extract relevant information from the
    *current* page content according to the watch's intent.

    Unlike evaluate_change (which compares a diff), this asks the LLM to
    directly answer the intent against the current snapshot — giving the user
    immediate feedback like "30 articles listed" or "Price: $149, 25% off".

    Returns {'found': bool, 'answer': str} or None if LLM not configured / no intent.
    """
    cfg = get_llm_config(datastore)
    if not cfg:
        return None

    intent, _ = resolve_intent(watch, datastore)
    if not intent or not content.strip():
        return None

    url = watch.get('url', '')
    title = watch.get('page_title') or watch.get('title') or ''

    system_prompt = build_preview_system_prompt()
    user_prompt = build_preview_prompt(intent, content, url=url, title=title)

    try:
        raw, tokens = llm_client.completion(
            model=cfg['model'],
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt},
            ],
            api_key=cfg.get('api_key'),
            api_base=cfg.get('api_base'),
        )
        result = parse_preview_response(raw)
        logger.debug(
            f"LLM preview {watch.get('uuid')}: found={result['found']} "
            f"tokens={tokens} answer={result['answer'][:80]}"
        )
        return result
    except Exception as e:
        logger.warning(f"LLM preview extraction failed for {watch.get('uuid')}: {e}")
        return None


# ---------------------------------------------------------------------------
# Per-change evaluation
# ---------------------------------------------------------------------------

def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') -> dict | None:
    """
    Evaluate whether `diff` matches the watch's intent.
    Returns {'important': bool, 'summary': str} or None if LLM not configured / no intent.

    Results are cached by (intent, diff) hash — each unique diff is evaluated exactly once.
    """
    cfg = get_llm_config(datastore)
    if not cfg:
        return None

    intent, source = resolve_intent(watch, datastore)
    if not intent:
        return None

    if not diff or not diff.strip():
        return {'important': False, 'summary': ''}

    # Cache lookup — evaluations are deterministic once cached
    cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
    cache = watch.get('llm_evaluation_cache') or {}
    if cache_key in cache:
        logger.debug(f"LLM cache hit for {watch.get('uuid')} key={cache_key[:8]}")
        return cache[cache_key]

    # Check cumulative budget before making the call
    if not _check_token_budget(watch, cfg):
        # Already over budget — fail open (don't suppress notification)
        return {'important': True, 'summary': ''}

    url = watch.get('url', '')
    title = watch.get('page_title') or watch.get('title') or ''

    system_prompt = build_eval_system_prompt()
    user_prompt = build_eval_prompt(
        intent=intent,
        diff=diff,
        current_snapshot=current_snapshot,
        url=url,
        title=title,
    )

    try:
        raw, tokens = llm_client.completion(
            model=cfg['model'],
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt},
            ],
            api_key=cfg.get('api_key'),
            api_base=cfg.get('api_base'),
        )
        result = parse_eval_response(raw)
    except Exception as e:
        logger.warning(f"LLM evaluation failed for {watch.get('uuid')}: {e}")
        # On failure: don't suppress the notification — pass through as important
        watch['llm_last_tokens_used'] = 0
        return {'important': True, 'summary': ''}

    # Accumulate token usage and enforce per-check limit
    _check_token_budget(watch, cfg, tokens)
    watch['llm_last_tokens_used'] = tokens

    # Store in cache
    if 'llm_evaluation_cache' not in watch or watch['llm_evaluation_cache'] is None:
        watch['llm_evaluation_cache'] = {}
    watch['llm_evaluation_cache'][cache_key] = result

    logger.debug(
        f"LLM eval {watch.get('uuid')} (intent from {source}): "
        f"important={result['important']} tokens={tokens} summary={result['summary'][:80]}"
    )
    return result