Files
changedetection.io/changedetectionio/llm/evaluator.py
T
2026-04-16 14:47:47 +02:00

386 lines
14 KiB
Python

"""
LLM evaluation orchestration.
Two public entry points:
- run_setup(watch, datastore) — one-time: decide if pre-filter needed
- evaluate_change(watch, datastore, diff, current_snapshot) — per-change evaluation
Intent resolution: watch.llm_intent → first tag with llm_intent → None (no evaluation)
Cache: each (intent, diff) pair is evaluated exactly once, result stored in watch.
Environment variable overrides (take priority over datastore settings):
LLM_MODEL — model string (e.g. "gpt-4o-mini", "ollama/llama3.2")
LLM_API_KEY — API key for cloud providers
LLM_API_BASE — base URL for local/custom endpoints (e.g. http://localhost:11434)
"""
import hashlib
import os
from loguru import logger
from . import client as llm_client
from .prompt_builder import (
build_change_summary_prompt, build_change_summary_system_prompt,
build_eval_prompt, build_eval_system_prompt,
build_preview_prompt, build_preview_system_prompt,
build_setup_prompt, build_setup_system_prompt,
)
from .response_parser import parse_eval_response, parse_preview_response, parse_setup_response
# AI Change Summary can produce longer output than eval responses
_MAX_SUMMARY_TOKENS = 500
# Default prompt used when the user hasn't configured llm_change_summary
DEFAULT_CHANGE_SUMMARY_PROMPT = "Briefly describe in plain English what changed — what was added, removed, or modified."
# ---------------------------------------------------------------------------
# Intent resolution
# ---------------------------------------------------------------------------
def resolve_llm_field(watch, datastore, field: str) -> tuple[str, str]:
"""
Generic cascade resolver for any LLM per-watch field.
Returns (value, source) where source is 'watch' or tag title.
Returns ('', '') if not set anywhere.
"""
value = (watch.get(field) or '').strip()
if value:
return value, 'watch'
for tag_uuid in watch.get('tags', []):
tag = datastore.data['settings']['application'].get('tags', {}).get(tag_uuid)
if tag:
tag_value = (tag.get(field) or '').strip()
if tag_value:
return tag_value, tag.get('title', 'tag')
return '', ''
def resolve_intent(watch, datastore) -> tuple[str, str]:
"""
Return (intent, source) where source is 'watch' or tag title.
Returns ('', '') if no intent is configured anywhere.
"""
intent = (watch.get('llm_intent') or '').strip()
if intent:
return intent, 'watch'
for tag_uuid in watch.get('tags', []):
tag = datastore.data['settings']['application'].get('tags', {}).get(tag_uuid)
if tag:
tag_intent = (tag.get('llm_intent') or '').strip()
if tag_intent:
return tag_intent, tag.get('title', 'tag')
return '', ''
# ---------------------------------------------------------------------------
# LLM config helper
# ---------------------------------------------------------------------------
def get_llm_config(datastore) -> dict | None:
"""
Return LLM config dict or None if not configured.
Resolution order (first non-empty model wins):
1. Environment variables: LLM_MODEL, LLM_API_KEY, LLM_API_BASE
2. Datastore settings (set via UI)
"""
# 1. Environment variable override
env_model = os.getenv('LLM_MODEL', '').strip()
if env_model:
return {
'model': env_model,
'api_key': os.getenv('LLM_API_KEY', '').strip(),
'api_base': os.getenv('LLM_API_BASE', '').strip(),
}
# 2. Datastore settings
cfg = datastore.data['settings']['application'].get('llm') or {}
if not cfg.get('model'):
return None
return cfg
def llm_configured_via_env() -> bool:
"""True when LLM config comes from environment variables, not the UI."""
return bool(os.getenv('LLM_MODEL', '').strip())
# ---------------------------------------------------------------------------
# One-time setup: derive pre-filter
# ---------------------------------------------------------------------------
def _check_token_budget(watch, cfg, tokens_this_call: int = 0) -> bool:
"""
Check token budget limits. Returns True if within budget, False if exceeded.
Also accumulates tokens_this_call into watch['llm_tokens_used_cumulative'].
"""
if tokens_this_call > 0:
current = watch.get('llm_tokens_used_cumulative') or 0
watch['llm_tokens_used_cumulative'] = current + tokens_this_call
max_per_check = int(cfg.get('max_tokens_per_check') or 0)
max_cumulative = int(cfg.get('max_tokens_cumulative') or 0)
if max_per_check and tokens_this_call > max_per_check:
logger.warning(
f"LLM token budget exceeded for {watch.get('uuid')}: "
f"{tokens_this_call} tokens > per-check limit {max_per_check}"
)
return False
if max_cumulative:
total = watch.get('llm_tokens_used_cumulative') or 0
if total > max_cumulative:
logger.warning(
f"LLM cumulative token budget exceeded for {watch.get('uuid')}: "
f"{total} tokens > limit {max_cumulative}"
)
return False
return True
def run_setup(watch, datastore, snapshot_text: str) -> None:
"""
Ask the LLM whether a CSS pre-filter would improve precision for this intent.
Stores result in watch['llm_prefilter'] (str selector or None).
Called once when intent is first set, and again if pre-filter returns zero matches.
"""
cfg = get_llm_config(datastore)
if not cfg:
return
intent, _ = resolve_intent(watch, datastore)
if not intent:
return
url = watch.get('url', '')
system_prompt = build_setup_system_prompt()
user_prompt = build_setup_prompt(intent, snapshot_text, url=url)
try:
raw, tokens = llm_client.completion(
model=cfg['model'],
messages=[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
api_key=cfg.get('api_key'),
api_base=cfg.get('api_base'),
)
_check_token_budget(watch, cfg, tokens)
result = parse_setup_response(raw)
watch['llm_prefilter'] = result['selector']
logger.debug(f"LLM setup for {watch.get('uuid')}: prefilter={result['selector']} reason={result['reason']}")
except Exception as e:
logger.warning(f"LLM setup call failed for {watch.get('uuid')}: {e}")
watch['llm_prefilter'] = None
# ---------------------------------------------------------------------------
# AI Change Summary — human-readable description of what changed
# ---------------------------------------------------------------------------
def get_effective_summary_prompt(watch, datastore) -> str:
"""Return the prompt that summarise_change will use — custom or the default fallback."""
prompt, _ = resolve_llm_field(watch, datastore, 'llm_change_summary')
return prompt or DEFAULT_CHANGE_SUMMARY_PROMPT
def compute_summary_cache_key(diff_text: str, prompt: str) -> str:
"""Stable 16-char hex key for a (diff, prompt) pair. Stored alongside the summary file."""
h = hashlib.md5()
h.update(diff_text.encode('utf-8', errors='replace'))
h.update(b'\x00')
h.update(prompt.encode('utf-8', errors='replace'))
return h.hexdigest()[:16]
def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') -> str:
"""
Generate a plain-language summary of the change using the watch's
llm_change_summary prompt (cascades from tag if not set on watch).
Returns the summary string, or '' on failure.
The result replaces {{ diff }} in notifications so the user gets a
readable description instead of raw +/- diff lines.
"""
cfg = get_llm_config(datastore)
if not cfg:
return ''
custom_prompt, _ = resolve_llm_field(watch, datastore, 'llm_change_summary')
if not custom_prompt:
custom_prompt = DEFAULT_CHANGE_SUMMARY_PROMPT
if not diff.strip():
return ''
url = watch.get('url', '')
title = watch.get('page_title') or watch.get('title') or ''
system_prompt = build_change_summary_system_prompt()
user_prompt = build_change_summary_prompt(
diff=diff,
custom_prompt=custom_prompt,
current_snapshot=current_snapshot,
url=url,
title=title,
)
try:
raw, tokens = llm_client.completion(
model=cfg['model'],
messages=[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
api_key=cfg.get('api_key'),
api_base=cfg.get('api_base'),
max_tokens=_MAX_SUMMARY_TOKENS,
)
summary = raw.strip()
_check_token_budget(watch, cfg, tokens)
watch['llm_last_tokens_used'] = (watch.get('llm_last_tokens_used') or 0) + tokens
logger.debug(
f"LLM change summary {watch.get('uuid')}: tokens={tokens} "
f"summary={summary[:80]}"
)
return summary
except Exception as e:
logger.warning(f"LLM change summary failed for {watch.get('uuid')}: {e}")
return ''
# ---------------------------------------------------------------------------
# Live-preview extraction (current content, no diff)
# ---------------------------------------------------------------------------
def preview_extract(watch, datastore, content: str) -> dict | None:
"""
For the live-preview endpoint: extract relevant information from the
*current* page content according to the watch's intent.
Unlike evaluate_change (which compares a diff), this asks the LLM to
directly answer the intent against the current snapshot — giving the user
immediate feedback like "30 articles listed" or "Price: $149, 25% off".
Returns {'found': bool, 'answer': str} or None if LLM not configured / no intent.
"""
cfg = get_llm_config(datastore)
if not cfg:
return None
intent, _ = resolve_intent(watch, datastore)
if not intent or not content.strip():
return None
url = watch.get('url', '')
title = watch.get('page_title') or watch.get('title') or ''
system_prompt = build_preview_system_prompt()
user_prompt = build_preview_prompt(intent, content, url=url, title=title)
try:
raw, tokens = llm_client.completion(
model=cfg['model'],
messages=[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
api_key=cfg.get('api_key'),
api_base=cfg.get('api_base'),
)
result = parse_preview_response(raw)
logger.debug(
f"LLM preview {watch.get('uuid')}: found={result['found']} "
f"tokens={tokens} answer={result['answer'][:80]}"
)
return result
except Exception as e:
logger.warning(f"LLM preview extraction failed for {watch.get('uuid')}: {e}")
return None
# ---------------------------------------------------------------------------
# Per-change evaluation
# ---------------------------------------------------------------------------
def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') -> dict | None:
"""
Evaluate whether `diff` matches the watch's intent.
Returns {'important': bool, 'summary': str} or None if LLM not configured / no intent.
Results are cached by (intent, diff) hash — each unique diff is evaluated exactly once.
"""
cfg = get_llm_config(datastore)
if not cfg:
return None
intent, source = resolve_intent(watch, datastore)
if not intent:
return None
if not diff or not diff.strip():
return {'important': False, 'summary': ''}
# Cache lookup — evaluations are deterministic once cached
cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
cache = watch.get('llm_evaluation_cache') or {}
if cache_key in cache:
logger.debug(f"LLM cache hit for {watch.get('uuid')} key={cache_key[:8]}")
return cache[cache_key]
# Check cumulative budget before making the call
if not _check_token_budget(watch, cfg):
# Already over budget — fail open (don't suppress notification)
return {'important': True, 'summary': ''}
url = watch.get('url', '')
title = watch.get('page_title') or watch.get('title') or ''
system_prompt = build_eval_system_prompt()
user_prompt = build_eval_prompt(
intent=intent,
diff=diff,
current_snapshot=current_snapshot,
url=url,
title=title,
)
try:
raw, tokens = llm_client.completion(
model=cfg['model'],
messages=[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt},
],
api_key=cfg.get('api_key'),
api_base=cfg.get('api_base'),
)
result = parse_eval_response(raw)
except Exception as e:
logger.warning(f"LLM evaluation failed for {watch.get('uuid')}: {e}")
# On failure: don't suppress the notification — pass through as important
watch['llm_last_tokens_used'] = 0
return {'important': True, 'summary': ''}
# Accumulate token usage and enforce per-check limit
_check_token_budget(watch, cfg, tokens)
watch['llm_last_tokens_used'] = tokens
# Store in cache
if 'llm_evaluation_cache' not in watch or watch['llm_evaluation_cache'] is None:
watch['llm_evaluation_cache'] = {}
watch['llm_evaluation_cache'][cache_key] = result
logger.debug(
f"LLM eval {watch.get('uuid')} (intent from {source}): "
f"important={result['important']} tokens={tokens} summary={result['summary'][:80]}"
)
return result