changedetection.io/changedetectionio/llm/response_parser.py

"""
Parse and validate LLM JSON responses.
Pure functions — no side effects, fully testable.

LLMs occasionally return JSON wrapped in markdown fences or with trailing
text. This module handles those cases gracefully.
"""

import json
import re

# Positional selectors are fragile — reject them even if the LLM generates them
_POSITIONAL_SELECTOR_RE = re.compile(
    r'nth-child|nth-of-type|:eq\(|\[\d+\]|\/\/\*\[\d',
    re.IGNORECASE
)


def _extract_json(raw: str) -> str:
    """Strip markdown fences and extract the first JSON object."""
    raw = raw.strip()
    # Remove ```json ... ``` or ``` ... ``` fences
    raw = re.sub(r'^```(?:json)?\s*', '', raw, flags=re.MULTILINE)
    raw = re.sub(r'\s*```$', '', raw, flags=re.MULTILINE)
    # Find the first { ... } block
    match = re.search(r'\{.*\}', raw, re.DOTALL)
    return match.group(0) if match else raw


def parse_eval_response(raw: str) -> dict:
    """
    Parse a diff evaluation response.
    Returns {'important': bool, 'summary': str}.
    Falls back to important=False on any parse error.
    """
    try:
        data = json.loads(_extract_json(raw))
        return {
            'important': bool(data.get('important', False)),
            'summary': str(data.get('summary', '')).strip(),
        }
    except (json.JSONDecodeError, AttributeError):
        return {'important': False, 'summary': ''}


def parse_preview_response(raw: str) -> dict:
    """
    Parse a live-preview extraction response.
    Returns {'found': bool, 'answer': str}.
    Falls back to found=False on any parse error.
    """
    try:
        data = json.loads(_extract_json(raw))
        return {
            'found': bool(data.get('found', False)),
            'answer': str(data.get('answer', '')).strip(),
        }
    except (json.JSONDecodeError, AttributeError):
        return {'found': False, 'answer': ''}


def parse_setup_response(raw: str) -> dict:
    """
    Parse a setup/pre-filter decision response.
    Returns {'needs_prefilter': bool, 'selector': str|None, 'reason': str}.
    Rejects positional selectors even if the LLM generates them.
    """
    try:
        data = json.loads(_extract_json(raw))
        needs = bool(data.get('needs_prefilter', False))
        selector = data.get('selector') or None

        # Sanitise: reject positional selectors
        if selector and _POSITIONAL_SELECTOR_RE.search(selector):
            selector = None
            needs = False

        return {
            'needs_prefilter': needs,
            'selector': selector if needs else None,
            'reason': str(data.get('reason', '')).strip(),
        }
    except (json.JSONDecodeError, AttributeError):
        return {'needs_prefilter': False, 'selector': None, 'reason': ''}