Files
changedetection.io/changedetectionio/llm/bm25_trim.py
T
dgtlmoon c2f06f574b WIP
2026-04-16 14:41:30 +02:00

53 lines
1.6 KiB
Python

"""
BM25-based relevance trimming for large snapshot text.
When a snapshot is large and no CSS pre-filter has narrowed it down,
we use BM25 to select the lines most relevant to the user's intent
before sending to the LLM. This keeps the context focused without
an arbitrary char truncation.
Pure functions — no side effects, fully testable.
"""
MAX_CONTEXT_CHARS = 15_000
def trim_to_relevant(text: str, query: str, max_chars: int = MAX_CONTEXT_CHARS) -> str:
"""
Return the lines from `text` most relevant to `query` up to `max_chars`.
If text fits within budget, return it unchanged.
Falls back to head-truncation if rank_bm25 is unavailable.
"""
if not text or not query:
return text or ''
if len(text) <= max_chars:
return text
lines = [l for l in text.splitlines() if l.strip()]
if not lines:
return text[:max_chars]
try:
from rank_bm25 import BM25Okapi
except ImportError:
# rank-bm25 not installed — fall back to simple head truncation
return text[:max_chars]
tokenized = [line.lower().split() for line in lines]
bm25 = BM25Okapi(tokenized)
scores = bm25.get_scores(query.lower().split())
ranked = sorted(enumerate(zip(scores, lines)), key=lambda x: x[1][0], reverse=True)
selected_indices, total = [], 0
for idx, (_score, line) in ranked:
if total + len(line) + 1 > max_chars:
break
selected_indices.append(idx)
total += len(line) + 1
# Re-order selected lines to preserve original document order
ordered = [lines[i] for i in sorted(selected_indices)]
return '\n'.join(ordered)