mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-04-30 06:47:03 +00:00
e9e8c8d218
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / lint-translations (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled
213 lines
10 KiB
Python
213 lines
10 KiB
Python
"""
|
|
Prompt construction for LLM evaluation calls.
|
|
Pure functions — no side effects, fully testable.
|
|
"""
|
|
|
|
import re
|
|
|
|
from .bm25_trim import trim_to_relevant
|
|
|
|
_AGO_RE = re.compile(r'^\d+\s+\w+\s+ago$', re.IGNORECASE)
|
|
|
|
SNAPSHOT_CONTEXT_CHARS = 3_000 # current page state excerpt sent alongside the diff
|
|
|
|
|
|
def _annotate_moved_lines(diff_text: str) -> str:
|
|
"""
|
|
Pre-process a unified diff to mark lines that appear on both the + and - sides
|
|
as [MOVED] rather than genuinely added/removed. This prevents the LLM from
|
|
incorrectly classifying repositioned content as new or deleted.
|
|
|
|
Lines are compared after stripping leading +/- and whitespace so that
|
|
indentation changes don't prevent matching.
|
|
"""
|
|
lines = diff_text.splitlines()
|
|
added_texts = {l[1:].strip().lower() for l in lines if l.startswith('+') and l[1:].strip()}
|
|
removed_texts = {l[1:].strip().lower() for l in lines if l.startswith('-') and l[1:].strip()}
|
|
moved_texts = added_texts & removed_texts
|
|
|
|
if not moved_texts:
|
|
return diff_text
|
|
|
|
result = []
|
|
for line in lines:
|
|
if line.startswith(('+', '-')):
|
|
bare = line[1:].strip().lower()
|
|
if bare in moved_texts or _AGO_RE.match(line[1:].strip()):
|
|
result.append(f'~{line[1:]}') # ~ prefix = moved/reordered/trivial, skip
|
|
continue
|
|
result.append(line)
|
|
return '\n'.join(result)
|
|
|
|
|
|
def build_eval_prompt(intent: str, diff: str, current_snapshot: str = '',
|
|
url: str = '', title: str = '') -> str:
|
|
"""
|
|
Build the user message for a diff evaluation call.
|
|
The system prompt is kept separate (see build_eval_system_prompt).
|
|
"""
|
|
parts = []
|
|
|
|
if url:
|
|
parts.append(f"URL: {url}")
|
|
if title:
|
|
parts.append(f"Page title: {title}")
|
|
|
|
parts.append(f"Intent: {intent}")
|
|
|
|
if current_snapshot:
|
|
excerpt = trim_to_relevant(current_snapshot, intent, max_chars=SNAPSHOT_CONTEXT_CHARS)
|
|
if excerpt:
|
|
parts.append(f"\nCurrent page state (relevant excerpt):\n{excerpt}")
|
|
|
|
parts.append(f"\nWhat changed (diff):\n{diff}")
|
|
|
|
return '\n'.join(parts)
|
|
|
|
|
|
def build_eval_system_prompt() -> str:
|
|
return (
|
|
"You are a precise, reliable website-change evaluator for a monitoring tool.\n"
|
|
"Your job is to read a unified diff and decide whether it matches a user's stated intent.\n"
|
|
"Accuracy is critical — false positives waste the user's attention; false negatives miss what they care about.\n\n"
|
|
"Diff format:\n"
|
|
"- Lines starting with '+' are newly ADDED content\n"
|
|
"- Lines starting with '-' are REMOVED content\n"
|
|
"- Lines starting with ' ' (space) are unchanged context\n\n"
|
|
"Respond with ONLY a JSON object — no markdown, no explanation outside it:\n"
|
|
'{"important": true/false, "summary": "one sentence describing the relevant change, or why it doesn\'t match"}\n\n'
|
|
"Rules:\n"
|
|
"- important=true ONLY when the diff clearly and specifically matches the intent — be strict\n"
|
|
"- Pay close attention to direction: an intent about price drops means removed (-) prices and added (+) lower prices\n"
|
|
"- Empty, trivial, or cosmetic diffs (timestamps, counters, whitespace, navigation) → important=false\n"
|
|
"- If the same text appears in both removed (-) and added (+) lines the content has likely just "
|
|
"shifted or been reordered. Treat pure reordering as important=false unless the intent "
|
|
"explicitly asks about order or position.\n"
|
|
"- Use OR logic when the intent lists multiple triggers — any one matching is sufficient\n"
|
|
"- When uncertain whether a change truly matches, prefer important=false and explain why in the summary\n"
|
|
"- Summary must be in the same language as the intent\n"
|
|
"- If important=false, the summary must clearly explain what changed and why it does not match"
|
|
)
|
|
|
|
|
|
def build_preview_prompt(intent: str, content: str, url: str = '', title: str = '') -> str:
|
|
"""
|
|
Build the user message for a live-preview extraction call.
|
|
Unlike build_eval_prompt (which analyses a diff), this asks the LLM to
|
|
extract relevant information from the *current* page content — giving the
|
|
user a direct answer to their intent so they can verify it makes sense
|
|
before saving.
|
|
"""
|
|
parts = []
|
|
if url:
|
|
parts.append(f"URL: {url}")
|
|
if title:
|
|
parts.append(f"Page title: {title}")
|
|
parts.append(f"Intent / question: {intent}")
|
|
parts.append(f"\nPage content:\n{content[:6_000]}")
|
|
return '\n'.join(parts)
|
|
|
|
|
|
def build_preview_system_prompt() -> str:
|
|
return (
|
|
"You are a precise, detail-oriented web page content analyst for a website monitoring tool.\n"
|
|
"Given the user's intent or question and the current page content, extract and directly answer "
|
|
"what the intent is looking for. Never guess or paraphrase — report only what the page actually contains.\n\n"
|
|
"Respond with ONLY a JSON object — no markdown, no explanation outside it:\n"
|
|
'{"found": true/false, "answer": "concise direct answer or extraction"}\n\n'
|
|
"Rules:\n"
|
|
"- found=true when the page clearly contains something relevant to the intent\n"
|
|
"- answer must directly address the intent with specific values where possible "
|
|
"(e.g. for 'current price?' → '$149.99', not 'a price is shown')\n"
|
|
"- answer must be in the same language as the intent\n"
|
|
"- Keep answer brief — one or two sentences maximum\n"
|
|
"- If found=false, briefly state what the page contains instead"
|
|
)
|
|
|
|
|
|
def build_change_summary_prompt(diff: str, custom_prompt: str,
|
|
current_snapshot: str = '', url: str = '', title: str = '') -> str:
|
|
"""
|
|
Build the user message for an AI Change Summary call.
|
|
The user supplies their own instructions (custom_prompt); this wraps them
|
|
with the diff and optional page context.
|
|
"""
|
|
parts = []
|
|
if url:
|
|
parts.append(f"URL: {url}")
|
|
if title:
|
|
parts.append(f"Page title: {title}")
|
|
parts.append(f"Instructions: {custom_prompt}")
|
|
if current_snapshot:
|
|
excerpt = trim_to_relevant(current_snapshot, custom_prompt, max_chars=2_000)
|
|
if excerpt:
|
|
parts.append(f"\nCurrent page (excerpt):\n{excerpt}")
|
|
parts.append(f"\nWhat changed (diff):\n{_annotate_moved_lines(diff)}")
|
|
return '\n'.join(parts)
|
|
|
|
|
|
def build_change_summary_system_prompt() -> str:
|
|
return (
|
|
"You are a meticulous, accurate summariser of website changes for monitoring notifications.\n"
|
|
"Your goal is to describe exactly what changed — never omit significant details, "
|
|
"never add information that isn't in the diff, and never speculate.\n\n"
|
|
"Rules for reading the diff:\n"
|
|
"- Lines starting with + are genuinely new content. List them specifically.\n"
|
|
"- Lines starting with - are genuinely removed content. List them specifically.\n"
|
|
"- Lines starting with ~ have been PRE-IDENTIFIED as moved/reordered or trivial — "
|
|
"the same text exists on both sides of the diff, or the line is a standalone timestamp. "
|
|
"Do NOT report ~ lines as added or removed. "
|
|
"If many ~ lines exist, note briefly that some content was reordered.\n"
|
|
"- Never list standalone timestamps like '3 hours ago', 'Yesterday', '2 minutes ago' "
|
|
"as added or removed items — they are not meaningful content changes.\n"
|
|
"For content-heavy pages (news, listings, feeds): quote or paraphrase the specific new "
|
|
"headlines, items, or entries that were added — do not collapse them into vague phrases "
|
|
"like 'new articles were added' or 'section was expanded'.\n"
|
|
"For large blocks of new text (full articles, documents, long paragraphs): briefly summarise "
|
|
"the substance in 1-2 sentences capturing the key point — do not just repeat the title.\n\n"
|
|
"Structure your response using these sections, in this fixed order — "
|
|
"omit a section entirely if there is nothing to report for it:\n"
|
|
" Added: ...\n"
|
|
" Changed: ...\n"
|
|
" Removed: ...\n"
|
|
"The Removed section MUST always be last. Never place removals before additions or changes.\n\n"
|
|
"Follow the user's formatting instructions exactly for structure, language, and length.\n"
|
|
"Respond with ONLY the summary text — no JSON, no markdown code fences, no preamble. "
|
|
"Just the description."
|
|
)
|
|
|
|
|
|
def build_setup_prompt(intent: str, snapshot_text: str, url: str = '') -> str:
|
|
"""
|
|
Build the prompt for the one-time setup call that decides whether
|
|
a CSS pre-filter would improve evaluation precision.
|
|
"""
|
|
excerpt = trim_to_relevant(snapshot_text, intent, max_chars=4_000)
|
|
|
|
parts = []
|
|
if url:
|
|
parts.append(f"URL: {url}")
|
|
parts.append(f"Intent: {intent}")
|
|
parts.append(f"\nPage content excerpt:\n{excerpt}")
|
|
|
|
return '\n'.join(parts)
|
|
|
|
|
|
def build_setup_system_prompt() -> str:
|
|
return (
|
|
"You help configure a website change monitor.\n"
|
|
"Given a monitoring intent and a sample of the page content, decide if a CSS pre-filter "
|
|
"would improve evaluation precision by scoping the content to a specific structural section.\n\n"
|
|
"Respond with ONLY a JSON object:\n"
|
|
'{"needs_prefilter": true/false, "selector": "CSS selector or null", "reason": "one sentence"}\n\n'
|
|
"Rules:\n"
|
|
"- Only recommend a pre-filter when the intent references a specific structural section "
|
|
"(e.g. 'footer', 'sidebar', 'nav', 'header', 'main', 'article') OR the page clearly "
|
|
"has high-noise sections unrelated to the intent\n"
|
|
"- Use ONLY semantic element selectors: footer, nav, header, main, article, aside, "
|
|
"or attribute-based like [id*='price'], [class*='sidebar'] — NEVER positional selectors "
|
|
"like div:nth-child(3) or //*[2]\n"
|
|
"- Default to needs_prefilter=false — most intents don't need one\n"
|
|
"- selector must be null when needs_prefilter=false"
|
|
)
|