mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-06-06 08:51:20 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 39fa7f9692 | |||
| 1e643b2244 |
@@ -267,9 +267,15 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
from changedetectionio.llm.evaluator import (
|
||||
summarise_change, get_effective_summary_prompt, build_summary_cache_prompt,
|
||||
is_global_token_budget_exceeded, get_global_token_budget_month,
|
||||
LLMInputTooLargeError,
|
||||
LLMInputTooLargeError, compute_llm_enrichment,
|
||||
)
|
||||
|
||||
# Structured-metadata enrichment from the raw HTML of the "to" version (only the
|
||||
# 2 newest fetched-HTML snapshots are retained; older pairs simply get no enrichment).
|
||||
# Must be computed the same way as the worker pre-cache so the cache key matches.
|
||||
_llm_raw_html = watch.get_fetched_html(to_version) or ''
|
||||
_llm_metadata = compute_llm_enrichment(watch, datastore, _llm_raw_html, diff_text)
|
||||
|
||||
# Diff-pref flags + system prompt + active model are part of the cache key
|
||||
# so prompt or model changes bust the cache.
|
||||
from changedetectionio.llm.evaluator import get_llm_settings
|
||||
@@ -281,6 +287,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
max_summary_tokens=_max_summary_tokens,
|
||||
prefs=prefs,
|
||||
model=_llm_model,
|
||||
metadata=_llm_metadata,
|
||||
)
|
||||
|
||||
# Check cache — keyed by version pair + prompt hash (invalidates if prompt changes)
|
||||
@@ -306,7 +313,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
}), 429
|
||||
|
||||
try:
|
||||
summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text)
|
||||
summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text, metadata=_llm_metadata)
|
||||
except LLMInputTooLargeError as e:
|
||||
return jsonify({'summary': None, 'error': str(e)}), 400
|
||||
except Exception as e:
|
||||
|
||||
@@ -68,6 +68,43 @@ def _get_max_input_chars(datastore) -> int:
|
||||
return _DEFAULT_MAX_INPUT_CHARS
|
||||
|
||||
|
||||
def compute_llm_enrichment(watch, datastore, raw_html: str, base_text: str) -> str:
|
||||
"""
|
||||
Collect verbatim structured-metadata enrichment (via the llm_context_enrich
|
||||
plugin hook) to append to an LLM prompt, or '' when there's nothing usable.
|
||||
|
||||
Sizing is governed by the single configurable budget, max_input_chars — there is
|
||||
no hardcoded cap. If the enrichment would push base_text + metadata over that
|
||||
budget it is DROPPED (the diff/content alone still goes through), so adding the
|
||||
feature can never turn a previously-working call into an over-size failure.
|
||||
|
||||
The result is deterministic for a given (raw_html, base_text, budget), so callers
|
||||
that also fold it into a cache key (the summary cache) stay consistent.
|
||||
"""
|
||||
if not raw_html:
|
||||
return ''
|
||||
try:
|
||||
from changedetectionio.pluggy_interface import collect_llm_context_enrichment
|
||||
meta = collect_llm_context_enrichment(watch, raw_html, datastore)
|
||||
except Exception as e:
|
||||
logger.debug(f"{watch.get('uuid')} - LLM - enrichment collection failed: {e}")
|
||||
return ''
|
||||
if not meta:
|
||||
return ''
|
||||
|
||||
max_chars = _get_max_input_chars(datastore)
|
||||
if len(base_text or '') + len(meta) > max_chars:
|
||||
logger.debug(
|
||||
f"{watch.get('uuid')} - LLM - enrichment of {len(meta)} bytes of metadata "
|
||||
f"DROPPED: would exceed max_input_chars budget ({len(base_text or '')} + "
|
||||
f"{len(meta)} > {max_chars})"
|
||||
)
|
||||
return ''
|
||||
|
||||
logger.debug(f"{watch.get('uuid')} - LLM - enrichening query/prompt with {len(meta)} bytes of metadata")
|
||||
return meta
|
||||
|
||||
|
||||
class LLMInputTooLargeError(Exception):
|
||||
pass
|
||||
|
||||
@@ -541,7 +578,8 @@ class DiffPrefs:
|
||||
|
||||
|
||||
def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
|
||||
prefs: DiffPrefs = None, model: str = '') -> str:
|
||||
prefs: DiffPrefs = None, model: str = '',
|
||||
metadata: str = '') -> str:
|
||||
"""
|
||||
Compose the full cache-key string passed to save/get_llm_diff_summary.
|
||||
|
||||
@@ -553,6 +591,10 @@ def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
|
||||
The active model name is folded into the key so switching models
|
||||
(e.g. qwen3 → gpt-4o) invalidates stale summaries that were generated
|
||||
by a different model with potentially different phrasing/quality.
|
||||
|
||||
`metadata` (the appended structured-data block) is folded in too: two checks can
|
||||
produce the same text diff but different current metadata, and a stale cached
|
||||
summary must not be served when the appended facts have changed.
|
||||
"""
|
||||
if prefs is None:
|
||||
prefs = DiffPrefs()
|
||||
@@ -562,10 +604,12 @@ def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
|
||||
+ f'\x00sys:{build_change_summary_system_prompt()}'
|
||||
+ f'\x00max_tokens:{max_summary_tokens}'
|
||||
+ f'\x00model:{model}'
|
||||
+ f'\x00meta:{metadata}'
|
||||
)
|
||||
|
||||
|
||||
def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') -> str:
|
||||
def summarise_change(watch, datastore, diff: str, current_snapshot: str = '',
|
||||
metadata: str = '') -> str:
|
||||
"""
|
||||
Generate a plain-language summary of the change using the watch's
|
||||
llm_change_summary prompt (cascades from tag if not set on watch).
|
||||
@@ -603,6 +647,7 @@ def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') ->
|
||||
current_snapshot=current_snapshot,
|
||||
url=url,
|
||||
title=title,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
settings = get_llm_settings(datastore)
|
||||
@@ -704,12 +749,14 @@ def preview_extract(watch, datastore, content: str) -> dict | None:
|
||||
# Per-change evaluation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') -> dict | None:
|
||||
def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '',
|
||||
metadata: str = '') -> dict | None:
|
||||
"""
|
||||
Evaluate whether `diff` matches the watch's intent.
|
||||
Returns {'important': bool, 'summary': str} or None if LLM not configured / no intent.
|
||||
|
||||
Results are cached by (intent, diff) hash — each unique diff is evaluated exactly once.
|
||||
Results are cached by (intent, diff, metadata) hash — each unique diff+metadata is
|
||||
evaluated exactly once. `metadata` is the appended verbatim structured-data block.
|
||||
"""
|
||||
cfg = _runtime_llm_config(datastore)
|
||||
if not cfg:
|
||||
@@ -725,7 +772,7 @@ def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') ->
|
||||
_check_input_size(diff, _get_max_input_chars(datastore))
|
||||
|
||||
# Cache lookup — evaluations are deterministic once cached
|
||||
cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
|
||||
cache_key = hashlib.sha256(f"{intent}||{diff}||{metadata}".encode()).hexdigest()
|
||||
cache = watch.get('llm_evaluation_cache') or {}
|
||||
if cache_key in cache:
|
||||
logger.debug(f"LLM cache hit for {watch.get('uuid')} key={cache_key[:8]}")
|
||||
@@ -758,6 +805,7 @@ def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') ->
|
||||
current_snapshot=current_snapshot,
|
||||
url=url,
|
||||
title=title,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
settings = get_llm_settings(datastore)
|
||||
|
||||
@@ -41,10 +41,13 @@ def _annotate_moved_lines(diff_text: str) -> str:
|
||||
|
||||
|
||||
def build_eval_prompt(intent: str, diff: str, current_snapshot: str = '',
|
||||
url: str = '', title: str = '') -> str:
|
||||
url: str = '', title: str = '', metadata: str = '') -> str:
|
||||
"""
|
||||
Build the user message for a diff evaluation call.
|
||||
The system prompt is kept separate (see build_eval_system_prompt).
|
||||
|
||||
`metadata` is verbatim current-state structured data (JSON-LD/OpenGraph) appended
|
||||
last so the model can compare the diff against canonical current values.
|
||||
"""
|
||||
parts = []
|
||||
|
||||
@@ -62,6 +65,9 @@ def build_eval_prompt(intent: str, diff: str, current_snapshot: str = '',
|
||||
|
||||
parts.append(f"\nWhat changed (diff):\n{diff}")
|
||||
|
||||
if metadata:
|
||||
parts.append(f"\n{metadata}")
|
||||
|
||||
return '\n'.join(parts)
|
||||
|
||||
|
||||
@@ -132,7 +138,8 @@ def build_preview_system_prompt() -> str:
|
||||
|
||||
|
||||
def build_change_summary_prompt(diff: str, custom_prompt: str,
|
||||
current_snapshot: str = '', url: str = '', title: str = '') -> str:
|
||||
current_snapshot: str = '', url: str = '', title: str = '',
|
||||
metadata: str = '') -> str:
|
||||
"""
|
||||
Build the user message for an AI Change Summary call.
|
||||
The user supplies their own instructions (custom_prompt); this wraps them
|
||||
@@ -152,6 +159,8 @@ def build_change_summary_prompt(diff: str, custom_prompt: str,
|
||||
parts.append(f"Page title: {title}")
|
||||
parts.append(f"Instructions: {custom_prompt}")
|
||||
parts.append(f"\nWhat changed (diff):\n{_annotate_moved_lines(diff)}")
|
||||
if metadata:
|
||||
parts.append(f"\n{metadata}")
|
||||
return '\n'.join(parts)
|
||||
|
||||
|
||||
|
||||
@@ -175,6 +175,30 @@ class ChangeDetectionSpec:
|
||||
"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def llm_context_enrich(watch, html_content, datastore):
|
||||
"""Return extra current-state context to append to LLM intent/summary prompts.
|
||||
|
||||
Called for any watch with an LLM intent or change-summary when raw HTML is
|
||||
available. Plugins can surface structured facts the html-to-text snapshot has
|
||||
dropped — e.g. JSON-LD / OpenGraph product metadata — so the model can answer
|
||||
intents like "alert when the SKU changes" or "list the product IDs".
|
||||
|
||||
The returned text is appended verbatim to the prompt; the caller is responsible
|
||||
for fitting it within the configurable max_input_chars budget (it drops the
|
||||
enrichment if it would not fit), so implementations should NOT impose their own
|
||||
size limits.
|
||||
|
||||
Args:
|
||||
watch: The watch dict being evaluated.
|
||||
html_content: The raw HTML of the current page (may be '' / None).
|
||||
datastore: The application datastore.
|
||||
|
||||
Returns:
|
||||
str or None: Context text to append, or None if nothing to add.
|
||||
"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def get_html_head_extras():
|
||||
"""Return HTML to inject into the <head> of every page via base.html.
|
||||
@@ -323,14 +347,17 @@ def register_builtin_restock_plugins():
|
||||
(restock_diff/__init__.py → model.Watch → content_fetchers → pluggy_interface).
|
||||
"""
|
||||
import importlib
|
||||
module_path = 'changedetectionio.processors.restock_diff.plugins.llm_restock'
|
||||
try:
|
||||
module = importlib.import_module(module_path)
|
||||
if not plugin_manager.is_registered(module):
|
||||
plugin_manager.register(module, 'llm_restock')
|
||||
logger.debug("Registered built-in restock plugin: llm_restock")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register llm_restock plugin: {e}")
|
||||
for module_path, plugin_name in (
|
||||
('changedetectionio.processors.restock_diff.plugins.llm_restock', 'llm_restock'),
|
||||
('changedetectionio.processors.restock_diff.plugins.llm_metadata_enrich', 'llm_metadata_enrich'),
|
||||
):
|
||||
try:
|
||||
module = importlib.import_module(module_path)
|
||||
if not plugin_manager.is_registered(module):
|
||||
plugin_manager.register(module, plugin_name)
|
||||
logger.debug(f"Registered built-in restock plugin: {plugin_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register {plugin_name} plugin: {e}")
|
||||
|
||||
# Helper function to collect UI stats extras from all plugins
|
||||
def collect_ui_edit_stats_extras(watch):
|
||||
@@ -403,6 +430,27 @@ def get_itemprop_availability_from_plugin(content, fetcher_name, fetcher_instanc
|
||||
return None
|
||||
|
||||
|
||||
def collect_llm_context_enrichment(watch, html_content, datastore):
|
||||
"""Collect and combine LLM context enrichment from all plugins.
|
||||
|
||||
Returns the concatenated non-empty plugin strings (blank-line separated), or ''
|
||||
when no plugin contributes anything. No size limit is applied here — the caller
|
||||
enforces the single configurable max_input_chars budget.
|
||||
"""
|
||||
try:
|
||||
results = plugin_manager.hook.llm_context_enrich(
|
||||
watch=watch,
|
||||
html_content=html_content,
|
||||
datastore=datastore,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"llm_context_enrich hook failed: {e}")
|
||||
return ''
|
||||
|
||||
parts = [r.strip() for r in results if r and isinstance(r, str) and r.strip()]
|
||||
return '\n\n'.join(parts) if parts else ''
|
||||
|
||||
|
||||
def get_active_plugins():
|
||||
"""Get a list of active plugins with their descriptions.
|
||||
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
"""
|
||||
LLM context enrichment plugin — structured product/page metadata.
|
||||
|
||||
Surfaces the page's structured metadata (JSON-LD + OpenGraph site/type) verbatim
|
||||
so it can be appended to the LLM intent/summary prompts. This lets user intents
|
||||
and summary prompts reference facts the html-to-text snapshot has stripped out —
|
||||
prices, SKUs/GTINs, availability, ratings, article dates, page kind, etc.
|
||||
|
||||
Extraction reuses the memory-safe pure_python_extractor (stdlib html.parser, no
|
||||
lxml/libxml2), so it is safe to run on every changed watch without the C-level
|
||||
memory leak that extruct/lxml carries. It performs NO LLM call of its own and
|
||||
imposes no size limit — the evaluator enforces the single configurable
|
||||
max_input_chars budget and drops the enrichment if it would not fit.
|
||||
"""
|
||||
from loguru import logger
|
||||
from changedetectionio.pluggy_interface import hookimpl
|
||||
|
||||
|
||||
@hookimpl
|
||||
def llm_context_enrich(watch, html_content, datastore):
|
||||
"""Return verbatim structured metadata for the current page, or None."""
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
try:
|
||||
from changedetectionio.processors.restock_diff.pure_python_extractor import extract_metadata_for_llm
|
||||
block = extract_metadata_for_llm(html_content)
|
||||
except Exception as e:
|
||||
logger.debug(f"llm_metadata_enrich: extraction failed: {e}")
|
||||
return None
|
||||
|
||||
return block or None
|
||||
@@ -287,3 +287,84 @@ def query_price_availability(extracted_data):
|
||||
# using something like babel you need to know the locale of the website and even then it can be problematic
|
||||
# we dont really do anything with the price data so far.. so just accept it the way it comes.
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Structured metadata for the LLM enricher — passed through verbatim
|
||||
# =============================================================================
|
||||
#
|
||||
# This surfaces the page's structured metadata (JSON-LD + OpenGraph site/type)
|
||||
# as-is for the LLM intent/summary prompts. We deliberately do NOT curate, field-
|
||||
# cherry-pick, or impose a size limit here:
|
||||
#
|
||||
# * LLMs are trained on schema.org JSON-LD and read it natively, so handing it
|
||||
# over verbatim lets ANY user intent ("list the SKUs", "did the release date
|
||||
# change?", "is it a recipe or a product?") work without us pre-guessing which
|
||||
# fields matter — and it covers non-product pages (NewsArticle, Event, JobPosting…)
|
||||
# for free.
|
||||
# * There is exactly one configurable budget for how much text reaches the LLM —
|
||||
# max_input_chars (env LLM_MAX_INPUT_CHARS → settings → default), enforced by the
|
||||
# evaluator. A second hardcoded cap here would be a competing, non-configurable
|
||||
# source of truth. The caller decides how much fits.
|
||||
#
|
||||
# Extraction reuses the memory-safe extract_metadata_pure_python() (stdlib
|
||||
# html.parser, no lxml/libxml2) so it is safe to call on every changed watch
|
||||
# without the C-level leak extruct/lxml carries, and it is robust to dangling/
|
||||
# unclosed <script type="application/ld+json"> blocks (HTMLParser only emits a
|
||||
# block on a real closing tag, so an unterminated blob is dropped rather than
|
||||
# swallowing the rest of the document the way a greedy regex would).
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def extract_metadata_for_llm(html_content) -> str:
|
||||
"""
|
||||
Return the page's structured metadata verbatim for LLM context, or '' if none.
|
||||
|
||||
Output (either part omitted when absent):
|
||||
|
||||
Page context: site: ExampleShop | og:type: product
|
||||
Structured metadata found on the page (JSON-LD):
|
||||
{"@type":"Product","name":"Acme Widget","sku":"12345", ...}
|
||||
{"@type":"BreadcrumbList", ...}
|
||||
|
||||
JSON-LD blocks are re-serialised compactly (this only strips source whitespace
|
||||
— the data is byte-for-byte the same schema.org structure). No truncation or
|
||||
field selection is applied; sizing is the caller's single configurable budget.
|
||||
"""
|
||||
if not html_content:
|
||||
return ''
|
||||
|
||||
try:
|
||||
data = extract_metadata_pure_python(html_content)
|
||||
except Exception as e:
|
||||
logger.debug(f"Metadata for LLM: extraction failed: {e}")
|
||||
return ''
|
||||
|
||||
parts = []
|
||||
|
||||
# OpenGraph site/type — page-kind context that is NOT carried in JSON-LD,
|
||||
# so the model can tell an e-shop listing from a news feed.
|
||||
og = data.get('opengraph', {})
|
||||
ctx = []
|
||||
if og.get('og:site_name'):
|
||||
ctx.append(f"site: {og['og:site_name']}")
|
||||
if og.get('og:type'):
|
||||
ctx.append(f"og:type: {og['og:type']}")
|
||||
if ctx:
|
||||
parts.append('Page context: ' + ' | '.join(ctx))
|
||||
|
||||
# JSON-LD verbatim (compact re-dump only — whitespace normalisation, not curation).
|
||||
nodes = data.get('json-ld', [])
|
||||
if nodes:
|
||||
try:
|
||||
blob = '\n'.join(
|
||||
json.dumps(n, ensure_ascii=False, separators=(',', ':'))
|
||||
for n in nodes
|
||||
)
|
||||
except (TypeError, ValueError) as e:
|
||||
logger.debug(f"Metadata for LLM: JSON-LD re-serialise failed: {e}")
|
||||
blob = ''
|
||||
if blob:
|
||||
parts.append('Structured metadata found on the page (JSON-LD):\n' + blob)
|
||||
|
||||
return '\n'.join(parts)
|
||||
|
||||
@@ -210,7 +210,8 @@ class TestEvaluateChange:
|
||||
|
||||
diff = '- $500\n+ $400'
|
||||
intent = 'flag price drops'
|
||||
cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
|
||||
metadata = '' # no enrichment in this test; folded into the key as a trailing ||
|
||||
cache_key = hashlib.sha256(f"{intent}||{diff}||{metadata}".encode()).hexdigest()
|
||||
watch['llm_evaluation_cache'] = {
|
||||
cache_key: {'important': True, 'summary': 'cached result'}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import pytest
|
||||
from changedetectionio.llm.prompt_builder import (
|
||||
build_eval_prompt,
|
||||
build_eval_system_prompt,
|
||||
build_change_summary_prompt,
|
||||
build_setup_prompt,
|
||||
build_setup_system_prompt,
|
||||
SNAPSHOT_CONTEXT_CHARS,
|
||||
@@ -71,6 +72,49 @@ class TestBuildEvalPrompt:
|
||||
assert len(prompt_without) < len(prompt_with)
|
||||
|
||||
|
||||
class TestMetadataEnrichmentInPrompts:
|
||||
"""The verbatim structured-metadata block must land in the eval/summary/preview
|
||||
user prompts when provided, and leave them unchanged when absent."""
|
||||
|
||||
METADATA = (
|
||||
"Page context: site: ExampleShop | og:type: product\n"
|
||||
"Structured metadata found on the page (JSON-LD):\n"
|
||||
'{"@type":"Product","name":"Acme Widget","sku":"12345","color":"blue"}'
|
||||
)
|
||||
|
||||
def test_eval_prompt_includes_metadata(self):
|
||||
prompt = build_eval_prompt(intent='alert on SKU change', diff='- a\n+ b',
|
||||
metadata=self.METADATA)
|
||||
assert self.METADATA in prompt
|
||||
# A field we never whitelisted must survive verbatim
|
||||
assert '"sku":"12345"' in prompt
|
||||
assert '"color":"blue"' in prompt
|
||||
# The block is appended AFTER the diff (diff stays the freshest pre-metadata content)
|
||||
assert prompt.index('What changed (diff):') < prompt.index('Structured metadata found')
|
||||
|
||||
def test_eval_prompt_unchanged_without_metadata(self):
|
||||
with_meta = build_eval_prompt(intent='i', diff='d', metadata=self.METADATA)
|
||||
without = build_eval_prompt(intent='i', diff='d')
|
||||
assert 'Structured metadata found' not in without
|
||||
assert len(without) < len(with_meta)
|
||||
|
||||
def test_summary_prompt_includes_metadata(self):
|
||||
prompt = build_change_summary_prompt(diff='- a\n+ b', custom_prompt='list the SKUs',
|
||||
metadata=self.METADATA)
|
||||
assert self.METADATA in prompt
|
||||
assert '"sku":"12345"' in prompt
|
||||
|
||||
def test_summary_prompt_unchanged_without_metadata(self):
|
||||
without = build_change_summary_prompt(diff='- a\n+ b', custom_prompt='x')
|
||||
assert 'Structured metadata found' not in without
|
||||
|
||||
def test_empty_metadata_appends_nothing(self):
|
||||
# Falsy metadata ('') must not add a trailing block/whitespace section
|
||||
assert build_eval_prompt(intent='i', diff='d', metadata='') == build_eval_prompt(intent='i', diff='d')
|
||||
assert (build_change_summary_prompt(diff='d', custom_prompt='c', metadata='')
|
||||
== build_change_summary_prompt(diff='d', custom_prompt='c'))
|
||||
|
||||
|
||||
class TestBuildEvalSystemPrompt:
|
||||
def test_returns_string(self):
|
||||
result = build_eval_system_prompt()
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
"""Unit tests for the memory-safe, verbatim structured-metadata block used by the LLM enricher.
|
||||
|
||||
Run: python -m unittest changedetectionio.tests.unit.test_product_metadata_summary
|
||||
"""
|
||||
|
||||
import json
|
||||
import unittest
|
||||
|
||||
from changedetectionio.processors.restock_diff.pure_python_extractor import (
|
||||
extract_metadata_for_llm,
|
||||
)
|
||||
|
||||
|
||||
def _page(*scripts, head_extra=''):
|
||||
body = '\n'.join(scripts)
|
||||
return f'<html><head>{head_extra}</head><body>{body}</body></html>'
|
||||
|
||||
|
||||
class TestExtractMetadataForLLM(unittest.TestCase):
|
||||
|
||||
def test_jsonld_passed_through_verbatim(self):
|
||||
html = _page('''
|
||||
<script type="application/ld+json">
|
||||
{"@context":"https://schema.org","@type":"Product","name":"Acme Widget",
|
||||
"sku":"12345","color":"blue","releaseDate":"2026-01-02",
|
||||
"offers":{"@type":"Offer","price":"249.00","priceCurrency":"USD","availability":"https://schema.org/InStock"}}
|
||||
</script>''')
|
||||
out = extract_metadata_for_llm(html)
|
||||
# Verbatim: fields we never "whitelisted" must still be present
|
||||
self.assertIn('JSON-LD', out)
|
||||
self.assertIn('"name":"Acme Widget"', out)
|
||||
self.assertIn('"sku":"12345"', out)
|
||||
self.assertIn('"color":"blue"', out)
|
||||
self.assertIn('"releaseDate":"2026-01-02"', out)
|
||||
self.assertIn('"availability":"https://schema.org/InStock"', out)
|
||||
|
||||
def test_no_size_or_count_limit_is_imposed(self):
|
||||
# 50 products → all 50 must appear; sizing is the caller's budget, not ours.
|
||||
prods = [f'{{"@type":"Product","name":"P{i}","sku":"S{i}"}}' for i in range(50)]
|
||||
html = _page(f'<script type="application/ld+json">[{",".join(prods)}]</script>')
|
||||
out = extract_metadata_for_llm(html)
|
||||
self.assertIn('"name":"P0"', out)
|
||||
self.assertIn('"name":"P49"', out)
|
||||
self.assertNotIn('more products', out) # no truncation marker
|
||||
|
||||
def test_non_product_types_included(self):
|
||||
# News / events / etc. are passed through too — not product-only.
|
||||
html = _page('''<script type="application/ld+json">
|
||||
{"@type":"NewsArticle","headline":"Big news","datePublished":"2026-05-30"}
|
||||
</script>''')
|
||||
out = extract_metadata_for_llm(html)
|
||||
self.assertIn('"@type":"NewsArticle"', out)
|
||||
self.assertIn('"headline":"Big news"', out)
|
||||
|
||||
def test_compact_reserialisation_is_valid_json(self):
|
||||
html = _page('''<script type="application/ld+json">
|
||||
{ "@type" : "Product" , "name" : "Spaced Out" }
|
||||
</script>''')
|
||||
out = extract_metadata_for_llm(html)
|
||||
blob_line = out.splitlines()[-1]
|
||||
# The re-dumped line must round-trip as valid JSON
|
||||
self.assertEqual(json.loads(blob_line)['name'], 'Spaced Out')
|
||||
|
||||
def test_opengraph_page_context(self):
|
||||
html = _page(
|
||||
'<script type="application/ld+json">{"@type":"ItemList"}</script>',
|
||||
head_extra='''
|
||||
<meta property="og:site_name" content="ExampleShop">
|
||||
<meta property="og:type" content="product.group">
|
||||
''',
|
||||
)
|
||||
out = extract_metadata_for_llm(html)
|
||||
self.assertIn('Page context: site: ExampleShop', out)
|
||||
self.assertIn('og:type: product.group', out)
|
||||
self.assertIn('"@type":"ItemList"', out)
|
||||
|
||||
def test_dangling_unclosed_jsonld_is_safe(self):
|
||||
# An unterminated ld+json block must NOT swallow the document nor crash.
|
||||
html = (
|
||||
'<html><body>'
|
||||
'<script type="application/ld+json">{"@type":"Product","name":"Broken","sku":"X"'
|
||||
'<div>rest of page</div>'
|
||||
'</body></html>'
|
||||
)
|
||||
self.assertEqual(extract_metadata_for_llm(html), '')
|
||||
|
||||
def test_invalid_json_skipped(self):
|
||||
html = _page('<script type="application/ld+json">{not valid json,,}</script>')
|
||||
self.assertEqual(extract_metadata_for_llm(html), '')
|
||||
|
||||
def test_no_metadata_returns_empty(self):
|
||||
self.assertEqual(extract_metadata_for_llm('<html><body><p>hi</p></body></html>'), '')
|
||||
self.assertEqual(extract_metadata_for_llm(''), '')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -450,7 +450,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
try:
|
||||
from changedetectionio.llm.evaluator import (
|
||||
evaluate_change, resolve_intent, resolve_llm_field,
|
||||
summarise_change, _runtime_llm_config,
|
||||
summarise_change, _runtime_llm_config, compute_llm_enrichment,
|
||||
)
|
||||
# _runtime_llm_config returns None (and logs a debug skip
|
||||
# message) when the master 'llm_enabled' toggle is off, so
|
||||
@@ -476,14 +476,20 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
else:
|
||||
_diff_text = contents
|
||||
|
||||
# Structured-metadata enrichment (verbatim JSON-LD/OpenGraph) from the
|
||||
# raw HTML, appended to both the intent and summary prompts. Computed once
|
||||
# and dropped automatically if it won't fit the max_input_chars budget.
|
||||
_llm_raw_html = getattr(getattr(update_handler, 'fetcher', None), 'content', '') or ''
|
||||
_llm_metadata = compute_llm_enrichment(watch, datastore, _llm_raw_html, _diff_text)
|
||||
|
||||
# Step 1: AI Change Intent — may suppress notification
|
||||
_llm_intent, _llm_intent_source = resolve_intent(watch, datastore)
|
||||
if _llm_intent:
|
||||
set_watch_minitext_status(watch, "AI/LLM (rules)..")
|
||||
_llm_result = await loop.run_in_executor(
|
||||
executor,
|
||||
lambda diff=_diff_text, snap=contents: evaluate_change(
|
||||
watch, datastore, diff=diff, current_snapshot=snap
|
||||
lambda diff=_diff_text, snap=contents, meta=_llm_metadata: evaluate_change(
|
||||
watch, datastore, diff=diff, current_snapshot=snap, metadata=meta
|
||||
)
|
||||
)
|
||||
update_obj['_llm_result'] = _llm_result
|
||||
@@ -502,8 +508,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
set_watch_minitext_status(watch, "AI/LLM (summary)..")
|
||||
_change_summary = await loop.run_in_executor(
|
||||
executor,
|
||||
lambda diff=_diff_text, snap=contents: summarise_change(
|
||||
watch, datastore, diff=diff, current_snapshot=snap
|
||||
lambda diff=_diff_text, snap=contents, meta=_llm_metadata: summarise_change(
|
||||
watch, datastore, diff=diff, current_snapshot=snap, metadata=meta
|
||||
)
|
||||
)
|
||||
if _change_summary:
|
||||
@@ -557,6 +563,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
effective_prompt=get_effective_summary_prompt(watch, datastore),
|
||||
max_summary_tokens=_llm_max_summary_tokens,
|
||||
model=_llm_model,
|
||||
metadata=_llm_metadata,
|
||||
)
|
||||
watch.save_llm_diff_summary(
|
||||
update_obj['_llm_change_summary'],
|
||||
|
||||
Reference in New Issue
Block a user