WIP

LLM - Enrichen summary and intent/rules with the actual product metadata in the HTML document if it exists for greater precision.
2026-06-06 08:51:20 +00:00 · 2026-05-30 13:35:49 +02:00 · 2026-05-30 13:26:01 +02:00
10 changed files with 400 additions and 23 deletions
@@ -267,9 +267,15 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        from changedetectionio.llm.evaluator import (
            summarise_change, get_effective_summary_prompt, build_summary_cache_prompt,
            is_global_token_budget_exceeded, get_global_token_budget_month,
-            LLMInputTooLargeError,
+            LLMInputTooLargeError, compute_llm_enrichment,
        )

+        # Structured-metadata enrichment from the raw HTML of the "to" version (only the
+        # 2 newest fetched-HTML snapshots are retained; older pairs simply get no enrichment).
+        # Must be computed the same way as the worker pre-cache so the cache key matches.
+        _llm_raw_html = watch.get_fetched_html(to_version) or ''
+        _llm_metadata = compute_llm_enrichment(watch, datastore, _llm_raw_html, diff_text)
+
        # Diff-pref flags + system prompt + active model are part of the cache key
        # so prompt or model changes bust the cache.
        from changedetectionio.llm.evaluator import get_llm_settings
@@ -281,6 +287,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
            max_summary_tokens=_max_summary_tokens,
            prefs=prefs,
            model=_llm_model,
+            metadata=_llm_metadata,
        )

        # Check cache — keyed by version pair + prompt hash (invalidates if prompt changes)
@@ -306,7 +313,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
            }), 429

        try:
-            summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text)
+            summary = summarise_change(watch, datastore, diff=diff_text, current_snapshot=to_text, metadata=_llm_metadata)
        except LLMInputTooLargeError as e:
            return jsonify({'summary': None, 'error': str(e)}), 400
        except Exception as e:
@@ -68,6 +68,43 @@ def _get_max_input_chars(datastore) -> int:
    return _DEFAULT_MAX_INPUT_CHARS


+def compute_llm_enrichment(watch, datastore, raw_html: str, base_text: str) -> str:
+    """
+    Collect verbatim structured-metadata enrichment (via the llm_context_enrich
+    plugin hook) to append to an LLM prompt, or '' when there's nothing usable.
+
+    Sizing is governed by the single configurable budget, max_input_chars — there is
+    no hardcoded cap. If the enrichment would push base_text + metadata over that
+    budget it is DROPPED (the diff/content alone still goes through), so adding the
+    feature can never turn a previously-working call into an over-size failure.
+
+    The result is deterministic for a given (raw_html, base_text, budget), so callers
+    that also fold it into a cache key (the summary cache) stay consistent.
+    """
+    if not raw_html:
+        return ''
+    try:
+        from changedetectionio.pluggy_interface import collect_llm_context_enrichment
+        meta = collect_llm_context_enrichment(watch, raw_html, datastore)
+    except Exception as e:
+        logger.debug(f"{watch.get('uuid')} - LLM - enrichment collection failed: {e}")
+        return ''
+    if not meta:
+        return ''
+
+    max_chars = _get_max_input_chars(datastore)
+    if len(base_text or '') + len(meta) > max_chars:
+        logger.debug(
+            f"{watch.get('uuid')} - LLM - enrichment of {len(meta)} bytes of metadata "
+            f"DROPPED: would exceed max_input_chars budget ({len(base_text or '')} + "
+            f"{len(meta)} > {max_chars})"
+        )
+        return ''
+
+    logger.debug(f"{watch.get('uuid')} - LLM - enrichening query/prompt with {len(meta)} bytes of metadata")
+    return meta
+
+
 class LLMInputTooLargeError(Exception):
    pass

@@ -541,7 +578,8 @@ class DiffPrefs:


 def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
-                                prefs: DiffPrefs = None, model: str = '') -> str:
+                                prefs: DiffPrefs = None, model: str = '',
+                                metadata: str = '') -> str:
    """
    Compose the full cache-key string passed to save/get_llm_diff_summary.

@@ -553,6 +591,10 @@ def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
    The active model name is folded into the key so switching models
    (e.g. qwen3 → gpt-4o) invalidates stale summaries that were generated
    by a different model with potentially different phrasing/quality.
+
+    `metadata` (the appended structured-data block) is folded in too: two checks can
+    produce the same text diff but different current metadata, and a stale cached
+    summary must not be served when the appended facts have changed.
    """
    if prefs is None:
        prefs = DiffPrefs()
@@ -562,10 +604,12 @@ def build_summary_cache_prompt(effective_prompt: str, max_summary_tokens: int,
        + f'\x00sys:{build_change_summary_system_prompt()}'
        + f'\x00max_tokens:{max_summary_tokens}'
        + f'\x00model:{model}'
+        + f'\x00meta:{metadata}'
    )


-def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') -> str:
+def summarise_change(watch, datastore, diff: str, current_snapshot: str = '',
+                     metadata: str = '') -> str:
    """
    Generate a plain-language summary of the change using the watch's
    llm_change_summary prompt (cascades from tag if not set on watch).
@@ -603,6 +647,7 @@ def summarise_change(watch, datastore, diff: str, current_snapshot: str = '') ->
        current_snapshot=current_snapshot,
        url=url,
        title=title,
+        metadata=metadata,
    )

    settings = get_llm_settings(datastore)
@@ -704,12 +749,14 @@ def preview_extract(watch, datastore, content: str) -> dict | None:
 # Per-change evaluation
 # ---------------------------------------------------------------------------

-def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') -> dict | None:
+def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '',
+                    metadata: str = '') -> dict | None:
    """
    Evaluate whether `diff` matches the watch's intent.
    Returns {'important': bool, 'summary': str} or None if LLM not configured / no intent.

-    Results are cached by (intent, diff) hash — each unique diff is evaluated exactly once.
+    Results are cached by (intent, diff, metadata) hash — each unique diff+metadata is
+    evaluated exactly once. `metadata` is the appended verbatim structured-data block.
    """
    cfg = _runtime_llm_config(datastore)
    if not cfg:
@@ -725,7 +772,7 @@ def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') ->
    _check_input_size(diff, _get_max_input_chars(datastore))

    # Cache lookup — evaluations are deterministic once cached
-    cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
+    cache_key = hashlib.sha256(f"{intent}||{diff}||{metadata}".encode()).hexdigest()
    cache = watch.get('llm_evaluation_cache') or {}
    if cache_key in cache:
        logger.debug(f"LLM cache hit for {watch.get('uuid')} key={cache_key[:8]}")
@@ -758,6 +805,7 @@ def evaluate_change(watch, datastore, diff: str, current_snapshot: str = '') ->
        current_snapshot=current_snapshot,
        url=url,
        title=title,
+        metadata=metadata,
    )

    settings = get_llm_settings(datastore)
@@ -41,10 +41,13 @@ def _annotate_moved_lines(diff_text: str) -> str:


 def build_eval_prompt(intent: str, diff: str, current_snapshot: str = '',
-                      url: str = '', title: str = '') -> str:
+                      url: str = '', title: str = '', metadata: str = '') -> str:
    """
    Build the user message for a diff evaluation call.
    The system prompt is kept separate (see build_eval_system_prompt).
+
+    `metadata` is verbatim current-state structured data (JSON-LD/OpenGraph) appended
+    last so the model can compare the diff against canonical current values.
    """
    parts = []

@@ -62,6 +65,9 @@ def build_eval_prompt(intent: str, diff: str, current_snapshot: str = '',

    parts.append(f"\nWhat changed (diff):\n{diff}")

+    if metadata:
+        parts.append(f"\n{metadata}")
+
    return '\n'.join(parts)


@@ -132,7 +138,8 @@ def build_preview_system_prompt() -> str:


 def build_change_summary_prompt(diff: str, custom_prompt: str,
-                                current_snapshot: str = '', url: str = '', title: str = '') -> str:
+                                current_snapshot: str = '', url: str = '', title: str = '',
+                                metadata: str = '') -> str:
    """
    Build the user message for an AI Change Summary call.
    The user supplies their own instructions (custom_prompt); this wraps them
@@ -152,6 +159,8 @@ def build_change_summary_prompt(diff: str, custom_prompt: str,
        parts.append(f"Page title: {title}")
    parts.append(f"Instructions: {custom_prompt}")
    parts.append(f"\nWhat changed (diff):\n{_annotate_moved_lines(diff)}")
+    if metadata:
+        parts.append(f"\n{metadata}")
    return '\n'.join(parts)


@@ -175,6 +175,30 @@ class ChangeDetectionSpec:
        """
        pass

+    @hookspec
+    def llm_context_enrich(watch, html_content, datastore):
+        """Return extra current-state context to append to LLM intent/summary prompts.
+
+        Called for any watch with an LLM intent or change-summary when raw HTML is
+        available. Plugins can surface structured facts the html-to-text snapshot has
+        dropped — e.g. JSON-LD / OpenGraph product metadata — so the model can answer
+        intents like "alert when the SKU changes" or "list the product IDs".
+
+        The returned text is appended verbatim to the prompt; the caller is responsible
+        for fitting it within the configurable max_input_chars budget (it drops the
+        enrichment if it would not fit), so implementations should NOT impose their own
+        size limits.
+
+        Args:
+            watch: The watch dict being evaluated.
+            html_content: The raw HTML of the current page (may be '' / None).
+            datastore: The application datastore.
+
+        Returns:
+            str or None: Context text to append, or None if nothing to add.
+        """
+        pass
+
    @hookspec
    def get_html_head_extras():
        """Return HTML to inject into the <head> of every page via base.html.
@@ -323,14 +347,17 @@ def register_builtin_restock_plugins():
    (restock_diff/__init__.py → model.Watch → content_fetchers → pluggy_interface).
    """
    import importlib
-    module_path = 'changedetectionio.processors.restock_diff.plugins.llm_restock'
-    try:
-        module = importlib.import_module(module_path)
-        if not plugin_manager.is_registered(module):
-            plugin_manager.register(module, 'llm_restock')
-            logger.debug("Registered built-in restock plugin: llm_restock")
-    except Exception as e:
-        logger.error(f"Failed to register llm_restock plugin: {e}")
+    for module_path, plugin_name in (
+        ('changedetectionio.processors.restock_diff.plugins.llm_restock', 'llm_restock'),
+        ('changedetectionio.processors.restock_diff.plugins.llm_metadata_enrich', 'llm_metadata_enrich'),
+    ):
+        try:
+            module = importlib.import_module(module_path)
+            if not plugin_manager.is_registered(module):
+                plugin_manager.register(module, plugin_name)
+                logger.debug(f"Registered built-in restock plugin: {plugin_name}")
+        except Exception as e:
+            logger.error(f"Failed to register {plugin_name} plugin: {e}")

 # Helper function to collect UI stats extras from all plugins
 def collect_ui_edit_stats_extras(watch):
@@ -403,6 +430,27 @@ def get_itemprop_availability_from_plugin(content, fetcher_name, fetcher_instanc
    return None


+def collect_llm_context_enrichment(watch, html_content, datastore):
+    """Collect and combine LLM context enrichment from all plugins.
+
+    Returns the concatenated non-empty plugin strings (blank-line separated), or ''
+    when no plugin contributes anything. No size limit is applied here — the caller
+    enforces the single configurable max_input_chars budget.
+    """
+    try:
+        results = plugin_manager.hook.llm_context_enrich(
+            watch=watch,
+            html_content=html_content,
+            datastore=datastore,
+        )
+    except Exception as e:
+        logger.debug(f"llm_context_enrich hook failed: {e}")
+        return ''
+
+    parts = [r.strip() for r in results if r and isinstance(r, str) and r.strip()]
+    return '\n\n'.join(parts) if parts else ''
+
+
 def get_active_plugins():
    """Get a list of active plugins with their descriptions.

@@ -0,0 +1,32 @@
+"""
+LLM context enrichment plugin — structured product/page metadata.
+
+Surfaces the page's structured metadata (JSON-LD + OpenGraph site/type) verbatim
+so it can be appended to the LLM intent/summary prompts. This lets user intents
+and summary prompts reference facts the html-to-text snapshot has stripped out —
+prices, SKUs/GTINs, availability, ratings, article dates, page kind, etc.
+
+Extraction reuses the memory-safe pure_python_extractor (stdlib html.parser, no
+lxml/libxml2), so it is safe to run on every changed watch without the C-level
+memory leak that extruct/lxml carries. It performs NO LLM call of its own and
+imposes no size limit — the evaluator enforces the single configurable
+max_input_chars budget and drops the enrichment if it would not fit.
+"""
+from loguru import logger
+from changedetectionio.pluggy_interface import hookimpl
+
+
+@hookimpl
+def llm_context_enrich(watch, html_content, datastore):
+    """Return verbatim structured metadata for the current page, or None."""
+    if not html_content:
+        return None
+
+    try:
+        from changedetectionio.processors.restock_diff.pure_python_extractor import extract_metadata_for_llm
+        block = extract_metadata_for_llm(html_content)
+    except Exception as e:
+        logger.debug(f"llm_metadata_enrich: extraction failed: {e}")
+        return None
+
+    return block or None
@@ -287,3 +287,84 @@ def query_price_availability(extracted_data):
    # using something like babel you need to know the locale of the website and even then it can be problematic
    # we dont really do anything with the price data so far.. so just accept it the way it comes.
    return result
+
+
+# =============================================================================
+# Structured metadata for the LLM enricher — passed through verbatim
+# =============================================================================
+#
+# This surfaces the page's structured metadata (JSON-LD + OpenGraph site/type)
+# as-is for the LLM intent/summary prompts. We deliberately do NOT curate, field-
+# cherry-pick, or impose a size limit here:
+#
+#   * LLMs are trained on schema.org JSON-LD and read it natively, so handing it
+#     over verbatim lets ANY user intent ("list the SKUs", "did the release date
+#     change?", "is it a recipe or a product?") work without us pre-guessing which
+#     fields matter — and it covers non-product pages (NewsArticle, Event, JobPosting…)
+#     for free.
+#   * There is exactly one configurable budget for how much text reaches the LLM —
+#     max_input_chars (env LLM_MAX_INPUT_CHARS → settings → default), enforced by the
+#     evaluator. A second hardcoded cap here would be a competing, non-configurable
+#     source of truth. The caller decides how much fits.
+#
+# Extraction reuses the memory-safe extract_metadata_pure_python() (stdlib
+# html.parser, no lxml/libxml2) so it is safe to call on every changed watch
+# without the C-level leak extruct/lxml carries, and it is robust to dangling/
+# unclosed <script type="application/ld+json"> blocks (HTMLParser only emits a
+# block on a real closing tag, so an unterminated blob is dropped rather than
+# swallowing the rest of the document the way a greedy regex would).
+# =============================================================================
+
+
+def extract_metadata_for_llm(html_content) -> str:
+    """
+    Return the page's structured metadata verbatim for LLM context, or '' if none.
+
+    Output (either part omitted when absent):
+
+        Page context: site: ExampleShop | og:type: product
+        Structured metadata found on the page (JSON-LD):
+        {"@type":"Product","name":"Acme Widget","sku":"12345", ...}
+        {"@type":"BreadcrumbList", ...}
+
+    JSON-LD blocks are re-serialised compactly (this only strips source whitespace
+    — the data is byte-for-byte the same schema.org structure). No truncation or
+    field selection is applied; sizing is the caller's single configurable budget.
+    """
+    if not html_content:
+        return ''
+
+    try:
+        data = extract_metadata_pure_python(html_content)
+    except Exception as e:
+        logger.debug(f"Metadata for LLM: extraction failed: {e}")
+        return ''
+
+    parts = []
+
+    # OpenGraph site/type — page-kind context that is NOT carried in JSON-LD,
+    # so the model can tell an e-shop listing from a news feed.
+    og = data.get('opengraph', {})
+    ctx = []
+    if og.get('og:site_name'):
+        ctx.append(f"site: {og['og:site_name']}")
+    if og.get('og:type'):
+        ctx.append(f"og:type: {og['og:type']}")
+    if ctx:
+        parts.append('Page context: ' + ' | '.join(ctx))
+
+    # JSON-LD verbatim (compact re-dump only — whitespace normalisation, not curation).
+    nodes = data.get('json-ld', [])
+    if nodes:
+        try:
+            blob = '\n'.join(
+                json.dumps(n, ensure_ascii=False, separators=(',', ':'))
+                for n in nodes
+            )
+        except (TypeError, ValueError) as e:
+            logger.debug(f"Metadata for LLM: JSON-LD re-serialise failed: {e}")
+            blob = ''
+        if blob:
+            parts.append('Structured metadata found on the page (JSON-LD):\n' + blob)
+
+    return '\n'.join(parts)
@@ -210,7 +210,8 @@ class TestEvaluateChange:

        diff = '- $500\n+ $400'
        intent = 'flag price drops'
-        cache_key = hashlib.sha256(f"{intent}||{diff}".encode()).hexdigest()
+        metadata = ''  # no enrichment in this test; folded into the key as a trailing ||
+        cache_key = hashlib.sha256(f"{intent}||{diff}||{metadata}".encode()).hexdigest()
        watch['llm_evaluation_cache'] = {
            cache_key: {'important': True, 'summary': 'cached result'}
        }
@@ -7,6 +7,7 @@ import pytest
 from changedetectionio.llm.prompt_builder import (
    build_eval_prompt,
    build_eval_system_prompt,
+    build_change_summary_prompt,
    build_setup_prompt,
    build_setup_system_prompt,
    SNAPSHOT_CONTEXT_CHARS,
@@ -71,6 +72,49 @@ class TestBuildEvalPrompt:
        assert len(prompt_without) < len(prompt_with)


+class TestMetadataEnrichmentInPrompts:
+    """The verbatim structured-metadata block must land in the eval/summary/preview
+    user prompts when provided, and leave them unchanged when absent."""
+
+    METADATA = (
+        "Page context: site: ExampleShop | og:type: product\n"
+        "Structured metadata found on the page (JSON-LD):\n"
+        '{"@type":"Product","name":"Acme Widget","sku":"12345","color":"blue"}'
+    )
+
+    def test_eval_prompt_includes_metadata(self):
+        prompt = build_eval_prompt(intent='alert on SKU change', diff='- a\n+ b',
+                                   metadata=self.METADATA)
+        assert self.METADATA in prompt
+        # A field we never whitelisted must survive verbatim
+        assert '"sku":"12345"' in prompt
+        assert '"color":"blue"' in prompt
+        # The block is appended AFTER the diff (diff stays the freshest pre-metadata content)
+        assert prompt.index('What changed (diff):') < prompt.index('Structured metadata found')
+
+    def test_eval_prompt_unchanged_without_metadata(self):
+        with_meta = build_eval_prompt(intent='i', diff='d', metadata=self.METADATA)
+        without = build_eval_prompt(intent='i', diff='d')
+        assert 'Structured metadata found' not in without
+        assert len(without) < len(with_meta)
+
+    def test_summary_prompt_includes_metadata(self):
+        prompt = build_change_summary_prompt(diff='- a\n+ b', custom_prompt='list the SKUs',
+                                             metadata=self.METADATA)
+        assert self.METADATA in prompt
+        assert '"sku":"12345"' in prompt
+
+    def test_summary_prompt_unchanged_without_metadata(self):
+        without = build_change_summary_prompt(diff='- a\n+ b', custom_prompt='x')
+        assert 'Structured metadata found' not in without
+
+    def test_empty_metadata_appends_nothing(self):
+        # Falsy metadata ('') must not add a trailing block/whitespace section
+        assert build_eval_prompt(intent='i', diff='d', metadata='') == build_eval_prompt(intent='i', diff='d')
+        assert (build_change_summary_prompt(diff='d', custom_prompt='c', metadata='')
+                == build_change_summary_prompt(diff='d', custom_prompt='c'))
+
+
 class TestBuildEvalSystemPrompt:
    def test_returns_string(self):
        result = build_eval_system_prompt()
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+"""Unit tests for the memory-safe, verbatim structured-metadata block used by the LLM enricher.
+
+Run: python -m unittest changedetectionio.tests.unit.test_product_metadata_summary
+"""
+
+import json
+import unittest
+
+from changedetectionio.processors.restock_diff.pure_python_extractor import (
+    extract_metadata_for_llm,
+)
+
+
+def _page(*scripts, head_extra=''):
+    body = '\n'.join(scripts)
+    return f'<html><head>{head_extra}</head><body>{body}</body></html>'
+
+
+class TestExtractMetadataForLLM(unittest.TestCase):
+
+    def test_jsonld_passed_through_verbatim(self):
+        html = _page('''
+        <script type="application/ld+json">
+        {"@context":"https://schema.org","@type":"Product","name":"Acme Widget",
+         "sku":"12345","color":"blue","releaseDate":"2026-01-02",
+         "offers":{"@type":"Offer","price":"249.00","priceCurrency":"USD","availability":"https://schema.org/InStock"}}
+        </script>''')
+        out = extract_metadata_for_llm(html)
+        # Verbatim: fields we never "whitelisted" must still be present
+        self.assertIn('JSON-LD', out)
+        self.assertIn('"name":"Acme Widget"', out)
+        self.assertIn('"sku":"12345"', out)
+        self.assertIn('"color":"blue"', out)
+        self.assertIn('"releaseDate":"2026-01-02"', out)
+        self.assertIn('"availability":"https://schema.org/InStock"', out)
+
+    def test_no_size_or_count_limit_is_imposed(self):
+        # 50 products → all 50 must appear; sizing is the caller's budget, not ours.
+        prods = [f'{{"@type":"Product","name":"P{i}","sku":"S{i}"}}' for i in range(50)]
+        html = _page(f'<script type="application/ld+json">[{",".join(prods)}]</script>')
+        out = extract_metadata_for_llm(html)
+        self.assertIn('"name":"P0"', out)
+        self.assertIn('"name":"P49"', out)
+        self.assertNotIn('more products', out)  # no truncation marker
+
+    def test_non_product_types_included(self):
+        # News / events / etc. are passed through too — not product-only.
+        html = _page('''<script type="application/ld+json">
+        {"@type":"NewsArticle","headline":"Big news","datePublished":"2026-05-30"}
+        </script>''')
+        out = extract_metadata_for_llm(html)
+        self.assertIn('"@type":"NewsArticle"', out)
+        self.assertIn('"headline":"Big news"', out)
+
+    def test_compact_reserialisation_is_valid_json(self):
+        html = _page('''<script type="application/ld+json">
+        {  "@type" : "Product" ,  "name" :  "Spaced Out"  }
+        </script>''')
+        out = extract_metadata_for_llm(html)
+        blob_line = out.splitlines()[-1]
+        # The re-dumped line must round-trip as valid JSON
+        self.assertEqual(json.loads(blob_line)['name'], 'Spaced Out')
+
+    def test_opengraph_page_context(self):
+        html = _page(
+            '<script type="application/ld+json">{"@type":"ItemList"}</script>',
+            head_extra='''
+                <meta property="og:site_name" content="ExampleShop">
+                <meta property="og:type" content="product.group">
+            ''',
+        )
+        out = extract_metadata_for_llm(html)
+        self.assertIn('Page context: site: ExampleShop', out)
+        self.assertIn('og:type: product.group', out)
+        self.assertIn('"@type":"ItemList"', out)
+
+    def test_dangling_unclosed_jsonld_is_safe(self):
+        # An unterminated ld+json block must NOT swallow the document nor crash.
+        html = (
+            '<html><body>'
+            '<script type="application/ld+json">{"@type":"Product","name":"Broken","sku":"X"'
+            '<div>rest of page</div>'
+            '</body></html>'
+        )
+        self.assertEqual(extract_metadata_for_llm(html), '')
+
+    def test_invalid_json_skipped(self):
+        html = _page('<script type="application/ld+json">{not valid json,,}</script>')
+        self.assertEqual(extract_metadata_for_llm(html), '')
+
+    def test_no_metadata_returns_empty(self):
+        self.assertEqual(extract_metadata_for_llm('<html><body><p>hi</p></body></html>'), '')
+        self.assertEqual(extract_metadata_for_llm(''), '')
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -450,7 +450,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
                            try:
                                from changedetectionio.llm.evaluator import (
                                    evaluate_change, resolve_intent, resolve_llm_field,
-                                    summarise_change, _runtime_llm_config,
+                                    summarise_change, _runtime_llm_config, compute_llm_enrichment,
                                )
                                # _runtime_llm_config returns None (and logs a debug skip
                                # message) when the master 'llm_enabled' toggle is off, so
@@ -476,14 +476,20 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
                                    else:
                                        _diff_text = contents

+                                    # Structured-metadata enrichment (verbatim JSON-LD/OpenGraph) from the
+                                    # raw HTML, appended to both the intent and summary prompts. Computed once
+                                    # and dropped automatically if it won't fit the max_input_chars budget.
+                                    _llm_raw_html = getattr(getattr(update_handler, 'fetcher', None), 'content', '') or ''
+                                    _llm_metadata = compute_llm_enrichment(watch, datastore, _llm_raw_html, _diff_text)
+
                                    # Step 1: AI Change Intent — may suppress notification
                                    _llm_intent, _llm_intent_source = resolve_intent(watch, datastore)
                                    if _llm_intent:
                                        set_watch_minitext_status(watch, "AI/LLM (rules)..")
                                        _llm_result = await loop.run_in_executor(
                                            executor,
-                                            lambda diff=_diff_text, snap=contents: evaluate_change(
-                                                watch, datastore, diff=diff, current_snapshot=snap
+                                            lambda diff=_diff_text, snap=contents, meta=_llm_metadata: evaluate_change(
+                                                watch, datastore, diff=diff, current_snapshot=snap, metadata=meta
                                            )
                                        )
                                        update_obj['_llm_result'] = _llm_result
@@ -502,8 +508,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
                                        set_watch_minitext_status(watch, "AI/LLM (summary)..")
                                        _change_summary = await loop.run_in_executor(
                                            executor,
-                                            lambda diff=_diff_text, snap=contents: summarise_change(
-                                                watch, datastore, diff=diff, current_snapshot=snap
+                                            lambda diff=_diff_text, snap=contents, meta=_llm_metadata: summarise_change(
+                                                watch, datastore, diff=diff, current_snapshot=snap, metadata=meta
                                            )
                                        )
                                        if _change_summary:
@@ -557,6 +563,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
                                        effective_prompt=get_effective_summary_prompt(watch, datastore),
                                        max_summary_tokens=_llm_max_summary_tokens,
                                        model=_llm_model,
+                                        metadata=_llm_metadata,
                                    )
                                    watch.save_llm_diff_summary(
                                        update_obj['_llm_change_summary'],
Author	SHA1	Message	Date
dgtlmoon	39fa7f9692	WIP	2026-05-30 13:35:49 +02:00
dgtlmoon	1e643b2244	LLM - Enrichen summary and intent/rules with the actual product metadata in the HTML document if it exists for greater precision.	2026-05-30 13:26:01 +02:00