"""
LLM fallback plugin for price and restock info extraction.
When the built-in structured-metadata extraction (JSON-LD, microdata, OpenGraph)
fails to produce both a price and availability, this plugin is called as a last
resort. It sends a trimmed, HTML-stripped version of the page to the configured
LLM and asks it to return a structured JSON answer.
The module-level `datastore` variable is injected at startup by
`inject_datastore_into_plugins()` in pluggy_interface.py.
"""
import json
import re
from loguru import logger
from changedetectionio.pluggy_interface import hookimpl
# Injected at startup by inject_datastore_into_plugins()
datastore = None
SYSTEM_PROMPT = (
'You are an expert price and restock extraction utility. '
'Your task is to analyse a product page and determine the price and stock status of the MAIN product only.\n\n'
'AVAILABILITY — treat as "in stock":\n'
'- Action buttons near the product: "Add to cart", "Add to basket", "Buy now", '
'"Order now", "Purchase", "Import", "Add to bag", "Add to trolley", "In stock", '
'"Available", "Ships in X days/weeks", "In store", "Pick up today".\n'
'- "Pre-order" or "Reserve" — the item is orderable, treat as "in stock".\n'
'- "Only X left", "Almost gone", "Low stock", "Limited availability" — still in stock.\n'
'- "Request a quote" or "Contact us for pricing" — item is available, price is null.\n'
'- IMPORTANT: Ignore cart/basket/bag links in the page HEADER or navigation bar '
'(e.g. a shopping cart icon showing item count). That reflects what is already in '
'the visitor\'s cart — it says nothing about whether THIS product is available.\n\n'
'PRICE — what NOT to use:\n'
'- A "$0.00" or "0" that appears near header/nav links such as "Login", "Wishlist", '
'"Contact Us", "My Account" is an empty shopping-cart indicator, NOT the product price. '
'Ignore it entirely — return null for price rather than 0 in this situation.\n'
'- Only return 0 (free) when the page clearly states the product itself costs nothing '
'(e.g. "Free", "Free download", "Price: $0").\n\n'
'AVAILABILITY — treat as "out of stock":\n'
'- "Out of stock", "Sold out", "Unavailable", "Currently unavailable", '
'"Temporarily out of stock", "Discontinued", "No longer available", '
'"Notify me when available", "Email me when back", "Join waitlist".\n\n'
'AVAILABILITY — return null when uncertain:\n'
'- The page asks the user to select a size, colour, or other variant first '
'("Select an option", "Choose a size") — availability depends on the variant, so return null.\n'
'- You cannot clearly tell from the page content whether the item is available.\n\n'
'PRICE rules:\n'
'- Extract the main selling price as a plain number, no currency symbol.\n'
'- Prices may use any popular locale format — interpret them all correctly and return a plain decimal number. '
'Examples: "10 000 Kč" = 10000, "1.299,95 €" = 1299.95, "1,299.95" = 1299.95, '
'"10 000,50" = 10000.50, "£1.299" = 1299, "¥10000" = 10000.\n'
'- If both an original (crossed-out) price and a sale/current price appear, use the sale price.\n'
'- "From $X" or "Starting at $X" are teaser prices — prefer a definite price or return null.\n'
'- A price of 0 (free) is valid — return 0, not null.\n'
'- If pricing requires a quote or login, return null for price.\n'
'- Ignore prices shown in search/filter UI elements (e.g. "Price from: — to:").\n'
'- IMPORTANT: Ignore ALL prices that appear inside or below recommendation/discovery blocks '
'such as: "Similar items", "You may also like", "Customers also bought", '
'"Based on your browsing", "Based on your shopping", "Frequently bought together", '
'"People also viewed", "Related products", "Sponsored products", "More like this", '
'"Other sellers", "Compare with similar items". '
'These sections contain prices for OTHER products, not the main product.\n'
'- When multiple prices appear on the page, prefer the price that is positioned '
'earliest/highest in the page content — it is almost always the main product price. '
'Prices appearing after large blocks of descriptive text or review sections are '
'likely from recommendation widgets and should be ignored.\n\n'
'CLASSIFIEDS AND LISTING PAGES:\n'
'- On classifieds or marketplace sites (e.g. eBay listings, Craigslist, Bazoš, Gumtree), '
'if a price is shown alongside seller contact details or a "Contact seller" link, '
'treat the item as "instock" — the listing being active means it is available.\n\n'
'Return ONLY a JSON object with exactly these three keys:\n'
' "price" — number or null\n'
' "currency" — ISO-4217 code (USD, EUR, GBP …) or null\n'
' "availability" — exactly one of: "instock", "outofstock", or null\n'
' Use "instock" when the product can be ordered/purchased.\n'
' Use "outofstock" when it cannot.\n'
' Use null when you genuinely cannot tell.\n'
'No markdown, no backticks, no explanation — pure JSON only.'
)
_MAX_CONTENT_CHARS = 8_000
def _extract_jsonld(html_content: str) -> str:
"""Extract JSON-LD blocks — these contain reliable structured product data."""
blocks = re.findall(
r'',
html_content, flags=re.DOTALL | re.IGNORECASE
)
if not blocks:
return ''
combined = ' '.join(b.strip() for b in blocks)
return combined[:2000]
# Semantic tags always treated as chrome (nav/header/footer)
_CHROME_TAGS = {'nav', 'header', 'footer', 'aside'}
# id/class fragments that strongly indicate navigation or site-chrome
_CHROME_PATTERNS = re.compile(
r'\b(nav|navigation|navbar|menu|mega-menu|breadcrumb|breadcrumbs?|'
r'site-header|page-header|top-bar|top-nav|top-header|mobile-nav|header-bar|'
r'site-footer|page-footer|footer-links|related|similar|'
r'you-?may-?also|customers?-?also|frequently-?bought|'
r'people-?also|sponsored|recommendation|widget|sidebar|'
r'cross-?sell|up-?sell)\b',
re.IGNORECASE,
)
def _remove_chrome(html_content: str) -> str:
"""Use BS4 to strip navigation, header, footer and recommendation noise.
Uses html.parser (built-in, no lxml) to avoid memory leak issues.
Falls back to the original HTML string if BS4 fails for any reason.
"""
try:
from bs4 import BeautifulSoup, Tag
soup = BeautifulSoup(html_content, 'html.parser')
# Snapshot the full tag list before any decompositions so we don't
# mutate the tree while iterating it. After a parent is decomposed
# its children become orphans (parent=None) — skip those.
for tag in list(soup.find_all(True)):
if not isinstance(tag, Tag) or tag.parent is None:
continue
name = tag.name or ''
if name in _CHROME_TAGS:
tag.decompose()
continue
try:
cls_list = tag.get('class') or []
cls_str = ' '.join(cls_list) if isinstance(cls_list, list) else str(cls_list)
id_str = tag.get('id') or ''
except Exception:
continue
if _CHROME_PATTERNS.search(cls_str + ' ' + id_str):
tag.decompose()
return str(soup)
except Exception as e:
logger.debug(f"BS4 chrome removal failed ({e}), using raw HTML")
return html_content
def _strip_html(html_content: str) -> str:
"""HTML-to-text for LLM consumption.
1. Extracts JSON-LD (structured product data) to prepend.
2. Strips nav/header/footer/recommendation blocks via BS4.
3. Removes all remaining tags and collapses whitespace.
JSON-LD is prepended so reliable price/availability data is always visible
to the LLM regardless of how deep it sits in the page.
"""
jsonld = _extract_jsonld(html_content)
# Remove site-chrome before generic tag stripping
cleaned = _remove_chrome(html_content)
# Drop HTML comments (can contain large disabled markup blocks)
text = re.sub(r'', ' ', cleaned, flags=re.DOTALL)
# Drop all