mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-15 10:46:01 +00:00
Compare commits
3 Commits
API-fields
...
memfix-lin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9729f4c4e4 | ||
|
|
759d4118bf | ||
|
|
bafbdfb5c0 |
@@ -56,6 +56,259 @@ def _deduplicate_prices(data):
|
|||||||
return list(unique_data)
|
return list(unique_data)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
|
||||||
|
# (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
|
||||||
|
# libxml2 - a C library that allocates memory at the C level.
|
||||||
|
#
|
||||||
|
# Memory Leak Problem:
|
||||||
|
# --------------------
|
||||||
|
# 1. lxml's document_fromstring() creates thousands of Python objects backed by
|
||||||
|
# C-level allocations (nodes, attributes, text content)
|
||||||
|
# 2. Python's garbage collector can mark these objects as collectible, but
|
||||||
|
# cannot force the OS to reclaim the actual C-level memory
|
||||||
|
# 3. malloc/free typically doesn't return memory to OS - it just marks it as
|
||||||
|
# "free in the process address space"
|
||||||
|
# 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
|
||||||
|
# after Python GC runs
|
||||||
|
#
|
||||||
|
# Why Multiprocessing Fixes This:
|
||||||
|
# --------------------------------
|
||||||
|
# When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
|
||||||
|
# allocations that Python GC couldn't release. This ensures clean memory state
|
||||||
|
# after each extraction.
|
||||||
|
#
|
||||||
|
# Performance Impact:
|
||||||
|
# -------------------
|
||||||
|
# - Memray analysis showed 1.2M document_fromstring allocations per page
|
||||||
|
# - Without subprocess: memory grows by ~50-500MB per parse and lingers
|
||||||
|
# - With subprocess: ~35MB overhead but forces full cleanup after each run
|
||||||
|
# - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# -----------
|
||||||
|
# - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
|
||||||
|
# - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
|
||||||
|
# - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
|
||||||
|
#
|
||||||
|
# Additional Context:
|
||||||
|
# -------------------
|
||||||
|
# - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
|
||||||
|
# - The leak is specifically from lxml's document parsing, not the JSONPath queries
|
||||||
|
# - Linux-only because multiprocessing spawn is well-tested there; other platforms
|
||||||
|
# use direct call as fallback
|
||||||
|
#
|
||||||
|
# Alternative Solution (Future Optimization):
|
||||||
|
# -------------------------------------------
|
||||||
|
# This entire problem could be avoided by using regex to extract just the machine
|
||||||
|
# data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
|
||||||
|
#
|
||||||
|
# 1. Use regex to extract <script type="application/ld+json">...</script> blocks
|
||||||
|
# 2. Use regex to extract <meta property="og:*"> tags
|
||||||
|
# 3. Use regex to find itemprop/itemtype attributes and their containing elements
|
||||||
|
# 4. Parse ONLY those extracted snippets instead of the entire HTML document
|
||||||
|
#
|
||||||
|
# Benefits:
|
||||||
|
# - Avoids parsing 5MB of HTML when we only need a few KB of metadata
|
||||||
|
# - Eliminates the lxml memory leak entirely
|
||||||
|
# - Faster extraction (regex is much faster than DOM parsing)
|
||||||
|
# - No subprocess overhead needed
|
||||||
|
#
|
||||||
|
# Trade-offs:
|
||||||
|
# - Regex for HTML is brittle (comments, CDATA, edge cases)
|
||||||
|
# - Microdata extraction would be complex (need to track element boundaries)
|
||||||
|
# - Would need extensive testing to ensure we don't miss valid data
|
||||||
|
# - extruct is battle-tested; regex solution would need similar maturity
|
||||||
|
#
|
||||||
|
# For now, the subprocess approach is safer and leverages existing extruct code.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_itemprop_availability_worker(pipe_conn):
|
||||||
|
"""
|
||||||
|
Subprocess worker for itemprop extraction (Linux memory management).
|
||||||
|
|
||||||
|
Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
|
||||||
|
When the subprocess exits, the OS reclaims ALL memory including lxml's
|
||||||
|
C-level allocations that Python's GC cannot release.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pipe_conn: Pipe connection to receive HTML and send result
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import gc
|
||||||
|
|
||||||
|
html_content = None
|
||||||
|
result_data = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Receive HTML as raw bytes (no pickle)
|
||||||
|
html_bytes = pipe_conn.recv_bytes()
|
||||||
|
html_content = html_bytes.decode('utf-8')
|
||||||
|
|
||||||
|
# Explicitly delete html_bytes to free memory
|
||||||
|
del html_bytes
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# Perform extraction in subprocess (uses extruct/lxml)
|
||||||
|
result_data = get_itemprop_availability(html_content)
|
||||||
|
|
||||||
|
# Convert Restock object to dict for JSON serialization
|
||||||
|
result = {
|
||||||
|
'success': True,
|
||||||
|
'data': dict(result_data) if result_data else {}
|
||||||
|
}
|
||||||
|
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||||
|
|
||||||
|
# Clean up before exit
|
||||||
|
del result_data, html_content, result
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
except MoreThanOnePriceFound:
|
||||||
|
# Serialize the specific exception type
|
||||||
|
result = {
|
||||||
|
'success': False,
|
||||||
|
'exception_type': 'MoreThanOnePriceFound'
|
||||||
|
}
|
||||||
|
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Serialize other exceptions
|
||||||
|
result = {
|
||||||
|
'success': False,
|
||||||
|
'exception_type': type(e).__name__,
|
||||||
|
'exception_message': str(e)
|
||||||
|
}
|
||||||
|
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Final cleanup before subprocess exits
|
||||||
|
# Variables may already be deleted in try block, so use try/except
|
||||||
|
try:
|
||||||
|
del html_content
|
||||||
|
except (NameError, UnboundLocalError):
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
del result_data
|
||||||
|
except (NameError, UnboundLocalError):
|
||||||
|
pass
|
||||||
|
gc.collect()
|
||||||
|
pipe_conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||||
|
"""
|
||||||
|
Extract itemprop availability with hybrid approach for memory efficiency.
|
||||||
|
|
||||||
|
Strategy (fastest to slowest, least to most memory):
|
||||||
|
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
|
||||||
|
2. Fall back to extruct with subprocess isolation on Linux for complex cases
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: HTML string to parse
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Restock: Extracted availability data
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
MoreThanOnePriceFound: When multiple prices detected
|
||||||
|
Other exceptions: From extruct/parsing
|
||||||
|
"""
|
||||||
|
import platform
|
||||||
|
|
||||||
|
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
|
||||||
|
try:
|
||||||
|
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
|
||||||
|
|
||||||
|
logger.trace("Attempting pure Python metadata extraction (no lxml)")
|
||||||
|
extracted_data = extract_metadata_pure_python(html_content)
|
||||||
|
price_data = query_price_availability(extracted_data)
|
||||||
|
|
||||||
|
# If we got price AND availability, we're done!
|
||||||
|
if price_data.get('price') and price_data.get('availability'):
|
||||||
|
result = Restock(price_data)
|
||||||
|
logger.debug(f"Pure Python extraction successful: {dict(result)}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# If we got some data but not everything, still try extruct for completeness
|
||||||
|
if price_data.get('price') or price_data.get('availability'):
|
||||||
|
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
|
||||||
|
|
||||||
|
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
|
||||||
|
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
|
||||||
|
|
||||||
|
# Only use subprocess isolation on Linux
|
||||||
|
# Other platforms may have issues with spawn or don't need the aggressive memory management
|
||||||
|
if platform.system() == 'Linux':
|
||||||
|
import multiprocessing
|
||||||
|
import json
|
||||||
|
import gc
|
||||||
|
|
||||||
|
try:
|
||||||
|
ctx = multiprocessing.get_context('spawn')
|
||||||
|
parent_conn, child_conn = ctx.Pipe()
|
||||||
|
p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
|
||||||
|
p.start()
|
||||||
|
|
||||||
|
# Send HTML as raw bytes (no pickle)
|
||||||
|
html_bytes = html_content.encode('utf-8')
|
||||||
|
parent_conn.send_bytes(html_bytes)
|
||||||
|
|
||||||
|
# Explicitly delete html_bytes copy immediately after sending
|
||||||
|
del html_bytes
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# Receive result as JSON
|
||||||
|
result_bytes = parent_conn.recv_bytes()
|
||||||
|
result = json.loads(result_bytes.decode('utf-8'))
|
||||||
|
|
||||||
|
# Wait for subprocess to complete
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
# Close pipes
|
||||||
|
parent_conn.close()
|
||||||
|
child_conn.close()
|
||||||
|
|
||||||
|
# Clean up all subprocess-related objects
|
||||||
|
del p, parent_conn, child_conn, result_bytes
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# Handle result or re-raise exception
|
||||||
|
if result['success']:
|
||||||
|
# Reconstruct Restock object from dict
|
||||||
|
restock_obj = Restock(result['data'])
|
||||||
|
# Clean up result dict
|
||||||
|
del result
|
||||||
|
gc.collect()
|
||||||
|
return restock_obj
|
||||||
|
else:
|
||||||
|
# Re-raise the exception that occurred in subprocess
|
||||||
|
exception_type = result['exception_type']
|
||||||
|
exception_msg = result.get('exception_message', '')
|
||||||
|
del result
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
if exception_type == 'MoreThanOnePriceFound':
|
||||||
|
raise MoreThanOnePriceFound()
|
||||||
|
else:
|
||||||
|
raise Exception(f"{exception_type}: {exception_msg}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If multiprocessing itself fails, log and fall back to direct call
|
||||||
|
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
|
||||||
|
gc.collect()
|
||||||
|
return get_itemprop_availability(html_content)
|
||||||
|
else:
|
||||||
|
# Non-Linux: direct call (no subprocess overhead needed)
|
||||||
|
return get_itemprop_availability(html_content)
|
||||||
|
|
||||||
|
|
||||||
# should return Restock()
|
# should return Restock()
|
||||||
# add casting?
|
# add casting?
|
||||||
def get_itemprop_availability(html_content) -> Restock:
|
def get_itemprop_availability(html_content) -> Restock:
|
||||||
@@ -196,8 +449,9 @@ class perform_site_check(difference_detection_processor):
|
|||||||
multiple_prices_found = False
|
multiple_prices_found = False
|
||||||
|
|
||||||
# Try built-in extraction first, this will scan metadata in the HTML
|
# Try built-in extraction first, this will scan metadata in the HTML
|
||||||
|
# On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
|
||||||
try:
|
try:
|
||||||
itemprop_availability = get_itemprop_availability(self.fetcher.content)
|
itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
|
||||||
except MoreThanOnePriceFound as e:
|
except MoreThanOnePriceFound as e:
|
||||||
# Don't raise immediately - let plugins try to handle this case
|
# Don't raise immediately - let plugins try to handle this case
|
||||||
# Plugins might be able to determine which price is correct
|
# Plugins might be able to determine which price is correct
|
||||||
|
|||||||
@@ -0,0 +1,286 @@
|
|||||||
|
"""
|
||||||
|
Pure Python metadata extractor - no lxml, no memory leaks.
|
||||||
|
|
||||||
|
This module provides a fast, memory-efficient alternative to extruct for common
|
||||||
|
e-commerce metadata extraction. It handles:
|
||||||
|
- JSON-LD (covers 80%+ of modern sites)
|
||||||
|
- OpenGraph meta tags
|
||||||
|
- Basic microdata attributes
|
||||||
|
|
||||||
|
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
|
||||||
|
memory allocation issues. For edge cases, the main processor can fall back to
|
||||||
|
extruct (with subprocess isolation on Linux).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class JSONLDExtractor(HTMLParser):
|
||||||
|
"""
|
||||||
|
Extract JSON-LD structured data from HTML.
|
||||||
|
|
||||||
|
Finds all <script type="application/ld+json"> tags and parses their content.
|
||||||
|
Handles multiple JSON-LD blocks on the same page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.in_jsonld = False
|
||||||
|
self.data = [] # List of all parsed JSON-LD objects
|
||||||
|
self.current_script = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag == 'script':
|
||||||
|
# Check if this is a JSON-LD script tag
|
||||||
|
for attr, value in attrs:
|
||||||
|
if attr == 'type' and value == 'application/ld+json':
|
||||||
|
self.in_jsonld = True
|
||||||
|
self.current_script = []
|
||||||
|
break
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.in_jsonld:
|
||||||
|
self.current_script.append(data)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == 'script' and self.in_jsonld:
|
||||||
|
# Parse the accumulated script content
|
||||||
|
script_content = ''.join(self.current_script)
|
||||||
|
if script_content.strip():
|
||||||
|
try:
|
||||||
|
# Parse JSON (handles both objects and arrays)
|
||||||
|
parsed = json.loads(script_content)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
self.data.extend(parsed)
|
||||||
|
else:
|
||||||
|
self.data.append(parsed)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.debug(f"Failed to parse JSON-LD: {e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.in_jsonld = False
|
||||||
|
self.current_script = []
|
||||||
|
|
||||||
|
|
||||||
|
class OpenGraphExtractor(HTMLParser):
|
||||||
|
"""
|
||||||
|
Extract OpenGraph meta tags from HTML.
|
||||||
|
|
||||||
|
Finds <meta property="og:*"> tags commonly used for social media sharing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.og_data = {}
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag == 'meta':
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
prop = attrs_dict.get('property', '')
|
||||||
|
|
||||||
|
# Extract OpenGraph properties
|
||||||
|
if prop.startswith('og:'):
|
||||||
|
content = attrs_dict.get('content', '')
|
||||||
|
if content:
|
||||||
|
self.og_data[prop] = content
|
||||||
|
|
||||||
|
|
||||||
|
class MicrodataExtractor(HTMLParser):
|
||||||
|
"""
|
||||||
|
Extract basic microdata attributes from HTML.
|
||||||
|
|
||||||
|
Finds elements with itemprop attributes. This is a simplified extractor
|
||||||
|
that doesn't handle nested itemscope/itemtype hierarchies - for complex
|
||||||
|
cases, use extruct as fallback.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.microdata = {}
|
||||||
|
self.current_itemprop = None
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
|
||||||
|
if 'itemprop' in attrs_dict:
|
||||||
|
itemprop = attrs_dict['itemprop']
|
||||||
|
|
||||||
|
# Price/currency/availability can be in content/href attributes
|
||||||
|
if itemprop == 'price':
|
||||||
|
if 'content' in attrs_dict:
|
||||||
|
self.microdata['price'] = attrs_dict['content']
|
||||||
|
else:
|
||||||
|
self.current_itemprop = 'price'
|
||||||
|
|
||||||
|
elif itemprop == 'priceCurrency':
|
||||||
|
if 'content' in attrs_dict:
|
||||||
|
self.microdata['currency'] = attrs_dict['content']
|
||||||
|
else:
|
||||||
|
self.current_itemprop = 'priceCurrency'
|
||||||
|
|
||||||
|
elif itemprop == 'availability':
|
||||||
|
# Can be in href (link) or content (meta)
|
||||||
|
if 'href' in attrs_dict:
|
||||||
|
self.microdata['availability'] = attrs_dict['href']
|
||||||
|
elif 'content' in attrs_dict:
|
||||||
|
self.microdata['availability'] = attrs_dict['content']
|
||||||
|
else:
|
||||||
|
self.current_itemprop = 'availability'
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
# Capture text content for itemprop elements
|
||||||
|
if self.current_itemprop == 'price':
|
||||||
|
# Try to extract numeric price from text
|
||||||
|
try:
|
||||||
|
price_text = re.sub(r'[^\d.]', '', data.strip())
|
||||||
|
if price_text:
|
||||||
|
self.microdata['price'] = float(price_text)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
elif self.current_itemprop == 'priceCurrency':
|
||||||
|
currency = data.strip()
|
||||||
|
if currency:
|
||||||
|
self.microdata['currency'] = currency
|
||||||
|
elif self.current_itemprop == 'availability':
|
||||||
|
availability = data.strip()
|
||||||
|
if availability:
|
||||||
|
self.microdata['availability'] = availability
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
# Reset current itemprop after closing tag
|
||||||
|
self.current_itemprop = None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata_pure_python(html_content):
|
||||||
|
"""
|
||||||
|
Extract structured metadata from HTML using pure Python parsers.
|
||||||
|
|
||||||
|
Returns a dict with three keys:
|
||||||
|
- 'json-ld': List of parsed JSON-LD objects
|
||||||
|
- 'opengraph': Dict of OpenGraph properties
|
||||||
|
- 'microdata': Dict of microdata properties
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: HTML string to parse
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Extracted metadata in three formats
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
'json-ld': [],
|
||||||
|
'opengraph': {},
|
||||||
|
'microdata': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract JSON-LD
|
||||||
|
try:
|
||||||
|
jsonld_extractor = JSONLDExtractor()
|
||||||
|
jsonld_extractor.feed(html_content)
|
||||||
|
result['json-ld'] = jsonld_extractor.data
|
||||||
|
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"JSON-LD extraction failed: {e}")
|
||||||
|
|
||||||
|
# Extract OpenGraph
|
||||||
|
try:
|
||||||
|
og_extractor = OpenGraphExtractor()
|
||||||
|
og_extractor.feed(html_content)
|
||||||
|
result['opengraph'] = og_extractor.og_data
|
||||||
|
if result['opengraph']:
|
||||||
|
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"OpenGraph extraction failed: {e}")
|
||||||
|
|
||||||
|
# Extract Microdata
|
||||||
|
try:
|
||||||
|
microdata_extractor = MicrodataExtractor()
|
||||||
|
microdata_extractor.feed(html_content)
|
||||||
|
result['microdata'] = microdata_extractor.microdata
|
||||||
|
if result['microdata']:
|
||||||
|
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Microdata extraction failed: {e}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def query_price_availability(extracted_data):
|
||||||
|
"""
|
||||||
|
Query extracted metadata for price and availability information.
|
||||||
|
|
||||||
|
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
|
||||||
|
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extracted_data: Dict from extract_metadata_pure_python()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: {'price': float, 'currency': str, 'availability': str}
|
||||||
|
"""
|
||||||
|
from jsonpath_ng import parse
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
# 1. Try JSON-LD first (most reliable and common)
|
||||||
|
for data in extracted_data.get('json-ld', []):
|
||||||
|
try:
|
||||||
|
# Use jsonpath to find price/availability anywhere in the structure
|
||||||
|
price_parse = parse('$..(price|Price)')
|
||||||
|
availability_parse = parse('$..(availability|Availability)')
|
||||||
|
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
|
||||||
|
|
||||||
|
price_results = [m.value for m in price_parse.find(data)]
|
||||||
|
if price_results and not result.get('price'):
|
||||||
|
# Handle various price formats
|
||||||
|
price_val = price_results[0]
|
||||||
|
if isinstance(price_val, (int, float)):
|
||||||
|
result['price'] = float(price_val)
|
||||||
|
elif isinstance(price_val, str):
|
||||||
|
# Extract numeric value from string
|
||||||
|
try:
|
||||||
|
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
avail_results = [m.value for m in availability_parse.find(data)]
|
||||||
|
if avail_results and not result.get('availability'):
|
||||||
|
result['availability'] = str(avail_results[0])
|
||||||
|
|
||||||
|
curr_results = [m.value for m in currency_parse.find(data)]
|
||||||
|
if curr_results and not result.get('currency'):
|
||||||
|
result['currency'] = str(curr_results[0])
|
||||||
|
|
||||||
|
# If we found price, this JSON-LD block is good
|
||||||
|
if result.get('price'):
|
||||||
|
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error querying JSON-LD: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Try OpenGraph if JSON-LD didn't provide everything
|
||||||
|
og_data = extracted_data.get('opengraph', {})
|
||||||
|
if not result.get('price') and 'og:price:amount' in og_data:
|
||||||
|
try:
|
||||||
|
result['price'] = float(og_data['og:price:amount'])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
if not result.get('currency') and 'og:price:currency' in og_data:
|
||||||
|
result['currency'] = og_data['og:price:currency']
|
||||||
|
if not result.get('availability') and 'og:availability' in og_data:
|
||||||
|
result['availability'] = og_data['og:availability']
|
||||||
|
|
||||||
|
# 3. Use microdata as last resort
|
||||||
|
microdata = extracted_data.get('microdata', {})
|
||||||
|
if not result.get('price') and 'price' in microdata:
|
||||||
|
result['price'] = microdata['price']
|
||||||
|
if not result.get('currency') and 'currency' in microdata:
|
||||||
|
result['currency'] = microdata['currency']
|
||||||
|
if not result.get('availability') and 'availability' in microdata:
|
||||||
|
result['availability'] = microdata['availability']
|
||||||
|
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user