mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-05-29 21:11:50 +00:00
Use extruct as a last resort
This commit is contained in:
@@ -139,13 +139,21 @@ def _extract_itemprop_availability_worker(pipe_conn):
|
||||
pipe_conn: Pipe connection to receive HTML and send result
|
||||
"""
|
||||
import json
|
||||
import gc
|
||||
|
||||
html_content = None
|
||||
result_data = None
|
||||
|
||||
try:
|
||||
# Receive HTML as raw bytes (no pickle)
|
||||
html_bytes = pipe_conn.recv_bytes()
|
||||
html_content = html_bytes.decode('utf-8')
|
||||
|
||||
# Perform extraction in subprocess
|
||||
# Explicitly delete html_bytes to free memory
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Perform extraction in subprocess (uses extruct/lxml)
|
||||
result_data = get_itemprop_availability(html_content)
|
||||
|
||||
# Convert Restock object to dict for JSON serialization
|
||||
@@ -155,6 +163,10 @@ def _extract_itemprop_availability_worker(pipe_conn):
|
||||
}
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
# Clean up before exit
|
||||
del result_data, html_content, result
|
||||
gc.collect()
|
||||
|
||||
except MoreThanOnePriceFound:
|
||||
# Serialize the specific exception type
|
||||
result = {
|
||||
@@ -173,15 +185,22 @@ def _extract_itemprop_availability_worker(pipe_conn):
|
||||
pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
|
||||
|
||||
finally:
|
||||
# Final cleanup before subprocess exits
|
||||
if html_content is not None:
|
||||
del html_content
|
||||
if result_data is not None:
|
||||
del result_data
|
||||
gc.collect()
|
||||
pipe_conn.close()
|
||||
|
||||
|
||||
def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||
"""
|
||||
Extract itemprop availability with platform-aware memory management.
|
||||
Extract itemprop availability with hybrid approach for memory efficiency.
|
||||
|
||||
On Linux: Uses subprocess to isolate extruct/lxml memory and force OS cleanup.
|
||||
On other platforms: Direct call (multiprocessing spawn has issues on some platforms).
|
||||
Strategy (fastest to slowest, least to most memory):
|
||||
1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
|
||||
2. Fall back to extruct with subprocess isolation on Linux for complex cases
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
@@ -194,13 +213,37 @@ def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||
Other exceptions: From extruct/parsing
|
||||
"""
|
||||
import platform
|
||||
import sys
|
||||
|
||||
# Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
|
||||
try:
|
||||
from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
|
||||
|
||||
logger.trace("Attempting pure Python metadata extraction (no lxml)")
|
||||
extracted_data = extract_metadata_pure_python(html_content)
|
||||
price_data = query_price_availability(extracted_data)
|
||||
|
||||
# If we got price AND availability, we're done!
|
||||
if price_data.get('price') and price_data.get('availability'):
|
||||
result = Restock(price_data)
|
||||
logger.debug(f"Pure Python extraction successful: {dict(result)}")
|
||||
return result
|
||||
|
||||
# If we got some data but not everything, still try extruct for completeness
|
||||
if price_data.get('price') or price_data.get('availability'):
|
||||
logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
|
||||
|
||||
# Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
|
||||
logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
|
||||
|
||||
# Only use subprocess isolation on Linux
|
||||
# Other platforms may have issues with spawn or don't need the aggressive memory management
|
||||
if platform.system() == 'Linux':
|
||||
import multiprocessing
|
||||
import json
|
||||
import gc
|
||||
|
||||
try:
|
||||
ctx = multiprocessing.get_context('spawn')
|
||||
@@ -212,35 +255,52 @@ def extract_itemprop_availability_safe(html_content) -> Restock:
|
||||
html_bytes = html_content.encode('utf-8')
|
||||
parent_conn.send_bytes(html_bytes)
|
||||
|
||||
# Explicitly delete html_bytes copy immediately after sending
|
||||
del html_bytes
|
||||
gc.collect()
|
||||
|
||||
# Receive result as JSON
|
||||
result_bytes = parent_conn.recv_bytes()
|
||||
result = json.loads(result_bytes.decode('utf-8'))
|
||||
|
||||
# Wait for subprocess to complete
|
||||
p.join()
|
||||
|
||||
# Close pipes
|
||||
parent_conn.close()
|
||||
child_conn.close()
|
||||
|
||||
# Clean up explicitly
|
||||
del p, parent_conn, child_conn, html_bytes
|
||||
# Clean up all subprocess-related objects
|
||||
del p, parent_conn, child_conn, result_bytes
|
||||
gc.collect()
|
||||
|
||||
# Handle result or re-raise exception
|
||||
if result['success']:
|
||||
# Reconstruct Restock object from dict
|
||||
return Restock(result['data'])
|
||||
restock_obj = Restock(result['data'])
|
||||
# Clean up result dict
|
||||
del result
|
||||
gc.collect()
|
||||
return restock_obj
|
||||
else:
|
||||
# Re-raise the exception that occurred in subprocess
|
||||
if result['exception_type'] == 'MoreThanOnePriceFound':
|
||||
exception_type = result['exception_type']
|
||||
exception_msg = result.get('exception_message', '')
|
||||
del result
|
||||
gc.collect()
|
||||
|
||||
if exception_type == 'MoreThanOnePriceFound':
|
||||
raise MoreThanOnePriceFound()
|
||||
else:
|
||||
exception_msg = result.get('exception_message', '')
|
||||
raise Exception(f"{result['exception_type']}: {exception_msg}")
|
||||
raise Exception(f"{exception_type}: {exception_msg}")
|
||||
|
||||
except Exception as e:
|
||||
# If multiprocessing itself fails, log and fall back to direct call
|
||||
logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
|
||||
gc.collect()
|
||||
return get_itemprop_availability(html_content)
|
||||
else:
|
||||
# Non-Linux: direct call
|
||||
# Non-Linux: direct call (no subprocess overhead needed)
|
||||
return get_itemprop_availability(html_content)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Pure Python metadata extractor - no lxml, no memory leaks.
|
||||
|
||||
This module provides a fast, memory-efficient alternative to extruct for common
|
||||
e-commerce metadata extraction. It handles:
|
||||
- JSON-LD (covers 80%+ of modern sites)
|
||||
- OpenGraph meta tags
|
||||
- Basic microdata attributes
|
||||
|
||||
Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
|
||||
memory allocation issues. For edge cases, the main processor can fall back to
|
||||
extruct (with subprocess isolation on Linux).
|
||||
"""
|
||||
|
||||
from html.parser import HTMLParser
|
||||
import json
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class JSONLDExtractor(HTMLParser):
|
||||
"""
|
||||
Extract JSON-LD structured data from HTML.
|
||||
|
||||
Finds all <script type="application/ld+json"> tags and parses their content.
|
||||
Handles multiple JSON-LD blocks on the same page.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_jsonld = False
|
||||
self.data = [] # List of all parsed JSON-LD objects
|
||||
self.current_script = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'script':
|
||||
# Check if this is a JSON-LD script tag
|
||||
for attr, value in attrs:
|
||||
if attr == 'type' and value == 'application/ld+json':
|
||||
self.in_jsonld = True
|
||||
self.current_script = []
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_jsonld:
|
||||
self.current_script.append(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'script' and self.in_jsonld:
|
||||
# Parse the accumulated script content
|
||||
script_content = ''.join(self.current_script)
|
||||
if script_content.strip():
|
||||
try:
|
||||
# Parse JSON (handles both objects and arrays)
|
||||
parsed = json.loads(script_content)
|
||||
if isinstance(parsed, list):
|
||||
self.data.extend(parsed)
|
||||
else:
|
||||
self.data.append(parsed)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"Failed to parse JSON-LD: {e}")
|
||||
pass
|
||||
|
||||
self.in_jsonld = False
|
||||
self.current_script = []
|
||||
|
||||
|
||||
class OpenGraphExtractor(HTMLParser):
|
||||
"""
|
||||
Extract OpenGraph meta tags from HTML.
|
||||
|
||||
Finds <meta property="og:*"> tags commonly used for social media sharing.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.og_data = {}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'meta':
|
||||
attrs_dict = dict(attrs)
|
||||
prop = attrs_dict.get('property', '')
|
||||
|
||||
# Extract OpenGraph properties
|
||||
if prop.startswith('og:'):
|
||||
content = attrs_dict.get('content', '')
|
||||
if content:
|
||||
self.og_data[prop] = content
|
||||
|
||||
|
||||
class MicrodataExtractor(HTMLParser):
|
||||
"""
|
||||
Extract basic microdata attributes from HTML.
|
||||
|
||||
Finds elements with itemprop attributes. This is a simplified extractor
|
||||
that doesn't handle nested itemscope/itemtype hierarchies - for complex
|
||||
cases, use extruct as fallback.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.microdata = {}
|
||||
self.current_itemprop = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
if 'itemprop' in attrs_dict:
|
||||
itemprop = attrs_dict['itemprop']
|
||||
|
||||
# Price/currency/availability can be in content/href attributes
|
||||
if itemprop == 'price':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['price'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'price'
|
||||
|
||||
elif itemprop == 'priceCurrency':
|
||||
if 'content' in attrs_dict:
|
||||
self.microdata['currency'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'priceCurrency'
|
||||
|
||||
elif itemprop == 'availability':
|
||||
# Can be in href (link) or content (meta)
|
||||
if 'href' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['href']
|
||||
elif 'content' in attrs_dict:
|
||||
self.microdata['availability'] = attrs_dict['content']
|
||||
else:
|
||||
self.current_itemprop = 'availability'
|
||||
|
||||
def handle_data(self, data):
|
||||
# Capture text content for itemprop elements
|
||||
if self.current_itemprop == 'price':
|
||||
# Try to extract numeric price from text
|
||||
try:
|
||||
price_text = re.sub(r'[^\d.]', '', data.strip())
|
||||
if price_text:
|
||||
self.microdata['price'] = float(price_text)
|
||||
except ValueError:
|
||||
pass
|
||||
elif self.current_itemprop == 'priceCurrency':
|
||||
currency = data.strip()
|
||||
if currency:
|
||||
self.microdata['currency'] = currency
|
||||
elif self.current_itemprop == 'availability':
|
||||
availability = data.strip()
|
||||
if availability:
|
||||
self.microdata['availability'] = availability
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
# Reset current itemprop after closing tag
|
||||
self.current_itemprop = None
|
||||
|
||||
|
||||
def extract_metadata_pure_python(html_content):
|
||||
"""
|
||||
Extract structured metadata from HTML using pure Python parsers.
|
||||
|
||||
Returns a dict with three keys:
|
||||
- 'json-ld': List of parsed JSON-LD objects
|
||||
- 'opengraph': Dict of OpenGraph properties
|
||||
- 'microdata': Dict of microdata properties
|
||||
|
||||
Args:
|
||||
html_content: HTML string to parse
|
||||
|
||||
Returns:
|
||||
dict: Extracted metadata in three formats
|
||||
"""
|
||||
result = {
|
||||
'json-ld': [],
|
||||
'opengraph': {},
|
||||
'microdata': {}
|
||||
}
|
||||
|
||||
# Extract JSON-LD
|
||||
try:
|
||||
jsonld_extractor = JSONLDExtractor()
|
||||
jsonld_extractor.feed(html_content)
|
||||
result['json-ld'] = jsonld_extractor.data
|
||||
logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
|
||||
except Exception as e:
|
||||
logger.debug(f"JSON-LD extraction failed: {e}")
|
||||
|
||||
# Extract OpenGraph
|
||||
try:
|
||||
og_extractor = OpenGraphExtractor()
|
||||
og_extractor.feed(html_content)
|
||||
result['opengraph'] = og_extractor.og_data
|
||||
if result['opengraph']:
|
||||
logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
|
||||
except Exception as e:
|
||||
logger.debug(f"OpenGraph extraction failed: {e}")
|
||||
|
||||
# Extract Microdata
|
||||
try:
|
||||
microdata_extractor = MicrodataExtractor()
|
||||
microdata_extractor.feed(html_content)
|
||||
result['microdata'] = microdata_extractor.microdata
|
||||
if result['microdata']:
|
||||
logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Microdata extraction failed: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def query_price_availability(extracted_data):
|
||||
"""
|
||||
Query extracted metadata for price and availability information.
|
||||
|
||||
Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
|
||||
Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
|
||||
|
||||
Args:
|
||||
extracted_data: Dict from extract_metadata_pure_python()
|
||||
|
||||
Returns:
|
||||
dict: {'price': float, 'currency': str, 'availability': str}
|
||||
"""
|
||||
from jsonpath_ng import parse
|
||||
|
||||
result = {}
|
||||
|
||||
# 1. Try JSON-LD first (most reliable and common)
|
||||
for data in extracted_data.get('json-ld', []):
|
||||
try:
|
||||
# Use jsonpath to find price/availability anywhere in the structure
|
||||
price_parse = parse('$..(price|Price)')
|
||||
availability_parse = parse('$..(availability|Availability)')
|
||||
currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
|
||||
|
||||
price_results = [m.value for m in price_parse.find(data)]
|
||||
if price_results and not result.get('price'):
|
||||
# Handle various price formats
|
||||
price_val = price_results[0]
|
||||
if isinstance(price_val, (int, float)):
|
||||
result['price'] = float(price_val)
|
||||
elif isinstance(price_val, str):
|
||||
# Extract numeric value from string
|
||||
try:
|
||||
result['price'] = float(re.sub(r'[^\d.]', '', price_val))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
avail_results = [m.value for m in availability_parse.find(data)]
|
||||
if avail_results and not result.get('availability'):
|
||||
result['availability'] = str(avail_results[0])
|
||||
|
||||
curr_results = [m.value for m in currency_parse.find(data)]
|
||||
if curr_results and not result.get('currency'):
|
||||
result['currency'] = str(curr_results[0])
|
||||
|
||||
# If we found price, this JSON-LD block is good
|
||||
if result.get('price'):
|
||||
logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error querying JSON-LD: {e}")
|
||||
continue
|
||||
|
||||
# 2. Try OpenGraph if JSON-LD didn't provide everything
|
||||
og_data = extracted_data.get('opengraph', {})
|
||||
if not result.get('price') and 'og:price:amount' in og_data:
|
||||
try:
|
||||
result['price'] = float(og_data['og:price:amount'])
|
||||
except ValueError:
|
||||
pass
|
||||
if not result.get('currency') and 'og:price:currency' in og_data:
|
||||
result['currency'] = og_data['og:price:currency']
|
||||
if not result.get('availability') and 'og:availability' in og_data:
|
||||
result['availability'] = og_data['og:availability']
|
||||
|
||||
# 3. Use microdata as last resort
|
||||
microdata = extracted_data.get('microdata', {})
|
||||
if not result.get('price') and 'price' in microdata:
|
||||
result['price'] = microdata['price']
|
||||
if not result.get('currency') and 'currency' in microdata:
|
||||
result['currency'] = microdata['currency']
|
||||
if not result.get('availability') and 'availability' in microdata:
|
||||
result['availability'] = microdata['availability']
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user