tweak

Use extruct as a last resort
Price tracker - Use subprocess on linux for cleaner memory management.
2026-02-15 10:46:01 +00:00 · 2026-02-11 17:08:58 +01:00 · 2026-02-11 16:59:44 +01:00 · 2026-02-11 16:40:12 +01:00
2 changed files with 541 additions and 1 deletions
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -56,6 +56,259 @@ def _deduplicate_prices(data):
    return list(unique_data)
 # =============================================================================
 # MEMORY MANAGEMENT: Why We Use Multiprocessing (Linux Only)
 # =============================================================================
 #
 # The get_itemprop_availability() function uses 'extruct' to parse HTML metadata
 # (JSON-LD, microdata, OpenGraph, etc). Extruct internally uses lxml, which wraps
 # libxml2 - a C library that allocates memory at the C level.
 #
 # Memory Leak Problem:
 # --------------------
 # 1. lxml's document_fromstring() creates thousands of Python objects backed by
 #    C-level allocations (nodes, attributes, text content)
 # 2. Python's garbage collector can mark these objects as collectible, but
 #    cannot force the OS to reclaim the actual C-level memory
 # 3. malloc/free typically doesn't return memory to OS - it just marks it as
 #    "free in the process address space"
 # 4. With repeated parsing of large HTML (5MB+ pages), memory accumulates even
 #    after Python GC runs
 #
 # Why Multiprocessing Fixes This:
 # --------------------------------
 # When a subprocess exits, the OS forcibly reclaims ALL memory including C-level
 # allocations that Python GC couldn't release. This ensures clean memory state
 # after each extraction.
 #
 # Performance Impact:
 # -------------------
 # - Memray analysis showed 1.2M document_fromstring allocations per page
 # - Without subprocess: memory grows by ~50-500MB per parse and lingers
 # - With subprocess: ~35MB overhead but forces full cleanup after each run
 # - Trade-off: 35MB resource_tracker vs 500MB+ accumulated leak = much better at scale
 #
 # References:
 # -----------
 # - lxml memory issues: https://medium.com/devopss-hole/python-lxml-memory-leak-b8d0b1000dc7
 # - libxml2 caching behavior: https://www.mail-archive.com/lxml@python.org/msg00026.html
 # - GC limitations with C extensions: https://benbernardblog.com/tracking-down-a-freaky-python-memory-leak-part-2/
 #
 # Additional Context:
 # -------------------
 # - jsonpath_ng (used to query the parsed data) is pure Python and doesn't leak
 # - The leak is specifically from lxml's document parsing, not the JSONPath queries
 # - Linux-only because multiprocessing spawn is well-tested there; other platforms
 #   use direct call as fallback
 #
 # Alternative Solution (Future Optimization):
 # -------------------------------------------
 # This entire problem could be avoided by using regex to extract just the machine
 # data blocks (JSON-LD, microdata, OpenGraph tags) BEFORE parsing with lxml:
 #
 #   1. Use regex to extract <script type="application/ld+json">...</script> blocks
 #   2. Use regex to extract <meta property="og:*"> tags
 #   3. Use regex to find itemprop/itemtype attributes and their containing elements
 #   4. Parse ONLY those extracted snippets instead of the entire HTML document
 #
 # Benefits:
 #   - Avoids parsing 5MB of HTML when we only need a few KB of metadata
 #   - Eliminates the lxml memory leak entirely
 #   - Faster extraction (regex is much faster than DOM parsing)
 #   - No subprocess overhead needed
 #
 # Trade-offs:
 #   - Regex for HTML is brittle (comments, CDATA, edge cases)
 #   - Microdata extraction would be complex (need to track element boundaries)
 #   - Would need extensive testing to ensure we don't miss valid data
 #   - extruct is battle-tested; regex solution would need similar maturity
 #
 # For now, the subprocess approach is safer and leverages existing extruct code.
 # =============================================================================
 def _extract_itemprop_availability_worker(pipe_conn):
    """
    Subprocess worker for itemprop extraction (Linux memory management).
    Uses spawn multiprocessing to isolate extruct/lxml memory allocations.
    When the subprocess exits, the OS reclaims ALL memory including lxml's
    C-level allocations that Python's GC cannot release.
    Args:
        pipe_conn: Pipe connection to receive HTML and send result
    """
    import json
    import gc
    html_content = None
    result_data = None
    try:
        # Receive HTML as raw bytes (no pickle)
        html_bytes = pipe_conn.recv_bytes()
        html_content = html_bytes.decode('utf-8')
        # Explicitly delete html_bytes to free memory
        del html_bytes
        gc.collect()
        # Perform extraction in subprocess (uses extruct/lxml)
        result_data = get_itemprop_availability(html_content)
        # Convert Restock object to dict for JSON serialization
        result = {
            'success': True,
            'data': dict(result_data) if result_data else {}
        }
        pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
        # Clean up before exit
        del result_data, html_content, result
        gc.collect()
    except MoreThanOnePriceFound:
        # Serialize the specific exception type
        result = {
            'success': False,
            'exception_type': 'MoreThanOnePriceFound'
        }
        pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
    except Exception as e:
        # Serialize other exceptions
        result = {
            'success': False,
            'exception_type': type(e).__name__,
            'exception_message': str(e)
        }
        pipe_conn.send_bytes(json.dumps(result).encode('utf-8'))
    finally:
        # Final cleanup before subprocess exits
        # Variables may already be deleted in try block, so use try/except
        try:
            del html_content
        except (NameError, UnboundLocalError):
            pass
        try:
            del result_data
        except (NameError, UnboundLocalError):
            pass
        gc.collect()
        pipe_conn.close()
 def extract_itemprop_availability_safe(html_content) -> Restock:
    """
    Extract itemprop availability with hybrid approach for memory efficiency.
    Strategy (fastest to slowest, least to most memory):
    1. Try pure Python extraction (JSON-LD, OpenGraph, microdata) - covers 80%+ of cases
    2. Fall back to extruct with subprocess isolation on Linux for complex cases
    Args:
        html_content: HTML string to parse
    Returns:
        Restock: Extracted availability data
    Raises:
        MoreThanOnePriceFound: When multiple prices detected
        Other exceptions: From extruct/parsing
    """
    import platform
    # Step 1: Try pure Python extraction first (fast, no lxml, no memory leak)
    try:
        from .pure_python_extractor import extract_metadata_pure_python, query_price_availability
        logger.trace("Attempting pure Python metadata extraction (no lxml)")
        extracted_data = extract_metadata_pure_python(html_content)
        price_data = query_price_availability(extracted_data)
        # If we got price AND availability, we're done!
        if price_data.get('price') and price_data.get('availability'):
            result = Restock(price_data)
            logger.debug(f"Pure Python extraction successful: {dict(result)}")
            return result
        # If we got some data but not everything, still try extruct for completeness
        if price_data.get('price') or price_data.get('availability'):
            logger.debug(f"Pure Python extraction partial: {price_data}, will try extruct for completeness")
    except Exception as e:
        logger.debug(f"Pure Python extraction failed: {e}, falling back to extruct")
    # Step 2: Fall back to extruct (uses lxml, needs subprocess on Linux)
    logger.trace("Falling back to extruct (lxml-based) with subprocess isolation")
    # Only use subprocess isolation on Linux
    # Other platforms may have issues with spawn or don't need the aggressive memory management
    if platform.system() == 'Linux':
        import multiprocessing
        import json
        import gc
        try:
            ctx = multiprocessing.get_context('spawn')
            parent_conn, child_conn = ctx.Pipe()
            p = ctx.Process(target=_extract_itemprop_availability_worker, args=(child_conn,))
            p.start()
            # Send HTML as raw bytes (no pickle)
            html_bytes = html_content.encode('utf-8')
            parent_conn.send_bytes(html_bytes)
            # Explicitly delete html_bytes copy immediately after sending
            del html_bytes
            gc.collect()
            # Receive result as JSON
            result_bytes = parent_conn.recv_bytes()
            result = json.loads(result_bytes.decode('utf-8'))
            # Wait for subprocess to complete
            p.join()
            # Close pipes
            parent_conn.close()
            child_conn.close()
            # Clean up all subprocess-related objects
            del p, parent_conn, child_conn, result_bytes
            gc.collect()
            # Handle result or re-raise exception
            if result['success']:
                # Reconstruct Restock object from dict
                restock_obj = Restock(result['data'])
                # Clean up result dict
                del result
                gc.collect()
                return restock_obj
            else:
                # Re-raise the exception that occurred in subprocess
                exception_type = result['exception_type']
                exception_msg = result.get('exception_message', '')
                del result
                gc.collect()
                if exception_type == 'MoreThanOnePriceFound':
                    raise MoreThanOnePriceFound()
                else:
                    raise Exception(f"{exception_type}: {exception_msg}")
        except Exception as e:
            # If multiprocessing itself fails, log and fall back to direct call
            logger.warning(f"Subprocess extraction failed: {e}, falling back to direct call")
            gc.collect()
            return get_itemprop_availability(html_content)
    else:
        # Non-Linux: direct call (no subprocess overhead needed)
        return get_itemprop_availability(html_content)
 # should return Restock()
 # add casting?
 def get_itemprop_availability(html_content) -> Restock:
@@ -196,8 +449,9 @@ class perform_site_check(difference_detection_processor):
        multiple_prices_found = False
        # Try built-in extraction first, this will scan metadata in the HTML
        # On Linux, this runs in a subprocess to prevent lxml/extruct memory leaks
        try:
-            itemprop_availability = get_itemprop_availability(self.fetcher.content)
+            itemprop_availability = extract_itemprop_availability_safe(self.fetcher.content)
        except MoreThanOnePriceFound as e:
            # Don't raise immediately - let plugins try to handle this case
            # Plugins might be able to determine which price is correct
--- a/changedetectionio/processors/restock_diff/pure_python_extractor.py
+++ b/changedetectionio/processors/restock_diff/pure_python_extractor.py
@@ -0,0 +1,286 @@
 """
 Pure Python metadata extractor - no lxml, no memory leaks.
 This module provides a fast, memory-efficient alternative to extruct for common
 e-commerce metadata extraction. It handles:
 - JSON-LD (covers 80%+ of modern sites)
 - OpenGraph meta tags
 - Basic microdata attributes
 Uses Python's built-in html.parser instead of lxml/libxml2, avoiding C-level
 memory allocation issues. For edge cases, the main processor can fall back to
 extruct (with subprocess isolation on Linux).
 """
 from html.parser import HTMLParser
 import json
 import re
 from loguru import logger
 class JSONLDExtractor(HTMLParser):
    """
    Extract JSON-LD structured data from HTML.
    Finds all <script type="application/ld+json"> tags and parses their content.
    Handles multiple JSON-LD blocks on the same page.
    """
    def __init__(self):
        super().__init__()
        self.in_jsonld = False
        self.data = []  # List of all parsed JSON-LD objects
        self.current_script = []
    def handle_starttag(self, tag, attrs):
        if tag == 'script':
            # Check if this is a JSON-LD script tag
            for attr, value in attrs:
                if attr == 'type' and value == 'application/ld+json':
                    self.in_jsonld = True
                    self.current_script = []
                    break
    def handle_data(self, data):
        if self.in_jsonld:
            self.current_script.append(data)
    def handle_endtag(self, tag):
        if tag == 'script' and self.in_jsonld:
            # Parse the accumulated script content
            script_content = ''.join(self.current_script)
            if script_content.strip():
                try:
                    # Parse JSON (handles both objects and arrays)
                    parsed = json.loads(script_content)
                    if isinstance(parsed, list):
                        self.data.extend(parsed)
                    else:
                        self.data.append(parsed)
                except json.JSONDecodeError as e:
                    logger.debug(f"Failed to parse JSON-LD: {e}")
                    pass
            self.in_jsonld = False
            self.current_script = []
 class OpenGraphExtractor(HTMLParser):
    """
    Extract OpenGraph meta tags from HTML.
    Finds <meta property="og:*"> tags commonly used for social media sharing.
    """
    def __init__(self):
        super().__init__()
        self.og_data = {}
    def handle_starttag(self, tag, attrs):
        if tag == 'meta':
            attrs_dict = dict(attrs)
            prop = attrs_dict.get('property', '')
            # Extract OpenGraph properties
            if prop.startswith('og:'):
                content = attrs_dict.get('content', '')
                if content:
                    self.og_data[prop] = content
 class MicrodataExtractor(HTMLParser):
    """
    Extract basic microdata attributes from HTML.
    Finds elements with itemprop attributes. This is a simplified extractor
    that doesn't handle nested itemscope/itemtype hierarchies - for complex
    cases, use extruct as fallback.
    """
    def __init__(self):
        super().__init__()
        self.microdata = {}
        self.current_itemprop = None
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if 'itemprop' in attrs_dict:
            itemprop = attrs_dict['itemprop']
            # Price/currency/availability can be in content/href attributes
            if itemprop == 'price':
                if 'content' in attrs_dict:
                    self.microdata['price'] = attrs_dict['content']
                else:
                    self.current_itemprop = 'price'
            elif itemprop == 'priceCurrency':
                if 'content' in attrs_dict:
                    self.microdata['currency'] = attrs_dict['content']
                else:
                    self.current_itemprop = 'priceCurrency'
            elif itemprop == 'availability':
                # Can be in href (link) or content (meta)
                if 'href' in attrs_dict:
                    self.microdata['availability'] = attrs_dict['href']
                elif 'content' in attrs_dict:
                    self.microdata['availability'] = attrs_dict['content']
                else:
                    self.current_itemprop = 'availability'
    def handle_data(self, data):
        # Capture text content for itemprop elements
        if self.current_itemprop == 'price':
            # Try to extract numeric price from text
            try:
                price_text = re.sub(r'[^\d.]', '', data.strip())
                if price_text:
                    self.microdata['price'] = float(price_text)
            except ValueError:
                pass
        elif self.current_itemprop == 'priceCurrency':
            currency = data.strip()
            if currency:
                self.microdata['currency'] = currency
        elif self.current_itemprop == 'availability':
            availability = data.strip()
            if availability:
                self.microdata['availability'] = availability
    def handle_endtag(self, tag):
        # Reset current itemprop after closing tag
        self.current_itemprop = None
 def extract_metadata_pure_python(html_content):
    """
    Extract structured metadata from HTML using pure Python parsers.
    Returns a dict with three keys:
    - 'json-ld': List of parsed JSON-LD objects
    - 'opengraph': Dict of OpenGraph properties
    - 'microdata': Dict of microdata properties
    Args:
        html_content: HTML string to parse
    Returns:
        dict: Extracted metadata in three formats
    """
    result = {
        'json-ld': [],
        'opengraph': {},
        'microdata': {}
    }
    # Extract JSON-LD
    try:
        jsonld_extractor = JSONLDExtractor()
        jsonld_extractor.feed(html_content)
        result['json-ld'] = jsonld_extractor.data
        logger.trace(f"Pure Python: Found {len(jsonld_extractor.data)} JSON-LD blocks")
    except Exception as e:
        logger.debug(f"JSON-LD extraction failed: {e}")
    # Extract OpenGraph
    try:
        og_extractor = OpenGraphExtractor()
        og_extractor.feed(html_content)
        result['opengraph'] = og_extractor.og_data
        if result['opengraph']:
            logger.trace(f"Pure Python: Found {len(og_extractor.og_data)} OpenGraph tags")
    except Exception as e:
        logger.debug(f"OpenGraph extraction failed: {e}")
    # Extract Microdata
    try:
        microdata_extractor = MicrodataExtractor()
        microdata_extractor.feed(html_content)
        result['microdata'] = microdata_extractor.microdata
        if result['microdata']:
            logger.trace(f"Pure Python: Found microdata: {result['microdata']}")
    except Exception as e:
        logger.debug(f"Microdata extraction failed: {e}")
    return result
 def query_price_availability(extracted_data):
    """
    Query extracted metadata for price and availability information.
    Uses jsonpath_ng to query JSON-LD data (same approach as extruct).
    Falls back to OpenGraph and microdata if JSON-LD doesn't have the data.
    Args:
        extracted_data: Dict from extract_metadata_pure_python()
    Returns:
        dict: {'price': float, 'currency': str, 'availability': str}
    """
    from jsonpath_ng import parse
    result = {}
    # 1. Try JSON-LD first (most reliable and common)
    for data in extracted_data.get('json-ld', []):
        try:
            # Use jsonpath to find price/availability anywhere in the structure
            price_parse = parse('$..(price|Price)')
            availability_parse = parse('$..(availability|Availability)')
            currency_parse = parse('$..(priceCurrency|currency|priceCurrency)')
            price_results = [m.value for m in price_parse.find(data)]
            if price_results and not result.get('price'):
                # Handle various price formats
                price_val = price_results[0]
                if isinstance(price_val, (int, float)):
                    result['price'] = float(price_val)
                elif isinstance(price_val, str):
                    # Extract numeric value from string
                    try:
                        result['price'] = float(re.sub(r'[^\d.]', '', price_val))
                    except ValueError:
                        pass
            avail_results = [m.value for m in availability_parse.find(data)]
            if avail_results and not result.get('availability'):
                result['availability'] = str(avail_results[0])
            curr_results = [m.value for m in currency_parse.find(data)]
            if curr_results and not result.get('currency'):
                result['currency'] = str(curr_results[0])
            # If we found price, this JSON-LD block is good
            if result.get('price'):
                logger.debug(f"Pure Python: Found price data in JSON-LD: {result}")
                break
        except Exception as e:
            logger.debug(f"Error querying JSON-LD: {e}")
            continue
    # 2. Try OpenGraph if JSON-LD didn't provide everything
    og_data = extracted_data.get('opengraph', {})
    if not result.get('price') and 'og:price:amount' in og_data:
        try:
            result['price'] = float(og_data['og:price:amount'])
        except ValueError:
            pass
    if not result.get('currency') and 'og:price:currency' in og_data:
        result['currency'] = og_data['og:price:currency']
    if not result.get('availability') and 'og:availability' in og_data:
        result['availability'] = og_data['og:availability']
    # 3. Use microdata as last resort
    microdata = extracted_data.get('microdata', {})
    if not result.get('price') and 'price' in microdata:
        result['price'] = microdata['price']
    if not result.get('currency') and 'currency' in microdata:
        result['currency'] = microdata['currency']
    if not result.get('availability') and 'availability' in microdata:
        result['availability'] = microdata['availability']
    return result
Author	SHA1	Message	Date
dgtlmoon	9729f4c4e4	tweak	2026-02-11 17:08:58 +01:00
dgtlmoon	759d4118bf	Use extruct as a last resort	2026-02-11 16:59:44 +01:00
dgtlmoon	bafbdfb5c0	Price tracker - Use subprocess on linux for cleaner memory management.	2026-02-11 16:40:12 +01:00