mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-12-12 19:16:40 +00:00
Memory management improvements - LXML and other libraries can leak allocation, wrap in a sub-process (#2626)
This commit is contained in:
@@ -2,8 +2,7 @@ from .. import difference_detection_processor
|
|||||||
from ..exceptions import ProcessorException
|
from ..exceptions import ProcessorException
|
||||||
from . import Restock
|
from . import Restock
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import hashlib
|
|
||||||
import re
|
|
||||||
import urllib3
|
import urllib3
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock:
|
|||||||
"""
|
"""
|
||||||
from jsonpath_ng import parse
|
from jsonpath_ng import parse
|
||||||
|
|
||||||
|
import re
|
||||||
now = time.time()
|
now = time.time()
|
||||||
import extruct
|
import extruct
|
||||||
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
|
||||||
@@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor):
|
|||||||
xpath_data = None
|
xpath_data = None
|
||||||
|
|
||||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from functools import partial
|
||||||
if not watch:
|
if not watch:
|
||||||
raise Exception("Watch no longer exists.")
|
raise Exception("Watch no longer exists.")
|
||||||
|
|
||||||
@@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor):
|
|||||||
|
|
||||||
itemprop_availability = {}
|
itemprop_availability = {}
|
||||||
try:
|
try:
|
||||||
itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content)
|
with ProcessPoolExecutor() as executor:
|
||||||
|
# Use functools.partial to create a callable with arguments
|
||||||
|
# anything using bs4/lxml etc is quite "leaky"
|
||||||
|
future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
|
||||||
|
itemprop_availability = future.result()
|
||||||
except MoreThanOnePriceFound as e:
|
except MoreThanOnePriceFound as e:
|
||||||
# Add the real data
|
# Add the real data
|
||||||
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
|
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError):
|
|||||||
class perform_site_check(difference_detection_processor):
|
class perform_site_check(difference_detection_processor):
|
||||||
|
|
||||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
changed_detected = False
|
changed_detected = False
|
||||||
html_content = ""
|
html_content = ""
|
||||||
screenshot = False # as bytes
|
screenshot = False # as bytes
|
||||||
@@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor):
|
|||||||
for filter_rule in include_filters_rule:
|
for filter_rule in include_filters_rule:
|
||||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||||
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
||||||
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
with ProcessPoolExecutor() as executor:
|
||||||
|
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
||||||
|
future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
is_rss=is_rss)
|
is_rss=is_rss))
|
||||||
|
html_content += future.result()
|
||||||
|
|
||||||
elif filter_rule.startswith('xpath1:'):
|
elif filter_rule.startswith('xpath1:'):
|
||||||
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
with ProcessPoolExecutor() as executor:
|
||||||
|
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
||||||
|
future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
is_rss=is_rss)
|
is_rss=is_rss))
|
||||||
|
html_content += future.result()
|
||||||
else:
|
else:
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
with ProcessPoolExecutor() as executor:
|
||||||
html_content += html_tools.include_filters(include_filters=filter_rule,
|
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
||||||
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
|
future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url)
|
append_pretty_line_formatting=not watch.is_source_type_url))
|
||||||
|
html_content += future.result()
|
||||||
|
|
||||||
if not html_content.strip():
|
if not html_content.strip():
|
||||||
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
|
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
|
||||||
@@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor):
|
|||||||
else:
|
else:
|
||||||
# extract text
|
# extract text
|
||||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||||
stripped_text_from_html = \
|
with ProcessPoolExecutor() as executor:
|
||||||
html_tools.html_to_text(
|
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
||||||
html_content=html_content,
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||||
|
future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
|
||||||
render_anchor_tag_content=do_anchor,
|
render_anchor_tag_content=do_anchor,
|
||||||
is_rss=is_rss # #1874 activate the <title workaround hack
|
is_rss=is_rss)) #1874 activate the <title workaround hack
|
||||||
)
|
stripped_text_from_html = future.result()
|
||||||
|
|
||||||
if watch.get('sort_text_alphabetically') and stripped_text_from_html:
|
if watch.get('sort_text_alphabetically') and stripped_text_from_html:
|
||||||
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
|
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
|
||||||
|
|||||||
Reference in New Issue
Block a user