mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-05 00:56:06 +00:00
Compare commits
2 Commits
docker-bui
...
reverse-26
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0f2b2f4258 | ||
|
|
192ae8064c |
@@ -143,8 +143,6 @@ class perform_site_check(difference_detection_processor):
|
|||||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
|
||||||
from functools import partial
|
|
||||||
if not watch:
|
if not watch:
|
||||||
raise Exception("Watch no longer exists.")
|
raise Exception("Watch no longer exists.")
|
||||||
|
|
||||||
@@ -186,11 +184,7 @@ class perform_site_check(difference_detection_processor):
|
|||||||
|
|
||||||
itemprop_availability = {}
|
itemprop_availability = {}
|
||||||
try:
|
try:
|
||||||
with ProcessPoolExecutor() as executor:
|
itemprop_availability = get_itemprop_availability(self.fetcher.content)
|
||||||
# Use functools.partial to create a callable with arguments
|
|
||||||
# anything using bs4/lxml etc is quite "leaky"
|
|
||||||
future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
|
|
||||||
itemprop_availability = future.result()
|
|
||||||
except MoreThanOnePriceFound as e:
|
except MoreThanOnePriceFound as e:
|
||||||
# Add the real data
|
# Add the real data
|
||||||
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
|
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
|
||||||
|
|||||||
@@ -36,8 +36,6 @@ class PDFToHTMLToolNotFound(ValueError):
|
|||||||
class perform_site_check(difference_detection_processor):
|
class perform_site_check(difference_detection_processor):
|
||||||
|
|
||||||
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
def run_changedetection(self, watch, skip_when_checksum_same=True):
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
changed_detected = False
|
changed_detected = False
|
||||||
html_content = ""
|
html_content = ""
|
||||||
@@ -174,30 +172,20 @@ class perform_site_check(difference_detection_processor):
|
|||||||
for filter_rule in include_filters_rule:
|
for filter_rule in include_filters_rule:
|
||||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||||
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
||||||
with ProcessPoolExecutor() as executor:
|
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
|
||||||
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
|
||||||
future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
|
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
is_rss=is_rss))
|
is_rss=is_rss)
|
||||||
html_content += future.result()
|
|
||||||
|
|
||||||
elif filter_rule.startswith('xpath1:'):
|
elif filter_rule.startswith('xpath1:'):
|
||||||
with ProcessPoolExecutor() as executor:
|
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||||
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
html_content=self.fetcher.content,
|
||||||
future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''),
|
append_pretty_line_formatting=not watch.is_source_type_url,
|
||||||
html_content=self.fetcher.content,
|
is_rss=is_rss)
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url,
|
|
||||||
is_rss=is_rss))
|
|
||||||
html_content += future.result()
|
|
||||||
else:
|
else:
|
||||||
with ProcessPoolExecutor() as executor:
|
html_content += html_tools.include_filters(include_filters=filter_rule,
|
||||||
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
||||||
future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
|
|
||||||
html_content=self.fetcher.content,
|
html_content=self.fetcher.content,
|
||||||
append_pretty_line_formatting=not watch.is_source_type_url))
|
append_pretty_line_formatting=not watch.is_source_type_url)
|
||||||
html_content += future.result()
|
|
||||||
|
|
||||||
if not html_content.strip():
|
if not html_content.strip():
|
||||||
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
|
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
|
||||||
@@ -210,13 +198,9 @@ class perform_site_check(difference_detection_processor):
|
|||||||
else:
|
else:
|
||||||
# extract text
|
# extract text
|
||||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||||
with ProcessPoolExecutor() as executor:
|
stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
|
||||||
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
|
render_anchor_tag_content=do_anchor,
|
||||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
is_rss=is_rss) # 1874 activate the <title workaround hack
|
||||||
future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
|
|
||||||
render_anchor_tag_content=do_anchor,
|
|
||||||
is_rss=is_rss)) #1874 activate the <title workaround hack
|
|
||||||
stripped_text_from_html = future.result()
|
|
||||||
|
|
||||||
|
|
||||||
if watch.get('trim_text_whitespace'):
|
if watch.get('trim_text_whitespace'):
|
||||||
|
|||||||
Reference in New Issue
Block a user