mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
@@ -303,70 +303,92 @@ def _get_stripped_text_from_json_match(match):
|
||||
|
||||
return stripped_text_from_html
|
||||
|
||||
def extract_json_blob_from_html(content, ensure_is_ldjson_info_type, json_filter):
|
||||
from bs4 import BeautifulSoup
|
||||
stripped_text_from_html = ''
|
||||
|
||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||
# As a last resort, try to parse the whole <body>
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
bs_result = soup.find_all('script', {"type": "application/ld+json"})
|
||||
else:
|
||||
bs_result = soup.find_all('script')
|
||||
bs_result += soup.find_all('body')
|
||||
|
||||
bs_jsons = []
|
||||
|
||||
for result in bs_result:
|
||||
# result.text is how bs4 magically strips JSON from the body
|
||||
content_start = result.text.lstrip("\ufeff").strip()[:100] if result.text else ''
|
||||
# Skip empty tags, and things that dont even look like JSON
|
||||
if not result.text or not (content_start[0] == '{' or content_start[0] == '['):
|
||||
continue
|
||||
try:
|
||||
json_data = json.loads(result.text)
|
||||
bs_jsons.append(json_data)
|
||||
except json.JSONDecodeError:
|
||||
# Skip objects which cannot be parsed
|
||||
continue
|
||||
|
||||
if not bs_jsons:
|
||||
raise JSONNotFound("No parsable JSON found in this document")
|
||||
|
||||
for json_data in bs_jsons:
|
||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
# Could sometimes be list, string or something else random
|
||||
if isinstance(json_data, dict):
|
||||
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
|
||||
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
|
||||
# @type could also be a list although non-standard ("@type": ["Product", "SubType"],)
|
||||
# LD_JSON auto-extract also requires some content PLUS the ldjson to be present
|
||||
# 1833 - could be either str or dict, should not be anything else
|
||||
|
||||
t = json_data.get('@type')
|
||||
if t and stripped_text_from_html:
|
||||
|
||||
if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower():
|
||||
break
|
||||
# The non-standard part, some have a list
|
||||
elif isinstance(t, list):
|
||||
if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]:
|
||||
break
|
||||
|
||||
elif stripped_text_from_html:
|
||||
break
|
||||
|
||||
return stripped_text_from_html
|
||||
|
||||
# content - json
|
||||
# json_filter - ie json:$..price
|
||||
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
|
||||
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
stripped_text_from_html = False
|
||||
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
|
||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
|
||||
try:
|
||||
# .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
|
||||
stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(str(e))
|
||||
|
||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||
# As a last resort, try to parse the whole <body>
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
# Looks like clean JSON, dont bother extracting from HTML
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
bs_result = soup.find_all('script', {"type": "application/ld+json"})
|
||||
else:
|
||||
bs_result = soup.find_all('script')
|
||||
bs_result += soup.find_all('body')
|
||||
content_start = content.lstrip("\ufeff").strip()[:100]
|
||||
|
||||
bs_jsons = []
|
||||
for result in bs_result:
|
||||
# Skip empty tags, and things that dont even look like JSON
|
||||
if not result.text or '{' not in result.text:
|
||||
continue
|
||||
try:
|
||||
json_data = json.loads(result.text)
|
||||
bs_jsons.append(json_data)
|
||||
except json.JSONDecodeError:
|
||||
# Skip objects which cannot be parsed
|
||||
continue
|
||||
|
||||
if not bs_jsons:
|
||||
raise JSONNotFound("No parsable JSON found in this document")
|
||||
|
||||
for json_data in bs_jsons:
|
||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
# Could sometimes be list, string or something else random
|
||||
if isinstance(json_data, dict):
|
||||
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
|
||||
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
|
||||
# @type could also be a list although non-standard ("@type": ["Product", "SubType"],)
|
||||
# LD_JSON auto-extract also requires some content PLUS the ldjson to be present
|
||||
# 1833 - could be either str or dict, should not be anything else
|
||||
|
||||
t = json_data.get('@type')
|
||||
if t and stripped_text_from_html:
|
||||
|
||||
if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower():
|
||||
break
|
||||
# The non-standard part, some have a list
|
||||
elif isinstance(t, list):
|
||||
if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]:
|
||||
break
|
||||
|
||||
elif stripped_text_from_html:
|
||||
break
|
||||
if content_start[0] == '{' or content_start[0] == '[':
|
||||
try:
|
||||
# .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
|
||||
stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff")), json_filter)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Error processing JSON {content[:20]}...{str(e)})")
|
||||
else:
|
||||
# Probably something else, go fish inside for it
|
||||
try:
|
||||
stripped_text_from_html = extract_json_blob_from_html(content=content,
|
||||
ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
|
||||
json_filter=json_filter )
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
|
||||
|
||||
if not stripped_text_from_html:
|
||||
# Re 265 - Just return an empty string when filter not found
|
||||
|
||||
Reference in New Issue
Block a user