mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-01 07:08:47 +00:00
Fetcher / Parser - Automatically attempt to extract JSON from document when document contains JSON but could be wrapped in HTML (#1593)
This commit is contained in:
@@ -137,12 +137,13 @@ def _get_stripped_text_from_json_match(match):
|
|||||||
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
||||||
stripped_text_from_html = False
|
stripped_text_from_html = False
|
||||||
|
|
||||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
|
||||||
try:
|
try:
|
||||||
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
|
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
|
||||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||||
|
# As a last resort, try to parse the whole <body>
|
||||||
s = []
|
s = []
|
||||||
soup = BeautifulSoup(content, 'html.parser')
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
@@ -150,32 +151,34 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
|||||||
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
||||||
else:
|
else:
|
||||||
bs_result = soup.findAll('script')
|
bs_result = soup.findAll('script')
|
||||||
|
bs_result += soup.findAll('body')
|
||||||
|
|
||||||
|
bs_jsons = []
|
||||||
if not bs_result:
|
|
||||||
raise JSONNotFound("No parsable JSON found in this document")
|
|
||||||
|
|
||||||
for result in bs_result:
|
for result in bs_result:
|
||||||
# Skip empty tags, and things that dont even look like JSON
|
# Skip empty tags, and things that dont even look like JSON
|
||||||
if not result.string or not '{' in result.string:
|
if not result.text or '{' not in result.text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
json_data = json.loads(result.string)
|
json_data = json.loads(result.text)
|
||||||
|
bs_jsons.append(json_data)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Just skip it
|
# Skip objects which cannot be parsed
|
||||||
continue
|
continue
|
||||||
else:
|
|
||||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
if not bs_jsons:
|
||||||
if ensure_is_ldjson_info_type:
|
raise JSONNotFound("No parsable JSON found in this document")
|
||||||
# Could sometimes be list, string or something else random
|
|
||||||
if isinstance(json_data, dict):
|
for json_data in bs_jsons:
|
||||||
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
|
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||||
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
|
if ensure_is_ldjson_info_type:
|
||||||
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
|
# Could sometimes be list, string or something else random
|
||||||
break
|
if isinstance(json_data, dict):
|
||||||
elif stripped_text_from_html:
|
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
|
||||||
break
|
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
|
||||||
|
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
|
||||||
|
break
|
||||||
|
elif stripped_text_from_html:
|
||||||
|
break
|
||||||
|
|
||||||
if not stripped_text_from_html:
|
if not stripped_text_from_html:
|
||||||
# Re 265 - Just return an empty string when filter not found
|
# Re 265 - Just return an empty string when filter not found
|
||||||
|
|||||||
@@ -64,6 +64,24 @@ and it can also be repeated
|
|||||||
with pytest.raises(html_tools.JSONNotFound) as e_info:
|
with pytest.raises(html_tools.JSONNotFound) as e_info:
|
||||||
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
|
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
|
||||||
|
|
||||||
|
|
||||||
|
def test_unittest_inline_extract_body():
|
||||||
|
content = """
|
||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
<body>
|
||||||
|
<pre style="word-wrap: break-word; white-space: pre-wrap;">
|
||||||
|
{"testKey": 42}
|
||||||
|
</pre>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
from .. import html_tools
|
||||||
|
|
||||||
|
# See that we can find the second <script> one, which is not broken, and matches our filter
|
||||||
|
text = html_tools.extract_json_as_string(content, "json:$.testKey")
|
||||||
|
assert text == '42'
|
||||||
|
|
||||||
def set_original_ext_response():
|
def set_original_ext_response():
|
||||||
data = """
|
data = """
|
||||||
[
|
[
|
||||||
@@ -437,7 +455,6 @@ def test_ignore_json_order(client, live_server):
|
|||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
def test_correct_header_detect(client, live_server):
|
def test_correct_header_detect(client, live_server):
|
||||||
|
|
||||||
# Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
|
# Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
|
||||||
# Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
|
# Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
|
||||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
@@ -453,11 +470,17 @@ def test_correct_header_detect(client, live_server):
|
|||||||
)
|
)
|
||||||
assert b"1 Imported" in res.data
|
assert b"1 Imported" in res.data
|
||||||
wait_for_all_checks(client)
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
|
||||||
res = client.get(url_for("index"))
|
res = client.get(url_for("index"))
|
||||||
# This will be fixed in #1593
|
|
||||||
assert b'No parsable JSON found in this document' in res.data
|
# Fixed in #1593
|
||||||
|
assert b'No parsable JSON found in this document' not in res.data
|
||||||
|
|
||||||
|
res = client.get(
|
||||||
|
url_for("preview_page", uuid="first"),
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b'"world":' in res.data
|
||||||
|
assert res.data.count(b'{') >= 2
|
||||||
|
|
||||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
||||||
|
|||||||
Reference in New Issue
Block a user