Compare commits

...

1 Commits

Author SHA1 Message Date
dgtlmoon
cab11ced5f Re #2945 - Handle ByteOrderMark in JSON strings 2025-02-07 21:28:40 +01:00
2 changed files with 18 additions and 3 deletions

View File

@@ -1,5 +1,6 @@
from typing import List
from loguru import logger
from lxml import etree
from typing import List
import json
import re
@@ -298,8 +299,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
try:
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
except json.JSONDecodeError as e:
logger.warning(str(e))
# Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body>

View File

@@ -514,3 +514,15 @@ def test_check_jq_ext_filter(client, live_server, measure_memory_usage):
def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)
def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage):
from .. import html_tools
# JSON string with BOM and correct double-quoted keys
json_str = '\ufeff{"name": "José", "emoji": "😊", "language": "中文", "greeting": "Привет"}'
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(json_str, "json:$.name")
assert text == '"José"'