diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index b710077f..a32f8146 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,5 +1,6 @@ -from typing import List +from loguru import logger from lxml import etree +from typing import List import json import re @@ -298,8 +299,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags try: - stripped_text_from_html = _parse_json(json.loads(content), json_filter) - except json.JSONDecodeError: + # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work + stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter) + except json.JSONDecodeError as e: + logger.warning(str(e)) # Foreach blob.. just return the first that matches json_filter # As a last resort, try to parse the whole diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 40f2a29b..e6190e1e 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -514,3 +514,15 @@ def test_check_jq_ext_filter(client, live_server, measure_memory_usage): def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage): if jq_support: check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server) + +def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage): + from .. import html_tools + + # JSON string with BOM and correct double-quoted keys + json_str = '\ufeff{"name": "José", "emoji": "😊", "language": "中文", "greeting": "Привет"}' + + # See that we can find the second