diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index f97add7e..6175282d 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -487,13 +487,25 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None except json.JSONDecodeError as e: logger.warning(f"Error processing JSON {content[:20]}...{str(e)})") else: - # Probably something else, go fish inside for it - try: - stripped_text_from_html = extract_json_blob_from_html(content=content, - ensure_is_ldjson_info_type=ensure_is_ldjson_info_type, - json_filter=json_filter ) - except json.JSONDecodeError as e: - logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})") + # Check for JSONP wrapper: someCallback({...}) or some.namespace({...}) + # Server may claim application/json but actually return JSONP + jsonp_match = re.match(r'^\w[\w.]*\s*\((.+)\)\s*;?\s*$', content.lstrip("\ufeff").strip(), re.DOTALL) + if jsonp_match: + try: + inner = jsonp_match.group(1).strip() + logger.warning(f"Content looks like JSONP, attempting to extract inner JSON for filter '{json_filter}'") + stripped_text_from_html = _parse_json(json.loads(inner), json_filter) + except json.JSONDecodeError as e: + logger.warning(f"Error processing JSONP inner content {content[:20]}...{str(e)})") + + if not stripped_text_from_html: + # Probably something else, go fish inside for it + try: + stripped_text_from_html = extract_json_blob_from_html(content=content, + ensure_is_ldjson_info_type=ensure_is_ldjson_info_type, + json_filter=json_filter) + except json.JSONDecodeError as e: + logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})") if not stripped_text_from_html: # Re 265 - Just return an empty string when filter not found diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py index 9d9018d7..2fbaedb7 100644 --- a/changedetectionio/processors/magic.py +++ b/changedetectionio/processors/magic.py @@ -100,7 +100,13 @@ class guess_stream_type(): if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES): self.is_rss = True elif any(s in http_content_header for s in JSON_CONTENT_TYPES): - self.is_json = True + # JSONP detection: server claims application/json but content is actually JSONP (e.g. cb({...})) + # A JSONP response starts with an identifier followed by '(' - not valid JSON + if re.match(r'^\w[\w.]*\s*\(', test_content): + logger.warning(f"Content-Type header claims JSON but content looks like JSONP (starts with identifier+parenthesis) - treating as plaintext") + self.is_plaintext = True + else: + self.is_json = True elif 'pdf' in magic_content_header: self.is_pdf = True # magic will call a rss document 'xml' diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index ab2144a1..00e0bcd7 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -16,6 +16,51 @@ except ModuleNotFoundError: +def test_jsonp_treated_as_plaintext(): + from ..processors.magic import guess_stream_type + + # JSONP content (server wrongly claims application/json) should be detected as plaintext + # Callback names are arbitrary identifiers, not always 'cb' + jsonp_content = 'jQuery123456({ "version": "8.0.41", "url": "https://example.com/app.apk" })' + result = guess_stream_type(http_content_header="application/json", content=jsonp_content) + assert result.is_json is False + assert result.is_plaintext is True + + # Variation with dotted callback name e.g. jQuery.cb(...) + jsonp_dotted = 'some.callback({ "version": "1.0" })' + result = guess_stream_type(http_content_header="application/json", content=jsonp_dotted) + assert result.is_json is False + assert result.is_plaintext is True + + # Real JSON should still be detected as JSON + json_content = '{ "version": "8.0.41", "url": "https://example.com/app.apk" }' + result = guess_stream_type(http_content_header="application/json", content=json_content) + assert result.is_json is True + assert result.is_plaintext is False + + +def test_jsonp_json_filter_extraction(): + from .. import html_tools + + # Tough case: dotted namespace callback, trailing semicolon, deeply nested content with arrays + jsonp_content = 'weixin.update.callback({"platforms": {"android": {"variants": [{"arch": "arm64", "versionName": "8.0.68", "url": "https://example.com/app-arm64.apk"}, {"arch": "arm32", "versionName": "8.0.41", "url": "https://example.com/app-arm32.apk"}]}}});' + + # Deep nested jsonpath filter into array element + text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[0].versionName") + assert text == '"8.0.68"' + + # Filter that selects the second array element + text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[1].arch") + assert text == '"arm32"' + + if jq_support: + text = html_tools.extract_json_as_string(jsonp_content, "jq:.platforms.android.variants[0].versionName") + assert text == '"8.0.68"' + + text = html_tools.extract_json_as_string(jsonp_content, "jqraw:.platforms.android.variants[1].url") + assert text == "https://example.com/app-arm32.apk" + + def test_unittest_inline_html_extract(): # So lets pretend that the JSON we want is inside some HTML content="""