diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index f97add7e..6175282d 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -487,13 +487,25 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
except json.JSONDecodeError as e:
logger.warning(f"Error processing JSON {content[:20]}...{str(e)})")
else:
- # Probably something else, go fish inside for it
- try:
- stripped_text_from_html = extract_json_blob_from_html(content=content,
- ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
- json_filter=json_filter )
- except json.JSONDecodeError as e:
- logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
+ # Check for JSONP wrapper: someCallback({...}) or some.namespace({...})
+ # Server may claim application/json but actually return JSONP
+ jsonp_match = re.match(r'^\w[\w.]*\s*\((.+)\)\s*;?\s*$', content.lstrip("\ufeff").strip(), re.DOTALL)
+ if jsonp_match:
+ try:
+ inner = jsonp_match.group(1).strip()
+ logger.warning(f"Content looks like JSONP, attempting to extract inner JSON for filter '{json_filter}'")
+ stripped_text_from_html = _parse_json(json.loads(inner), json_filter)
+ except json.JSONDecodeError as e:
+ logger.warning(f"Error processing JSONP inner content {content[:20]}...{str(e)})")
+
+ if not stripped_text_from_html:
+ # Probably something else, go fish inside for it
+ try:
+ stripped_text_from_html = extract_json_blob_from_html(content=content,
+ ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
+ json_filter=json_filter)
+ except json.JSONDecodeError as e:
+ logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
if not stripped_text_from_html:
# Re 265 - Just return an empty string when filter not found
diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py
index 9d9018d7..2fbaedb7 100644
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -100,7 +100,13 @@ class guess_stream_type():
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
self.is_rss = True
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
- self.is_json = True
+ # JSONP detection: server claims application/json but content is actually JSONP (e.g. cb({...}))
+ # A JSONP response starts with an identifier followed by '(' - not valid JSON
+ if re.match(r'^\w[\w.]*\s*\(', test_content):
+ logger.warning(f"Content-Type header claims JSON but content looks like JSONP (starts with identifier+parenthesis) - treating as plaintext")
+ self.is_plaintext = True
+ else:
+ self.is_json = True
elif 'pdf' in magic_content_header:
self.is_pdf = True
# magic will call a rss document 'xml'
diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py
index ab2144a1..00e0bcd7 100644
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -16,6 +16,51 @@ except ModuleNotFoundError:
+def test_jsonp_treated_as_plaintext():
+ from ..processors.magic import guess_stream_type
+
+ # JSONP content (server wrongly claims application/json) should be detected as plaintext
+ # Callback names are arbitrary identifiers, not always 'cb'
+ jsonp_content = 'jQuery123456({ "version": "8.0.41", "url": "https://example.com/app.apk" })'
+ result = guess_stream_type(http_content_header="application/json", content=jsonp_content)
+ assert result.is_json is False
+ assert result.is_plaintext is True
+
+ # Variation with dotted callback name e.g. jQuery.cb(...)
+ jsonp_dotted = 'some.callback({ "version": "1.0" })'
+ result = guess_stream_type(http_content_header="application/json", content=jsonp_dotted)
+ assert result.is_json is False
+ assert result.is_plaintext is True
+
+ # Real JSON should still be detected as JSON
+ json_content = '{ "version": "8.0.41", "url": "https://example.com/app.apk" }'
+ result = guess_stream_type(http_content_header="application/json", content=json_content)
+ assert result.is_json is True
+ assert result.is_plaintext is False
+
+
+def test_jsonp_json_filter_extraction():
+ from .. import html_tools
+
+ # Tough case: dotted namespace callback, trailing semicolon, deeply nested content with arrays
+ jsonp_content = 'weixin.update.callback({"platforms": {"android": {"variants": [{"arch": "arm64", "versionName": "8.0.68", "url": "https://example.com/app-arm64.apk"}, {"arch": "arm32", "versionName": "8.0.41", "url": "https://example.com/app-arm32.apk"}]}}});'
+
+ # Deep nested jsonpath filter into array element
+ text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[0].versionName")
+ assert text == '"8.0.68"'
+
+ # Filter that selects the second array element
+ text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[1].arch")
+ assert text == '"arm32"'
+
+ if jq_support:
+ text = html_tools.extract_json_as_string(jsonp_content, "jq:.platforms.android.variants[0].versionName")
+ assert text == '"8.0.68"'
+
+ text = html_tools.extract_json_as_string(jsonp_content, "jqraw:.platforms.android.variants[1].url")
+ assert text == "https://example.com/app-arm32.apk"
+
+
def test_unittest_inline_html_extract():
# So lets pretend that the JSON we want is inside some HTML
content="""