mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-03-21 11:17:57 +00:00
Compare commits
1 Commits
socketio-c
...
JSONP-supp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
141aea07b8 |
@@ -487,13 +487,25 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
|||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
logger.warning(f"Error processing JSON {content[:20]}...{str(e)})")
|
logger.warning(f"Error processing JSON {content[:20]}...{str(e)})")
|
||||||
else:
|
else:
|
||||||
# Probably something else, go fish inside for it
|
# Check for JSONP wrapper: someCallback({...}) or some.namespace({...})
|
||||||
try:
|
# Server may claim application/json but actually return JSONP
|
||||||
stripped_text_from_html = extract_json_blob_from_html(content=content,
|
jsonp_match = re.match(r'^\w[\w.]*\s*\((.+)\)\s*;?\s*$', content.lstrip("\ufeff").strip(), re.DOTALL)
|
||||||
ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
|
if jsonp_match:
|
||||||
json_filter=json_filter )
|
try:
|
||||||
except json.JSONDecodeError as e:
|
inner = jsonp_match.group(1).strip()
|
||||||
logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
|
logger.warning(f"Content looks like JSONP, attempting to extract inner JSON for filter '{json_filter}'")
|
||||||
|
stripped_text_from_html = _parse_json(json.loads(inner), json_filter)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Error processing JSONP inner content {content[:20]}...{str(e)})")
|
||||||
|
|
||||||
|
if not stripped_text_from_html:
|
||||||
|
# Probably something else, go fish inside for it
|
||||||
|
try:
|
||||||
|
stripped_text_from_html = extract_json_blob_from_html(content=content,
|
||||||
|
ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
|
||||||
|
json_filter=json_filter)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
|
||||||
|
|
||||||
if not stripped_text_from_html:
|
if not stripped_text_from_html:
|
||||||
# Re 265 - Just return an empty string when filter not found
|
# Re 265 - Just return an empty string when filter not found
|
||||||
|
|||||||
@@ -100,7 +100,13 @@ class guess_stream_type():
|
|||||||
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
|
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
|
||||||
self.is_rss = True
|
self.is_rss = True
|
||||||
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
|
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
|
||||||
self.is_json = True
|
# JSONP detection: server claims application/json but content is actually JSONP (e.g. cb({...}))
|
||||||
|
# A JSONP response starts with an identifier followed by '(' - not valid JSON
|
||||||
|
if re.match(r'^\w[\w.]*\s*\(', test_content):
|
||||||
|
logger.warning(f"Content-Type header claims JSON but content looks like JSONP (starts with identifier+parenthesis) - treating as plaintext")
|
||||||
|
self.is_plaintext = True
|
||||||
|
else:
|
||||||
|
self.is_json = True
|
||||||
elif 'pdf' in magic_content_header:
|
elif 'pdf' in magic_content_header:
|
||||||
self.is_pdf = True
|
self.is_pdf = True
|
||||||
# magic will call a rss document 'xml'
|
# magic will call a rss document 'xml'
|
||||||
|
|||||||
@@ -16,6 +16,51 @@ except ModuleNotFoundError:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_jsonp_treated_as_plaintext():
|
||||||
|
from ..processors.magic import guess_stream_type
|
||||||
|
|
||||||
|
# JSONP content (server wrongly claims application/json) should be detected as plaintext
|
||||||
|
# Callback names are arbitrary identifiers, not always 'cb'
|
||||||
|
jsonp_content = 'jQuery123456({ "version": "8.0.41", "url": "https://example.com/app.apk" })'
|
||||||
|
result = guess_stream_type(http_content_header="application/json", content=jsonp_content)
|
||||||
|
assert result.is_json is False
|
||||||
|
assert result.is_plaintext is True
|
||||||
|
|
||||||
|
# Variation with dotted callback name e.g. jQuery.cb(...)
|
||||||
|
jsonp_dotted = 'some.callback({ "version": "1.0" })'
|
||||||
|
result = guess_stream_type(http_content_header="application/json", content=jsonp_dotted)
|
||||||
|
assert result.is_json is False
|
||||||
|
assert result.is_plaintext is True
|
||||||
|
|
||||||
|
# Real JSON should still be detected as JSON
|
||||||
|
json_content = '{ "version": "8.0.41", "url": "https://example.com/app.apk" }'
|
||||||
|
result = guess_stream_type(http_content_header="application/json", content=json_content)
|
||||||
|
assert result.is_json is True
|
||||||
|
assert result.is_plaintext is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_jsonp_json_filter_extraction():
|
||||||
|
from .. import html_tools
|
||||||
|
|
||||||
|
# Tough case: dotted namespace callback, trailing semicolon, deeply nested content with arrays
|
||||||
|
jsonp_content = 'weixin.update.callback({"platforms": {"android": {"variants": [{"arch": "arm64", "versionName": "8.0.68", "url": "https://example.com/app-arm64.apk"}, {"arch": "arm32", "versionName": "8.0.41", "url": "https://example.com/app-arm32.apk"}]}}});'
|
||||||
|
|
||||||
|
# Deep nested jsonpath filter into array element
|
||||||
|
text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[0].versionName")
|
||||||
|
assert text == '"8.0.68"'
|
||||||
|
|
||||||
|
# Filter that selects the second array element
|
||||||
|
text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[1].arch")
|
||||||
|
assert text == '"arm32"'
|
||||||
|
|
||||||
|
if jq_support:
|
||||||
|
text = html_tools.extract_json_as_string(jsonp_content, "jq:.platforms.android.variants[0].versionName")
|
||||||
|
assert text == '"8.0.68"'
|
||||||
|
|
||||||
|
text = html_tools.extract_json_as_string(jsonp_content, "jqraw:.platforms.android.variants[1].url")
|
||||||
|
assert text == "https://example.com/app-arm32.apk"
|
||||||
|
|
||||||
|
|
||||||
def test_unittest_inline_html_extract():
|
def test_unittest_inline_html_extract():
|
||||||
# So lets pretend that the JSON we want is inside some HTML
|
# So lets pretend that the JSON we want is inside some HTML
|
||||||
content="""
|
content="""
|
||||||
|
|||||||
Reference in New Issue
Block a user