Fetcher / Parser - Automatically attempt to extract JSON from document when document contains JSON but could be wrapped in HTML (#1593)

2025-12-08 09:05:36 +00:00 · 2023-05-30 06:57:17 +00:00
parent d8b9f0fd78
commit a4e6fd1ec3
2 changed files with 51 additions and 25 deletions
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -64,6 +64,24 @@ and it can also be repeated
        with pytest.raises(html_tools.JSONNotFound) as e_info:
            html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")

+
+def test_unittest_inline_extract_body():
+    content = """
+    <html>
+        <head></head>
+        <body>
+            <pre style="word-wrap: break-word; white-space: pre-wrap;">
+                {"testKey": 42}
+            </pre>
+        </body>
+    </html>
+    """
+    from .. import html_tools
+
+    # See that we can find the second <script> one, which is not broken, and matches our filter
+    text = html_tools.extract_json_as_string(content, "json:$.testKey")
+    assert text == '42'
+
 def set_original_ext_response():
    data = """
        [
@@ -437,7 +455,6 @@ def test_ignore_json_order(client, live_server):
    assert b'Deleted' in res.data

 def test_correct_header_detect(client, live_server):
-    
    # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
    # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
    with open("test-datastore/endpoint-content.txt", "w") as f:
@@ -453,11 +470,17 @@ def test_correct_header_detect(client, live_server):
    )
    assert b"1 Imported" in res.data
    wait_for_all_checks(client)
-
-
    res = client.get(url_for("index"))
-    # This will be fixed in #1593
-    assert b'No parsable JSON found in this document' in res.data
+
+    # Fixed in #1593
+    assert b'No parsable JSON found in this document' not in res.data
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+    assert b'&#34;world&#34;:' in res.data
+    assert res.data.count(b'{') >= 2

    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data