mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Adds support for jq JSON path querying engine (#1001)
This commit is contained in:
@@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
|
||||
#### Key Features
|
||||
|
||||
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
|
||||
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
|
||||
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
|
||||
- Switch between fast non-JS and Chrome JS based "fetchers"
|
||||
- Easily specify how often a site should be checked
|
||||
- Execute JS before extracting text (Good for logging in, see examples in the UI!)
|
||||
|
||||
53
README.md
53
README.md
@@ -47,7 +47,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
|
||||
#### Key Features
|
||||
|
||||
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
|
||||
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
|
||||
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
|
||||
- Switch between fast non-JS and Chrome JS based "fetchers"
|
||||
- Easily specify how often a site should be checked
|
||||
- Execute JS before extracting text (Good for logging in, see examples in the UI!)
|
||||
@@ -121,7 +121,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
|
||||
|
||||
|
||||
## Filters
|
||||
XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
|
||||
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
|
||||
|
||||
(We support LXML `re:test`, `re:math` and `re:replace`.)
|
||||
|
||||
@@ -151,7 +151,7 @@ Now you can also customise your notification content!
|
||||
|
||||
## JSON API Monitoring
|
||||
|
||||
Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector.
|
||||
Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed.
|
||||
|
||||

|
||||
|
||||
@@ -159,9 +159,52 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
|
||||
|
||||

|
||||
|
||||
### JSONPath or jq?
|
||||
|
||||
For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq.
|
||||
|
||||
The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10.
|
||||
|
||||
#### Sample input data from API
|
||||
```
|
||||
{
|
||||
"items": [
|
||||
{
|
||||
"name": "Product A",
|
||||
"priceInCents": 2500
|
||||
},
|
||||
{
|
||||
"name": "Product B",
|
||||
"priceInCents": 500
|
||||
},
|
||||
{
|
||||
"name": "Product C",
|
||||
"priceInCents": 2000
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Sample jq
|
||||
`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)`
|
||||
|
||||
#### Sample output data
|
||||
```
|
||||
{
|
||||
"name": "Product A",
|
||||
"priceInCents": 2500,
|
||||
"priceInDollars": 25
|
||||
}
|
||||
{
|
||||
"name": "Product C",
|
||||
"priceInCents": 2000,
|
||||
"priceInDollars": 20
|
||||
}
|
||||
```
|
||||
|
||||
### Parse JSON embedded in HTML!
|
||||
|
||||
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
|
||||
When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
|
||||
|
||||
```
|
||||
<html>
|
||||
@@ -171,7 +214,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e
|
||||
</script>
|
||||
```
|
||||
|
||||
`json:$.price` would give `23.50`, or you can extract the whole structure
|
||||
`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
|
||||
|
||||
## Proxy configuration
|
||||
|
||||
|
||||
@@ -141,8 +141,9 @@ class perform_site_check():
|
||||
has_filter_rule = True
|
||||
|
||||
if has_filter_rule:
|
||||
if 'json:' in css_filter_rule:
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
|
||||
json_filter_prefixes = ['json:', 'jq:']
|
||||
if any(prefix in css_filter_rule for prefix in json_filter_prefixes):
|
||||
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule)
|
||||
is_html = False
|
||||
|
||||
if is_html or is_source:
|
||||
|
||||
@@ -304,6 +304,21 @@ class ValidateCSSJSONXPATHInput(object):
|
||||
# Re #265 - maybe in the future fetch the page and offer a
|
||||
# warning/notice that its possible the rule doesnt yet match anything?
|
||||
|
||||
if 'jq:' in line:
|
||||
if not self.allow_json:
|
||||
raise ValidationError("jq not permitted in this field!")
|
||||
|
||||
import jq
|
||||
input = line.replace('jq:', '')
|
||||
|
||||
try:
|
||||
jq.compile(input)
|
||||
except (ValueError) as e:
|
||||
message = field.gettext('\'%s\' is not a valid jq expression. (%s)')
|
||||
raise ValidationError(message % (input, str(e)))
|
||||
except:
|
||||
raise ValidationError("A system-error occurred when validating your jq expression")
|
||||
|
||||
|
||||
class quickWatchForm(Form):
|
||||
url = fields.URLField('URL', validators=[validateURL()])
|
||||
|
||||
@@ -3,6 +3,7 @@ from typing import List
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from jsonpath_ng.ext import parse
|
||||
import jq
|
||||
import re
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
@@ -79,19 +80,26 @@ def extract_element(find='title', html_content=''):
|
||||
return element_text
|
||||
|
||||
#
|
||||
def _parse_json(json_data, jsonpath_filter):
|
||||
s=[]
|
||||
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
|
||||
match = jsonpath_expression.find(json_data)
|
||||
def _parse_json(json_data, json_filter):
|
||||
if 'json:' in json_filter:
|
||||
jsonpath_expression = parse(json_filter.replace('json:', ''))
|
||||
match = jsonpath_expression.find(json_data)
|
||||
return _get_stripped_text_from_json_match(match)
|
||||
if 'jq:' in json_filter:
|
||||
jq_expression = jq.compile(json_filter.replace('jq:', ''))
|
||||
match = jq_expression.input(json_data).all()
|
||||
return _get_stripped_text_from_json_match(match)
|
||||
|
||||
def _get_stripped_text_from_json_match(match):
|
||||
s = []
|
||||
# More than one result, we will return it as a JSON list.
|
||||
if len(match) > 1:
|
||||
for i in match:
|
||||
s.append(i.value)
|
||||
s.append(i.value if hasattr(i, 'value') else i)
|
||||
|
||||
# Single value, use just the value, as it could be later used in a token in notifications.
|
||||
if len(match) == 1:
|
||||
s = match[0].value
|
||||
s = match[0].value if hasattr(match[0], 'value') else match[0]
|
||||
|
||||
# Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
|
||||
if not match:
|
||||
@@ -103,16 +111,16 @@ def _parse_json(json_data, jsonpath_filter):
|
||||
|
||||
return stripped_text_from_html
|
||||
|
||||
def extract_json_as_string(content, jsonpath_filter):
|
||||
def extract_json_as_string(content, json_filter):
|
||||
|
||||
stripped_text_from_html = False
|
||||
|
||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||
try:
|
||||
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
|
||||
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
|
||||
except json.JSONDecodeError:
|
||||
|
||||
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
|
||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||
s = []
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
bs_result = soup.findAll('script')
|
||||
@@ -131,7 +139,7 @@ def extract_json_as_string(content, jsonpath_filter):
|
||||
# Just skip it
|
||||
continue
|
||||
else:
|
||||
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
|
||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||
if stripped_text_from_html:
|
||||
break
|
||||
|
||||
|
||||
@@ -184,8 +184,12 @@ User-Agent: wonderbra 1.0") }}
|
||||
<span class="pure-form-message-inline">
|
||||
<ul>
|
||||
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
|
||||
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
|
||||
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
|
||||
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a>.
|
||||
<ul>
|
||||
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
|
||||
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
|
||||
<ul>
|
||||
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
|
||||
@@ -194,7 +198,7 @@ User-Agent: wonderbra 1.0") }}
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
|
||||
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath, or jq selector rules before filing an issue on GitHub! <a
|
||||
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# coding=utf-8
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from flask import url_for, escape
|
||||
from . util import live_server_setup
|
||||
import pytest
|
||||
|
||||
@@ -36,16 +36,26 @@ and it can also be repeated
|
||||
from .. import html_tools
|
||||
|
||||
# See that we can find the second <script> one, which is not broken, and matches our filter
|
||||
text = html_tools.extract_json_as_string(content, "$.offers.price")
|
||||
text = html_tools.extract_json_as_string(content, "json:$.offers.price")
|
||||
assert text == "23.5"
|
||||
|
||||
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
|
||||
# also check for jq
|
||||
text = html_tools.extract_json_as_string(content, "jq:.offers.price")
|
||||
assert text == "23.5"
|
||||
|
||||
text = html_tools.extract_json_as_string('{"id":5}', "json:$.id")
|
||||
assert text == "5"
|
||||
|
||||
text = html_tools.extract_json_as_string('{"id":5}', "jq:.id")
|
||||
assert text == "5"
|
||||
|
||||
# When nothing at all is found, it should throw JSONNOTFound
|
||||
# Which is caught and shown to the user in the watch-overview table
|
||||
with pytest.raises(html_tools.JSONNotFound) as e_info:
|
||||
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
|
||||
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "json:$.id")
|
||||
|
||||
with pytest.raises(html_tools.JSONNotFound) as e_info:
|
||||
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
|
||||
|
||||
def set_original_ext_response():
|
||||
data = """
|
||||
@@ -66,6 +76,7 @@ def set_original_ext_response():
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(data)
|
||||
return None
|
||||
|
||||
def set_modified_ext_response():
|
||||
data = """
|
||||
@@ -86,6 +97,7 @@ def set_modified_ext_response():
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(data)
|
||||
return None
|
||||
|
||||
def set_original_response():
|
||||
test_return_data = """
|
||||
@@ -184,10 +196,10 @@ def test_check_json_without_filter(client, live_server):
|
||||
assert b'"<b>' in res.data
|
||||
assert res.data.count(b'{\n') >= 2
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_json_filter(client, live_server):
|
||||
json_filter = 'json:boss.name'
|
||||
|
||||
def check_json_filter(json_filter, client, live_server):
|
||||
set_original_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
@@ -226,7 +238,7 @@ def test_check_json_filter(client, live_server):
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first"),
|
||||
)
|
||||
assert bytes(json_filter.encode('utf-8')) in res.data
|
||||
assert bytes(escape(json_filter).encode('utf-8')) in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
@@ -252,10 +264,16 @@ def test_check_json_filter(client, live_server):
|
||||
# And #462 - check we see the proper utf-8 string there
|
||||
assert "Örnsköldsvik".encode('utf-8') in res.data
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_json_filter_bool_val(client, live_server):
|
||||
json_filter = "json:$['available']"
|
||||
def test_check_jsonpath_filter(client, live_server):
|
||||
check_json_filter('json:boss.name', client, live_server)
|
||||
|
||||
def test_check_jq_filter(client, live_server):
|
||||
check_json_filter('jq:.boss.name', client, live_server)
|
||||
|
||||
def check_json_filter_bool_val(json_filter, client, live_server):
|
||||
set_original_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
@@ -304,14 +322,21 @@ def test_check_json_filter_bool_val(client, live_server):
|
||||
# But the change should be there, tho its hard to test the change was detected because it will show old and new versions
|
||||
assert b'false' in res.data
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_jsonpath_filter_bool_val(client, live_server):
|
||||
check_json_filter_bool_val("json:$['available']", client, live_server)
|
||||
|
||||
def test_check_jq_filter_bool_val(client, live_server):
|
||||
check_json_filter_bool_val("jq:.available", client, live_server)
|
||||
|
||||
# Re #265 - Extended JSON selector test
|
||||
# Stuff to consider here
|
||||
# - Selector should be allowed to return empty when it doesnt match (people might wait for some condition)
|
||||
# - The 'diff' tab could show the old and new content
|
||||
# - Form should let us enter a selector that doesnt (yet) match anything
|
||||
def test_check_json_ext_filter(client, live_server):
|
||||
json_filter = 'json:$[?(@.status==Sold)]'
|
||||
|
||||
def check_json_ext_filter(json_filter, client, live_server):
|
||||
set_original_ext_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
@@ -350,7 +375,7 @@ def test_check_json_ext_filter(client, live_server):
|
||||
res = client.get(
|
||||
url_for("edit_page", uuid="first"),
|
||||
)
|
||||
assert bytes(json_filter.encode('utf-8')) in res.data
|
||||
assert bytes(escape(json_filter).encode('utf-8')) in res.data
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
@@ -376,3 +401,11 @@ def test_check_json_ext_filter(client, live_server):
|
||||
assert b'ForSale' not in res.data
|
||||
assert b'Sold' in res.data
|
||||
|
||||
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
assert b'Deleted' in res.data
|
||||
|
||||
def test_check_jsonpath_ext_filter(client, live_server):
|
||||
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
|
||||
|
||||
def test_check_jq_ext_filter(client, live_server):
|
||||
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)
|
||||
@@ -16,6 +16,7 @@ chardet > 2.3.0
|
||||
|
||||
wtforms ~= 3.0
|
||||
jsonpath-ng ~= 1.5.3
|
||||
jq ~= 1.3.0
|
||||
|
||||
# Notification library
|
||||
apprise ~= 1.1.0
|
||||
|
||||
Reference in New Issue
Block a user