diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 1a6893f6..ef51e096 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -303,70 +303,92 @@ def _get_stripped_text_from_json_match(match): return stripped_text_from_html +def extract_json_blob_from_html(content, ensure_is_ldjson_info_type, json_filter): + from bs4 import BeautifulSoup + stripped_text_from_html = '' + + # Foreach blob.. just return the first that matches json_filter + # As a last resort, try to parse the whole + soup = BeautifulSoup(content, 'html.parser') + + if ensure_is_ldjson_info_type: + bs_result = soup.find_all('script', {"type": "application/ld+json"}) + else: + bs_result = soup.find_all('script') + bs_result += soup.find_all('body') + + bs_jsons = [] + + for result in bs_result: + # result.text is how bs4 magically strips JSON from the body + content_start = result.text.lstrip("\ufeff").strip()[:100] if result.text else '' + # Skip empty tags, and things that dont even look like JSON + if not result.text or not (content_start[0] == '{' or content_start[0] == '['): + continue + try: + json_data = json.loads(result.text) + bs_jsons.append(json_data) + except json.JSONDecodeError: + # Skip objects which cannot be parsed + continue + + if not bs_jsons: + raise JSONNotFound("No parsable JSON found in this document") + + for json_data in bs_jsons: + stripped_text_from_html = _parse_json(json_data, json_filter) + + if ensure_is_ldjson_info_type: + # Could sometimes be list, string or something else random + if isinstance(json_data, dict): + # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search + # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) + # @type could also be a list although non-standard ("@type": ["Product", "SubType"],) + # LD_JSON auto-extract also requires some content PLUS the ldjson to be present + # 1833 - could be either str or dict, should not be anything else + + t = json_data.get('@type') + if t and stripped_text_from_html: + + if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower(): + break + # The non-standard part, some have a list + elif isinstance(t, list): + if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]: + break + + elif stripped_text_from_html: + break + + return stripped_text_from_html + # content - json # json_filter - ie json:$..price # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): - from bs4 import BeautifulSoup stripped_text_from_html = False # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags - try: - # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work - stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter) - except json.JSONDecodeError as e: - logger.warning(str(e)) - # Foreach blob.. just return the first that matches json_filter - # As a last resort, try to parse the whole - soup = BeautifulSoup(content, 'html.parser') + # Looks like clean JSON, dont bother extracting from HTML - if ensure_is_ldjson_info_type: - bs_result = soup.find_all('script', {"type": "application/ld+json"}) - else: - bs_result = soup.find_all('script') - bs_result += soup.find_all('body') + content_start = content.lstrip("\ufeff").strip()[:100] - bs_jsons = [] - for result in bs_result: - # Skip empty tags, and things that dont even look like JSON - if not result.text or '{' not in result.text: - continue - try: - json_data = json.loads(result.text) - bs_jsons.append(json_data) - except json.JSONDecodeError: - # Skip objects which cannot be parsed - continue - - if not bs_jsons: - raise JSONNotFound("No parsable JSON found in this document") - - for json_data in bs_jsons: - stripped_text_from_html = _parse_json(json_data, json_filter) - - if ensure_is_ldjson_info_type: - # Could sometimes be list, string or something else random - if isinstance(json_data, dict): - # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search - # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) - # @type could also be a list although non-standard ("@type": ["Product", "SubType"],) - # LD_JSON auto-extract also requires some content PLUS the ldjson to be present - # 1833 - could be either str or dict, should not be anything else - - t = json_data.get('@type') - if t and stripped_text_from_html: - - if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower(): - break - # The non-standard part, some have a list - elif isinstance(t, list): - if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]: - break - - elif stripped_text_from_html: - break + if content_start[0] == '{' or content_start[0] == '[': + try: + # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work + stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff")), json_filter) + except json.JSONDecodeError as e: + logger.warning(f"Error processing JSON {content[:20]}...{str(e)})") + else: + # Probably something else, go fish inside for it + try: + stripped_text_from_html = extract_json_blob_from_html(content=content, + ensure_is_ldjson_info_type=ensure_is_ldjson_info_type, + json_filter=json_filter ) + except json.JSONDecodeError as e: + logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})") if not stripped_text_from_html: # Re 265 - Just return an empty string when filter not found diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index d7cbfa08..2827c369 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -99,6 +99,10 @@ class FilterConfig: def has_include_filters(self): return bool(self.include_filters) and bool(self.include_filters[0].strip()) + @property + def has_include_json_filters(self): + return any(f.strip().startswith(prefix) for f in self.include_filters for prefix in JSON_FILTER_PREFIXES) + @property def has_subtractive_selectors(self): return bool(self.subtractive_selectors) and bool(self.subtractive_selectors[0].strip()) @@ -255,12 +259,10 @@ class ContentProcessor: ) return html_content.replace('', metadata + '') - def preprocess_json(self, content): + def preprocess_json(self, raw_content): """Format and sort JSON content.""" - - # if it doesnt look like JSON then try to extract it - if not '{' in content[:5]: - content = html_tools.extract_json_as_string(content=content, json_filter="json:$") + # Then we re-format it, else it does have filters (later on) which will reformat it anyway + content = html_tools.extract_json_as_string(content=raw_content, json_filter="json:$") # Sort JSON to avoid false alerts from reordering try: @@ -388,9 +390,12 @@ class perform_site_check(difference_detection_processor): content = content_processor.preprocess_pdf(raw_content=self.fetcher.raw_content) stream_content_type.is_html = True - # JSON - Extract JSON from some HTML blob, and always reformat it nicely for consistency. + # JSON - Always reformat it nicely for consistency. + if stream_content_type.is_json: - content = content_processor.preprocess_json(content, filter_config.has_include_filters) + if not filter_config.has_include_json_filters: + content = content_processor.preprocess_json(raw_content=content) + #else, otherwise it gets sorted/formatted in the filter stage anyway # HTML obfuscation workarounds if stream_content_type.is_html: diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 476be216..bc5cec67 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -113,14 +113,8 @@ def set_original_ext_response(): return None def set_modified_ext_response(): - data = """ - [ - { - "isPriceLowered": false, - "status": "Sold", - "statusOrig": "sold" - }, - { + # This should get reformatted + data = """ [ { "isPriceLowered": false, "status": "Sold", "statusOrig": "sold" }, { "_id": "5e7b3e1fb3262d306323ff1e", "listingsType": "consumer", "isPriceLowered": false, @@ -230,30 +224,15 @@ def check_json_filter(json_filter, client, live_server): # Add our URL to the import page test_url = url_for('test_endpoint', content_type="application/json", _external=True) - uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) + uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={"include_filters": json_filter.splitlines()}) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up wait_for_all_checks(client) - # Goto the edit page, add our ignore text - # Add our URL to the import page - res = client.post( - url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": json_filter, - "url": test_url, - "tags": "", - "headers": "", - "fetch_backend": "html_requests", - "time_between_check_use_default": "y" - }, - follow_redirects=True - ) - assert b"Updated watch." in res.data - # Check it saved res = client.get( - url_for("ui.ui_edit.edit_page", uuid="first"), + url_for("ui.ui_edit.edit_page", uuid=uuid), ) assert bytes(escape(json_filter).encode('utf-8')) in res.data @@ -272,7 +251,7 @@ def check_json_filter(json_filter, client, live_server): assert b'has-unread-changes' in res.data # Should not see this, because its not in the JSONPath we entered - res = client.get(url_for("ui.ui_views.diff_history_page", uuid="first")) + res = client.get(url_for("ui.ui_views.diff_history_page", uuid=uuid)) # But the change should be there, tho its hard to test the change was detected because it will show old and new versions # And #462 - check we see the proper utf-8 string there @@ -294,32 +273,12 @@ def test_check_jqraw_filter(client, live_server, measure_memory_usage): def check_json_filter_bool_val(json_filter, client, live_server): set_original_response() - # Give the endpoint time to spin up - time.sleep(1) - test_url = url_for('test_endpoint', content_type="application/json", _external=True) - uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) + uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={"include_filters": [json_filter]}) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) - wait_for_all_checks(client) - # Goto the edit page, add our ignore text - # Add our URL to the import page - res = client.post( - url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": json_filter, - "url": test_url, - "tags": "", - "headers": "", - "fetch_backend": "html_requests", - "time_between_check_use_default": "y" - }, - follow_redirects=True - ) - assert b"Updated watch." in res.data - # Give the thread time to pick it up - wait_for_all_checks(client) # Make a change set_modified_response() @@ -353,21 +312,16 @@ def test_check_jqraw_filter_bool_val(client, live_server, measure_memory_usage): def check_json_ext_filter(json_filter, client, live_server): set_original_ext_response() - # Give the endpoint time to spin up - time.sleep(1) - # Add our URL to the import page test_url = url_for('test_endpoint', content_type="application/json", _external=True) uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) - - # Give the thread time to pick it up wait_for_all_checks(client) # Goto the edit page, add our ignore text # Add our URL to the import page res = client.post( - url_for("ui.ui_edit.edit_page", uuid="first"), + url_for("ui.ui_edit.edit_page", uuid=uuid), data={"include_filters": json_filter, "url": test_url, "tags": "", @@ -381,7 +335,7 @@ def check_json_ext_filter(json_filter, client, live_server): # Check it saved res = client.get( - url_for("ui.ui_edit.edit_page", uuid="first"), + url_for("ui.ui_edit.edit_page", uuid=uuid), ) assert bytes(escape(json_filter).encode('utf-8')) in res.data @@ -395,6 +349,12 @@ def check_json_ext_filter(json_filter, client, live_server): # Give the thread time to pick it up wait_for_all_checks(client) + watch = live_server.app.config['DATASTORE'].data['watching'][uuid] + dates = list(watch.history.keys()) + snapshot_contents = watch.get_history_snapshot(dates[0]) + + assert snapshot_contents[0] == '[' + # It should have 'has-unread-changes' res = client.get(url_for("watchlist.index")) assert b'has-unread-changes' in res.data @@ -474,12 +434,13 @@ def test_correct_header_detect(client, live_server, measure_memory_usage): follow_redirects=True ) - assert b'"hello": 123,' in res.data # properly html escaped in the front end watch = live_server.app.config['DATASTORE'].data['watching'][uuid] dates = list(watch.history.keys()) snapshot_contents = watch.get_history_snapshot(dates[0]) + assert b'"hello": 123,' in res.data # properly html escaped in the front end + # Should be correctly formatted and sorted, ("world" goes to end) assert snapshot_contents == """{ "hello": 123,