mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-30 22:27:52 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			better-mer
			...
			1833-ldjso
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | e86e178203 | ||
|   | 9e9f9d30c8 | ||
|   | 0c1d37ba11 | ||
|   | 4ae3519d66 | ||
|   | da92a12d92 | ||
|   | 56385112e7 | 
| @@ -7,13 +7,14 @@ from typing import List | ||||
| import json | ||||
| import re | ||||
|  | ||||
|  | ||||
| # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis | ||||
| TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" | ||||
|  | ||||
| PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' | ||||
| # 'price' , 'lowPrice', 'highPrice' are usually under here | ||||
| # all of those may or may not appear on different websites | ||||
| LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" | ||||
| # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here | ||||
| LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] | ||||
|  | ||||
| class JSONNotFound(ValueError): | ||||
|     def __init__(self, msg): | ||||
| @@ -161,7 +162,6 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None | ||||
|  | ||||
|         # Foreach <script json></script> blob.. just return the first that matches json_filter | ||||
|         # As a last resort, try to parse the whole <body> | ||||
|         s = [] | ||||
|         soup = BeautifulSoup(content, 'html.parser') | ||||
|  | ||||
|         if ensure_is_ldjson_info_type: | ||||
| @@ -187,13 +187,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None | ||||
|          | ||||
|         for json_data in bs_jsons: | ||||
|             stripped_text_from_html = _parse_json(json_data, json_filter) | ||||
|  | ||||
|             if ensure_is_ldjson_info_type: | ||||
|                 # Could sometimes be list, string or something else random | ||||
|                 if isinstance(json_data, dict): | ||||
|                     # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search | ||||
|                     # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) | ||||
|                     if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: | ||||
|                         break | ||||
|                     # @type could also be a list (Product, SubType) | ||||
|                     # LD_JSON auto-extract also requires some content PLUS the ldjson to be present | ||||
|                     # 1833 - could be either str or dict, should not be anything else | ||||
|                     if json_data.get('@type') and stripped_text_from_html: | ||||
|                         try: | ||||
|                             if json_data.get('@type') == str or json_data.get('@type') == dict: | ||||
|                                 types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type') | ||||
|                                 if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]: | ||||
|                                     break | ||||
|                         except: | ||||
|                             continue | ||||
|  | ||||
|             elif stripped_text_from_html: | ||||
|                 break | ||||
|  | ||||
| @@ -283,9 +294,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: | ||||
|  | ||||
| # Does LD+JSON exist with a @type=='product' and a .price set anywhere? | ||||
| def has_ldjson_product_info(content): | ||||
|     pricing_data = '' | ||||
|  | ||||
|     try: | ||||
|         pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product") | ||||
|     except JSONNotFound as e: | ||||
|         if not 'application/ld+json' in content: | ||||
|             return False | ||||
|  | ||||
|         for filter in LD_JSON_PRODUCT_OFFER_SELECTORS: | ||||
|             pricing_data += extract_json_as_string(content=content, | ||||
|                                                   json_filter=filter, | ||||
|                                                   ensure_is_ldjson_info_type="product") | ||||
|  | ||||
|     except Exception as e: | ||||
|         # Totally fine | ||||
|         return False | ||||
|     x=bool(pricing_data) | ||||
|   | ||||
| @@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
|  | ||||
| name = 'Webpage Text/HTML, JSON and PDF changes' | ||||
| description = 'Detects all text changes where possible' | ||||
|  | ||||
| json_filter_prefixes = ['json:', 'jq:'] | ||||
|  | ||||
| class FilterNotFoundInResponse(ValueError): | ||||
|     def __init__(self, msg): | ||||
| @@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor): | ||||
|  | ||||
|         # Inject a virtual LD+JSON price tracker rule | ||||
|         if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT: | ||||
|             include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) | ||||
|             include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS | ||||
|  | ||||
|         has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip()) | ||||
|         has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip()) | ||||
| @@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor): | ||||
|                 pass | ||||
|  | ||||
|         if has_filter_rule: | ||||
|             json_filter_prefixes = ['json:', 'jq:'] | ||||
|             for filter in include_filters_rule: | ||||
|                 if any(prefix in filter for prefix in json_filter_prefixes): | ||||
|                     stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) | ||||
|   | ||||
| @@ -2,7 +2,8 @@ | ||||
|  | ||||
| import time | ||||
| from flask import url_for | ||||
| from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI | ||||
| from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI, wait_for_all_checks | ||||
|  | ||||
|  | ||||
| def set_response_with_ldjson(): | ||||
|     test_return_data = """<html> | ||||
| @@ -27,7 +28,7 @@ def set_response_with_ldjson(): | ||||
|            "description":"You dont need it", | ||||
|            "mpn":"111111", | ||||
|            "sku":"22222", | ||||
|            "offers":{ | ||||
|            "Offers":{ | ||||
|               "@type":"AggregateOffer", | ||||
|               "lowPrice":8097000, | ||||
|               "highPrice":8099900, | ||||
| @@ -75,12 +76,11 @@ def set_response_without_ldjson(): | ||||
|         f.write(test_return_data) | ||||
|     return None | ||||
|  | ||||
| # actually only really used by the distll.io importer, but could be handy too | ||||
| def test_check_ldjson_price_autodetect(client, live_server): | ||||
| def test_setup(client, live_server): | ||||
|     live_server_setup(live_server) | ||||
|  | ||||
|     # Give the endpoint time to spin up | ||||
|     time.sleep(1) | ||||
| # actually only really used by the distll.io importer, but could be handy too | ||||
| def test_check_ldjson_price_autodetect(client, live_server): | ||||
|  | ||||
|     set_response_with_ldjson() | ||||
|  | ||||
| @@ -92,7 +92,7 @@ def test_check_ldjson_price_autodetect(client, live_server): | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|     time.sleep(3) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Should get a notice that it's available | ||||
|     res = client.get(url_for("index")) | ||||
| @@ -102,11 +102,11 @@ def test_check_ldjson_price_autodetect(client, live_server): | ||||
|     uuid = extract_UUID_from_client(client) | ||||
|  | ||||
|     client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) | ||||
|     time.sleep(2) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Trigger a check | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     time.sleep(2) | ||||
|     wait_for_all_checks(client) | ||||
|     # Offer should be gone | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'Embedded price data' not in res.data | ||||
| @@ -138,9 +138,97 @@ def test_check_ldjson_price_autodetect(client, live_server): | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|     time.sleep(3) | ||||
|     wait_for_all_checks(client) | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'ldjson-price-track-offer' not in res.data | ||||
|      | ||||
|     ########################################################################################## | ||||
|     client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|  | ||||
|  | ||||
| def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data): | ||||
|  | ||||
|     test_url = url_for('test_endpoint', _external=True) | ||||
|     res = client.post( | ||||
|         url_for("import_page"), | ||||
|         data={"urls": test_url}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     for k,v in client.application.config.get('DATASTORE').data['watching'].items(): | ||||
|         assert v.get('last_error') == False | ||||
|         assert v.get('has_ldjson_price_data') == has_ldjson_price_data | ||||
|  | ||||
|  | ||||
|     ########################################################################################## | ||||
|     client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|  | ||||
|  | ||||
| def test_bad_ldjson_is_correctly_ignored(client, live_server): | ||||
|     #live_server_setup(live_server) | ||||
|     test_return_data = """ | ||||
|             <html> | ||||
|             <head> | ||||
|                 <script type="application/ld+json"> | ||||
|                     { | ||||
|                         "@context": "http://schema.org", | ||||
|                         "@type": ["Product", "SubType"], | ||||
|                         "name": "My test product", | ||||
|                         "description": "", | ||||
|                         "offers": { | ||||
|                             "note" : "You can see the case-insensitive OffERS key, it should work", | ||||
|                             "@type": "Offer", | ||||
|                             "offeredBy": { | ||||
|                                 "@type": "Organization", | ||||
|                                 "name":"Person", | ||||
|                                 "telephone":"+1 999 999 999" | ||||
|                             }, | ||||
|                             "price": "1", | ||||
|                             "priceCurrency": "EUR", | ||||
|                             "url": "/some/url" | ||||
|                         } | ||||
|                     } | ||||
|                 </script> | ||||
|             </head> | ||||
|             <body> | ||||
|             <div class="yes">Some extra stuff</div> | ||||
|             </body></html> | ||||
|      """ | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write(test_return_data) | ||||
|  | ||||
|     _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=True) | ||||
|     test_return_data = """ | ||||
|             <html> | ||||
|             <head> | ||||
|                 <script type="application/ld+json"> | ||||
|                     { | ||||
|                         "@context": "http://schema.org", | ||||
|                         "@type": ["Product", "SubType"], | ||||
|                         "name": "My test product", | ||||
|                         "description": "", | ||||
|                         "BrokenOffers": { | ||||
|                             "@type": "Offer", | ||||
|                             "offeredBy": { | ||||
|                                 "@type": "Organization", | ||||
|                                 "name":"Person", | ||||
|                                 "telephone":"+1 999 999 999" | ||||
|                             }, | ||||
|                             "price": "1", | ||||
|                             "priceCurrency": "EUR", | ||||
|                             "url": "/some/url" | ||||
|                         } | ||||
|                     } | ||||
|                 </script> | ||||
|             </head> | ||||
|             <body> | ||||
|             <div class="yes">Some extra stuff</div> | ||||
|             </body></html> | ||||
|      """ | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write(test_return_data) | ||||
|  | ||||
|     _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user