mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 00:27:48 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			0.50.20
			...
			1833-ldjso
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					e86e178203 | ||
| 
						 | 
					9e9f9d30c8 | ||
| 
						 | 
					0c1d37ba11 | ||
| 
						 | 
					4ae3519d66 | ||
| 
						 | 
					da92a12d92 | ||
| 
						 | 
					56385112e7 | 
@@ -7,13 +7,14 @@ from typing import List
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 | 
			
		||||
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
 | 
			
		||||
 | 
			
		||||
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
 | 
			
		||||
# 'price' , 'lowPrice', 'highPrice' are usually under here
 | 
			
		||||
# all of those may or may not appear on different websites
 | 
			
		||||
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
 | 
			
		||||
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
 | 
			
		||||
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
 | 
			
		||||
 | 
			
		||||
class JSONNotFound(ValueError):
 | 
			
		||||
    def __init__(self, msg):
 | 
			
		||||
@@ -161,7 +162,6 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 | 
			
		||||
 | 
			
		||||
        # Foreach <script json></script> blob.. just return the first that matches json_filter
 | 
			
		||||
        # As a last resort, try to parse the whole <body>
 | 
			
		||||
        s = []
 | 
			
		||||
        soup = BeautifulSoup(content, 'html.parser')
 | 
			
		||||
 | 
			
		||||
        if ensure_is_ldjson_info_type:
 | 
			
		||||
@@ -187,13 +187,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 | 
			
		||||
        
 | 
			
		||||
        for json_data in bs_jsons:
 | 
			
		||||
            stripped_text_from_html = _parse_json(json_data, json_filter)
 | 
			
		||||
 | 
			
		||||
            if ensure_is_ldjson_info_type:
 | 
			
		||||
                # Could sometimes be list, string or something else random
 | 
			
		||||
                if isinstance(json_data, dict):
 | 
			
		||||
                    # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
 | 
			
		||||
                    # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
 | 
			
		||||
                    if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
 | 
			
		||||
                        break
 | 
			
		||||
                    # @type could also be a list (Product, SubType)
 | 
			
		||||
                    # LD_JSON auto-extract also requires some content PLUS the ldjson to be present
 | 
			
		||||
                    # 1833 - could be either str or dict, should not be anything else
 | 
			
		||||
                    if json_data.get('@type') and stripped_text_from_html:
 | 
			
		||||
                        try:
 | 
			
		||||
                            if json_data.get('@type') == str or json_data.get('@type') == dict:
 | 
			
		||||
                                types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type')
 | 
			
		||||
                                if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]:
 | 
			
		||||
                                    break
 | 
			
		||||
                        except:
 | 
			
		||||
                            continue
 | 
			
		||||
 | 
			
		||||
            elif stripped_text_from_html:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
@@ -283,9 +294,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
 | 
			
		||||
 | 
			
		||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 | 
			
		||||
def has_ldjson_product_info(content):
 | 
			
		||||
    pricing_data = ''
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
 | 
			
		||||
    except JSONNotFound as e:
 | 
			
		||||
        if not 'application/ld+json' in content:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
 | 
			
		||||
            pricing_data += extract_json_as_string(content=content,
 | 
			
		||||
                                                  json_filter=filter,
 | 
			
		||||
                                                  ensure_is_ldjson_info_type="product")
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        # Totally fine
 | 
			
		||||
        return False
 | 
			
		||||
    x=bool(pricing_data)
 | 
			
		||||
 
 | 
			
		||||
@@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 | 
			
		||||
 | 
			
		||||
name = 'Webpage Text/HTML, JSON and PDF changes'
 | 
			
		||||
description = 'Detects all text changes where possible'
 | 
			
		||||
 | 
			
		||||
json_filter_prefixes = ['json:', 'jq:']
 | 
			
		||||
 | 
			
		||||
class FilterNotFoundInResponse(ValueError):
 | 
			
		||||
    def __init__(self, msg):
 | 
			
		||||
@@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
 | 
			
		||||
        # Inject a virtual LD+JSON price tracker rule
 | 
			
		||||
        if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
 | 
			
		||||
            include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
 | 
			
		||||
            include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS
 | 
			
		||||
 | 
			
		||||
        has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
 | 
			
		||||
        has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
 | 
			
		||||
@@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
        if has_filter_rule:
 | 
			
		||||
            json_filter_prefixes = ['json:', 'jq:']
 | 
			
		||||
            for filter in include_filters_rule:
 | 
			
		||||
                if any(prefix in filter for prefix in json_filter_prefixes):
 | 
			
		||||
                    stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,8 @@
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
 | 
			
		||||
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI, wait_for_all_checks
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_response_with_ldjson():
 | 
			
		||||
    test_return_data = """<html>
 | 
			
		||||
@@ -27,7 +28,7 @@ def set_response_with_ldjson():
 | 
			
		||||
           "description":"You dont need it",
 | 
			
		||||
           "mpn":"111111",
 | 
			
		||||
           "sku":"22222",
 | 
			
		||||
           "offers":{
 | 
			
		||||
           "Offers":{
 | 
			
		||||
              "@type":"AggregateOffer",
 | 
			
		||||
              "lowPrice":8097000,
 | 
			
		||||
              "highPrice":8099900,
 | 
			
		||||
@@ -75,12 +76,11 @@ def set_response_without_ldjson():
 | 
			
		||||
        f.write(test_return_data)
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
# actually only really used by the distll.io importer, but could be handy too
 | 
			
		||||
def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
def test_setup(client, live_server):
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    # Give the endpoint time to spin up
 | 
			
		||||
    time.sleep(1)
 | 
			
		||||
# actually only really used by the distll.io importer, but could be handy too
 | 
			
		||||
def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
 | 
			
		||||
    set_response_with_ldjson()
 | 
			
		||||
 | 
			
		||||
@@ -92,7 +92,7 @@ def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    time.sleep(3)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # Should get a notice that it's available
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
@@ -102,11 +102,11 @@ def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
 | 
			
		||||
    client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # Trigger a check
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    # Offer should be gone
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'Embedded price data' not in res.data
 | 
			
		||||
@@ -138,9 +138,97 @@ def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    time.sleep(3)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'ldjson-price-track-offer' not in res.data
 | 
			
		||||
    
 | 
			
		||||
    ##########################################################################################
 | 
			
		||||
    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data):
 | 
			
		||||
 | 
			
		||||
    test_url = url_for('test_endpoint', _external=True)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        data={"urls": test_url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    for k,v in client.application.config.get('DATASTORE').data['watching'].items():
 | 
			
		||||
        assert v.get('last_error') == False
 | 
			
		||||
        assert v.get('has_ldjson_price_data') == has_ldjson_price_data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ##########################################################################################
 | 
			
		||||
    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_bad_ldjson_is_correctly_ignored(client, live_server):
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
    test_return_data = """
 | 
			
		||||
            <html>
 | 
			
		||||
            <head>
 | 
			
		||||
                <script type="application/ld+json">
 | 
			
		||||
                    {
 | 
			
		||||
                        "@context": "http://schema.org",
 | 
			
		||||
                        "@type": ["Product", "SubType"],
 | 
			
		||||
                        "name": "My test product",
 | 
			
		||||
                        "description": "",
 | 
			
		||||
                        "offers": {
 | 
			
		||||
                            "note" : "You can see the case-insensitive OffERS key, it should work",
 | 
			
		||||
                            "@type": "Offer",
 | 
			
		||||
                            "offeredBy": {
 | 
			
		||||
                                "@type": "Organization",
 | 
			
		||||
                                "name":"Person",
 | 
			
		||||
                                "telephone":"+1 999 999 999"
 | 
			
		||||
                            },
 | 
			
		||||
                            "price": "1",
 | 
			
		||||
                            "priceCurrency": "EUR",
 | 
			
		||||
                            "url": "/some/url"
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                </script>
 | 
			
		||||
            </head>
 | 
			
		||||
            <body>
 | 
			
		||||
            <div class="yes">Some extra stuff</div>
 | 
			
		||||
            </body></html>
 | 
			
		||||
     """
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write(test_return_data)
 | 
			
		||||
 | 
			
		||||
    _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=True)
 | 
			
		||||
    test_return_data = """
 | 
			
		||||
            <html>
 | 
			
		||||
            <head>
 | 
			
		||||
                <script type="application/ld+json">
 | 
			
		||||
                    {
 | 
			
		||||
                        "@context": "http://schema.org",
 | 
			
		||||
                        "@type": ["Product", "SubType"],
 | 
			
		||||
                        "name": "My test product",
 | 
			
		||||
                        "description": "",
 | 
			
		||||
                        "BrokenOffers": {
 | 
			
		||||
                            "@type": "Offer",
 | 
			
		||||
                            "offeredBy": {
 | 
			
		||||
                                "@type": "Organization",
 | 
			
		||||
                                "name":"Person",
 | 
			
		||||
                                "telephone":"+1 999 999 999"
 | 
			
		||||
                            },
 | 
			
		||||
                            "price": "1",
 | 
			
		||||
                            "priceCurrency": "EUR",
 | 
			
		||||
                            "url": "/some/url"
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                </script>
 | 
			
		||||
            </head>
 | 
			
		||||
            <body>
 | 
			
		||||
            <div class="yes">Some extra stuff</div>
 | 
			
		||||
            </body></html>
 | 
			
		||||
     """
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write(test_return_data)
 | 
			
		||||
 | 
			
		||||
    _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user