mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			no-cryptog
			...
			auto-sugge
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					2d65d49bb7 | ||
| 
						 | 
					300f55b0a2 | ||
| 
						 | 
					b437df7787 | ||
| 
						 | 
					ab1b87893a | 
@@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
    import changedetectionio.blueprint.browser_steps as browser_steps
 | 
			
		||||
    app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
 | 
			
		||||
 | 
			
		||||
    import changedetectionio.blueprint.price_data_follower as price_data_follower
 | 
			
		||||
    app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # @todo handle ctrl break
 | 
			
		||||
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
 | 
			
		||||
    threading.Thread(target=notification_runner).start()
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										27
									
								
								changedetectionio/blueprint/price_data_follower/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								changedetectionio/blueprint/price_data_follower/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,27 @@
 | 
			
		||||
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from flask import Blueprint, flash, redirect, url_for
 | 
			
		||||
from flask_login import login_required
 | 
			
		||||
from changedetectionio.store import ChangeDetectionStore
 | 
			
		||||
 | 
			
		||||
def construct_blueprint(datastore: ChangeDetectionStore):
 | 
			
		||||
 | 
			
		||||
    price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
 | 
			
		||||
 | 
			
		||||
    @login_required
 | 
			
		||||
    @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
 | 
			
		||||
    def accept(uuid):
 | 
			
		||||
        datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
 | 
			
		||||
        return redirect(url_for("form_watch_checknow", uuid=uuid))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    @login_required
 | 
			
		||||
    @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
 | 
			
		||||
    def reject(uuid):
 | 
			
		||||
        datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
 | 
			
		||||
        return redirect(url_for("index"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    return price_data_follower_blueprint
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -2,7 +2,6 @@ import hashlib
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
import urllib3
 | 
			
		||||
 | 
			
		||||
from changedetectionio import content_fetcher, html_tools
 | 
			
		||||
@@ -140,7 +139,7 @@ class perform_site_check():
 | 
			
		||||
            is_html = False
 | 
			
		||||
            is_json = False
 | 
			
		||||
 | 
			
		||||
        include_filters_rule = watch.get('include_filters', [])
 | 
			
		||||
        include_filters_rule = deepcopy(watch.get('include_filters', []))
 | 
			
		||||
        # include_filters_rule = watch['include_filters']
 | 
			
		||||
        subtractive_selectors = watch.get(
 | 
			
		||||
            "subtractive_selectors", []
 | 
			
		||||
@@ -148,6 +147,10 @@ class perform_site_check():
 | 
			
		||||
            "global_subtractive_selectors", []
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Inject a virtual LD+JSON price tracker rule
 | 
			
		||||
        if watch.get('track_ldjson_price_data'):
 | 
			
		||||
            include_filters_rule.append('json:$..price')
 | 
			
		||||
 | 
			
		||||
        has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
 | 
			
		||||
        has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
 | 
			
		||||
 | 
			
		||||
@@ -173,9 +176,13 @@ class perform_site_check():
 | 
			
		||||
                # Don't run get_text or xpath/css filters on plaintext
 | 
			
		||||
                stripped_text_from_html = html_content
 | 
			
		||||
            else:
 | 
			
		||||
                # Does it have some ld+json price data? used for easier monitoring
 | 
			
		||||
                update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
 | 
			
		||||
 | 
			
		||||
                # Then we assume HTML
 | 
			
		||||
                if has_filter_rule:
 | 
			
		||||
                    html_content = ""
 | 
			
		||||
 | 
			
		||||
                    for filter_rule in include_filters_rule:
 | 
			
		||||
                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
 | 
			
		||||
                        if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
 | 
			
		||||
 
 | 
			
		||||
@@ -127,8 +127,10 @@ def _get_stripped_text_from_json_match(match):
 | 
			
		||||
 | 
			
		||||
    return stripped_text_from_html
 | 
			
		||||
 | 
			
		||||
def extract_json_as_string(content, json_filter):
 | 
			
		||||
 | 
			
		||||
# content - json
 | 
			
		||||
# json_filter - ie json:$..price
 | 
			
		||||
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
 | 
			
		||||
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
 | 
			
		||||
    stripped_text_from_html = False
 | 
			
		||||
 | 
			
		||||
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
 | 
			
		||||
@@ -139,7 +141,12 @@ def extract_json_as_string(content, json_filter):
 | 
			
		||||
        # Foreach <script json></script> blob.. just return the first that matches json_filter
 | 
			
		||||
        s = []
 | 
			
		||||
        soup = BeautifulSoup(content, 'html.parser')
 | 
			
		||||
        bs_result = soup.findAll('script')
 | 
			
		||||
 | 
			
		||||
        if ensure_is_ldjson_info_type:
 | 
			
		||||
            bs_result = soup.findAll('script', {"type": "application/ld+json"})
 | 
			
		||||
        else:
 | 
			
		||||
            bs_result = soup.findAll('script')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if not bs_result:
 | 
			
		||||
            raise JSONNotFound("No parsable JSON found in this document")
 | 
			
		||||
@@ -156,7 +163,12 @@ def extract_json_as_string(content, json_filter):
 | 
			
		||||
                continue
 | 
			
		||||
            else:
 | 
			
		||||
                stripped_text_from_html = _parse_json(json_data, json_filter)
 | 
			
		||||
                if stripped_text_from_html:
 | 
			
		||||
                if ensure_is_ldjson_info_type:
 | 
			
		||||
                    # Could sometimes be list, string or something else random
 | 
			
		||||
                    if isinstance(json_data, dict):
 | 
			
		||||
                        if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower():
 | 
			
		||||
                            break
 | 
			
		||||
                elif stripped_text_from_html:
 | 
			
		||||
                    break
 | 
			
		||||
 | 
			
		||||
    if not stripped_text_from_html:
 | 
			
		||||
@@ -243,6 +255,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
 | 
			
		||||
 | 
			
		||||
    return text_content
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 | 
			
		||||
def has_ldjson_product_info(content):
 | 
			
		||||
    try:
 | 
			
		||||
        pricing_data = extract_json_as_string(content=content, json_filter='json:$..price', ensure_is_ldjson_info_type="product")
 | 
			
		||||
    except JSONNotFound as e:
 | 
			
		||||
        # Totally fine
 | 
			
		||||
        return False
 | 
			
		||||
    x=bool(pricing_data)
 | 
			
		||||
    return x
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def workarounds_for_obfuscations(content):
 | 
			
		||||
    """
 | 
			
		||||
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,8 @@ class model(dict):
 | 
			
		||||
            'extract_title_as_title': False,
 | 
			
		||||
            'fetch_backend': None,
 | 
			
		||||
            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
 | 
			
		||||
            'has_ldjson_price_data': None,
 | 
			
		||||
            'track_ldjson_price_data': None,
 | 
			
		||||
            'headers': {},  # Extra headers to send
 | 
			
		||||
            'ignore_text': [],  # List of text to ignore when calculating the comparison checksum
 | 
			
		||||
            'include_filters': [],
 | 
			
		||||
 
 | 
			
		||||
@@ -1009,3 +1009,20 @@ ul {
 | 
			
		||||
  border-radius: 5px;
 | 
			
		||||
  color: var(--color-warning);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* automatic price following helpers */
 | 
			
		||||
.tracking-ldjson-price-data {
 | 
			
		||||
  background-color: var(--color-background-button-green);
 | 
			
		||||
  color: #000;
 | 
			
		||||
  padding: 3px;
 | 
			
		||||
  border-radius: 3px;
 | 
			
		||||
  white-space: nowrap;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
.ldjson-price-track-offer {
 | 
			
		||||
  a.pure-button {
 | 
			
		||||
    border-radius: 3px;
 | 
			
		||||
    padding: 3px;
 | 
			
		||||
    background-color: var(--color-background-button-green);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -945,3 +945,16 @@ ul {
 | 
			
		||||
    display: inline;
 | 
			
		||||
    height: 26px;
 | 
			
		||||
    vertical-align: middle; }
 | 
			
		||||
 | 
			
		||||
/* automatic price following helpers */
 | 
			
		||||
.tracking-ldjson-price-data {
 | 
			
		||||
  background-color: var(--color-background-button-green);
 | 
			
		||||
  color: #000;
 | 
			
		||||
  padding: 3px;
 | 
			
		||||
  border-radius: 3px;
 | 
			
		||||
  white-space: nowrap; }
 | 
			
		||||
 | 
			
		||||
.ldjson-price-track-offer a.pure-button {
 | 
			
		||||
  border-radius: 3px;
 | 
			
		||||
  padding: 3px;
 | 
			
		||||
  background-color: var(--color-background-button-green); }
 | 
			
		||||
 
 | 
			
		||||
@@ -250,12 +250,15 @@ class ChangeDetectionStore:
 | 
			
		||||
    def clear_watch_history(self, uuid):
 | 
			
		||||
        import pathlib
 | 
			
		||||
 | 
			
		||||
        self.__data['watching'][uuid].update(
 | 
			
		||||
            {'last_checked': 0,
 | 
			
		||||
             'last_viewed': 0,
 | 
			
		||||
             'previous_md5': False,
 | 
			
		||||
             'last_notification_error': False,
 | 
			
		||||
             'last_error': False})
 | 
			
		||||
        self.__data['watching'][uuid].update({
 | 
			
		||||
                'last_checked': 0,
 | 
			
		||||
                'has_ldjson_price_data': None,
 | 
			
		||||
                'last_error': False,
 | 
			
		||||
                'last_notification_error': False,
 | 
			
		||||
                'last_viewed': 0,
 | 
			
		||||
                'previous_md5': False,
 | 
			
		||||
                'track_ldjson_price_data': None,
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
 | 
			
		||||
        for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
 | 
			
		||||
 
 | 
			
		||||
@@ -98,6 +98,12 @@
 | 
			
		||||
                    {% if watch.last_notification_error is defined and watch.last_notification_error != False %}
 | 
			
		||||
                    <div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
 | 
			
		||||
                    {% endif %}
 | 
			
		||||
                    {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data']  %}
 | 
			
		||||
                    <div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
 | 
			
		||||
                    {% endif %}
 | 
			
		||||
                    {% if watch['track_ldjson_price_data'] %}
 | 
			
		||||
                    <span class="tracking-ldjson-price-data">Price</span>
 | 
			
		||||
                    {% endif %}
 | 
			
		||||
                    {% if not active_tag %}
 | 
			
		||||
                    <span class="watch-tag-list">{{ watch.tag}}</span>
 | 
			
		||||
                    {% endif %}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										112
									
								
								changedetectionio/tests/test_automatic_follow_ldjson_price.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								changedetectionio/tests/test_automatic_follow_ldjson_price.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,112 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
 | 
			
		||||
 | 
			
		||||
from ..html_tools import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_response_with_ldjson():
 | 
			
		||||
    test_return_data = """<html>
 | 
			
		||||
       <body>
 | 
			
		||||
     Some initial text</br>
 | 
			
		||||
     <p>Which is across multiple lines</p>
 | 
			
		||||
     </br>
 | 
			
		||||
     So let's see what happens.  </br>
 | 
			
		||||
     <div class="sametext">Some text thats the same</div>
 | 
			
		||||
     <div class="changetext">Some text that will change</div>
 | 
			
		||||
     <script type="application/ld+json">
 | 
			
		||||
        {
 | 
			
		||||
           "@context":"https://schema.org/",
 | 
			
		||||
           "@type":"Product",
 | 
			
		||||
           "@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
 | 
			
		||||
           "name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
 | 
			
		||||
           "brand":{
 | 
			
		||||
              "@type":"Brand",
 | 
			
		||||
              "name":"APPLE"
 | 
			
		||||
           },
 | 
			
		||||
           "image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
 | 
			
		||||
           "description":"You dont need it",
 | 
			
		||||
           "mpn":"111111",
 | 
			
		||||
           "sku":"22222",
 | 
			
		||||
           "offers":{
 | 
			
		||||
              "@type":"AggregateOffer",
 | 
			
		||||
              "lowPrice":8097000,
 | 
			
		||||
              "highPrice":8099900,
 | 
			
		||||
              "priceCurrency":"COP",
 | 
			
		||||
              "offers":[
 | 
			
		||||
                 {
 | 
			
		||||
                    "@type":"Offer",
 | 
			
		||||
                    "price":8097000,
 | 
			
		||||
                    "priceCurrency":"COP",
 | 
			
		||||
                    "availability":"http://schema.org/InStock",
 | 
			
		||||
                    "sku":"102375961",
 | 
			
		||||
                    "itemCondition":"http://schema.org/NewCondition",
 | 
			
		||||
                    "seller":{
 | 
			
		||||
                       "@type":"Organization",
 | 
			
		||||
                       "name":"ajax"
 | 
			
		||||
                    }
 | 
			
		||||
                 }
 | 
			
		||||
              ],
 | 
			
		||||
              "offerCount":1
 | 
			
		||||
           }
 | 
			
		||||
        }
 | 
			
		||||
       </script>
 | 
			
		||||
     </body>
 | 
			
		||||
     </html>
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write(test_return_data)
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# actually only really used by the distll.io importer, but could be handy too
 | 
			
		||||
def test_check_ldjson_price_autodetect(client, live_server):
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    # Give the endpoint time to spin up
 | 
			
		||||
    time.sleep(1)
 | 
			
		||||
 | 
			
		||||
    set_response_with_ldjson()
 | 
			
		||||
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_endpoint', _external=True)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        data={"urls": test_url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    time.sleep(3)
 | 
			
		||||
 | 
			
		||||
    # Should get a notice that it's available
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'ldjson-price-track-offer' in res.data
 | 
			
		||||
 | 
			
		||||
    # Accept it
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
 | 
			
		||||
    client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
 | 
			
		||||
    # Trigger a check
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
    # Offer should be gone
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'Embedded price data' not in res.data
 | 
			
		||||
    assert b'tracking-ldjson-price-data' in res.data
 | 
			
		||||
 | 
			
		||||
    # and last snapshop (via API) should be just the price
 | 
			
		||||
    api_key = extract_api_key_from_UI(client)
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
 | 
			
		||||
        headers={'x-api-key': api_key},
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Should just see the price in the API reply
 | 
			
		||||
    assert res.data == b'8097000'
 | 
			
		||||
 | 
			
		||||
    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
		Reference in New Issue
	
	Block a user