mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 06:37:41 +00:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			3376-clean
			...
			auto-sugge
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 2d65d49bb7 | ||
|   | 300f55b0a2 | ||
|   | b437df7787 | ||
|   | ab1b87893a | 
| @@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|     import changedetectionio.blueprint.browser_steps as browser_steps | ||||
|     app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps') | ||||
|  | ||||
|     import changedetectionio.blueprint.price_data_follower as price_data_follower | ||||
|     app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower') | ||||
|  | ||||
|  | ||||
|     # @todo handle ctrl break | ||||
|     ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() | ||||
|     threading.Thread(target=notification_runner).start() | ||||
|   | ||||
							
								
								
									
										27
									
								
								changedetectionio/blueprint/price_data_follower/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								changedetectionio/blueprint/price_data_follower/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
|  | ||||
| from distutils.util import strtobool | ||||
| from flask import Blueprint, flash, redirect, url_for | ||||
| from flask_login import login_required | ||||
| from changedetectionio.store import ChangeDetectionStore | ||||
|  | ||||
| def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|  | ||||
|     price_data_follower_blueprint = Blueprint('price_data_follower', __name__) | ||||
|  | ||||
|     @login_required | ||||
|     @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET']) | ||||
|     def accept(uuid): | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted' | ||||
|         return redirect(url_for("form_watch_checknow", uuid=uuid)) | ||||
|  | ||||
|  | ||||
|     @login_required | ||||
|     @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET']) | ||||
|     def reject(uuid): | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected' | ||||
|         return redirect(url_for("index")) | ||||
|  | ||||
|  | ||||
|     return price_data_follower_blueprint | ||||
|  | ||||
|  | ||||
| @@ -2,7 +2,6 @@ import hashlib | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import time | ||||
| import urllib3 | ||||
|  | ||||
| from changedetectionio import content_fetcher, html_tools | ||||
| @@ -140,7 +139,7 @@ class perform_site_check(): | ||||
|             is_html = False | ||||
|             is_json = False | ||||
|  | ||||
|         include_filters_rule = watch.get('include_filters', []) | ||||
|         include_filters_rule = deepcopy(watch.get('include_filters', [])) | ||||
|         # include_filters_rule = watch['include_filters'] | ||||
|         subtractive_selectors = watch.get( | ||||
|             "subtractive_selectors", [] | ||||
| @@ -148,6 +147,10 @@ class perform_site_check(): | ||||
|             "global_subtractive_selectors", [] | ||||
|         ) | ||||
|  | ||||
|         # Inject a virtual LD+JSON price tracker rule | ||||
|         if watch.get('track_ldjson_price_data'): | ||||
|             include_filters_rule.append('json:$..price') | ||||
|  | ||||
|         has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip()) | ||||
|         has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) | ||||
|  | ||||
| @@ -173,9 +176,13 @@ class perform_site_check(): | ||||
|                 # Don't run get_text or xpath/css filters on plaintext | ||||
|                 stripped_text_from_html = html_content | ||||
|             else: | ||||
|                 # Does it have some ld+json price data? used for easier monitoring | ||||
|                 update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) | ||||
|  | ||||
|                 # Then we assume HTML | ||||
|                 if has_filter_rule: | ||||
|                     html_content = "" | ||||
|  | ||||
|                     for filter_rule in include_filters_rule: | ||||
|                         # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." | ||||
|                         if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): | ||||
|   | ||||
| @@ -127,8 +127,10 @@ def _get_stripped_text_from_json_match(match): | ||||
|  | ||||
|     return stripped_text_from_html | ||||
|  | ||||
| def extract_json_as_string(content, json_filter): | ||||
|  | ||||
| # content - json | ||||
| # json_filter - ie json:$..price | ||||
| # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) | ||||
| def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): | ||||
|     stripped_text_from_html = False | ||||
|  | ||||
|     # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson> | ||||
| @@ -139,7 +141,12 @@ def extract_json_as_string(content, json_filter): | ||||
|         # Foreach <script json></script> blob.. just return the first that matches json_filter | ||||
|         s = [] | ||||
|         soup = BeautifulSoup(content, 'html.parser') | ||||
|         bs_result = soup.findAll('script') | ||||
|  | ||||
|         if ensure_is_ldjson_info_type: | ||||
|             bs_result = soup.findAll('script', {"type": "application/ld+json"}) | ||||
|         else: | ||||
|             bs_result = soup.findAll('script') | ||||
|  | ||||
|  | ||||
|         if not bs_result: | ||||
|             raise JSONNotFound("No parsable JSON found in this document") | ||||
| @@ -156,7 +163,12 @@ def extract_json_as_string(content, json_filter): | ||||
|                 continue | ||||
|             else: | ||||
|                 stripped_text_from_html = _parse_json(json_data, json_filter) | ||||
|                 if stripped_text_from_html: | ||||
|                 if ensure_is_ldjson_info_type: | ||||
|                     # Could sometimes be list, string or something else random | ||||
|                     if isinstance(json_data, dict): | ||||
|                         if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower(): | ||||
|                             break | ||||
|                 elif stripped_text_from_html: | ||||
|                     break | ||||
|  | ||||
|     if not stripped_text_from_html: | ||||
| @@ -243,6 +255,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: | ||||
|  | ||||
|     return text_content | ||||
|  | ||||
|  | ||||
| # Does LD+JSON exist with a @type=='product' and a .price set anywhere? | ||||
| def has_ldjson_product_info(content): | ||||
|     try: | ||||
|         pricing_data = extract_json_as_string(content=content, json_filter='json:$..price', ensure_is_ldjson_info_type="product") | ||||
|     except JSONNotFound as e: | ||||
|         # Totally fine | ||||
|         return False | ||||
|     x=bool(pricing_data) | ||||
|     return x | ||||
|  | ||||
|  | ||||
| def workarounds_for_obfuscations(content): | ||||
|     """ | ||||
|     Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis | ||||
|   | ||||
| @@ -26,6 +26,8 @@ class model(dict): | ||||
|             'extract_title_as_title': False, | ||||
|             'fetch_backend': None, | ||||
|             'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), | ||||
|             'has_ldjson_price_data': None, | ||||
|             'track_ldjson_price_data': None, | ||||
|             'headers': {},  # Extra headers to send | ||||
|             'ignore_text': [],  # List of text to ignore when calculating the comparison checksum | ||||
|             'include_filters': [], | ||||
|   | ||||
| @@ -1009,3 +1009,20 @@ ul { | ||||
|   border-radius: 5px; | ||||
|   color: var(--color-warning); | ||||
| } | ||||
|  | ||||
| /* automatic price following helpers */ | ||||
| .tracking-ldjson-price-data { | ||||
|   background-color: var(--color-background-button-green); | ||||
|   color: #000; | ||||
|   padding: 3px; | ||||
|   border-radius: 3px; | ||||
|   white-space: nowrap; | ||||
| } | ||||
|  | ||||
| .ldjson-price-track-offer { | ||||
|   a.pure-button { | ||||
|     border-radius: 3px; | ||||
|     padding: 3px; | ||||
|     background-color: var(--color-background-button-green); | ||||
|   } | ||||
| } | ||||
|   | ||||
| @@ -945,3 +945,16 @@ ul { | ||||
|     display: inline; | ||||
|     height: 26px; | ||||
|     vertical-align: middle; } | ||||
|  | ||||
| /* automatic price following helpers */ | ||||
| .tracking-ldjson-price-data { | ||||
|   background-color: var(--color-background-button-green); | ||||
|   color: #000; | ||||
|   padding: 3px; | ||||
|   border-radius: 3px; | ||||
|   white-space: nowrap; } | ||||
|  | ||||
| .ldjson-price-track-offer a.pure-button { | ||||
|   border-radius: 3px; | ||||
|   padding: 3px; | ||||
|   background-color: var(--color-background-button-green); } | ||||
|   | ||||
| @@ -250,12 +250,15 @@ class ChangeDetectionStore: | ||||
|     def clear_watch_history(self, uuid): | ||||
|         import pathlib | ||||
|  | ||||
|         self.__data['watching'][uuid].update( | ||||
|             {'last_checked': 0, | ||||
|              'last_viewed': 0, | ||||
|              'previous_md5': False, | ||||
|              'last_notification_error': False, | ||||
|              'last_error': False}) | ||||
|         self.__data['watching'][uuid].update({ | ||||
|                 'last_checked': 0, | ||||
|                 'has_ldjson_price_data': None, | ||||
|                 'last_error': False, | ||||
|                 'last_notification_error': False, | ||||
|                 'last_viewed': 0, | ||||
|                 'previous_md5': False, | ||||
|                 'track_ldjson_price_data': None, | ||||
|             }) | ||||
|  | ||||
|         # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc | ||||
|         for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"): | ||||
|   | ||||
| @@ -98,6 +98,12 @@ | ||||
|                     {% if watch.last_notification_error is defined and watch.last_notification_error != False %} | ||||
|                     <div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div> | ||||
|                     {% endif %} | ||||
|                     {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data']  %} | ||||
|                     <div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div> | ||||
|                     {% endif %} | ||||
|                     {% if watch['track_ldjson_price_data'] %} | ||||
|                     <span class="tracking-ldjson-price-data">Price</span> | ||||
|                     {% endif %} | ||||
|                     {% if not active_tag %} | ||||
|                     <span class="watch-tag-list">{{ watch.tag}}</span> | ||||
|                     {% endif %} | ||||
|   | ||||
							
								
								
									
										112
									
								
								changedetectionio/tests/test_automatic_follow_ldjson_price.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								changedetectionio/tests/test_automatic_follow_ldjson_price.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| #!/usr/bin/python3 | ||||
|  | ||||
| import time | ||||
| from flask import url_for | ||||
| from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI | ||||
|  | ||||
| from ..html_tools import * | ||||
|  | ||||
|  | ||||
| def set_response_with_ldjson(): | ||||
|     test_return_data = """<html> | ||||
|        <body> | ||||
|      Some initial text</br> | ||||
|      <p>Which is across multiple lines</p> | ||||
|      </br> | ||||
|      So let's see what happens.  </br> | ||||
|      <div class="sametext">Some text thats the same</div> | ||||
|      <div class="changetext">Some text that will change</div> | ||||
|      <script type="application/ld+json"> | ||||
|         { | ||||
|            "@context":"https://schema.org/", | ||||
|            "@type":"Product", | ||||
|            "@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p", | ||||
|            "name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic", | ||||
|            "brand":{ | ||||
|               "@type":"Brand", | ||||
|               "name":"APPLE" | ||||
|            }, | ||||
|            "image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg", | ||||
|            "description":"You dont need it", | ||||
|            "mpn":"111111", | ||||
|            "sku":"22222", | ||||
|            "offers":{ | ||||
|               "@type":"AggregateOffer", | ||||
|               "lowPrice":8097000, | ||||
|               "highPrice":8099900, | ||||
|               "priceCurrency":"COP", | ||||
|               "offers":[ | ||||
|                  { | ||||
|                     "@type":"Offer", | ||||
|                     "price":8097000, | ||||
|                     "priceCurrency":"COP", | ||||
|                     "availability":"http://schema.org/InStock", | ||||
|                     "sku":"102375961", | ||||
|                     "itemCondition":"http://schema.org/NewCondition", | ||||
|                     "seller":{ | ||||
|                        "@type":"Organization", | ||||
|                        "name":"ajax" | ||||
|                     } | ||||
|                  } | ||||
|               ], | ||||
|               "offerCount":1 | ||||
|            } | ||||
|         } | ||||
|        </script> | ||||
|      </body> | ||||
|      </html> | ||||
| """ | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write(test_return_data) | ||||
|     return None | ||||
|  | ||||
|  | ||||
| # actually only really used by the distll.io importer, but could be handy too | ||||
| def test_check_ldjson_price_autodetect(client, live_server): | ||||
|     live_server_setup(live_server) | ||||
|  | ||||
|     # Give the endpoint time to spin up | ||||
|     time.sleep(1) | ||||
|  | ||||
|     set_response_with_ldjson() | ||||
|  | ||||
|     # Add our URL to the import page | ||||
|     test_url = url_for('test_endpoint', _external=True) | ||||
|     res = client.post( | ||||
|         url_for("import_page"), | ||||
|         data={"urls": test_url}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|     time.sleep(3) | ||||
|  | ||||
|     # Should get a notice that it's available | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'ldjson-price-track-offer' in res.data | ||||
|  | ||||
|     # Accept it | ||||
|     uuid = extract_UUID_from_client(client) | ||||
|  | ||||
|     client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) | ||||
|     time.sleep(2) | ||||
|  | ||||
|     # Trigger a check | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     time.sleep(2) | ||||
|     # Offer should be gone | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'Embedded price data' not in res.data | ||||
|     assert b'tracking-ldjson-price-data' in res.data | ||||
|  | ||||
|     # and last snapshop (via API) should be just the price | ||||
|     api_key = extract_api_key_from_UI(client) | ||||
|     res = client.get( | ||||
|         url_for("watchsinglehistory", uuid=uuid, timestamp='latest'), | ||||
|         headers={'x-api-key': api_key}, | ||||
|     ) | ||||
|  | ||||
|     # Should just see the price in the API reply | ||||
|     assert res.data == b'8097000' | ||||
|  | ||||
|     client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
		Reference in New Issue
	
	Block a user