mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Compare commits
4 Commits
path-bluep
...
auto-sugge
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2d65d49bb7 | ||
|
|
300f55b0a2 | ||
|
|
b437df7787 | ||
|
|
ab1b87893a |
@@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
import changedetectionio.blueprint.browser_steps as browser_steps
|
||||
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
|
||||
|
||||
import changedetectionio.blueprint.price_data_follower as price_data_follower
|
||||
app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
|
||||
|
||||
|
||||
# @todo handle ctrl break
|
||||
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
|
||||
threading.Thread(target=notification_runner).start()
|
||||
|
||||
27
changedetectionio/blueprint/price_data_follower/__init__.py
Normal file
27
changedetectionio/blueprint/price_data_follower/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
from distutils.util import strtobool
|
||||
from flask import Blueprint, flash, redirect, url_for
|
||||
from flask_login import login_required
|
||||
from changedetectionio.store import ChangeDetectionStore
|
||||
|
||||
def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
|
||||
price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
|
||||
|
||||
@login_required
|
||||
@price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
|
||||
def accept(uuid):
|
||||
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
|
||||
return redirect(url_for("form_watch_checknow", uuid=uuid))
|
||||
|
||||
|
||||
@login_required
|
||||
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
|
||||
def reject(uuid):
|
||||
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
return price_data_follower_blueprint
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib3
|
||||
|
||||
from changedetectionio import content_fetcher, html_tools
|
||||
@@ -140,7 +139,7 @@ class perform_site_check():
|
||||
is_html = False
|
||||
is_json = False
|
||||
|
||||
include_filters_rule = watch.get('include_filters', [])
|
||||
include_filters_rule = deepcopy(watch.get('include_filters', []))
|
||||
# include_filters_rule = watch['include_filters']
|
||||
subtractive_selectors = watch.get(
|
||||
"subtractive_selectors", []
|
||||
@@ -148,6 +147,10 @@ class perform_site_check():
|
||||
"global_subtractive_selectors", []
|
||||
)
|
||||
|
||||
# Inject a virtual LD+JSON price tracker rule
|
||||
if watch.get('track_ldjson_price_data'):
|
||||
include_filters_rule.append('json:$..price')
|
||||
|
||||
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
|
||||
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
||||
|
||||
@@ -173,9 +176,13 @@ class perform_site_check():
|
||||
# Don't run get_text or xpath/css filters on plaintext
|
||||
stripped_text_from_html = html_content
|
||||
else:
|
||||
# Does it have some ld+json price data? used for easier monitoring
|
||||
update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
|
||||
|
||||
# Then we assume HTML
|
||||
if has_filter_rule:
|
||||
html_content = ""
|
||||
|
||||
for filter_rule in include_filters_rule:
|
||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
||||
|
||||
@@ -127,8 +127,10 @@ def _get_stripped_text_from_json_match(match):
|
||||
|
||||
return stripped_text_from_html
|
||||
|
||||
def extract_json_as_string(content, json_filter):
|
||||
|
||||
# content - json
|
||||
# json_filter - ie json:$..price
|
||||
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
|
||||
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
||||
stripped_text_from_html = False
|
||||
|
||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||
@@ -139,7 +141,12 @@ def extract_json_as_string(content, json_filter):
|
||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||
s = []
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
bs_result = soup.findAll('script')
|
||||
|
||||
if ensure_is_ldjson_info_type:
|
||||
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
||||
else:
|
||||
bs_result = soup.findAll('script')
|
||||
|
||||
|
||||
if not bs_result:
|
||||
raise JSONNotFound("No parsable JSON found in this document")
|
||||
@@ -156,7 +163,12 @@ def extract_json_as_string(content, json_filter):
|
||||
continue
|
||||
else:
|
||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||
if stripped_text_from_html:
|
||||
if ensure_is_ldjson_info_type:
|
||||
# Could sometimes be list, string or something else random
|
||||
if isinstance(json_data, dict):
|
||||
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower():
|
||||
break
|
||||
elif stripped_text_from_html:
|
||||
break
|
||||
|
||||
if not stripped_text_from_html:
|
||||
@@ -243,6 +255,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
||||
|
||||
return text_content
|
||||
|
||||
|
||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||
def has_ldjson_product_info(content):
|
||||
try:
|
||||
pricing_data = extract_json_as_string(content=content, json_filter='json:$..price', ensure_is_ldjson_info_type="product")
|
||||
except JSONNotFound as e:
|
||||
# Totally fine
|
||||
return False
|
||||
x=bool(pricing_data)
|
||||
return x
|
||||
|
||||
|
||||
def workarounds_for_obfuscations(content):
|
||||
"""
|
||||
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
||||
|
||||
@@ -26,6 +26,8 @@ class model(dict):
|
||||
'extract_title_as_title': False,
|
||||
'fetch_backend': None,
|
||||
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
||||
'has_ldjson_price_data': None,
|
||||
'track_ldjson_price_data': None,
|
||||
'headers': {}, # Extra headers to send
|
||||
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||
'include_filters': [],
|
||||
|
||||
@@ -1009,3 +1009,20 @@ ul {
|
||||
border-radius: 5px;
|
||||
color: var(--color-warning);
|
||||
}
|
||||
|
||||
/* automatic price following helpers */
|
||||
.tracking-ldjson-price-data {
|
||||
background-color: var(--color-background-button-green);
|
||||
color: #000;
|
||||
padding: 3px;
|
||||
border-radius: 3px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.ldjson-price-track-offer {
|
||||
a.pure-button {
|
||||
border-radius: 3px;
|
||||
padding: 3px;
|
||||
background-color: var(--color-background-button-green);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -945,3 +945,16 @@ ul {
|
||||
display: inline;
|
||||
height: 26px;
|
||||
vertical-align: middle; }
|
||||
|
||||
/* automatic price following helpers */
|
||||
.tracking-ldjson-price-data {
|
||||
background-color: var(--color-background-button-green);
|
||||
color: #000;
|
||||
padding: 3px;
|
||||
border-radius: 3px;
|
||||
white-space: nowrap; }
|
||||
|
||||
.ldjson-price-track-offer a.pure-button {
|
||||
border-radius: 3px;
|
||||
padding: 3px;
|
||||
background-color: var(--color-background-button-green); }
|
||||
|
||||
@@ -250,12 +250,15 @@ class ChangeDetectionStore:
|
||||
def clear_watch_history(self, uuid):
|
||||
import pathlib
|
||||
|
||||
self.__data['watching'][uuid].update(
|
||||
{'last_checked': 0,
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'last_notification_error': False,
|
||||
'last_error': False})
|
||||
self.__data['watching'][uuid].update({
|
||||
'last_checked': 0,
|
||||
'has_ldjson_price_data': None,
|
||||
'last_error': False,
|
||||
'last_notification_error': False,
|
||||
'last_viewed': 0,
|
||||
'previous_md5': False,
|
||||
'track_ldjson_price_data': None,
|
||||
})
|
||||
|
||||
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
|
||||
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
|
||||
|
||||
@@ -98,6 +98,12 @@
|
||||
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}
|
||||
<div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
|
||||
{% endif %}
|
||||
{% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %}
|
||||
<div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
|
||||
{% endif %}
|
||||
{% if watch['track_ldjson_price_data'] %}
|
||||
<span class="tracking-ldjson-price-data">Price</span>
|
||||
{% endif %}
|
||||
{% if not active_tag %}
|
||||
<span class="watch-tag-list">{{ watch.tag}}</span>
|
||||
{% endif %}
|
||||
|
||||
112
changedetectionio/tests/test_automatic_follow_ldjson_price.py
Normal file
112
changedetectionio/tests/test_automatic_follow_ldjson_price.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
|
||||
|
||||
from ..html_tools import *
|
||||
|
||||
|
||||
def set_response_with_ldjson():
|
||||
test_return_data = """<html>
|
||||
<body>
|
||||
Some initial text</br>
|
||||
<p>Which is across multiple lines</p>
|
||||
</br>
|
||||
So let's see what happens. </br>
|
||||
<div class="sametext">Some text thats the same</div>
|
||||
<div class="changetext">Some text that will change</div>
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context":"https://schema.org/",
|
||||
"@type":"Product",
|
||||
"@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
|
||||
"name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
|
||||
"brand":{
|
||||
"@type":"Brand",
|
||||
"name":"APPLE"
|
||||
},
|
||||
"image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
|
||||
"description":"You dont need it",
|
||||
"mpn":"111111",
|
||||
"sku":"22222",
|
||||
"offers":{
|
||||
"@type":"AggregateOffer",
|
||||
"lowPrice":8097000,
|
||||
"highPrice":8099900,
|
||||
"priceCurrency":"COP",
|
||||
"offers":[
|
||||
{
|
||||
"@type":"Offer",
|
||||
"price":8097000,
|
||||
"priceCurrency":"COP",
|
||||
"availability":"http://schema.org/InStock",
|
||||
"sku":"102375961",
|
||||
"itemCondition":"http://schema.org/NewCondition",
|
||||
"seller":{
|
||||
"@type":"Organization",
|
||||
"name":"ajax"
|
||||
}
|
||||
}
|
||||
],
|
||||
"offerCount":1
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||
f.write(test_return_data)
|
||||
return None
|
||||
|
||||
|
||||
# actually only really used by the distll.io importer, but could be handy too
|
||||
def test_check_ldjson_price_autodetect(client, live_server):
|
||||
live_server_setup(live_server)
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
set_response_with_ldjson()
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
url_for("import_page"),
|
||||
data={"urls": test_url},
|
||||
follow_redirects=True
|
||||
)
|
||||
assert b"1 Imported" in res.data
|
||||
time.sleep(3)
|
||||
|
||||
# Should get a notice that it's available
|
||||
res = client.get(url_for("index"))
|
||||
assert b'ldjson-price-track-offer' in res.data
|
||||
|
||||
# Accept it
|
||||
uuid = extract_UUID_from_client(client)
|
||||
|
||||
client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
|
||||
time.sleep(2)
|
||||
|
||||
# Trigger a check
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(2)
|
||||
# Offer should be gone
|
||||
res = client.get(url_for("index"))
|
||||
assert b'Embedded price data' not in res.data
|
||||
assert b'tracking-ldjson-price-data' in res.data
|
||||
|
||||
# and last snapshop (via API) should be just the price
|
||||
api_key = extract_api_key_from_UI(client)
|
||||
res = client.get(
|
||||
url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
|
||||
headers={'x-api-key': api_key},
|
||||
)
|
||||
|
||||
# Should just see the price in the API reply
|
||||
assert res.data == b'8097000'
|
||||
|
||||
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||
Reference in New Issue
Block a user