mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-09 11:06:47 +00:00
Compare commits
4 Commits
openai-int
...
auto-sugge
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2d65d49bb7 | ||
|
|
300f55b0a2 | ||
|
|
b437df7787 | ||
|
|
ab1b87893a |
@@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
|
|||||||
import changedetectionio.blueprint.browser_steps as browser_steps
|
import changedetectionio.blueprint.browser_steps as browser_steps
|
||||||
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
|
app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
|
||||||
|
|
||||||
|
import changedetectionio.blueprint.price_data_follower as price_data_follower
|
||||||
|
app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
|
||||||
|
|
||||||
|
|
||||||
# @todo handle ctrl break
|
# @todo handle ctrl break
|
||||||
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
|
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
|
||||||
threading.Thread(target=notification_runner).start()
|
threading.Thread(target=notification_runner).start()
|
||||||
|
|||||||
27
changedetectionio/blueprint/price_data_follower/__init__.py
Normal file
27
changedetectionio/blueprint/price_data_follower/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
|
||||||
|
from distutils.util import strtobool
|
||||||
|
from flask import Blueprint, flash, redirect, url_for
|
||||||
|
from flask_login import login_required
|
||||||
|
from changedetectionio.store import ChangeDetectionStore
|
||||||
|
|
||||||
|
def construct_blueprint(datastore: ChangeDetectionStore):
|
||||||
|
|
||||||
|
price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
|
||||||
|
|
||||||
|
@login_required
|
||||||
|
@price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
|
||||||
|
def accept(uuid):
|
||||||
|
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
|
||||||
|
return redirect(url_for("form_watch_checknow", uuid=uuid))
|
||||||
|
|
||||||
|
|
||||||
|
@login_required
|
||||||
|
@price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
|
||||||
|
def reject(uuid):
|
||||||
|
datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
|
||||||
|
return redirect(url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
|
return price_data_follower_blueprint
|
||||||
|
|
||||||
|
|
||||||
@@ -2,7 +2,6 @@ import hashlib
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
import urllib3
|
import urllib3
|
||||||
|
|
||||||
from changedetectionio import content_fetcher, html_tools
|
from changedetectionio import content_fetcher, html_tools
|
||||||
@@ -140,7 +139,7 @@ class perform_site_check():
|
|||||||
is_html = False
|
is_html = False
|
||||||
is_json = False
|
is_json = False
|
||||||
|
|
||||||
include_filters_rule = watch.get('include_filters', [])
|
include_filters_rule = deepcopy(watch.get('include_filters', []))
|
||||||
# include_filters_rule = watch['include_filters']
|
# include_filters_rule = watch['include_filters']
|
||||||
subtractive_selectors = watch.get(
|
subtractive_selectors = watch.get(
|
||||||
"subtractive_selectors", []
|
"subtractive_selectors", []
|
||||||
@@ -148,6 +147,10 @@ class perform_site_check():
|
|||||||
"global_subtractive_selectors", []
|
"global_subtractive_selectors", []
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Inject a virtual LD+JSON price tracker rule
|
||||||
|
if watch.get('track_ldjson_price_data'):
|
||||||
|
include_filters_rule.append('json:$..price')
|
||||||
|
|
||||||
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
|
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
|
||||||
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
|
||||||
|
|
||||||
@@ -173,9 +176,13 @@ class perform_site_check():
|
|||||||
# Don't run get_text or xpath/css filters on plaintext
|
# Don't run get_text or xpath/css filters on plaintext
|
||||||
stripped_text_from_html = html_content
|
stripped_text_from_html = html_content
|
||||||
else:
|
else:
|
||||||
|
# Does it have some ld+json price data? used for easier monitoring
|
||||||
|
update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
|
||||||
|
|
||||||
# Then we assume HTML
|
# Then we assume HTML
|
||||||
if has_filter_rule:
|
if has_filter_rule:
|
||||||
html_content = ""
|
html_content = ""
|
||||||
|
|
||||||
for filter_rule in include_filters_rule:
|
for filter_rule in include_filters_rule:
|
||||||
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
||||||
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
|
||||||
|
|||||||
@@ -127,8 +127,10 @@ def _get_stripped_text_from_json_match(match):
|
|||||||
|
|
||||||
return stripped_text_from_html
|
return stripped_text_from_html
|
||||||
|
|
||||||
def extract_json_as_string(content, json_filter):
|
# content - json
|
||||||
|
# json_filter - ie json:$..price
|
||||||
|
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
|
||||||
|
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
|
||||||
stripped_text_from_html = False
|
stripped_text_from_html = False
|
||||||
|
|
||||||
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||||||
@@ -139,7 +141,12 @@ def extract_json_as_string(content, json_filter):
|
|||||||
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
# Foreach <script json></script> blob.. just return the first that matches json_filter
|
||||||
s = []
|
s = []
|
||||||
soup = BeautifulSoup(content, 'html.parser')
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
bs_result = soup.findAll('script')
|
|
||||||
|
if ensure_is_ldjson_info_type:
|
||||||
|
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
||||||
|
else:
|
||||||
|
bs_result = soup.findAll('script')
|
||||||
|
|
||||||
|
|
||||||
if not bs_result:
|
if not bs_result:
|
||||||
raise JSONNotFound("No parsable JSON found in this document")
|
raise JSONNotFound("No parsable JSON found in this document")
|
||||||
@@ -156,7 +163,12 @@ def extract_json_as_string(content, json_filter):
|
|||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
stripped_text_from_html = _parse_json(json_data, json_filter)
|
stripped_text_from_html = _parse_json(json_data, json_filter)
|
||||||
if stripped_text_from_html:
|
if ensure_is_ldjson_info_type:
|
||||||
|
# Could sometimes be list, string or something else random
|
||||||
|
if isinstance(json_data, dict):
|
||||||
|
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower():
|
||||||
|
break
|
||||||
|
elif stripped_text_from_html:
|
||||||
break
|
break
|
||||||
|
|
||||||
if not stripped_text_from_html:
|
if not stripped_text_from_html:
|
||||||
@@ -243,6 +255,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
|||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|
||||||
|
|
||||||
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
|
def has_ldjson_product_info(content):
|
||||||
|
try:
|
||||||
|
pricing_data = extract_json_as_string(content=content, json_filter='json:$..price', ensure_is_ldjson_info_type="product")
|
||||||
|
except JSONNotFound as e:
|
||||||
|
# Totally fine
|
||||||
|
return False
|
||||||
|
x=bool(pricing_data)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
def workarounds_for_obfuscations(content):
|
def workarounds_for_obfuscations(content):
|
||||||
"""
|
"""
|
||||||
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ class model(dict):
|
|||||||
'extract_title_as_title': False,
|
'extract_title_as_title': False,
|
||||||
'fetch_backend': None,
|
'fetch_backend': None,
|
||||||
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
||||||
|
'has_ldjson_price_data': None,
|
||||||
|
'track_ldjson_price_data': None,
|
||||||
'headers': {}, # Extra headers to send
|
'headers': {}, # Extra headers to send
|
||||||
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
||||||
'include_filters': [],
|
'include_filters': [],
|
||||||
|
|||||||
@@ -1009,3 +1009,20 @@ ul {
|
|||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
color: var(--color-warning);
|
color: var(--color-warning);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* automatic price following helpers */
|
||||||
|
.tracking-ldjson-price-data {
|
||||||
|
background-color: var(--color-background-button-green);
|
||||||
|
color: #000;
|
||||||
|
padding: 3px;
|
||||||
|
border-radius: 3px;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ldjson-price-track-offer {
|
||||||
|
a.pure-button {
|
||||||
|
border-radius: 3px;
|
||||||
|
padding: 3px;
|
||||||
|
background-color: var(--color-background-button-green);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -945,3 +945,16 @@ ul {
|
|||||||
display: inline;
|
display: inline;
|
||||||
height: 26px;
|
height: 26px;
|
||||||
vertical-align: middle; }
|
vertical-align: middle; }
|
||||||
|
|
||||||
|
/* automatic price following helpers */
|
||||||
|
.tracking-ldjson-price-data {
|
||||||
|
background-color: var(--color-background-button-green);
|
||||||
|
color: #000;
|
||||||
|
padding: 3px;
|
||||||
|
border-radius: 3px;
|
||||||
|
white-space: nowrap; }
|
||||||
|
|
||||||
|
.ldjson-price-track-offer a.pure-button {
|
||||||
|
border-radius: 3px;
|
||||||
|
padding: 3px;
|
||||||
|
background-color: var(--color-background-button-green); }
|
||||||
|
|||||||
@@ -250,12 +250,15 @@ class ChangeDetectionStore:
|
|||||||
def clear_watch_history(self, uuid):
|
def clear_watch_history(self, uuid):
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
self.__data['watching'][uuid].update(
|
self.__data['watching'][uuid].update({
|
||||||
{'last_checked': 0,
|
'last_checked': 0,
|
||||||
'last_viewed': 0,
|
'has_ldjson_price_data': None,
|
||||||
'previous_md5': False,
|
'last_error': False,
|
||||||
'last_notification_error': False,
|
'last_notification_error': False,
|
||||||
'last_error': False})
|
'last_viewed': 0,
|
||||||
|
'previous_md5': False,
|
||||||
|
'track_ldjson_price_data': None,
|
||||||
|
})
|
||||||
|
|
||||||
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
|
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
|
||||||
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
|
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
|
||||||
|
|||||||
@@ -98,6 +98,12 @@
|
|||||||
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}
|
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}
|
||||||
<div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
|
<div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %}
|
||||||
|
<div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
|
||||||
|
{% endif %}
|
||||||
|
{% if watch['track_ldjson_price_data'] %}
|
||||||
|
<span class="tracking-ldjson-price-data">Price</span>
|
||||||
|
{% endif %}
|
||||||
{% if not active_tag %}
|
{% if not active_tag %}
|
||||||
<span class="watch-tag-list">{{ watch.tag}}</span>
|
<span class="watch-tag-list">{{ watch.tag}}</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|||||||
112
changedetectionio/tests/test_automatic_follow_ldjson_price.py
Normal file
112
changedetectionio/tests/test_automatic_follow_ldjson_price.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import time
|
||||||
|
from flask import url_for
|
||||||
|
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
|
||||||
|
|
||||||
|
from ..html_tools import *
|
||||||
|
|
||||||
|
|
||||||
|
def set_response_with_ldjson():
|
||||||
|
test_return_data = """<html>
|
||||||
|
<body>
|
||||||
|
Some initial text</br>
|
||||||
|
<p>Which is across multiple lines</p>
|
||||||
|
</br>
|
||||||
|
So let's see what happens. </br>
|
||||||
|
<div class="sametext">Some text thats the same</div>
|
||||||
|
<div class="changetext">Some text that will change</div>
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context":"https://schema.org/",
|
||||||
|
"@type":"Product",
|
||||||
|
"@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
|
||||||
|
"name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
|
||||||
|
"brand":{
|
||||||
|
"@type":"Brand",
|
||||||
|
"name":"APPLE"
|
||||||
|
},
|
||||||
|
"image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
|
||||||
|
"description":"You dont need it",
|
||||||
|
"mpn":"111111",
|
||||||
|
"sku":"22222",
|
||||||
|
"offers":{
|
||||||
|
"@type":"AggregateOffer",
|
||||||
|
"lowPrice":8097000,
|
||||||
|
"highPrice":8099900,
|
||||||
|
"priceCurrency":"COP",
|
||||||
|
"offers":[
|
||||||
|
{
|
||||||
|
"@type":"Offer",
|
||||||
|
"price":8097000,
|
||||||
|
"priceCurrency":"COP",
|
||||||
|
"availability":"http://schema.org/InStock",
|
||||||
|
"sku":"102375961",
|
||||||
|
"itemCondition":"http://schema.org/NewCondition",
|
||||||
|
"seller":{
|
||||||
|
"@type":"Organization",
|
||||||
|
"name":"ajax"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"offerCount":1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open("test-datastore/endpoint-content.txt", "w") as f:
|
||||||
|
f.write(test_return_data)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# actually only really used by the distll.io importer, but could be handy too
|
||||||
|
def test_check_ldjson_price_autodetect(client, live_server):
|
||||||
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
# Give the endpoint time to spin up
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
set_response_with_ldjson()
|
||||||
|
|
||||||
|
# Add our URL to the import page
|
||||||
|
test_url = url_for('test_endpoint', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Should get a notice that it's available
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'ldjson-price-track-offer' in res.data
|
||||||
|
|
||||||
|
# Accept it
|
||||||
|
uuid = extract_UUID_from_client(client)
|
||||||
|
|
||||||
|
client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Trigger a check
|
||||||
|
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||||
|
time.sleep(2)
|
||||||
|
# Offer should be gone
|
||||||
|
res = client.get(url_for("index"))
|
||||||
|
assert b'Embedded price data' not in res.data
|
||||||
|
assert b'tracking-ldjson-price-data' in res.data
|
||||||
|
|
||||||
|
# and last snapshop (via API) should be just the price
|
||||||
|
api_key = extract_api_key_from_UI(client)
|
||||||
|
res = client.get(
|
||||||
|
url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
|
||||||
|
headers={'x-api-key': api_key},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should just see the price in the API reply
|
||||||
|
assert res.data == b'8097000'
|
||||||
|
|
||||||
|
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
|
||||||
Reference in New Issue
Block a user