mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-03-05 19:43:25 +00:00
Compare commits
3 Commits
restock-no
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9d355b8f05 | ||
|
|
da43a17541 | ||
|
|
904eaaaaf7 |
@@ -305,12 +305,20 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} {
|
||||
{%- endif -%}
|
||||
|
||||
{%- if watch.get('restock') and watch['restock'].get('price') -%}
|
||||
{%- if watch['restock']['price'] is number -%}
|
||||
<span class="restock-label price" title="{{ _('Price') }}">
|
||||
{{ watch['restock']['price']|format_number_locale if watch['restock'].get('price') else '' }} {{ watch['restock'].get('currency','') }}
|
||||
</span>
|
||||
{%- else -%} <!-- watch['restock']['price']' is not a number, cant output it -->
|
||||
{%- set restock = watch['restock'] -%}
|
||||
{%- set price = restock.get('price') -%}
|
||||
{%- set cur = restock.get('currency','') -%}
|
||||
|
||||
{%- if price is not none and (price|string)|regex_search('\d') -%}
|
||||
<span class="restock-label price" title="{{ _('Price') }}">
|
||||
{# @todo: make parse_currency/parse_decimal aware of the locale of the actual web page and use that instead changedetectionio/processors/restock_diff/__init__.py #}
|
||||
{%- if price is number -%}{# It's a number so we can convert it to their locale' #}
|
||||
{{ price|format_number_locale }} {{ cur }}<!-- as number -->
|
||||
{%- else -%}{# It's totally fine if it arrives as something else, the website might be something weird in this field #}
|
||||
{{ price }} {{ cur }}<!-- as string -->
|
||||
{%- endif -%}
|
||||
</span>
|
||||
{%- endif -%}
|
||||
{%- elif not watch.has_restock_info -%}
|
||||
<span class="restock-label error">{{ _('No information') }}</span>
|
||||
{%- endif -%}
|
||||
|
||||
@@ -148,10 +148,32 @@ class fetcher(Fetcher):
|
||||
# Default to UTF-8 for XML if no encoding found
|
||||
r.encoding = 'utf-8'
|
||||
else:
|
||||
# For other content types, use chardet
|
||||
encoding = chardet.detect(r.content)['encoding']
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
# No charset in HTTP header - sniff encoding in priority order matching browsers
|
||||
# (WHATWG encoding sniffing algorithm):
|
||||
# 1. BOM - highest confidence, check before anything else
|
||||
# 2. <meta charset> in first 2kb
|
||||
# 3. chardet statistical detection - last resort
|
||||
# See: https://github.com/dgtlmoon/changedetection.io/issues/3952
|
||||
boms = [
|
||||
(b'\xef\xbb\xbf', 'utf-8-sig'),
|
||||
(b'\xff\xfe', 'utf-16-le'),
|
||||
(b'\xfe\xff', 'utf-16-be'),
|
||||
]
|
||||
bom_encoding = next((enc for bom, enc in boms if r.content.startswith(bom)), None)
|
||||
if bom_encoding:
|
||||
logger.info(f"URL: {url} Using encoding '{bom_encoding}' detected from BOM")
|
||||
r.encoding = bom_encoding
|
||||
else:
|
||||
meta_charset_match = re.search(rb'<meta[^>]+charset\s*=\s*["\']?\s*([^"\'\s;>]+)', r.content[:2000], re.IGNORECASE)
|
||||
if meta_charset_match:
|
||||
encoding = meta_charset_match.group(1).decode('ascii', errors='ignore')
|
||||
logger.info(f"URL: {url} No content-type encoding in HTTP headers - Using encoding '{encoding}' from HTML meta charset tag")
|
||||
r.encoding = encoding
|
||||
else:
|
||||
encoding = chardet.detect(r.content)['encoding']
|
||||
logger.warning(f"URL: {url} No charset in headers or meta tag, guessed encoding as '{encoding}' via chardet")
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
|
||||
self.headers = r.headers
|
||||
|
||||
|
||||
@@ -217,9 +217,13 @@ def _jinja2_filter_format_number_locale(value: float) -> str:
|
||||
"Formats for example 4000.10 to the local locale default of 4,000.10"
|
||||
# Format the number with two decimal places (locale format string will return 6 decimal)
|
||||
formatted_value = locale.format_string("%.2f", value, grouping=True)
|
||||
|
||||
return formatted_value
|
||||
|
||||
@app.template_filter('regex_search')
|
||||
def _jinja2_filter_regex_search(value, pattern):
|
||||
import re
|
||||
return re.search(pattern, str(value)) is not None
|
||||
|
||||
@app.template_global('is_checking_now')
|
||||
def _watch_is_checking_now(watch_obj, format="%Y-%m-%d %H:%M:%S"):
|
||||
return worker_pool.is_watch_running(watch_obj['uuid'])
|
||||
|
||||
@@ -260,6 +260,16 @@ class difference_detection_processor():
|
||||
# @todo .quit here could go on close object, so we can run JS if change-detected
|
||||
await self.fetcher.quit(watch=self.watch)
|
||||
|
||||
# Sanitize lone surrogates - these can appear when servers return malformed/mixed-encoding
|
||||
# content that gets decoded into surrogate characters (e.g. \udcad). Without this,
|
||||
# encode('utf-8') raises UnicodeEncodeError downstream in checksums, diffs, file writes, etc.
|
||||
# Covers all fetchers (requests, playwright, puppeteer, selenium) in one place.
|
||||
# Also note: By this point we SHOULD know the original encoding so it can safely convert to utf-8 for the rest of the app.
|
||||
# See: https://github.com/dgtlmoon/changedetection.io/issues/3952
|
||||
|
||||
if self.fetcher.content and isinstance(self.fetcher.content, str):
|
||||
self.fetcher.content = self.fetcher.content.encode('utf-8', errors='replace').decode('utf-8')
|
||||
|
||||
# After init, call run_changedetection() which will do the actual change-detection
|
||||
|
||||
def get_extra_watch_config(self, filename):
|
||||
|
||||
@@ -31,6 +31,7 @@ class Restock(dict):
|
||||
|
||||
if standardized_value:
|
||||
# Convert to float
|
||||
# @todo locale needs to be the locale of the webpage
|
||||
return float(parse_decimal(standardized_value, locale='en'))
|
||||
|
||||
return None
|
||||
|
||||
@@ -283,4 +283,7 @@ def query_price_availability(extracted_data):
|
||||
if not result.get('availability') and 'availability' in microdata:
|
||||
result['availability'] = microdata['availability']
|
||||
|
||||
# result['price'] could be float or str here, depending on the website, for example it might contain "1,00" commas, etc.
|
||||
# using something like babel you need to know the locale of the website and even then it can be problematic
|
||||
# we dont really do anything with the price data so far.. so just accept it the way it comes.
|
||||
return result
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from flask import url_for
|
||||
from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
|
||||
@@ -11,6 +12,69 @@ import os
|
||||
|
||||
|
||||
|
||||
def test_surrogate_characters_in_content_are_sanitized():
|
||||
"""Lone surrogates can appear in requests' r.text when a server returns malformed/mixed-encoding
|
||||
content. Without sanitization, encoding to UTF-8 raises UnicodeEncodeError.
|
||||
See: https://github.com/dgtlmoon/changedetection.io/issues/3952
|
||||
"""
|
||||
content_with_surrogate = '<html><body>Hello \udcad World</body></html>'
|
||||
|
||||
# Confirm the raw problem exists
|
||||
with pytest.raises(UnicodeEncodeError):
|
||||
content_with_surrogate.encode('utf-8')
|
||||
|
||||
# Our fix: sanitize after fetcher.run() in processors/base.py call_browser()
|
||||
sanitized = content_with_surrogate.encode('utf-8', errors='replace').decode('utf-8')
|
||||
assert 'Hello' in sanitized
|
||||
assert 'World' in sanitized
|
||||
assert '\udcad' not in sanitized
|
||||
|
||||
# Checksum computation (processors/base.py get_raw_document_checksum) must not crash
|
||||
hashlib.md5(sanitized.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
def test_utf8_content_without_charset_header(client, live_server, datastore_path):
|
||||
"""Server returns UTF-8 content but no charset in Content-Type header.
|
||||
chardet can misdetect such pages as UTF-7 (Python 3.14 then produces surrogates).
|
||||
Our fix tries UTF-8 first before falling back to chardet.
|
||||
See: https://github.com/dgtlmoon/changedetection.io/issues/3952
|
||||
"""
|
||||
from .util import write_test_file_and_sync
|
||||
# UTF-8 encoded content with non-ASCII chars - no charset will be in the header
|
||||
html = '<html><body><p>Español</p><p>Français</p><p>日本語</p></body></html>'
|
||||
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), html.encode('utf-8'), mode='wb')
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="text/html", _external=True)
|
||||
client.application.config.get('DATASTORE').add_watch(url=test_url)
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True)
|
||||
# Should decode correctly as UTF-8, not produce mojibake (Español) or replacement chars
|
||||
assert 'Español'.encode('utf-8') in res.data
|
||||
assert 'Français'.encode('utf-8') in res.data
|
||||
assert '日本語'.encode('utf-8') in res.data
|
||||
|
||||
|
||||
def test_shiftjis_with_meta_charset(client, live_server, datastore_path):
|
||||
"""Server returns Shift-JIS content with no charset in HTTP header, but the HTML
|
||||
declares <meta charset="Shift-JIS">. We should use the meta tag, not chardet.
|
||||
Real-world case: https://github.com/dgtlmoon/changedetection.io/issues/3952
|
||||
"""
|
||||
from .util import write_test_file_and_sync
|
||||
japanese_text = '日本語のページ'
|
||||
html = f'<html><head><meta http-equiv="Content-Type" content="text/html;charset=Shift-JIS"></head><body><p>{japanese_text}</p></body></html>'
|
||||
write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), html.encode('shift_jis'), mode='wb')
|
||||
|
||||
test_url = url_for('test_endpoint', content_type="text/html", _external=True)
|
||||
client.application.config.get('DATASTORE').add_watch(url=test_url)
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True)
|
||||
assert japanese_text.encode('utf-8') in res.data
|
||||
|
||||
|
||||
def set_html_response(datastore_path):
|
||||
test_return_data = """
|
||||
<html><body><span class="nav_second_img_text">
|
||||
|
||||
@@ -467,3 +467,38 @@ def test_special_prop_examples(client, live_server, measure_memory_usage, datast
|
||||
assert b'155.55' in res.data
|
||||
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_itemprop_as_str(client, live_server, measure_memory_usage, datastore_path):
|
||||
|
||||
test_return_data = f"""<html>
|
||||
<body>
|
||||
Some initial text<br>
|
||||
<p>Which is across multiple lines</p>
|
||||
<span itemprop="offers" itemscope itemtype="http://schema.org/Offer">
|
||||
<meta content="767.55" itemprop="price"/>
|
||||
<meta content="EUR" itemprop="priceCurrency"/>
|
||||
<meta content="InStock" itemprop="availability"/>
|
||||
<meta content="https://www.123-test.dk" itemprop="url"/>
|
||||
</span>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
|
||||
f.write(test_return_data)
|
||||
|
||||
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
|
||||
client.post(
|
||||
url_for("ui.ui_views.form_quick_watch_add"),
|
||||
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
client.get(url_for("ui.form_watch_checknow"))
|
||||
wait_for_all_checks(client)
|
||||
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'767.55' in res.data
|
||||
Reference in New Issue
Block a user