mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-22 01:16:12 +00:00
Compare commits
5 Commits
PDF-diff-i
...
stats-tab
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
55783c97ac | ||
|
|
52225f2ad8 | ||
|
|
6ec73bc879 | ||
|
|
7220afab0a | ||
|
|
1c0fe4c23e |
@@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
|
|||||||
elif type(element) == etree._ElementUnicodeResult:
|
elif type(element) == etree._ElementUnicodeResult:
|
||||||
html_block += str(element)
|
html_block += str(element)
|
||||||
else:
|
else:
|
||||||
if not is_rss:
|
|
||||||
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
||||||
else:
|
|
||||||
html_block += f"<div>{element.text}</div>\n"
|
|
||||||
|
|
||||||
return html_block
|
return html_block
|
||||||
|
|
||||||
@@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||||||
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
|
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
|
||||||
def repl(m):
|
def repl(m):
|
||||||
text = m.group(1)
|
text = m.group(1)
|
||||||
return xml_escape(html_to_text(html_content=text))
|
return xml_escape(html_to_text(html_content=text)).strip()
|
||||||
|
|
||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
@@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
|||||||
# extracting this content
|
# extracting this content
|
||||||
if render_anchor_tag_content:
|
if render_anchor_tag_content:
|
||||||
parser_config = ParserConfig(
|
parser_config = ParserConfig(
|
||||||
annotation_rules={"a": ["hyperlink"]}, display_links=True
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
|
display_links=True
|
||||||
)
|
)
|
||||||
# otherwise set config to None/default
|
# otherwise set config to None/default
|
||||||
else:
|
else:
|
||||||
@@ -303,12 +301,11 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
|||||||
|
|
||||||
# RSS Mode - Inscriptis will treat `title` as something else.
|
# RSS Mode - Inscriptis will treat `title` as something else.
|
||||||
# Make it as a regular block display element (//item/title)
|
# Make it as a regular block display element (//item/title)
|
||||||
|
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
|
||||||
if is_rss:
|
if is_rss:
|
||||||
css = CSS_PROFILES['strict'].copy()
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
css['title'] = HtmlElement(display=Display.block)
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
text_content = get_text(html_content, ParserConfig(css=css))
|
|
||||||
else:
|
|
||||||
# get text and annotations via inscriptis
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
text_content = get_text(html_content, config=parser_config)
|
||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ base_config = {
|
|||||||
'extract_title_as_title': False,
|
'extract_title_as_title': False,
|
||||||
'fetch_backend': 'system', # plaintext, playwright etc
|
'fetch_backend': 'system', # plaintext, playwright etc
|
||||||
'processor': 'text_json_diff', # could be restock_diff or others from .processors
|
'processor': 'text_json_diff', # could be restock_diff or others from .processors
|
||||||
|
'fetch_time': 0.0,
|
||||||
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
||||||
'filter_text_added': True,
|
'filter_text_added': True,
|
||||||
'filter_text_replaced': True,
|
'filter_text_replaced': True,
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
|
|||||||
html_tools.html_to_text(
|
html_tools.html_to_text(
|
||||||
html_content=html_content,
|
html_content=html_content,
|
||||||
render_anchor_tag_content=do_anchor,
|
render_anchor_tag_content=do_anchor,
|
||||||
is_rss=is_rss
|
is_rss=is_rss # #1874 activate the <title workaround hack
|
||||||
)
|
)
|
||||||
|
|
||||||
# Re #340 - return the content before the 'ignore text' was applied
|
# Re #340 - return the content before the 'ignore text' was applied
|
||||||
|
|||||||
@@ -244,12 +244,15 @@ class ChangeDetectionStore:
|
|||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
self.__data['watching'][uuid].update({
|
self.__data['watching'][uuid].update({
|
||||||
'last_checked': 0,
|
'check_count': 0,
|
||||||
|
'fetch_time' : 0.0,
|
||||||
'has_ldjson_price_data': None,
|
'has_ldjson_price_data': None,
|
||||||
|
'last_checked': 0,
|
||||||
'last_error': False,
|
'last_error': False,
|
||||||
'last_notification_error': False,
|
'last_notification_error': False,
|
||||||
'last_viewed': 0,
|
'last_viewed': 0,
|
||||||
'previous_md5': False,
|
'previous_md5': False,
|
||||||
|
'previous_md5_before_filters': False,
|
||||||
'track_ldjson_price_data': None,
|
'track_ldjson_price_data': None,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -49,6 +49,7 @@
|
|||||||
<li class="tab"><a href="#restock">Restock Detection</a></li>
|
<li class="tab"><a href="#restock">Restock Detection</a></li>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<li class="tab"><a href="#notifications">Notifications</a></li>
|
<li class="tab"><a href="#notifications">Notifications</a></li>
|
||||||
|
<li class="tab"><a href="#stats">Stats</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -441,7 +442,35 @@ Unavailable") }}
|
|||||||
</fieldset>
|
</fieldset>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
<div class="tab-pane-inner" id="stats">
|
||||||
|
<div class="pure-control-group">
|
||||||
|
<style>
|
||||||
|
#stats-table tr > td:first-child {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<table class="pure-table" id="stats-table">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>Check count</td>
|
||||||
|
<td>{{ watch.check_count }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Consecutive filter failures</td>
|
||||||
|
<td>{{ watch.consecutive_filter_failures }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>History length</td>
|
||||||
|
<td>{{ watch.history|length }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Last fetch time</td>
|
||||||
|
<td>{{ watch.fetch_time }}s</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
<div id="actions">
|
<div id="actions">
|
||||||
<div class="pure-control-group">
|
<div class="pure-control-group">
|
||||||
{{ render_button(form.save_button) }}
|
{{ render_button(form.save_button) }}
|
||||||
|
|||||||
@@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
|
|||||||
)
|
)
|
||||||
assert b'CDATA' not in res.data
|
assert b'CDATA' not in res.data
|
||||||
assert b'<![' not in res.data
|
assert b'<![' not in res.data
|
||||||
|
# #1874 All but the first <title was getting selected
|
||||||
|
# Convert any HTML with just a top level <title> to <h1> to be sure title renders
|
||||||
|
|
||||||
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
|
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
|
||||||
assert b'Some other title' in res.data # Should ONLY be selected by the xpath
|
assert b'Some other title' in res.data # Should ONLY be selected by the xpath
|
||||||
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
|
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
|
||||||
|
|||||||
Reference in New Issue
Block a user