Compare commits

..

5 Commits

6 changed files with 49 additions and 16 deletions

View File

@@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
elif type(element) == etree._ElementUnicodeResult:
html_block += str(element)
else:
if not is_rss:
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
else:
html_block += f"<div>{element.text}</div>\n"
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
return html_block
@@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
def repl(m):
text = m.group(1)
return xml_escape(html_to_text(html_content=text))
return xml_escape(html_to_text(html_content=text)).strip()
return re.sub(pattern, repl, html_content)
@@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]}, display_links=True
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
# otherwise set config to None/default
else:
@@ -303,13 +301,12 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
# RSS Mode - Inscriptis will treat `title` as something else.
# Make it as a regular block display element (//item/title)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss:
css = CSS_PROFILES['strict'].copy()
css['title'] = HtmlElement(display=Display.block)
text_content = get_text(html_content, ParserConfig(css=css))
else:
# get text and annotations via inscriptis
text_content = get_text(html_content, config=parser_config)
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
return text_content

View File

@@ -26,6 +26,7 @@ base_config = {
'extract_title_as_title': False,
'fetch_backend': 'system', # plaintext, playwright etc
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'fetch_time': 0.0,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'filter_text_added': True,
'filter_text_replaced': True,

View File

@@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
html_tools.html_to_text(
html_content=html_content,
render_anchor_tag_content=do_anchor,
is_rss=is_rss
is_rss=is_rss # #1874 activate the <title workaround hack
)
# Re #340 - return the content before the 'ignore text' was applied

View File

@@ -244,12 +244,15 @@ class ChangeDetectionStore:
import pathlib
self.__data['watching'][uuid].update({
'last_checked': 0,
'check_count': 0,
'fetch_time' : 0.0,
'has_ldjson_price_data': None,
'last_checked': 0,
'last_error': False,
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'track_ldjson_price_data': None,
})

View File

@@ -49,6 +49,7 @@
<li class="tab"><a href="#restock">Restock Detection</a></li>
{% endif %}
<li class="tab"><a href="#notifications">Notifications</a></li>
<li class="tab"><a href="#stats">Stats</a></li>
</ul>
</div>
@@ -441,7 +442,35 @@ Unavailable") }}
</fieldset>
</div>
{% endif %}
<div class="tab-pane-inner" id="stats">
<div class="pure-control-group">
<style>
#stats-table tr > td:first-child {
font-weight: bold;
}
</style>
<table class="pure-table" id="stats-table">
<tbody>
<tr>
<td>Check count</td>
<td>{{ watch.check_count }}</td>
</tr>
<tr>
<td>Consecutive filter failures</td>
<td>{{ watch.consecutive_filter_failures }}</td>
</tr>
<tr>
<td>History length</td>
<td>{{ watch.history|length }}</td>
</tr>
<tr>
<td>Last fetch time</td>
<td>{{ watch.fetch_time }}s</td>
</tr>
</tbody>
</table>
</div>
</div>
<div id="actions">
<div class="pure-control-group">
{{ render_button(form.save_button) }}

View File

@@ -118,7 +118,7 @@ def test_basic_cdata_rss_markup(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
def test_rss_xpath_filtering(client, live_server):
# live_server_setup(live_server)
#live_server_setup(live_server)
set_original_cdata_xml()
@@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
)
assert b'CDATA' not in res.data
assert b'<![' not in res.data
# #1874 All but the first <title was getting selected
# Convert any HTML with just a top level <title> to <h1> to be sure title renders
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
assert b'Some other title' in res.data # Should ONLY be selected by the xpath
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath