Merge branch 'master' into stats-tab

Bugfix - [Clear history] button was not clearing all metadata (#1881 )
Adding [stats] tab
2026-06-21 16:18:21 +00:00 · 2023-10-20 11:48:19 +02:00 · 2023-10-20 11:47:49 +02:00 · 2023-10-20 10:52:45 +02:00 · 2023-10-19 16:42:05 +02:00 · 2023-10-19 13:20:01 +02:00
6 changed files with 49 additions and 16 deletions
@@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
        elif type(element) == etree._ElementUnicodeResult:
            html_block += str(element)
        else:
-            if not is_rss:
-                html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
-            else:
-                html_block += f"<div>{element.text}</div>\n"
+            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')

    return html_block

@@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
    def repl(m):
        text = m.group(1)
-        return xml_escape(html_to_text(html_content=text))
+        return xml_escape(html_to_text(html_content=text)).strip()

    return re.sub(pattern, repl, html_content)

@@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
-            annotation_rules={"a": ["hyperlink"]}, display_links=True
+            annotation_rules={"a": ["hyperlink"]},
+            display_links=True
        )
    # otherwise set config to None/default
    else:
@@ -303,13 +301,12 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals

    # RSS Mode - Inscriptis will treat `title` as something else.
    # Make it as a regular block display element (//item/title)
+    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
-        css = CSS_PROFILES['strict'].copy()
-        css['title'] = HtmlElement(display=Display.block)
-        text_content = get_text(html_content, ParserConfig(css=css))
-    else:
-        # get text and annotations via inscriptis
-        text_content = get_text(html_content, config=parser_config)
+        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
+        html_content = re.sub(r'</title>', r'</h1>', html_content)
+
+    text_content = get_text(html_content, config=parser_config)

    return text_content

@@ -26,6 +26,7 @@ base_config = {
    'extract_title_as_title': False,
    'fetch_backend': 'system', # plaintext, playwright etc
    'processor': 'text_json_diff', # could be restock_diff or others from .processors
+    'fetch_time': 0.0,
    'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
    'filter_text_added': True,
    'filter_text_replaced': True,
@@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
                        html_tools.html_to_text(
                            html_content=html_content,
                            render_anchor_tag_content=do_anchor,
-                            is_rss=is_rss
+                            is_rss=is_rss # #1874 activate the <title workaround hack
                        )

        # Re #340 - return the content before the 'ignore text' was applied
@@ -244,12 +244,15 @@ class ChangeDetectionStore:
        import pathlib

        self.__data['watching'][uuid].update({
-                'last_checked': 0,
+                'check_count': 0,
+                'fetch_time' : 0.0,
                'has_ldjson_price_data': None,
+                'last_checked': 0,
                'last_error': False,
                'last_notification_error': False,
                'last_viewed': 0,
                'previous_md5': False,
+                'previous_md5_before_filters': False,
                'track_ldjson_price_data': None,
            })

@@ -49,6 +49,7 @@
            <li class="tab"><a href="#restock">Restock Detection</a></li>
            {% endif %}
            <li class="tab"><a href="#notifications">Notifications</a></li>
+            <li class="tab"><a href="#stats">Stats</a></li>
        </ul>
    </div>

@@ -441,7 +442,35 @@ Unavailable") }}
                </fieldset>
            </div>
            {% endif %}
-
+            <div class="tab-pane-inner" id="stats">
+                <div class="pure-control-group">
+                    <style>
+                    #stats-table tr > td:first-child {
+                        font-weight: bold;
+                    }
+                    </style>
+                    <table class="pure-table" id="stats-table">
+                        <tbody>
+                        <tr>
+                            <td>Check count</td>
+                            <td>{{ watch.check_count }}</td>
+                        </tr>
+                        <tr>
+                            <td>Consecutive filter failures</td>
+                            <td>{{ watch.consecutive_filter_failures }}</td>
+                        </tr>
+                        <tr>
+                            <td>History length</td>
+                            <td>{{ watch.history|length }}</td>
+                        </tr>
+                        <tr>
+                            <td>Last fetch time</td>
+                            <td>{{ watch.fetch_time }}s</td>
+                        </tr>
+                        </tbody>
+                    </table>
+                </div>
+            </div>
            <div id="actions">
                <div class="pure-control-group">
                    {{ render_button(form.save_button) }}
@@ -118,7 +118,7 @@ def test_basic_cdata_rss_markup(client, live_server):
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

 def test_rss_xpath_filtering(client, live_server):
-#    live_server_setup(live_server)
+    #live_server_setup(live_server)

    set_original_cdata_xml()

@@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
    )
    assert b'CDATA' not in res.data
    assert b'<![' not in res.data
+    # #1874  All but the first <title was getting selected
+    # Convert any HTML with just a top level <title> to <h1> to be sure title renders
+
    assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
    assert b'Some other title' in res.data  # Should ONLY be selected by the xpath
    assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
Author	SHA1	Message	Date
dgtlmoon	55783c97ac	Merge branch 'master' into stats-tab	2023-10-20 11:48:19 +02:00
dgtlmoon	52225f2ad8	Bugfix - [Clear history] button was not clearing all metadata (#1881 )	2023-10-20 11:47:49 +02:00
dgtlmoon	6ec73bc879	Adding [stats] tab	2023-10-20 10:52:45 +02:00
dgtlmoon	7220afab0a	RSS fetch - RSS field <title> was not rendering as text correctly, added workaround #1879	2023-10-19 16:42:05 +02:00
dgtlmoon	1c0fe4c23e	PDF Fetching - Handle when the PDF is given as inline content without a proper mime header (#1875 )	2023-10-19 13:20:01 +02:00