Ability to apply filters (first, last etc)

2026-07-08 08:21:14 +00:00 · 2025-10-10 17:26:20 +02:00
parent f02fb7406d
commit 709dadc492
3 changed files with 48 additions and 2 deletions
@@ -397,6 +397,11 @@ class perform_site_check(difference_detection_processor):
        # RSS preprocessing
        if stream_content_type.is_rss:
            content = content_processor.preprocess_rss(content)
+            if self.datastore.data["settings"]["application"].get("rss_reader_mode"):
+                # Now just becomes regular HTML that can have xpath/CSS applied (first of the set etc)
+                stream_content_type.is_rss = False
+                stream_content_type.is_html = True
+                self.fetcher.content = content

        # PDF preprocessing
        if watch.is_pdf or stream_content_type.is_pdf:
@@ -109,8 +109,22 @@ def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
            if item_parts:
                formatted_items.append('\n'.join(item_parts))

-        # Join all items with <br><br><hr>
-        return '<html><body>'+'<br><hr><br>'.join(formatted_items)
+        # Wrap each item in a div with classes (first, last, item-N)
+        items_html = []
+        total_items = len(formatted_items)
+        for idx, item in enumerate(formatted_items):
+            classes = ['rss-item']
+            if idx == 0:
+                classes.append('first')
+            if idx == total_items - 1:
+                classes.append('last')
+            classes.append(f'item-{idx + 1}')
+
+            class_str = ' '.join(classes)
+            items_html.append(f'<div class="{class_str}">{item}</div>')
+
+        # Join items with two <br> tags
+        return f'<html><body>{"\n<br><br>".join(items_html)}</body></html>'

    except Exception as e:
        logger.warning(f"Error formatting RSS items: {str(e)}")
@@ -25,6 +25,8 @@ def set_original_cdata_xml():
 <pubDate>Thu, 07 Aug 2025 00:00:00 GMT</pubDate>
 <description><p>Wet noodles escape<br><p>they also found themselves outside</p> </description>
 </item>
+
+
 <item>
 <title>TS-2025-004</title>
 <link>https://wetscale.com/security-bulletins/#ts-2025-004</link>
@@ -69,3 +71,28 @@ def test_rss_reader_mode(client, live_server, measure_memory_usage):
    assert 'PubDate: Thu, 07 Aug 2025 00:00:00 GMT' in snapshot_contents
    delete_all_watches(client)

+def test_rss_reader_mode_with_css_filters(client, live_server, measure_memory_usage):
+    set_original_cdata_xml()
+
+    # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
+    # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
+    test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
+    live_server.app.config['DATASTORE'].data['settings']['application']['rss_reader_mode'] = True
+
+
+    # Add our URL to the import page
+    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'include_filters': [".last"]})
+    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+
+    wait_for_all_checks(client)
+
+
+    watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
+    dates = list(watch.history.keys())
+    snapshot_contents = watch.get_history_snapshot(dates[0])
+    assert 'Wet noodles escape' not in snapshot_contents
+    assert '<br>' not in snapshot_contents
+    assert '&lt;' not in snapshot_contents
+    assert 'The days of Terminator and The Matrix' in snapshot_contents
+    delete_all_watches(client)
+