Be sure that default namespaces are registered

2026-01-08 16:20:21 +00:00 · 2025-10-24 17:26:09 +02:00
2 changed files with 181 additions and 4 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -185,8 +185,21 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

-    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
-    #@note: //title/text() wont work where <title>CDATA..
+    # Build namespace map for XPath queries
+    namespaces = {'re': 'http://exslt.org/regular-expressions'}
+
+    # Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
+    # XPath spec: unprefixed element names have no namespace, not the default namespace
+    # Solution: Register the default namespace with empty string prefix in elementpath
+    # This is primarily for RSS/Atom feeds but works for any XML with default namespace
+    if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
+        # Register the default namespace with empty string prefix for elementpath
+        # This allows //title to match elements in the default namespace
+        namespaces[''] = tree.nsmap[None]
+
+    r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
+    #@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
+    #@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)

    if type(r) != list:
        r = [r]
@@ -221,8 +234,19 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

-    r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
-    #@note: //title/text() wont work where <title>CDATA..
+    # Build namespace map for XPath queries
+    namespaces = {'re': 'http://exslt.org/regular-expressions'}
+
+    # NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
+    # For documents with default namespace (RSS/Atom feeds), users must use:
+    #   - local-name(): //*[local-name()='title']/text()
+    #   - Or use xpath_filter (not xpath1_filter) which supports default namespaces
+    # XPath spec: unprefixed element names have no namespace, not the default namespace
+
+    r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
+    #@note: xpath1 (lxml) does NOT automatically handle default namespaces
+    #@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
+    #@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
--- a/changedetectionio/tests/test_xpath_default_namespace.py
+++ b/changedetectionio/tests/test_xpath_default_namespace.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Unit tests for XPath default namespace handling in RSS/Atom feeds.
+Tests the fix for issue where //title/text() returns empty on feeds with default namespaces.
+
+Real-world test data from https://github.com/microsoft/PowerToys/releases.atom
+"""
+
+import sys
+import os
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import html_tools
+
+
+# Real-world Atom feed with default namespace from GitHub PowerToys releases
+# This is the actual format that was failing before the fix
+atom_feed_with_default_ns = """<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xml:lang="en-US">
+  <id>tag:github.com,2008:https://github.com/microsoft/PowerToys/releases</id>
+  <link type="text/html" rel="alternate" href="https://github.com/microsoft/PowerToys/releases"/>
+  <link type="application/atom+xml" rel="self" href="https://github.com/microsoft/PowerToys/releases.atom"/>
+  <title>Release notes from PowerToys</title>
+  <updated>2025-10-23T08:53:12Z</updated>
+  <entry>
+    <id>tag:github.com,2008:Repository/184456251/v0.95.1</id>
+    <updated>2025-10-24T14:20:14Z</updated>
+    <link rel="alternate" type="text/html" href="https://github.com/microsoft/PowerToys/releases/tag/v0.95.1"/>
+    <title>Release 0.95.1</title>
+    <content type="html">&lt;p&gt;This patch release fixes several important stability issues.&lt;/p&gt;</content>
+    <author>
+      <name>Jaylyn-Barbee</name>
+    </author>
+  </entry>
+  <entry>
+    <id>tag:github.com,2008:Repository/184456251/v0.95.0</id>
+    <updated>2025-10-17T12:51:21Z</updated>
+    <link rel="alternate" type="text/html" href="https://github.com/microsoft/PowerToys/releases/tag/v0.95.0"/>
+    <title>Release v0.95.0</title>
+    <content type="html">&lt;p&gt;New features, stability, optimization improvements.&lt;/p&gt;</content>
+    <author>
+      <name>Jaylyn-Barbee</name>
+    </author>
+  </entry>
+</feed>"""
+
+# RSS feed without default namespace
+rss_feed_no_default_ns = """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Channel Title</title>
+    <description>Channel Description</description>
+    <item>
+      <title>Item 1 Title</title>
+      <description>Item 1 Description</description>
+    </item>
+    <item>
+      <title>Item 2 Title</title>
+      <description>Item 2 Description</description>
+    </item>
+  </channel>
+</rss>"""
+
+# RSS 2.0 feed with namespace prefix (not default)
+rss_feed_with_ns_prefix = """<?xml version="1.0" encoding="UTF-8"?>
+<rss xmlns:dc="http://purl.org/dc/elements/1.1/"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:atom="http://www.w3.org/2005/Atom"
+     version="2.0">
+  <channel>
+    <title>Channel Title</title>
+    <atom:link href="http://example.com/feed" rel="self" type="application/rss+xml"/>
+    <item>
+      <title>Item Title</title>
+      <dc:creator>Author Name</dc:creator>
+    </item>
+  </channel>
+</rss>"""
+
+
+class TestXPathDefaultNamespace:
+    """Test XPath queries on feeds with and without default namespaces."""
+
+    def test_atom_feed_simple_xpath_with_xpath_filter(self):
+        """Test that //title/text() works on Atom feed with default namespace using xpath_filter."""
+        result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
+        assert 'Release notes from PowerToys' in result
+        assert 'Release 0.95.1' in result
+        assert 'Release v0.95.0' in result
+
+    def test_atom_feed_nested_xpath_with_xpath_filter(self):
+        """Test nested XPath like //entry/title/text() on Atom feed."""
+        result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_rss=True)
+        assert 'Release 0.95.1' in result
+        assert 'Release v0.95.0' in result
+        # Should NOT include the feed title
+        assert 'Release notes from PowerToys' not in result
+
+    def test_atom_feed_other_elements_with_xpath_filter(self):
+        """Test that other elements like //updated/text() work on Atom feed."""
+        result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_rss=True)
+        assert '2025-10-23T08:53:12Z' in result
+        assert '2025-10-24T14:20:14Z' in result
+
+    def test_rss_feed_without_namespace(self):
+        """Test that //title/text() works on RSS feed without default namespace."""
+        result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
+        assert 'Channel Title' in result
+        assert 'Item 1 Title' in result
+        assert 'Item 2 Title' in result
+
+    def test_rss_feed_nested_xpath(self):
+        """Test nested XPath on RSS feed without default namespace."""
+        result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_rss=True)
+        assert 'Item 1 Title' in result
+        assert 'Item 2 Title' in result
+        # Should NOT include channel title
+        assert 'Channel Title' not in result
+
+    def test_rss_feed_with_prefixed_namespaces(self):
+        """Test that feeds with namespace prefixes (not default) still work."""
+        result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_rss=True)
+        assert 'Channel Title' in result
+        assert 'Item Title' in result
+
+    def test_local_name_workaround_still_works(self):
+        """Test that local-name() workaround still works for Atom feeds."""
+        result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
+        assert 'Release notes from PowerToys' in result
+        assert 'Release 0.95.1' in result
+
+    def test_xpath1_filter_without_default_namespace(self):
+        """Test xpath1_filter works on RSS without default namespace."""
+        result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
+        assert 'Channel Title' in result
+        assert 'Item 1 Title' in result
+
+    def test_xpath1_filter_with_default_namespace_returns_empty(self):
+        """Test that xpath1_filter returns empty on Atom with default namespace (known limitation)."""
+        result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
+        # xpath1_filter (lxml) doesn't support default namespaces, so this returns empty
+        assert result == ''
+
+    def test_xpath1_filter_local_name_workaround(self):
+        """Test that xpath1_filter works with local-name() workaround on Atom feeds."""
+        result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
+        assert 'Release notes from PowerToys' in result
+        assert 'Release 0.95.1' in result
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])