mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-01 23:28:06 +00:00
Compare commits
1 Commits
0.50.30
...
default-na
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6bccf93313 |
@@ -185,8 +185,21 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
|
||||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
|
||||
html_block = ""
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
|
||||
#@note: //title/text() wont work where <title>CDATA..
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
# Solution: Register the default namespace with empty string prefix in elementpath
|
||||
# This is primarily for RSS/Atom feeds but works for any XML with default namespace
|
||||
if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
|
||||
# Register the default namespace with empty string prefix for elementpath
|
||||
# This allows //title to match elements in the default namespace
|
||||
namespaces[''] = tree.nsmap[None]
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
|
||||
#@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
if type(r) != list:
|
||||
r = [r]
|
||||
@@ -221,8 +234,19 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
|
||||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
|
||||
html_block = ""
|
||||
|
||||
r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
|
||||
#@note: //title/text() wont work where <title>CDATA..
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
|
||||
# For documents with default namespace (RSS/Atom feeds), users must use:
|
||||
# - local-name(): //*[local-name()='title']/text()
|
||||
# - Or use xpath_filter (not xpath1_filter) which supports default namespaces
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
|
||||
r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
|
||||
#@note: xpath1 (lxml) does NOT automatically handle default namespaces
|
||||
#@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
for element in r:
|
||||
# When there's more than 1 match, then add the suffix to separate each line
|
||||
|
||||
153
changedetectionio/tests/test_xpath_default_namespace.py
Normal file
153
changedetectionio/tests/test_xpath_default_namespace.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unit tests for XPath default namespace handling in RSS/Atom feeds.
|
||||
Tests the fix for issue where //title/text() returns empty on feeds with default namespaces.
|
||||
|
||||
Real-world test data from https://github.com/microsoft/PowerToys/releases.atom
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import html_tools
|
||||
|
||||
|
||||
# Real-world Atom feed with default namespace from GitHub PowerToys releases
|
||||
# This is the actual format that was failing before the fix
|
||||
atom_feed_with_default_ns = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xml:lang="en-US">
|
||||
<id>tag:github.com,2008:https://github.com/microsoft/PowerToys/releases</id>
|
||||
<link type="text/html" rel="alternate" href="https://github.com/microsoft/PowerToys/releases"/>
|
||||
<link type="application/atom+xml" rel="self" href="https://github.com/microsoft/PowerToys/releases.atom"/>
|
||||
<title>Release notes from PowerToys</title>
|
||||
<updated>2025-10-23T08:53:12Z</updated>
|
||||
<entry>
|
||||
<id>tag:github.com,2008:Repository/184456251/v0.95.1</id>
|
||||
<updated>2025-10-24T14:20:14Z</updated>
|
||||
<link rel="alternate" type="text/html" href="https://github.com/microsoft/PowerToys/releases/tag/v0.95.1"/>
|
||||
<title>Release 0.95.1</title>
|
||||
<content type="html"><p>This patch release fixes several important stability issues.</p></content>
|
||||
<author>
|
||||
<name>Jaylyn-Barbee</name>
|
||||
</author>
|
||||
</entry>
|
||||
<entry>
|
||||
<id>tag:github.com,2008:Repository/184456251/v0.95.0</id>
|
||||
<updated>2025-10-17T12:51:21Z</updated>
|
||||
<link rel="alternate" type="text/html" href="https://github.com/microsoft/PowerToys/releases/tag/v0.95.0"/>
|
||||
<title>Release v0.95.0</title>
|
||||
<content type="html"><p>New features, stability, optimization improvements.</p></content>
|
||||
<author>
|
||||
<name>Jaylyn-Barbee</name>
|
||||
</author>
|
||||
</entry>
|
||||
</feed>"""
|
||||
|
||||
# RSS feed without default namespace
|
||||
rss_feed_no_default_ns = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Channel Title</title>
|
||||
<description>Channel Description</description>
|
||||
<item>
|
||||
<title>Item 1 Title</title>
|
||||
<description>Item 1 Description</description>
|
||||
</item>
|
||||
<item>
|
||||
<title>Item 2 Title</title>
|
||||
<description>Item 2 Description</description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>"""
|
||||
|
||||
# RSS 2.0 feed with namespace prefix (not default)
|
||||
rss_feed_with_ns_prefix = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
version="2.0">
|
||||
<channel>
|
||||
<title>Channel Title</title>
|
||||
<atom:link href="http://example.com/feed" rel="self" type="application/rss+xml"/>
|
||||
<item>
|
||||
<title>Item Title</title>
|
||||
<dc:creator>Author Name</dc:creator>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>"""
|
||||
|
||||
|
||||
class TestXPathDefaultNamespace:
|
||||
"""Test XPath queries on feeds with and without default namespaces."""
|
||||
|
||||
def test_atom_feed_simple_xpath_with_xpath_filter(self):
|
||||
"""Test that //title/text() works on Atom feed with default namespace using xpath_filter."""
|
||||
result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
assert 'Release v0.95.0' in result
|
||||
|
||||
def test_atom_feed_nested_xpath_with_xpath_filter(self):
|
||||
"""Test nested XPath like //entry/title/text() on Atom feed."""
|
||||
result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
assert 'Release 0.95.1' in result
|
||||
assert 'Release v0.95.0' in result
|
||||
# Should NOT include the feed title
|
||||
assert 'Release notes from PowerToys' not in result
|
||||
|
||||
def test_atom_feed_other_elements_with_xpath_filter(self):
|
||||
"""Test that other elements like //updated/text() work on Atom feed."""
|
||||
result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
assert '2025-10-23T08:53:12Z' in result
|
||||
assert '2025-10-24T14:20:14Z' in result
|
||||
|
||||
def test_rss_feed_without_namespace(self):
|
||||
"""Test that //title/text() works on RSS feed without default namespace."""
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item 1 Title' in result
|
||||
assert 'Item 2 Title' in result
|
||||
|
||||
def test_rss_feed_nested_xpath(self):
|
||||
"""Test nested XPath on RSS feed without default namespace."""
|
||||
result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
assert 'Item 1 Title' in result
|
||||
assert 'Item 2 Title' in result
|
||||
# Should NOT include channel title
|
||||
assert 'Channel Title' not in result
|
||||
|
||||
def test_rss_feed_with_prefixed_namespaces(self):
|
||||
"""Test that feeds with namespace prefixes (not default) still work."""
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_rss=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item Title' in result
|
||||
|
||||
def test_local_name_workaround_still_works(self):
|
||||
"""Test that local-name() workaround still works for Atom feeds."""
|
||||
result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
|
||||
def test_xpath1_filter_without_default_namespace(self):
|
||||
"""Test xpath1_filter works on RSS without default namespace."""
|
||||
result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item 1 Title' in result
|
||||
|
||||
def test_xpath1_filter_with_default_namespace_returns_empty(self):
|
||||
"""Test that xpath1_filter returns empty on Atom with default namespace (known limitation)."""
|
||||
result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
# xpath1_filter (lxml) doesn't support default namespaces, so this returns empty
|
||||
assert result == ''
|
||||
|
||||
def test_xpath1_filter_local_name_workaround(self):
|
||||
"""Test that xpath1_filter works with local-name() workaround on Atom feeds."""
|
||||
result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user