diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 0a3cd108..cda37cc0 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -172,99 +172,131 @@ def elementpath_tostring(obj):
return str(obj)
# Return str Utf-8 of matched rules
-def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
+def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False):
+ """
+
+ :param xpath_filter:
+ :param html_content:
+ :param append_pretty_line_formatting:
+ :param is_xml: set to true if is XML or is RSS (RSS is XML)
+ :return:
+ """
from lxml import etree, html
import elementpath
# xpath 2.0-3.1
from elementpath.xpath3 import XPath3Parser
parser = etree.HTMLParser()
- if is_rss:
- # So that we can keep CDATA for cdata_in_document_to_text() to process
- parser = etree.XMLParser(strip_cdata=False)
-
- tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
- html_block = ""
-
- # Build namespace map for XPath queries
- namespaces = {'re': 'http://exslt.org/regular-expressions'}
-
- # Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
- # XPath spec: unprefixed element names have no namespace, not the default namespace
- # Solution: Register the default namespace with empty string prefix in elementpath
- # This is primarily for RSS/Atom feeds but works for any XML with default namespace
- if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
- # Register the default namespace with empty string prefix for elementpath
- # This allows //title to match elements in the default namespace
- namespaces[''] = tree.nsmap[None]
-
- r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
- #@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
- #@note: //title/text() wont work where
CDATA.. (use cdata_in_document_to_text first)
-
- if type(r) != list:
- r = [r]
-
- for element in r:
- # When there's more than 1 match, then add the suffix to separate each line
- # And where the matched result doesn't include something that will cause Inscriptis to add a newline
- # (This way each 'match' reliably has a new-line in the diff)
- # Divs are converted to 4 whitespaces by inscriptis
- if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
- html_block += TEXT_FILTER_LIST_LINE_SUFFIX
-
- if type(element) == str:
- html_block += element
- elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
- html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
+ tree = None
+ try:
+ if is_xml:
+ # So that we can keep CDATA for cdata_in_document_to_text() to process
+ parser = etree.XMLParser(strip_cdata=False)
+ # For XML/RSS content, use etree.fromstring to properly handle XML declarations
+ tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser)
else:
- html_block += elementpath_tostring(element)
+ tree = html.fromstring(html_content, parser=parser)
+ html_block = ""
- return html_block
+ # Build namespace map for XPath queries
+ namespaces = {'re': 'http://exslt.org/regular-expressions'}
+
+ # Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
+ # XPath spec: unprefixed element names have no namespace, not the default namespace
+ # Solution: Register the default namespace with empty string prefix in elementpath
+ # This is primarily for RSS/Atom feeds but works for any XML with default namespace
+ if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
+ # Register the default namespace with empty string prefix for elementpath
+ # This allows //title to match elements in the default namespace
+ namespaces[''] = tree.nsmap[None]
+
+ r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
+ #@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
+ #@note: //title/text() wont work where CDATA.. (use cdata_in_document_to_text first)
+
+ if type(r) != list:
+ r = [r]
+
+ for element in r:
+ # When there's more than 1 match, then add the suffix to separate each line
+ # And where the matched result doesn't include something that will cause Inscriptis to add a newline
+ # (This way each 'match' reliably has a new-line in the diff)
+ # Divs are converted to 4 whitespaces by inscriptis
+ if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
+ html_block += TEXT_FILTER_LIST_LINE_SUFFIX
+
+ if type(element) == str:
+ html_block += element
+ elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
+ # Use 'xml' method for RSS/XML content, 'html' for HTML content
+ # parser will be XMLParser if we detected XML content
+ method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html'
+ html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode')
+ else:
+ html_block += elementpath_tostring(element)
+
+ return html_block
+ finally:
+ # Explicitly clear the tree to free memory
+ # lxml trees can hold significant memory, especially with large documents
+ if tree is not None:
+ tree.clear()
# Return str Utf-8 of matched rules
# 'xpath1:'
-def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
+def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False):
from lxml import etree, html
parser = None
- if is_rss:
- # So that we can keep CDATA for cdata_in_document_to_text() to process
- parser = etree.XMLParser(strip_cdata=False)
-
- tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
- html_block = ""
-
- # Build namespace map for XPath queries
- namespaces = {'re': 'http://exslt.org/regular-expressions'}
-
- # NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
- # For documents with default namespace (RSS/Atom feeds), users must use:
- # - local-name(): //*[local-name()='title']/text()
- # - Or use xpath_filter (not xpath1_filter) which supports default namespaces
- # XPath spec: unprefixed element names have no namespace, not the default namespace
-
- r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
- #@note: xpath1 (lxml) does NOT automatically handle default namespaces
- #@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
- #@note: //title/text() wont work where CDATA.. (use cdata_in_document_to_text first)
-
- for element in r:
- # When there's more than 1 match, then add the suffix to separate each line
- # And where the matched result doesn't include something that will cause Inscriptis to add a newline
- # (This way each 'match' reliably has a new-line in the diff)
- # Divs are converted to 4 whitespaces by inscriptis
- if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
- html_block += TEXT_FILTER_LIST_LINE_SUFFIX
-
- # Some kind of text, UTF-8 or other
- if isinstance(element, (str, bytes)):
- html_block += element
+ tree = None
+ try:
+ if is_xml:
+ # So that we can keep CDATA for cdata_in_document_to_text() to process
+ parser = etree.XMLParser(strip_cdata=False)
+ # For XML/RSS content, use etree.fromstring to properly handle XML declarations
+ tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser)
else:
- # Return the HTML which will get parsed as text
- html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
+ tree = html.fromstring(html_content, parser=parser)
+ html_block = ""
- return html_block
+ # Build namespace map for XPath queries
+ namespaces = {'re': 'http://exslt.org/regular-expressions'}
+
+ # NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
+ # For documents with default namespace (RSS/Atom feeds), users must use:
+ # - local-name(): //*[local-name()='title']/text()
+ # - Or use xpath_filter (not xpath1_filter) which supports default namespaces
+ # XPath spec: unprefixed element names have no namespace, not the default namespace
+
+ r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
+ #@note: xpath1 (lxml) does NOT automatically handle default namespaces
+ #@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
+ #@note: //title/text() wont work where CDATA.. (use cdata_in_document_to_text first)
+
+ for element in r:
+ # When there's more than 1 match, then add the suffix to separate each line
+ # And where the matched result doesn't include something that will cause Inscriptis to add a newline
+ # (This way each 'match' reliably has a new-line in the diff)
+ # Divs are converted to 4 whitespaces by inscriptis
+ if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
+ html_block += TEXT_FILTER_LIST_LINE_SUFFIX
+
+ # Some kind of text, UTF-8 or other
+ if isinstance(element, (str, bytes)):
+ html_block += element
+ else:
+ # Return the HTML/XML which will get parsed as text
+ # Use 'xml' method for RSS/XML content, 'html' for HTML content
+ # parser will be XMLParser if we detected XML content
+ method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html'
+ html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode')
+
+ return html_block
+ finally:
+ # Explicitly clear the tree to free memory
+ # lxml trees can hold significant memory, especially with large documents
+ if tree is not None:
+ tree.clear()
# Extract/find element
def extract_element(find='title', html_content=''):
diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py
index 2a0ef68f..9d9018d7 100644
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -103,15 +103,15 @@ class guess_stream_type():
self.is_json = True
elif 'pdf' in magic_content_header:
self.is_pdf = True
- elif has_html_patterns or http_content_header == 'text/html':
- self.is_html = True
- elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
- self.is_json = True
# magic will call a rss document 'xml'
# Rarely do endpoints give the right header, usually just text/xml, so we check also for
+
+
+
+
+
+
+
Cyrillic: Привет мир
+
Greek: Γειά σου κόσμε
+
Arabic: مرحبا بالعالم
+
Chinese: 你好世界
+
Japanese: こんにちは世界
+
Emoji: 🌍🎉✨
+
+
+
+"""
+
+
+@pytest.mark.parametrize("html_content", [polish_html])
+@pytest.mark.parametrize("xpath, expected_text", [
+ # Test Polish characters in xpath_filter
+ ('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'),
+ ('//a[(contains(@class,"index--s-headline-link"))]', 'oglądanie krótkich filmików'),
+ ('//a[(contains(@class,"index--s-headline-link"))]', 'zgnilizny mózgu'),
+ ('//a[@class="other-class"]', 'żółć ąę śń'),
+
+ # Test various Unicode scripts
+ ('//p[@class="unicode-test"]', 'Привет мир'),
+ ('//p[@class="unicode-test"]', 'Γειά σου κόσμε'),
+ ('//p[@class="unicode-test"]', 'مرحبا بالعالم'),
+ ('//p[@class="unicode-test"]', '你好世界'),
+ ('//p[@class="unicode-test"]', 'こんにちは世界'),
+ ('//p[@class="unicode-test"]', '🌍🎉✨'),
+
+ # Test with text() extraction
+ ('//a[@class="other-class"]/text()', 'żółć'),
+])
+def test_xpath_utf8_encoding(html_content, xpath, expected_text):
+ """Test that XPath filters preserve UTF-8 characters correctly (issue #3658)"""
+ result = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=False)
+ assert type(result) == str
+ assert expected_text in result
+ # Ensure characters are NOT HTML-entity encoded
+ # For example, 'ą' should NOT become 'ą'
+ assert '' not in result or expected_text in result
+
+
+@pytest.mark.parametrize("html_content", [polish_html])
+@pytest.mark.parametrize("xpath, expected_text", [
+ # Test Polish characters in xpath1_filter
+ ('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'),
+ ('//a[(contains(@class,"index--s-headline-link"))]', 'mózgu'),
+ ('//a[@class="other-class"]', 'żółć ąę śń'),
+
+ # Test various Unicode scripts with xpath1
+ ('//p[@class="unicode-test" and contains(text(), "Cyrillic")]', 'Привет мир'),
+ ('//p[@class="unicode-test" and contains(text(), "Greek")]', 'Γειά σου'),
+ ('//p[@class="unicode-test" and contains(text(), "Chinese")]', '你好世界'),
+])
+def test_xpath1_utf8_encoding(html_content, xpath, expected_text):
+ """Test that XPath1 filters preserve UTF-8 characters correctly"""
+ result = html_tools.xpath1_filter(xpath, html_content, append_pretty_line_formatting=False)
+ assert type(result) == str
+ assert expected_text in result
+ # Ensure characters are NOT HTML-entity encoded
+ assert '' not in result or expected_text in result
+
+
+# Test with real-world example from wyborcza.pl (issue #3658)
+wyborcza_style_html = """
+
+
+
+
+
+
+"""
+
+
+def test_wyborcza_real_world_example():
+ """Test real-world case from wyborcza.pl that was failing (issue #3658)"""
+ xpath = '//a[(contains(@class,"index--s-headline-link"))]'
+ result = html_tools.xpath_filter(xpath, wyborcza_style_html, append_pretty_line_formatting=False)
+
+ # These exact strings should appear in the result
+ assert 'Naukowcy potwierdzają' in result
+ assert 'oglądanie krótkich filmików' in result
+ assert 'zgnilizny mózgu' in result
+ assert 'Łódź' in result
+
+ # Make sure they're NOT corrupted to mojibake like "potwierdzajÄ"
+ assert 'potwierdzajÄ' not in result
+ assert 'oglądanie' not in result
+ assert 'mózgu' not in result