diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 0a3cd108..cda37cc0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -172,99 +172,131 @@ def elementpath_tostring(obj): return str(obj) # Return str Utf-8 of matched rules -def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): +def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False): + """ + + :param xpath_filter: + :param html_content: + :param append_pretty_line_formatting: + :param is_xml: set to true if is XML or is RSS (RSS is XML) + :return: + """ from lxml import etree, html import elementpath # xpath 2.0-3.1 from elementpath.xpath3 import XPath3Parser parser = etree.HTMLParser() - if is_rss: - # So that we can keep CDATA for cdata_in_document_to_text() to process - parser = etree.XMLParser(strip_cdata=False) - - tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) - html_block = "" - - # Build namespace map for XPath queries - namespaces = {'re': 'http://exslt.org/regular-expressions'} - - # Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML) - # XPath spec: unprefixed element names have no namespace, not the default namespace - # Solution: Register the default namespace with empty string prefix in elementpath - # This is primarily for RSS/Atom feeds but works for any XML with default namespace - if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap: - # Register the default namespace with empty string prefix for elementpath - # This allows //title to match elements in the default namespace - namespaces[''] = tree.nsmap[None] - - r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser) - #@note: //title/text() now works with default namespaces (fixed by registering '' prefix) - #@note: //title/text() wont work where CDATA.. (use cdata_in_document_to_text first) - - if type(r) != list: - r = [r] - - for element in r: - # When there's more than 1 match, then add the suffix to separate each line - # And where the matched result doesn't include something that will cause Inscriptis to add a newline - # (This way each 'match' reliably has a new-line in the diff) - # Divs are converted to 4 whitespaces by inscriptis - if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])): - html_block += TEXT_FILTER_LIST_LINE_SUFFIX - - if type(element) == str: - html_block += element - elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree): - html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + tree = None + try: + if is_xml: + # So that we can keep CDATA for cdata_in_document_to_text() to process + parser = etree.XMLParser(strip_cdata=False) + # For XML/RSS content, use etree.fromstring to properly handle XML declarations + tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser) else: - html_block += elementpath_tostring(element) + tree = html.fromstring(html_content, parser=parser) + html_block = "" - return html_block + # Build namespace map for XPath queries + namespaces = {'re': 'http://exslt.org/regular-expressions'} + + # Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML) + # XPath spec: unprefixed element names have no namespace, not the default namespace + # Solution: Register the default namespace with empty string prefix in elementpath + # This is primarily for RSS/Atom feeds but works for any XML with default namespace + if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap: + # Register the default namespace with empty string prefix for elementpath + # This allows //title to match elements in the default namespace + namespaces[''] = tree.nsmap[None] + + r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser) + #@note: //title/text() now works with default namespaces (fixed by registering '' prefix) + #@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first) + + if type(r) != list: + r = [r] + + for element in r: + # When there's more than 1 match, then add the suffix to separate each line + # And where the matched result doesn't include something that will cause Inscriptis to add a newline + # (This way each 'match' reliably has a new-line in the diff) + # Divs are converted to 4 whitespaces by inscriptis + if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])): + html_block += TEXT_FILTER_LIST_LINE_SUFFIX + + if type(element) == str: + html_block += element + elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree): + # Use 'xml' method for RSS/XML content, 'html' for HTML content + # parser will be XMLParser if we detected XML content + method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html' + html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode') + else: + html_block += elementpath_tostring(element) + + return html_block + finally: + # Explicitly clear the tree to free memory + # lxml trees can hold significant memory, especially with large documents + if tree is not None: + tree.clear() # Return str Utf-8 of matched rules # 'xpath1:' -def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): +def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False): from lxml import etree, html parser = None - if is_rss: - # So that we can keep CDATA for cdata_in_document_to_text() to process - parser = etree.XMLParser(strip_cdata=False) - - tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) - html_block = "" - - # Build namespace map for XPath queries - namespaces = {'re': 'http://exslt.org/regular-expressions'} - - # NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace - # For documents with default namespace (RSS/Atom feeds), users must use: - # - local-name(): //*[local-name()='title']/text() - # - Or use xpath_filter (not xpath1_filter) which supports default namespaces - # XPath spec: unprefixed element names have no namespace, not the default namespace - - r = tree.xpath(xpath_filter.strip(), namespaces=namespaces) - #@note: xpath1 (lxml) does NOT automatically handle default namespaces - #@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support - #@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first) - - for element in r: - # When there's more than 1 match, then add the suffix to separate each line - # And where the matched result doesn't include something that will cause Inscriptis to add a newline - # (This way each 'match' reliably has a new-line in the diff) - # Divs are converted to 4 whitespaces by inscriptis - if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])): - html_block += TEXT_FILTER_LIST_LINE_SUFFIX - - # Some kind of text, UTF-8 or other - if isinstance(element, (str, bytes)): - html_block += element + tree = None + try: + if is_xml: + # So that we can keep CDATA for cdata_in_document_to_text() to process + parser = etree.XMLParser(strip_cdata=False) + # For XML/RSS content, use etree.fromstring to properly handle XML declarations + tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser) else: - # Return the HTML which will get parsed as text - html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + tree = html.fromstring(html_content, parser=parser) + html_block = "" - return html_block + # Build namespace map for XPath queries + namespaces = {'re': 'http://exslt.org/regular-expressions'} + + # NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace + # For documents with default namespace (RSS/Atom feeds), users must use: + # - local-name(): //*[local-name()='title']/text() + # - Or use xpath_filter (not xpath1_filter) which supports default namespaces + # XPath spec: unprefixed element names have no namespace, not the default namespace + + r = tree.xpath(xpath_filter.strip(), namespaces=namespaces) + #@note: xpath1 (lxml) does NOT automatically handle default namespaces + #@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support + #@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first) + + for element in r: + # When there's more than 1 match, then add the suffix to separate each line + # And where the matched result doesn't include something that will cause Inscriptis to add a newline + # (This way each 'match' reliably has a new-line in the diff) + # Divs are converted to 4 whitespaces by inscriptis + if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])): + html_block += TEXT_FILTER_LIST_LINE_SUFFIX + + # Some kind of text, UTF-8 or other + if isinstance(element, (str, bytes)): + html_block += element + else: + # Return the HTML/XML which will get parsed as text + # Use 'xml' method for RSS/XML content, 'html' for HTML content + # parser will be XMLParser if we detected XML content + method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html' + html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode') + + return html_block + finally: + # Explicitly clear the tree to free memory + # lxml trees can hold significant memory, especially with large documents + if tree is not None: + tree.clear() # Extract/find element def extract_element(find='title', html_content=''): diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py index 2a0ef68f..9d9018d7 100644 --- a/changedetectionio/processors/magic.py +++ b/changedetectionio/processors/magic.py @@ -103,15 +103,15 @@ class guess_stream_type(): self.is_json = True elif 'pdf' in magic_content_header: self.is_pdf = True - elif has_html_patterns or http_content_header == 'text/html': - self.is_html = True - elif any(s in magic_content_header for s in JSON_CONTENT_TYPES): - self.is_json = True # magic will call a rss document 'xml' # Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss # This also triggers the automatic CDATA text parser so the RSS goes back a nice content list elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES) or '<rdf:' in test_content_normalized: self.is_rss = True + elif has_html_patterns or http_content_header == 'text/html': + self.is_html = True + elif any(s in magic_content_header for s in JSON_CONTENT_TYPES): + self.is_json = True elif any(s in http_content_header for s in XML_CONTENT_TYPES): # Only mark as generic XML if not already detected as RSS if not self.is_rss: diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 62d749f7..2f15a55e 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -298,7 +298,7 @@ class ContentProcessor: xpath_filter=filter_rule.replace('xpath:', ''), html_content=content, append_pretty_line_formatting=not self.watch.is_source_type_url, - is_rss=stream_content_type.is_rss + is_xml=stream_content_type.is_rss or stream_content_type.is_xml ) # XPath1 filters (first match only) @@ -307,7 +307,7 @@ class ContentProcessor: xpath_filter=filter_rule.replace('xpath1:', ''), html_content=content, append_pretty_line_formatting=not self.watch.is_source_type_url, - is_rss=stream_content_type.is_rss + is_xml=stream_content_type.is_rss or stream_content_type.is_xml ) # JSON filters diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index 0fa74094..6023732b 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -405,7 +405,10 @@ def test_plaintext_even_if_xml_content_and_can_apply_filters(client, live_server follow_redirects=True ) - assert b'<string name="feed_update_receiver_name"' in res.data + # Check that the string element with the correct name attribute is present + # Note: namespace declarations may be included when extracting elements, which is correct XML behavior + assert b'feed_update_receiver_name' in res.data + assert b'Abonnementen bijwerken' in res.data assert b'<foobar' not in res.data res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) diff --git a/changedetectionio/tests/test_xpath_default_namespace.py b/changedetectionio/tests/test_xpath_default_namespace.py index 10eda4ca..d22e1ac5 100644 --- a/changedetectionio/tests/test_xpath_default_namespace.py +++ b/changedetectionio/tests/test_xpath_default_namespace.py @@ -84,14 +84,14 @@ class TestXPathDefaultNamespace: def test_atom_feed_simple_xpath_with_xpath_filter(self): """Test that //title/text() works on Atom feed with default namespace using xpath_filter.""" - result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_xml=True) assert 'Release notes from PowerToys' in result assert 'Release 0.95.1' in result assert 'Release v0.95.0' in result def test_atom_feed_nested_xpath_with_xpath_filter(self): """Test nested XPath like //entry/title/text() on Atom feed.""" - result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_xml=True) assert 'Release 0.95.1' in result assert 'Release v0.95.0' in result # Should NOT include the feed title @@ -99,20 +99,20 @@ class TestXPathDefaultNamespace: def test_atom_feed_other_elements_with_xpath_filter(self): """Test that other elements like //updated/text() work on Atom feed.""" - result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_xml=True) assert '2025-10-23T08:53:12Z' in result assert '2025-10-24T14:20:14Z' in result def test_rss_feed_without_namespace(self): """Test that //title/text() works on RSS feed without default namespace.""" - result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_rss=True) + result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_xml=True) assert 'Channel Title' in result assert 'Item 1 Title' in result assert 'Item 2 Title' in result def test_rss_feed_nested_xpath(self): """Test nested XPath on RSS feed without default namespace.""" - result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_rss=True) + result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_xml=True) assert 'Item 1 Title' in result assert 'Item 2 Title' in result # Should NOT include channel title @@ -120,31 +120,31 @@ class TestXPathDefaultNamespace: def test_rss_feed_with_prefixed_namespaces(self): """Test that feeds with namespace prefixes (not default) still work.""" - result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_rss=True) + result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_xml=True) assert 'Channel Title' in result assert 'Item Title' in result def test_local_name_workaround_still_works(self): """Test that local-name() workaround still works for Atom feeds.""" - result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_xml=True) assert 'Release notes from PowerToys' in result assert 'Release 0.95.1' in result def test_xpath1_filter_without_default_namespace(self): """Test xpath1_filter works on RSS without default namespace.""" - result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_rss=True) + result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_xml=True) assert 'Channel Title' in result assert 'Item 1 Title' in result def test_xpath1_filter_with_default_namespace_returns_empty(self): """Test that xpath1_filter returns empty on Atom with default namespace (known limitation).""" - result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_xml=True) # xpath1_filter (lxml) doesn't support default namespaces, so this returns empty assert result == '' def test_xpath1_filter_local_name_workaround(self): """Test that xpath1_filter works with local-name() workaround on Atom feeds.""" - result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True) + result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_xml=True) assert 'Release notes from PowerToys' in result assert 'Release 0.95.1' in result diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index b4dda080..a09a6539 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -201,3 +201,120 @@ def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content + + +# Test for UTF-8 encoding bug fix (issue #3658) +# Polish and other UTF-8 characters should be preserved correctly +polish_html = """<!DOCTYPE html> +<html> +<head><meta charset="utf-8"></head> +<body> +<div class="index--s-headline-link"> + <a class="index--s-headline-link" href="#"> + Naukowcy potwierdzają: oglądanie krótkich filmików prowadzi do "zgnilizny mózgu" + </a> +</div> +<div> + <a class="other-class" href="#"> + Test with Polish chars: żółć ąę śń + </a> +</div> +<div> + <p class="unicode-test">Cyrillic: Привет мир</p> + <p class="unicode-test">Greek: Γειά σου κόσμε</p> + <p class="unicode-test">Arabic: مرحبا بالعالم</p> + <p class="unicode-test">Chinese: 你好世界</p> + <p class="unicode-test">Japanese: こんにちは世界</p> + <p class="unicode-test">Emoji: 🌍🎉✨</p> +</div> +</body> +</html> +""" + + +@pytest.mark.parametrize("html_content", [polish_html]) +@pytest.mark.parametrize("xpath, expected_text", [ + # Test Polish characters in xpath_filter + ('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'), + ('//a[(contains(@class,"index--s-headline-link"))]', 'oglądanie krótkich filmików'), + ('//a[(contains(@class,"index--s-headline-link"))]', 'zgnilizny mózgu'), + ('//a[@class="other-class"]', 'żółć ąę śń'), + + # Test various Unicode scripts + ('//p[@class="unicode-test"]', 'Привет мир'), + ('//p[@class="unicode-test"]', 'Γειά σου κόσμε'), + ('//p[@class="unicode-test"]', 'مرحبا بالعالم'), + ('//p[@class="unicode-test"]', '你好世界'), + ('//p[@class="unicode-test"]', 'こんにちは世界'), + ('//p[@class="unicode-test"]', '🌍🎉✨'), + + # Test with text() extraction + ('//a[@class="other-class"]/text()', 'żółć'), +]) +def test_xpath_utf8_encoding(html_content, xpath, expected_text): + """Test that XPath filters preserve UTF-8 characters correctly (issue #3658)""" + result = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=False) + assert type(result) == str + assert expected_text in result + # Ensure characters are NOT HTML-entity encoded + # For example, 'ą' should NOT become 'ą' + assert '&#' not in result or expected_text in result + + +@pytest.mark.parametrize("html_content", [polish_html]) +@pytest.mark.parametrize("xpath, expected_text", [ + # Test Polish characters in xpath1_filter + ('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'), + ('//a[(contains(@class,"index--s-headline-link"))]', 'mózgu'), + ('//a[@class="other-class"]', 'żółć ąę śń'), + + # Test various Unicode scripts with xpath1 + ('//p[@class="unicode-test" and contains(text(), "Cyrillic")]', 'Привет мир'), + ('//p[@class="unicode-test" and contains(text(), "Greek")]', 'Γειά σου'), + ('//p[@class="unicode-test" and contains(text(), "Chinese")]', '你好世界'), +]) +def test_xpath1_utf8_encoding(html_content, xpath, expected_text): + """Test that XPath1 filters preserve UTF-8 characters correctly""" + result = html_tools.xpath1_filter(xpath, html_content, append_pretty_line_formatting=False) + assert type(result) == str + assert expected_text in result + # Ensure characters are NOT HTML-entity encoded + assert '&#' not in result or expected_text in result + + +# Test with real-world example from wyborcza.pl (issue #3658) +wyborcza_style_html = """<!DOCTYPE html> +<html lang="pl"> +<head><meta charset="utf-8"></head> +<body> +<div class="article-list"> + <a class="index--s-headline-link" href="/article1"> + Naukowcy potwierdzają: oglądanie krótkich filmików prowadzi do "zgnilizny mózgu" + </a> + <a class="index--s-headline-link" href="/article2"> + Zmiany klimatyczne wpływają na życie w miastach + </a> + <a class="index--s-headline-link" href="/article3"> + Łódź: Nowe inwestycje w infrastrukturę miejską + </a> +</div> +</body> +</html> +""" + + +def test_wyborcza_real_world_example(): + """Test real-world case from wyborcza.pl that was failing (issue #3658)""" + xpath = '//a[(contains(@class,"index--s-headline-link"))]' + result = html_tools.xpath_filter(xpath, wyborcza_style_html, append_pretty_line_formatting=False) + + # These exact strings should appear in the result + assert 'Naukowcy potwierdzają' in result + assert 'oglądanie krótkich filmików' in result + assert 'zgnilizny mózgu' in result + assert 'Łódź' in result + + # Make sure they're NOT corrupted to mojibake like "potwierdzajÄ" + assert 'potwierdzajÄ' not in result + assert 'oglądanie' not in result + assert 'mózgu' not in result