mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-29 12:53:20 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39274f121c | ||
|
|
4b1d871078 | ||
|
|
f78c2dcffd | ||
|
|
1c2c22b8df | ||
|
|
3276a9347a | ||
|
|
d763bb4267 | ||
|
|
be3c9892e0 |
2
.github/workflows/codeql-analysis.yml
vendored
2
.github/workflows/codeql-analysis.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v5
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
|
||||
2
.github/workflows/containers.yml
vendored
2
.github/workflows/containers.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
# Or if we are in a tagged release scenario.
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' }} || ${{ github.event.release.tag_name }} != ''
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
- name: Set up Python 3.11
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
|
||||
2
.github/workflows/pypi-release.yml
vendored
2
.github/workflows/pypi-release.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
|
||||
2
.github/workflows/test-container-build.yml
vendored
2
.github/workflows/test-container-build.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
- platform: linux/arm64
|
||||
dockerfile: ./.github/test/Dockerfile-alpine
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
- name: Set up Python 3.11
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
|
||||
2
.github/workflows/test-only.yml
vendored
2
.github/workflows/test-only.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
lint-code:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
- name: Lint with Ruff
|
||||
run: |
|
||||
pip install ruff
|
||||
|
||||
@@ -21,7 +21,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
||||
uses: actions/setup-python@v6
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -132,7 +132,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -174,7 +174,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -214,7 +214,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -250,7 +250,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -279,7 +279,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -319,7 +319,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -350,7 +350,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
@@ -395,7 +395,7 @@ jobs:
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v6
|
||||
|
||||
@@ -52,7 +52,7 @@ RUN --mount=type=cache,id=pip,sharing=locked,target=/tmp/pip-cache \
|
||||
--prefer-binary \
|
||||
--cache-dir=/tmp/pip-cache \
|
||||
--target=/dependencies \
|
||||
playwright~=1.48.0 \
|
||||
playwright~=1.56.0 \
|
||||
|| echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
# Semver means never use .01, or 00. Should be .1.
|
||||
__version__ = '0.51.2'
|
||||
__version__ = '0.51.4'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -439,7 +439,7 @@ class browsersteps_live_ui(steppable_browser_interface):
|
||||
logger.warning("Attempted to get current state after cleanup")
|
||||
return (None, None)
|
||||
|
||||
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
|
||||
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding="utf-8")
|
||||
|
||||
now = time.time()
|
||||
await self.page.wait_for_timeout(1 * 1000)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from loguru import logger
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import asyncio
|
||||
from changedetectionio import strtobool
|
||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
|
||||
@@ -76,9 +77,22 @@ class fetcher(Fetcher):
|
||||
if not is_binary:
|
||||
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
|
||||
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
|
||||
encoding = chardet.detect(r.content)['encoding']
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
# For XML/RSS feeds, check the XML declaration for encoding attribute
|
||||
# This is more reliable than chardet which can misdetect UTF-8 as MacRoman
|
||||
content_type = r.headers.get('content-type', '').lower()
|
||||
if 'xml' in content_type or 'rss' in content_type:
|
||||
# Look for <?xml version="1.0" encoding="UTF-8"?>
|
||||
xml_encoding_match = re.search(rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']', r.content[:200])
|
||||
if xml_encoding_match:
|
||||
r.encoding = xml_encoding_match.group(1).decode('ascii')
|
||||
else:
|
||||
# Default to UTF-8 for XML if no encoding found
|
||||
r.encoding = 'utf-8'
|
||||
else:
|
||||
# For other content types, use chardet
|
||||
encoding = chardet.detect(r.content)['encoding']
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
|
||||
self.headers = r.headers
|
||||
|
||||
|
||||
@@ -172,99 +172,131 @@ def elementpath_tostring(obj):
|
||||
return str(obj)
|
||||
|
||||
# Return str Utf-8 of matched rules
|
||||
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
|
||||
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False):
|
||||
"""
|
||||
|
||||
:param xpath_filter:
|
||||
:param html_content:
|
||||
:param append_pretty_line_formatting:
|
||||
:param is_xml: set to true if is XML or is RSS (RSS is XML)
|
||||
:return:
|
||||
"""
|
||||
from lxml import etree, html
|
||||
import elementpath
|
||||
# xpath 2.0-3.1
|
||||
from elementpath.xpath3 import XPath3Parser
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
if is_rss:
|
||||
# So that we can keep CDATA for cdata_in_document_to_text() to process
|
||||
parser = etree.XMLParser(strip_cdata=False)
|
||||
|
||||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
|
||||
html_block = ""
|
||||
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
# Solution: Register the default namespace with empty string prefix in elementpath
|
||||
# This is primarily for RSS/Atom feeds but works for any XML with default namespace
|
||||
if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
|
||||
# Register the default namespace with empty string prefix for elementpath
|
||||
# This allows //title to match elements in the default namespace
|
||||
namespaces[''] = tree.nsmap[None]
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
|
||||
#@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
if type(r) != list:
|
||||
r = [r]
|
||||
|
||||
for element in r:
|
||||
# When there's more than 1 match, then add the suffix to separate each line
|
||||
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
|
||||
# (This way each 'match' reliably has a new-line in the diff)
|
||||
# Divs are converted to 4 whitespaces by inscriptis
|
||||
if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
|
||||
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
|
||||
|
||||
if type(element) == str:
|
||||
html_block += element
|
||||
elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
|
||||
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
||||
tree = None
|
||||
try:
|
||||
if is_xml:
|
||||
# So that we can keep CDATA for cdata_in_document_to_text() to process
|
||||
parser = etree.XMLParser(strip_cdata=False)
|
||||
# For XML/RSS content, use etree.fromstring to properly handle XML declarations
|
||||
tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser)
|
||||
else:
|
||||
html_block += elementpath_tostring(element)
|
||||
tree = html.fromstring(html_content, parser=parser)
|
||||
html_block = ""
|
||||
|
||||
return html_block
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# Handle default namespace in documents (common in RSS/Atom feeds, but can occur in any XML)
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
# Solution: Register the default namespace with empty string prefix in elementpath
|
||||
# This is primarily for RSS/Atom feeds but works for any XML with default namespace
|
||||
if hasattr(tree, 'nsmap') and tree.nsmap and None in tree.nsmap:
|
||||
# Register the default namespace with empty string prefix for elementpath
|
||||
# This allows //title to match elements in the default namespace
|
||||
namespaces[''] = tree.nsmap[None]
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
|
||||
#@note: //title/text() now works with default namespaces (fixed by registering '' prefix)
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
if type(r) != list:
|
||||
r = [r]
|
||||
|
||||
for element in r:
|
||||
# When there's more than 1 match, then add the suffix to separate each line
|
||||
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
|
||||
# (This way each 'match' reliably has a new-line in the diff)
|
||||
# Divs are converted to 4 whitespaces by inscriptis
|
||||
if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
|
||||
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
|
||||
|
||||
if type(element) == str:
|
||||
html_block += element
|
||||
elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
|
||||
# Use 'xml' method for RSS/XML content, 'html' for HTML content
|
||||
# parser will be XMLParser if we detected XML content
|
||||
method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html'
|
||||
html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode')
|
||||
else:
|
||||
html_block += elementpath_tostring(element)
|
||||
|
||||
return html_block
|
||||
finally:
|
||||
# Explicitly clear the tree to free memory
|
||||
# lxml trees can hold significant memory, especially with large documents
|
||||
if tree is not None:
|
||||
tree.clear()
|
||||
|
||||
# Return str Utf-8 of matched rules
|
||||
# 'xpath1:'
|
||||
def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
|
||||
def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_xml=False):
|
||||
from lxml import etree, html
|
||||
|
||||
parser = None
|
||||
if is_rss:
|
||||
# So that we can keep CDATA for cdata_in_document_to_text() to process
|
||||
parser = etree.XMLParser(strip_cdata=False)
|
||||
|
||||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
|
||||
html_block = ""
|
||||
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
|
||||
# For documents with default namespace (RSS/Atom feeds), users must use:
|
||||
# - local-name(): //*[local-name()='title']/text()
|
||||
# - Or use xpath_filter (not xpath1_filter) which supports default namespaces
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
|
||||
r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
|
||||
#@note: xpath1 (lxml) does NOT automatically handle default namespaces
|
||||
#@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
for element in r:
|
||||
# When there's more than 1 match, then add the suffix to separate each line
|
||||
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
|
||||
# (This way each 'match' reliably has a new-line in the diff)
|
||||
# Divs are converted to 4 whitespaces by inscriptis
|
||||
if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
|
||||
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
|
||||
|
||||
# Some kind of text, UTF-8 or other
|
||||
if isinstance(element, (str, bytes)):
|
||||
html_block += element
|
||||
tree = None
|
||||
try:
|
||||
if is_xml:
|
||||
# So that we can keep CDATA for cdata_in_document_to_text() to process
|
||||
parser = etree.XMLParser(strip_cdata=False)
|
||||
# For XML/RSS content, use etree.fromstring to properly handle XML declarations
|
||||
tree = etree.fromstring(html_content.encode('utf-8') if isinstance(html_content, str) else html_content, parser=parser)
|
||||
else:
|
||||
# Return the HTML which will get parsed as text
|
||||
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
|
||||
tree = html.fromstring(html_content, parser=parser)
|
||||
html_block = ""
|
||||
|
||||
return html_block
|
||||
# Build namespace map for XPath queries
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
|
||||
# NOTE: lxml's native xpath() does NOT support empty string prefix for default namespace
|
||||
# For documents with default namespace (RSS/Atom feeds), users must use:
|
||||
# - local-name(): //*[local-name()='title']/text()
|
||||
# - Or use xpath_filter (not xpath1_filter) which supports default namespaces
|
||||
# XPath spec: unprefixed element names have no namespace, not the default namespace
|
||||
|
||||
r = tree.xpath(xpath_filter.strip(), namespaces=namespaces)
|
||||
#@note: xpath1 (lxml) does NOT automatically handle default namespaces
|
||||
#@note: Use //*[local-name()='element'] or switch to xpath_filter for default namespace support
|
||||
#@note: //title/text() wont work where <title>CDATA.. (use cdata_in_document_to_text first)
|
||||
|
||||
for element in r:
|
||||
# When there's more than 1 match, then add the suffix to separate each line
|
||||
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
|
||||
# (This way each 'match' reliably has a new-line in the diff)
|
||||
# Divs are converted to 4 whitespaces by inscriptis
|
||||
if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
|
||||
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
|
||||
|
||||
# Some kind of text, UTF-8 or other
|
||||
if isinstance(element, (str, bytes)):
|
||||
html_block += element
|
||||
else:
|
||||
# Return the HTML/XML which will get parsed as text
|
||||
# Use 'xml' method for RSS/XML content, 'html' for HTML content
|
||||
# parser will be XMLParser if we detected XML content
|
||||
method = 'xml' if (is_xml or isinstance(parser, etree.XMLParser)) else 'html'
|
||||
html_block += etree.tostring(element, pretty_print=True, method=method, encoding='unicode')
|
||||
|
||||
return html_block
|
||||
finally:
|
||||
# Explicitly clear the tree to free memory
|
||||
# lxml trees can hold significant memory, especially with large documents
|
||||
if tree is not None:
|
||||
tree.clear()
|
||||
|
||||
# Extract/find element
|
||||
def extract_element(find='title', html_content=''):
|
||||
|
||||
@@ -103,15 +103,15 @@ class guess_stream_type():
|
||||
self.is_json = True
|
||||
elif 'pdf' in magic_content_header:
|
||||
self.is_pdf = True
|
||||
elif has_html_patterns or http_content_header == 'text/html':
|
||||
self.is_html = True
|
||||
elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
|
||||
self.is_json = True
|
||||
# magic will call a rss document 'xml'
|
||||
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
|
||||
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
|
||||
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES) or '<rdf:' in test_content_normalized:
|
||||
self.is_rss = True
|
||||
elif has_html_patterns or http_content_header == 'text/html':
|
||||
self.is_html = True
|
||||
elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
|
||||
self.is_json = True
|
||||
elif any(s in http_content_header for s in XML_CONTENT_TYPES):
|
||||
# Only mark as generic XML if not already detected as RSS
|
||||
if not self.is_rss:
|
||||
|
||||
@@ -298,7 +298,7 @@ class ContentProcessor:
|
||||
xpath_filter=filter_rule.replace('xpath:', ''),
|
||||
html_content=content,
|
||||
append_pretty_line_formatting=not self.watch.is_source_type_url,
|
||||
is_rss=stream_content_type.is_rss
|
||||
is_xml=stream_content_type.is_rss or stream_content_type.is_xml
|
||||
)
|
||||
|
||||
# XPath1 filters (first match only)
|
||||
@@ -307,7 +307,7 @@ class ContentProcessor:
|
||||
xpath_filter=filter_rule.replace('xpath1:', ''),
|
||||
html_content=content,
|
||||
append_pretty_line_formatting=not self.watch.is_source_type_url,
|
||||
is_rss=stream_content_type.is_rss
|
||||
is_xml=stream_content_type.is_rss or stream_content_type.is_xml
|
||||
)
|
||||
|
||||
# JSON filters
|
||||
|
||||
@@ -29,16 +29,135 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||
return re.sub(pattern, repl, html_content)
|
||||
|
||||
|
||||
# Jinja2 template for formatting RSS/Atom feed entries
|
||||
# Covers all common feedparser entry fields including namespaced elements
|
||||
# Outputs HTML that will be converted to text via html_to_text
|
||||
# @todo - This could be a UI setting in the future
|
||||
RSS_ENTRY_TEMPLATE = """<article class="rss-item" id="{{ entry.id|replace('"', '')|replace(' ', '-') }}">{%- if entry.title -%}Title: {{ entry.title }}<br>{%- endif -%}
|
||||
{%- if entry.link -%}<strong>Link:</strong> <a href="{{ entry.link }}">{{ entry.link }}</a><br>
|
||||
{%- endif -%}
|
||||
{%- if entry.id -%}
|
||||
<strong>Guid:</strong> {{ entry.id }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.published -%}
|
||||
<strong>PubDate:</strong> {{ entry.published }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.updated and entry.updated != entry.published -%}
|
||||
<strong>Updated:</strong> {{ entry.updated }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.author -%}
|
||||
<strong>Author:</strong> {{ entry.author }}<br>
|
||||
{%- elif entry.author_detail and entry.author_detail.name -%}
|
||||
<strong>Author:</strong> {{ entry.author_detail.name }}
|
||||
{%- if entry.author_detail.email %} ({{ entry.author_detail.email }}){% endif -%}
|
||||
<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.contributors -%}
|
||||
<strong>Contributors:</strong> {% for contributor in entry.contributors -%}
|
||||
{{ contributor.name if contributor.name else contributor }}
|
||||
{%- if not loop.last %}, {% endif -%}
|
||||
{%- endfor %}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.publisher -%}
|
||||
<strong>Publisher:</strong> {{ entry.publisher }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.rights -%}
|
||||
<strong>Rights:</strong> {{ entry.rights }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.license -%}
|
||||
<strong>License:</strong> {{ entry.license }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.language -%}
|
||||
<strong>Language:</strong> {{ entry.language }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.tags -%}
|
||||
<strong>Tags:</strong> {% for tag in entry.tags -%}
|
||||
{{ tag.term if tag.term else tag }}
|
||||
{%- if not loop.last %}, {% endif -%}
|
||||
{%- endfor %}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.category -%}
|
||||
<strong>Category:</strong> {{ entry.category }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.comments -%}
|
||||
<strong>Comments:</strong> <a href="{{ entry.comments }}">{{ entry.comments }}</a><br>
|
||||
{%- endif -%}
|
||||
{%- if entry.slash_comments -%}
|
||||
<strong>Comment Count:</strong> {{ entry.slash_comments }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.enclosures -%}
|
||||
<strong>Enclosures:</strong><br>
|
||||
{%- for enclosure in entry.enclosures %}
|
||||
- <a href="{{ enclosure.href }}">{{ enclosure.href }}</a> ({{ enclosure.type if enclosure.type else 'unknown type' }}
|
||||
{%- if enclosure.length %}, {{ enclosure.length }} bytes{% endif -%}
|
||||
)<br>
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- if entry.media_content -%}
|
||||
<strong>Media:</strong><br>
|
||||
{%- for media in entry.media_content %}
|
||||
- <a href="{{ media.url }}">{{ media.url }}</a>
|
||||
{%- if media.type %} ({{ media.type }}){% endif -%}
|
||||
{%- if media.width and media.height %} {{ media.width }}x{{ media.height }}{% endif -%}
|
||||
<br>
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- if entry.media_thumbnail -%}
|
||||
<strong>Thumbnail:</strong> <a href="{{ entry.media_thumbnail[0].url if entry.media_thumbnail[0].url else entry.media_thumbnail[0] }}">{{ entry.media_thumbnail[0].url if entry.media_thumbnail[0].url else entry.media_thumbnail[0] }}</a><br>
|
||||
{%- endif -%}
|
||||
{%- if entry.media_description -%}
|
||||
<strong>Media Description:</strong> {{ entry.media_description }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.itunes_duration -%}
|
||||
<strong>Duration:</strong> {{ entry.itunes_duration }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.itunes_author -%}
|
||||
<strong>Podcast Author:</strong> {{ entry.itunes_author }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_identifier -%}
|
||||
<strong>Identifier:</strong> {{ entry.dc_identifier }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_source -%}
|
||||
<strong>DC Source:</strong> {{ entry.dc_source }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_type -%}
|
||||
<strong>Type:</strong> {{ entry.dc_type }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_format -%}
|
||||
<strong>Format:</strong> {{ entry.dc_format }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_relation -%}
|
||||
<strong>Related:</strong> {{ entry.dc_relation }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_coverage -%}
|
||||
<strong>Coverage:</strong> {{ entry.dc_coverage }}<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.source and entry.source.title -%}
|
||||
<strong>Source:</strong> {{ entry.source.title }}
|
||||
{%- if entry.source.link %} (<a href="{{ entry.source.link }}">{{ entry.source.link }}</a>){% endif -%}
|
||||
<br>
|
||||
{%- endif -%}
|
||||
{%- if entry.dc_content -%}
|
||||
<strong>Content:</strong> {{ entry.dc_content | safe }}
|
||||
{%- elif entry.content and entry.content[0].value -%}
|
||||
<strong>Content:</strong> {{ entry.content[0].value | safe }}
|
||||
{%- elif entry.summary -%}
|
||||
<strong>Summary:</strong> {{ entry.summary | safe }}
|
||||
{%- endif -%}</article>
|
||||
"""
|
||||
|
||||
|
||||
def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
|
||||
"""
|
||||
Format RSS/Atom feed items in a readable text format using feedparser.
|
||||
Format RSS/Atom feed items in a readable text format using feedparser and Jinja2.
|
||||
|
||||
Converts RSS <item> or Atom <entry> elements to formatted text with:
|
||||
- <title> → <h1>Title</h1>
|
||||
- <link> → Link: [url]
|
||||
- <guid> → Guid: [id]
|
||||
- <pubDate> → PubDate: [date]
|
||||
- <description> or <content> → Raw HTML content (CDATA and entities automatically handled)
|
||||
Converts RSS <item> or Atom <entry> elements to formatted text with all available fields:
|
||||
- Basic fields: title, link, id/guid, published date, updated date
|
||||
- Author fields: author, author_detail, contributors, publisher
|
||||
- Content fields: content, summary, description
|
||||
- Metadata: tags, category, rights, license
|
||||
- Media: enclosures, media_content, media_thumbnail
|
||||
- Dublin Core elements: dc:creator, dc:date, dc:publisher, etc. (mapped by feedparser)
|
||||
|
||||
Args:
|
||||
rss_content: The RSS/Atom feed content
|
||||
@@ -49,65 +168,19 @@ def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
|
||||
"""
|
||||
try:
|
||||
import feedparser
|
||||
from xml.sax.saxutils import escape as xml_escape
|
||||
from changedetectionio.jinja2_custom import safe_jinja
|
||||
|
||||
# Parse the feed - feedparser handles all RSS/Atom variants, CDATA, entity unescaping, etc.
|
||||
feed = feedparser.parse(rss_content)
|
||||
|
||||
formatted_items = []
|
||||
|
||||
# Determine feed type for appropriate labels when fields are missing
|
||||
# feedparser sets feed.version to things like 'rss20', 'atom10', etc.
|
||||
# Determine feed type for appropriate labels
|
||||
is_atom = feed.version and 'atom' in feed.version
|
||||
|
||||
formatted_items = []
|
||||
for entry in feed.entries:
|
||||
item_parts = []
|
||||
|
||||
# Title - feedparser handles CDATA and entity unescaping automatically
|
||||
if hasattr(entry, 'title') and entry.title:
|
||||
item_parts.append(f'<h1>{xml_escape(entry.title)}</h1>')
|
||||
|
||||
# Link
|
||||
if hasattr(entry, 'link') and entry.link:
|
||||
item_parts.append(f'Link: {xml_escape(entry.link)}<br>')
|
||||
|
||||
# GUID/ID
|
||||
if hasattr(entry, 'id') and entry.id:
|
||||
item_parts.append(f'Guid: {xml_escape(entry.id)}<br>')
|
||||
|
||||
# Date - feedparser normalizes all date field names to 'published'
|
||||
if hasattr(entry, 'published') and entry.published:
|
||||
item_parts.append(f'PubDate: {xml_escape(entry.published)}<br>')
|
||||
|
||||
# Description/Content - feedparser handles CDATA and entity unescaping automatically
|
||||
# Only add "Summary:" label for Atom <summary> tags
|
||||
content = None
|
||||
add_label = False
|
||||
|
||||
if hasattr(entry, 'content') and entry.content:
|
||||
# Atom <content> - no label, just content
|
||||
content = entry.content[0].value if entry.content[0].value else None
|
||||
elif hasattr(entry, 'summary'):
|
||||
# Could be RSS <description> or Atom <summary>
|
||||
# feedparser maps both to entry.summary
|
||||
content = entry.summary if entry.summary else None
|
||||
# Only add "Summary:" label for Atom feeds (which use <summary> tag)
|
||||
if is_atom:
|
||||
add_label = True
|
||||
|
||||
# Add content with or without label
|
||||
if content:
|
||||
if add_label:
|
||||
item_parts.append(f'Summary:<br>{content}')
|
||||
else:
|
||||
item_parts.append(content)
|
||||
else:
|
||||
# No content - just show <none>
|
||||
item_parts.append('<none>')
|
||||
|
||||
# Join all parts of this item
|
||||
if item_parts:
|
||||
formatted_items.append('\n'.join(item_parts))
|
||||
# Render the entry using Jinja2 template
|
||||
rendered = safe_jinja.render(RSS_ENTRY_TEMPLATE, entry=entry, is_atom=is_atom)
|
||||
formatted_items.append(rendered.strip())
|
||||
|
||||
# Wrap each item in a div with classes (first, last, item-N)
|
||||
items_html = []
|
||||
@@ -122,7 +195,8 @@ def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
|
||||
|
||||
class_str = ' '.join(classes)
|
||||
items_html.append(f'<div class="{class_str}">{item}</div>')
|
||||
return '<html><body>\n'+"\n<br><br>".join(items_html)+'\n</body></html>'
|
||||
|
||||
return '<html><body>\n' + "\n<br>".join(items_html) + '\n</body></html>'
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error formatting RSS items: {str(e)}")
|
||||
|
||||
@@ -405,7 +405,10 @@ def test_plaintext_even_if_xml_content_and_can_apply_filters(client, live_server
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
assert b'<string name="feed_update_receiver_name"' in res.data
|
||||
# Check that the string element with the correct name attribute is present
|
||||
# Note: namespace declarations may be included when extracting elements, which is correct XML behavior
|
||||
assert b'feed_update_receiver_name' in res.data
|
||||
assert b'Abonnementen bijwerken' in res.data
|
||||
assert b'<foobar' not in res.data
|
||||
|
||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||
|
||||
@@ -7,6 +7,61 @@ from flask import url_for
|
||||
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, \
|
||||
extract_UUID_from_client, delete_all_watches
|
||||
|
||||
def set_xmlns_purl_content(datastore_path, extra=""):
|
||||
data=f"""<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="https://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||
<channel>
|
||||
<atom:link href="https://www.xxxxxxxtechxxxxx.com/feeds.xml" rel="self" type="application/rss+xml"/>
|
||||
<title>
|
||||
<![CDATA[ Latest from xxxxxxxtechxxxxx ]]>
|
||||
</title>
|
||||
<link>https://www.xxxxx.com</link>
|
||||
<description>
|
||||
<![CDATA[ All the latest content from the xxxxxxxtechxxxxx team ]]>
|
||||
</description>
|
||||
<lastBuildDate>Wed, 19 Nov 2025 15:00:00 +0000</lastBuildDate>
|
||||
<language>en</language>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[ Sony Xperia 1 VII review: has Sony’s long-standing Xperia family lost what it takes to compete? ]]>
|
||||
</title>
|
||||
<dc:content>
|
||||
<![CDATA[ {{extra}} a little harder, dc-content. blue often quite tough and purple usually very difficult.</p><p>On the plus side, you don't technically need to solve the final one, as you'll be able to answer that one by a process of elimination. What's more, you can make up to four mistakes, which gives you a little bit of breathing room.</p><p>It's a little more involved than something like Wordle, however, and there are plenty of opportunities for the game to trip you up with tricks. For instance, watch out for homophones and other word games that could disguise the answers.</p><p>It's playable for free via the <a href="https://www.nytimes.com/games/strands" target="_blank">NYT Games site</a> on desktop or mobile.</p></article></section> ]]>
|
||||
</dc:content>
|
||||
<link>https://www.xxxxxxx.com/gaming/nyt-connections-today-answers-hints-20-november-2025</link>
|
||||
<description>
|
||||
<![CDATA[ Looking for NYT Connections answers and hints? Here's all you need to know to solve today's game, plus my commentary on the puzzles. ]]>
|
||||
</description>
|
||||
<guid isPermaLink="false">N2C2T6DztpWdxSdKpSUx89</guid>
|
||||
<enclosure url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg" type="image/jpeg" length="0"/>
|
||||
<pubDate>Wed, 19 Nov 2025 15:00:00 +0000</pubDate>
|
||||
<category>
|
||||
<![CDATA[ Gaming ]]>
|
||||
</category>
|
||||
<dc:creator>
|
||||
<![CDATA[ Johnny Dee ]]>
|
||||
</dc:creator>
|
||||
<media:content type="image/jpeg" url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg">
|
||||
<media:credit>
|
||||
<![CDATA[ New York Times ]]>
|
||||
</media:credit>
|
||||
<media:text>
|
||||
<![CDATA[ NYT Connections homescreen on a phone, on a purple background ]]>
|
||||
</media:text>
|
||||
<media:title type="plain">
|
||||
<![CDATA[ NYT Connections homescreen on a phone, on a purple background ]]>
|
||||
</media:title>
|
||||
</media:content>
|
||||
<media:thumbnail url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg"/>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
|
||||
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
|
||||
f.write(data)
|
||||
|
||||
|
||||
|
||||
|
||||
def set_original_cdata_xml(datastore_path):
|
||||
test_return_data = """<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||
@@ -98,3 +153,26 @@ def test_rss_reader_mode_with_css_filters(client, live_server, measure_memory_us
|
||||
assert 'The days of Terminator and The Matrix' in snapshot_contents
|
||||
delete_all_watches(client)
|
||||
|
||||
|
||||
def test_xmlns_purl_content(client, live_server, measure_memory_usage, datastore_path):
|
||||
set_xmlns_purl_content(datastore_path=datastore_path)
|
||||
|
||||
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
|
||||
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
|
||||
#test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
|
||||
|
||||
# Because NO utf-8 was specified here, we should be able to recover it in requests or other somehow.
|
||||
test_url = url_for('test_endpoint', content_type="text/xml;", _external=True)
|
||||
live_server.app.config['DATASTORE'].data['settings']['application']['rss_reader_mode'] = True
|
||||
|
||||
# Add our URL to the import page
|
||||
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'include_filters': [".last"]})
|
||||
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
|
||||
|
||||
wait_for_all_checks(client)
|
||||
|
||||
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
|
||||
dates = list(watch.history.keys())
|
||||
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
|
||||
assert "Title: Sony Xperia 1 VII review: has Sony’s long-standing Xperia family lost what it takes to compete?" in snapshot_contents
|
||||
assert "dc-content" in snapshot_contents
|
||||
|
||||
@@ -84,14 +84,14 @@ class TestXPathDefaultNamespace:
|
||||
|
||||
def test_atom_feed_simple_xpath_with_xpath_filter(self):
|
||||
"""Test that //title/text() works on Atom feed with default namespace using xpath_filter."""
|
||||
result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//title/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
assert 'Release v0.95.0' in result
|
||||
|
||||
def test_atom_feed_nested_xpath_with_xpath_filter(self):
|
||||
"""Test nested XPath like //entry/title/text() on Atom feed."""
|
||||
result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//entry/title/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
assert 'Release 0.95.1' in result
|
||||
assert 'Release v0.95.0' in result
|
||||
# Should NOT include the feed title
|
||||
@@ -99,20 +99,20 @@ class TestXPathDefaultNamespace:
|
||||
|
||||
def test_atom_feed_other_elements_with_xpath_filter(self):
|
||||
"""Test that other elements like //updated/text() work on Atom feed."""
|
||||
result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//updated/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
assert '2025-10-23T08:53:12Z' in result
|
||||
assert '2025-10-24T14:20:14Z' in result
|
||||
|
||||
def test_rss_feed_without_namespace(self):
|
||||
"""Test that //title/text() works on RSS feed without default namespace."""
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_no_default_ns, is_xml=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item 1 Title' in result
|
||||
assert 'Item 2 Title' in result
|
||||
|
||||
def test_rss_feed_nested_xpath(self):
|
||||
"""Test nested XPath on RSS feed without default namespace."""
|
||||
result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//item/title/text()', rss_feed_no_default_ns, is_xml=True)
|
||||
assert 'Item 1 Title' in result
|
||||
assert 'Item 2 Title' in result
|
||||
# Should NOT include channel title
|
||||
@@ -120,31 +120,31 @@ class TestXPathDefaultNamespace:
|
||||
|
||||
def test_rss_feed_with_prefixed_namespaces(self):
|
||||
"""Test that feeds with namespace prefixes (not default) still work."""
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_rss=True)
|
||||
result = html_tools.xpath_filter('//title/text()', rss_feed_with_ns_prefix, is_xml=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item Title' in result
|
||||
|
||||
def test_local_name_workaround_still_works(self):
|
||||
"""Test that local-name() workaround still works for Atom feeds."""
|
||||
result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
|
||||
def test_xpath1_filter_without_default_namespace(self):
|
||||
"""Test xpath1_filter works on RSS without default namespace."""
|
||||
result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_rss=True)
|
||||
result = html_tools.xpath1_filter('//title/text()', rss_feed_no_default_ns, is_xml=True)
|
||||
assert 'Channel Title' in result
|
||||
assert 'Item 1 Title' in result
|
||||
|
||||
def test_xpath1_filter_with_default_namespace_returns_empty(self):
|
||||
"""Test that xpath1_filter returns empty on Atom with default namespace (known limitation)."""
|
||||
result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath1_filter('//title/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
# xpath1_filter (lxml) doesn't support default namespaces, so this returns empty
|
||||
assert result == ''
|
||||
|
||||
def test_xpath1_filter_local_name_workaround(self):
|
||||
"""Test that xpath1_filter works with local-name() workaround on Atom feeds."""
|
||||
result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_rss=True)
|
||||
result = html_tools.xpath1_filter('//*[local-name()="title"]/text()', atom_feed_with_default_ns, is_xml=True)
|
||||
assert 'Release notes from PowerToys' in result
|
||||
assert 'Release 0.95.1' in result
|
||||
|
||||
|
||||
@@ -201,3 +201,120 @@ def test_trips(html_content, xpath, answer):
|
||||
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
|
||||
assert type(html_content) == str
|
||||
assert answer in html_content
|
||||
|
||||
|
||||
# Test for UTF-8 encoding bug fix (issue #3658)
|
||||
# Polish and other UTF-8 characters should be preserved correctly
|
||||
polish_html = """<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta charset="utf-8"></head>
|
||||
<body>
|
||||
<div class="index--s-headline-link">
|
||||
<a class="index--s-headline-link" href="#">
|
||||
Naukowcy potwierdzają: oglądanie krótkich filmików prowadzi do "zgnilizny mózgu"
|
||||
</a>
|
||||
</div>
|
||||
<div>
|
||||
<a class="other-class" href="#">
|
||||
Test with Polish chars: żółć ąę śń
|
||||
</a>
|
||||
</div>
|
||||
<div>
|
||||
<p class="unicode-test">Cyrillic: Привет мир</p>
|
||||
<p class="unicode-test">Greek: Γειά σου κόσμε</p>
|
||||
<p class="unicode-test">Arabic: مرحبا بالعالم</p>
|
||||
<p class="unicode-test">Chinese: 你好世界</p>
|
||||
<p class="unicode-test">Japanese: こんにちは世界</p>
|
||||
<p class="unicode-test">Emoji: 🌍🎉✨</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.parametrize("html_content", [polish_html])
|
||||
@pytest.mark.parametrize("xpath, expected_text", [
|
||||
# Test Polish characters in xpath_filter
|
||||
('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'),
|
||||
('//a[(contains(@class,"index--s-headline-link"))]', 'oglądanie krótkich filmików'),
|
||||
('//a[(contains(@class,"index--s-headline-link"))]', 'zgnilizny mózgu'),
|
||||
('//a[@class="other-class"]', 'żółć ąę śń'),
|
||||
|
||||
# Test various Unicode scripts
|
||||
('//p[@class="unicode-test"]', 'Привет мир'),
|
||||
('//p[@class="unicode-test"]', 'Γειά σου κόσμε'),
|
||||
('//p[@class="unicode-test"]', 'مرحبا بالعالم'),
|
||||
('//p[@class="unicode-test"]', '你好世界'),
|
||||
('//p[@class="unicode-test"]', 'こんにちは世界'),
|
||||
('//p[@class="unicode-test"]', '🌍🎉✨'),
|
||||
|
||||
# Test with text() extraction
|
||||
('//a[@class="other-class"]/text()', 'żółć'),
|
||||
])
|
||||
def test_xpath_utf8_encoding(html_content, xpath, expected_text):
|
||||
"""Test that XPath filters preserve UTF-8 characters correctly (issue #3658)"""
|
||||
result = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=False)
|
||||
assert type(result) == str
|
||||
assert expected_text in result
|
||||
# Ensure characters are NOT HTML-entity encoded
|
||||
# For example, 'ą' should NOT become 'ą'
|
||||
assert '&#' not in result or expected_text in result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("html_content", [polish_html])
|
||||
@pytest.mark.parametrize("xpath, expected_text", [
|
||||
# Test Polish characters in xpath1_filter
|
||||
('//a[(contains(@class,"index--s-headline-link"))]', 'Naukowcy potwierdzają'),
|
||||
('//a[(contains(@class,"index--s-headline-link"))]', 'mózgu'),
|
||||
('//a[@class="other-class"]', 'żółć ąę śń'),
|
||||
|
||||
# Test various Unicode scripts with xpath1
|
||||
('//p[@class="unicode-test" and contains(text(), "Cyrillic")]', 'Привет мир'),
|
||||
('//p[@class="unicode-test" and contains(text(), "Greek")]', 'Γειά σου'),
|
||||
('//p[@class="unicode-test" and contains(text(), "Chinese")]', '你好世界'),
|
||||
])
|
||||
def test_xpath1_utf8_encoding(html_content, xpath, expected_text):
|
||||
"""Test that XPath1 filters preserve UTF-8 characters correctly"""
|
||||
result = html_tools.xpath1_filter(xpath, html_content, append_pretty_line_formatting=False)
|
||||
assert type(result) == str
|
||||
assert expected_text in result
|
||||
# Ensure characters are NOT HTML-entity encoded
|
||||
assert '&#' not in result or expected_text in result
|
||||
|
||||
|
||||
# Test with real-world example from wyborcza.pl (issue #3658)
|
||||
wyborcza_style_html = """<!DOCTYPE html>
|
||||
<html lang="pl">
|
||||
<head><meta charset="utf-8"></head>
|
||||
<body>
|
||||
<div class="article-list">
|
||||
<a class="index--s-headline-link" href="/article1">
|
||||
Naukowcy potwierdzają: oglądanie krótkich filmików prowadzi do "zgnilizny mózgu"
|
||||
</a>
|
||||
<a class="index--s-headline-link" href="/article2">
|
||||
Zmiany klimatyczne wpływają na życie w miastach
|
||||
</a>
|
||||
<a class="index--s-headline-link" href="/article3">
|
||||
Łódź: Nowe inwestycje w infrastrukturę miejską
|
||||
</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def test_wyborcza_real_world_example():
|
||||
"""Test real-world case from wyborcza.pl that was failing (issue #3658)"""
|
||||
xpath = '//a[(contains(@class,"index--s-headline-link"))]'
|
||||
result = html_tools.xpath_filter(xpath, wyborcza_style_html, append_pretty_line_formatting=False)
|
||||
|
||||
# These exact strings should appear in the result
|
||||
assert 'Naukowcy potwierdzają' in result
|
||||
assert 'oglądanie krótkich filmików' in result
|
||||
assert 'zgnilizny mózgu' in result
|
||||
assert 'Łódź' in result
|
||||
|
||||
# Make sure they're NOT corrupted to mojibake like "potwierdzajÄ"
|
||||
assert 'potwierdzajÄ' not in result
|
||||
assert 'oglądanie' not in result
|
||||
assert 'mózgu' not in result
|
||||
|
||||
Reference in New Issue
Block a user