""" RSS/Atom feed processing tools for changedetection.io """ from loguru import logger import re def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: """ Process CDATA sections in HTML/XML content - inline replacement. Args: html_content: The HTML/XML content to process render_anchor_tag_content: Whether to render anchor tag content Returns: Processed HTML/XML content with CDATA sections replaced inline """ from xml.sax.saxutils import escape as xml_escape from .html_tools import html_to_text pattern = ')\s*)*)\]\]>' def repl(m): text = m.group(1) return xml_escape(html_to_text(html_content=text, render_anchor_tag_content=render_anchor_tag_content)).strip() return re.sub(pattern, repl, html_content) # Jinja2 template for formatting RSS/Atom feed entries # Covers all common feedparser entry fields including namespaced elements # Outputs HTML that will be converted to text via html_to_text # @todo - This could be a UI setting in the future RSS_ENTRY_TEMPLATE = """
{%- if entry.title -%}Title: {{ entry.title }}
{%- endif -%} {%- if entry.link -%}Link: {{ entry.link }}
{%- endif -%} {%- if entry.id -%} Guid: {{ entry.id }}
{%- endif -%} {%- if entry.published -%} PubDate: {{ entry.published }}
{%- endif -%} {%- if entry.updated and entry.updated != entry.published -%} Updated: {{ entry.updated }}
{%- endif -%} {%- if entry.author -%} Author: {{ entry.author }}
{%- elif entry.author_detail and entry.author_detail.name -%} Author: {{ entry.author_detail.name }} {%- if entry.author_detail.email %} ({{ entry.author_detail.email }}){% endif -%}
{%- endif -%} {%- if entry.contributors -%} Contributors: {% for contributor in entry.contributors -%} {{ contributor.name if contributor.name else contributor }} {%- if not loop.last %}, {% endif -%} {%- endfor %}
{%- endif -%} {%- if entry.publisher -%} Publisher: {{ entry.publisher }}
{%- endif -%} {%- if entry.rights -%} Rights: {{ entry.rights }}
{%- endif -%} {%- if entry.license -%} License: {{ entry.license }}
{%- endif -%} {%- if entry.language -%} Language: {{ entry.language }}
{%- endif -%} {%- if entry.tags -%} Tags: {% for tag in entry.tags -%} {{ tag.term if tag.term else tag }} {%- if not loop.last %}, {% endif -%} {%- endfor %}
{%- endif -%} {%- if entry.category -%} Category: {{ entry.category }}
{%- endif -%} {%- if entry.comments -%} Comments: {{ entry.comments }}
{%- endif -%} {%- if entry.slash_comments -%} Comment Count: {{ entry.slash_comments }}
{%- endif -%} {%- if entry.enclosures -%} Enclosures:
{%- for enclosure in entry.enclosures %} - {{ enclosure.href }} ({{ enclosure.type if enclosure.type else 'unknown type' }} {%- if enclosure.length %}, {{ enclosure.length }} bytes{% endif -%} )
{%- endfor -%} {%- endif -%} {%- if entry.media_content -%} Media:
{%- for media in entry.media_content %} - {{ media.url }} {%- if media.type %} ({{ media.type }}){% endif -%} {%- if media.width and media.height %} {{ media.width }}x{{ media.height }}{% endif -%}
{%- endfor -%} {%- endif -%} {%- if entry.media_thumbnail -%} Thumbnail: {{ entry.media_thumbnail[0].url if entry.media_thumbnail[0].url else entry.media_thumbnail[0] }}
{%- endif -%} {%- if entry.media_description -%} Media Description: {{ entry.media_description }}
{%- endif -%} {%- if entry.itunes_duration -%} Duration: {{ entry.itunes_duration }}
{%- endif -%} {%- if entry.itunes_author -%} Podcast Author: {{ entry.itunes_author }}
{%- endif -%} {%- if entry.dc_identifier -%} Identifier: {{ entry.dc_identifier }}
{%- endif -%} {%- if entry.dc_source -%} DC Source: {{ entry.dc_source }}
{%- endif -%} {%- if entry.dc_type -%} Type: {{ entry.dc_type }}
{%- endif -%} {%- if entry.dc_format -%} Format: {{ entry.dc_format }}
{%- endif -%} {%- if entry.dc_relation -%} Related: {{ entry.dc_relation }}
{%- endif -%} {%- if entry.dc_coverage -%} Coverage: {{ entry.dc_coverage }}
{%- endif -%} {%- if entry.source and entry.source.title -%} Source: {{ entry.source.title }} {%- if entry.source.link %} ({{ entry.source.link }}){% endif -%}
{%- endif -%} {%- if entry.dc_content -%} Content: {{ entry.dc_content | safe }} {%- elif entry.content and entry.content[0].value -%} Content: {{ entry.content[0].value | safe }} {%- elif entry.summary -%} Summary: {{ entry.summary | safe }} {%- endif -%}
""" def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str: """ Format RSS/Atom feed items in a readable text format using feedparser and Jinja2. Converts RSS or Atom elements to formatted text with all available fields: - Basic fields: title, link, id/guid, published date, updated date - Author fields: author, author_detail, contributors, publisher - Content fields: content, summary, description - Metadata: tags, category, rights, license - Media: enclosures, media_content, media_thumbnail - Dublin Core elements: dc:creator, dc:date, dc:publisher, etc. (mapped by feedparser) Args: rss_content: The RSS/Atom feed content render_anchor_tag_content: Whether to render anchor tag content in descriptions (unused, kept for compatibility) Returns: Formatted HTML content ready for html_to_text conversion """ try: import feedparser from changedetectionio.jinja2_custom import safe_jinja # Parse the feed - feedparser handles all RSS/Atom variants, CDATA, entity unescaping, etc. feed = feedparser.parse(rss_content) # Determine feed type for appropriate labels is_atom = feed.version and 'atom' in feed.version formatted_items = [] for entry in feed.entries: # Render the entry using Jinja2 template rendered = safe_jinja.render(RSS_ENTRY_TEMPLATE, entry=entry, is_atom=is_atom) formatted_items.append(rendered.strip()) # Wrap each item in a div with classes (first, last, item-N) items_html = [] total_items = len(formatted_items) for idx, item in enumerate(formatted_items): classes = ['rss-item'] if idx == 0: classes.append('first') if idx == total_items - 1: classes.append('last') classes.append(f'item-{idx + 1}') class_str = ' '.join(classes) items_html.append(f'
{item}
') return '\n' + "\n
".join(items_html) + '\n' except Exception as e: logger.warning(f"Error formatting RSS items: {str(e)}") # Fall back to original content return rss_content