Files
changedetection.io/changedetectionio/blueprint/rss/_util.py

98 lines
3.5 KiB
Python

"""
Utility functions for RSS feed generation.
"""
from changedetectionio.jinja2_custom import render as jinja_render
from changedetectionio.notification.handler import apply_service_tweaks
from loguru import logger
import re
BAD_CHARS_REGEX = r'[\x00-\x08\x0B\x0C\x0E-\x1F]'
def scan_invalid_chars_in_rss(content):
"""
Scan for invalid characters in RSS content.
Returns True if invalid characters are found.
"""
for match in re.finditer(BAD_CHARS_REGEX, content):
i = match.start()
bad_char = content[i]
hex_value = f"0x{ord(bad_char):02x}"
# Grab context
start = max(0, i - 20)
end = min(len(content), i + 21)
context = content[start:end].replace('\n', '\\n').replace('\r', '\\r')
logger.warning(f"Invalid char {hex_value} at pos {i}: ...{context}...")
# First match is enough
return True
return False
def clean_entry_content(content):
"""
Remove invalid characters from RSS content.
"""
cleaned = re.sub(BAD_CHARS_REGEX, '', content)
return cleaned
def generate_watch_guid(watch):
"""
Generate a unique GUID for a watch RSS entry.
"""
return f"{watch['uuid']}/{watch.last_changed}"
def generate_watch_diff_content(watch, dates, rss_content_format, datastore, date_index_from=-2, date_index_to=-1):
"""
Generate HTML diff content for a watch given its history dates.
Returns tuple of (content, watch_label).
Args:
watch: The watch object
dates: List of history snapshot dates
rss_content_format: Format for RSS content (html or text)
datastore: The ChangeDetectionStore instance
date_index_from: Index of the "from" date in the dates list (default: -2)
date_index_to: Index of the "to" date in the dates list (default: -1)
Returns:
Tuple of (content, watch_label) - the rendered HTML content and watch label
"""
from changedetectionio import diff
# Same logic as watch-overview.html
if datastore.data['settings']['application']['ui'].get('use_page_title_in_list') or watch.get('use_page_title_in_list'):
watch_label = watch.label
else:
watch_label = watch.get('url')
try:
html_diff = diff.render_diff(
previous_version_file_contents=watch.get_history_snapshot(timestamp=dates[date_index_from]),
newest_version_file_contents=watch.get_history_snapshot(timestamp=dates[date_index_to]),
include_equal=False
)
requested_output_format = datastore.data['settings']['application'].get('rss_content_format')
url, html_diff, n_title = apply_service_tweaks(url='', n_body=html_diff, n_title=None, requested_output_format=requested_output_format)
except FileNotFoundError as e:
html_diff = f"History snapshot file for watch {watch.get('uuid')}@{watch.last_changed} - '{watch.get('title')} not found."
# @note: We use <pre> because nearly all RSS readers render only HTML (Thunderbird for example cant do just plaintext)
rss_template = "<pre>{{watch_label}} had a change.\n\n{{html_diff}}\n</pre>"
if 'html' in rss_content_format:
rss_template = "<html><body>\n<h4><a href=\"{{watch_url}}\">{{watch_label}}</a></h4>\n<p>{{html_diff}}</p>\n</body></html>\n"
content = jinja_render(template_str=rss_template, watch_label=watch_label, html_diff=html_diff, watch_url=watch.link)
# Out of range chars could also break feedgen
if scan_invalid_chars_in_rss(content):
content = clean_entry_content(content)
return content, watch_label