Compare commits

...

5 Commits

Author SHA1 Message Date
dgtlmoon
5b69d73ea1 also dump memory report when building
Some checks failed
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2025-10-12 16:07:19 +02:00
dgtlmoon
bc614e1651 Helper to give python GC a better chance of cleaning up 2025-10-12 15:51:44 +02:00
dgtlmoon
5d459dbc59 Replace filetype detection library with puremagic, 20Mb less RAM usage 2025-10-12 15:04:43 +02:00
dgtlmoon
3c81efe2f4 0.50.21
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2025-10-10 18:17:56 +02:00
dgtlmoon
0fcfb94690 Adding 'RSS reader mode' (see main Settings) (#3488) 2025-10-10 18:17:30 +02:00
15 changed files with 364 additions and 31 deletions

View File

@@ -246,13 +246,21 @@ jobs:
# @todo - scan the container log to see the right "graceful shutdown" text exists
docker rm sig-test
- name: Dump container log
- name: Dump container log and memory report
if: always()
run: |
mkdir output-logs
docker logs test-cdio-basic-tests > output-logs/test-cdio-basic-tests-stdout-${{ env.PYTHON_VERSION }}.txt
docker logs test-cdio-basic-tests 2> output-logs/test-cdio-basic-tests-stderr-${{ env.PYTHON_VERSION }}.txt
# Extract test-memory.log from the container
echo "Extracting test-memory.log from container..."
docker cp test-cdio-basic-tests:/app/changedetectionio/test-memory.log output-logs/test-memory-${{ env.PYTHON_VERSION }}.log || echo "test-memory.log not found in container"
# Display the memory log contents for immediate visibility in workflow output
echo "=== Memory Test Report ==="
cat output-logs/test-memory-${{ env.PYTHON_VERSION }}.log 2>/dev/null || echo "No memory log available"
- name: Store everything including test-datastore
if: always()
uses: actions/upload-artifact@v4

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.50.20'
__version__ = '0.50.21'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -334,6 +334,10 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
if update_handler.fetcher.content or (not update_handler.fetcher.content and empty_pages_are_a_change):
watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=int(fetch_start_time))
# Explicitly delete large content variables to free memory IMMEDIATELY after saving
# These are no longer needed after being saved to history
del contents
# Send notifications on second+ check
if watch.history_n >= 2:
logger.info(f"Change detected in UUID {uuid} - {watch['url']}")
@@ -372,6 +376,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
except Exception as e:
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.error(f"Worker {worker_id} traceback:", exc_info=True)
@@ -392,7 +402,28 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
#logger.info(f"Worker {worker_id} sending completion signal for UUID {watch['uuid']}")
watch_check_update.send(watch_uuid=watch['uuid'])
update_handler = None
# Explicitly clean up update_handler and all its references
if update_handler:
# Clear fetcher content using the proper method
if hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
# Clear processor references
if hasattr(update_handler, 'content_processor'):
update_handler.content_processor = None
update_handler = None
# Clear local contents variable if it still exists
if 'contents' in locals():
del contents
# Note: We don't set watch = None here because:
# 1. watch is just a local reference to datastore.data['watching'][uuid]
# 2. Setting it to None doesn't affect the datastore
# 3. GC can't collect the object anyway (still referenced by datastore)
# 4. It would just cause confusion
logger.debug(f"Worker {worker_id} completed watch {uuid} in {time.time()-fetch_start_time:.2f}s")
except Exception as cleanup_error:
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")

View File

@@ -72,17 +72,24 @@
<span class="pure-form-message-inline">Allow access to view watch diff page when password is enabled (Good for sharing the diff page)
</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.rss_hide_muted_watches) }}
</div>
<div class="pure-control-group">
{{ render_field(form.application.form.rss_content_format) }}
<span class="pure-form-message-inline">Love RSS? Does your reader support HTML? Set it here</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
</div>
<div class="grey-form-border">
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.rss_hide_muted_watches) }}
</div>
<div class="pure-control-group">
{{ render_field(form.application.form.rss_content_format) }}
<span class="pure-form-message-inline">Love RSS? Does your reader support HTML? Set it here</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.rss_reader_mode) }}
<span class="pure-form-message-inline">Transforms RSS/RDF feed watches into beautiful text only</span>
</div>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}

View File

@@ -64,6 +64,19 @@ class Fetcher():
# Time ONTOP of the system defined env minimum time
render_extract_delay = 0
def clear_content(self):
"""
Explicitly clear all content from memory to free up heap space.
Call this after content has been saved to disk.
"""
self.content = None
if hasattr(self, 'raw_content'):
self.raw_content = None
self.screenshot = None
self.xpath_data = None
# Keep headers and status_code as they're small
logger.trace("Fetcher content cleared from memory")
@abstractmethod
def get_error(self):
return self.error

View File

@@ -940,6 +940,10 @@ class globalSettingsApplicationForm(commonSettingsForm):
strip_ignored_lines = BooleanField('Strip ignored lines')
rss_hide_muted_watches = BooleanField('Hide muted watches from RSS feed', default=True,
validators=[validators.Optional()])
rss_reader_mode = BooleanField('RSS reader mode ', default=False,
validators=[validators.Optional()])
filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0,

View File

@@ -55,6 +55,7 @@ class model(dict):
'rss_access_token': None,
'rss_content_format': RSS_FORMAT_TYPES[0][0],
'rss_hide_muted_watches': True,
'rss_reader_mode': False,
'schema_version' : 0,
'shared_diff_access': False,
'strip_ignored_lines': False,

View File

@@ -64,24 +64,31 @@ class guess_stream_type():
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
test_content_normalized = re.sub(r'<\s+', '<', test_content)
# Magic will sometimes call text/plain as text/html!
# Use puremagic for lightweight MIME detection (saves ~14MB vs python-magic)
magic_result = None
try:
import magic
import puremagic
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types from magic
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
# Trust magic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
# puremagic needs bytes, so encode if we have a string
content_bytes = content[:200].encode('utf-8') if isinstance(content, str) else content[:200]
# puremagic returns a list of PureMagic objects with confidence scores
detections = puremagic.magic_string(content_bytes)
if detections:
# Get the highest confidence detection
mime = detections[0].mime_type
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from puremagic library")
# Trust puremagic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
logger.error(f"Error getting a more precise mime type from 'puremagic' library ({str(e)}), using content-based detection")
# Content-based detection (most reliable for text formats)
# Check for HTML patterns first - if found, override magic's text/plain
@@ -103,7 +110,7 @@ class guess_stream_type():
# magic will call a rss document 'xml'
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES) or '<rdf:' in test_content_normalized:
self.is_rss = True
elif any(s in http_content_header for s in XML_CONTENT_TYPES):
# Only mark as generic XML if not already detected as RSS

View File

@@ -228,8 +228,21 @@ class ContentProcessor:
self.datastore = datastore
def preprocess_rss(self, content):
"""Convert CDATA/comments in RSS to usable text."""
return cdata_in_document_to_text(html_content=content)
"""
Convert CDATA/comments in RSS to usable text.
Supports two RSS processing modes:
- 'default': Inline CDATA replacement (original behavior)
- 'formatted': Format RSS items with title, link, guid, pubDate, and description (CDATA unmarked)
"""
from changedetectionio import rss_tools
rss_mode = self.datastore.data["settings"]["application"].get("rss_reader_mode")
if rss_mode:
# Format RSS items nicely with CDATA content unmarked and converted to text
return rss_tools.format_rss_items(content)
else:
# Default: Original inline CDATA replacement
return cdata_in_document_to_text(html_content=content)
def preprocess_pdf(self, raw_content):
"""Convert PDF to HTML using external tool."""
@@ -384,6 +397,11 @@ class perform_site_check(difference_detection_processor):
# RSS preprocessing
if stream_content_type.is_rss:
content = content_processor.preprocess_rss(content)
if self.datastore.data["settings"]["application"].get("rss_reader_mode"):
# Now just becomes regular HTML that can have xpath/CSS applied (first of the set etc)
stream_content_type.is_rss = False
stream_content_type.is_html = True
self.fetcher.content = content
# PDF preprocessing
if watch.is_pdf or stream_content_type.is_pdf:
@@ -538,6 +556,20 @@ class perform_site_check(difference_detection_processor):
else:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content")
# Note: Explicit cleanup is only needed here because text_json_diff handles
# large strings (100KB-300KB for RSS/HTML). The other processors work with
# small strings and don't need this.
#
# Python would clean these up automatically, but explicit `del` frees memory
# immediately rather than waiting for function return, reducing peak memory usage.
del content
if 'html_content' in locals() and html_content is not stripped_text:
del html_content
if 'text_content_before_ignored_filter' in locals() and text_content_before_ignored_filter is not stripped_text:
del text_content_before_ignored_filter
if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text:
del text_for_checksuming
return changed_detected, update_obj, stripped_text
def _apply_diff_filtering(self, watch, stripped_text, text_before_filter):

View File

@@ -0,0 +1,130 @@
"""
RSS/Atom feed processing tools for changedetection.io
"""
from loguru import logger
import re
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
"""
Process CDATA sections in HTML/XML content - inline replacement.
Args:
html_content: The HTML/XML content to process
render_anchor_tag_content: Whether to render anchor tag content
Returns:
Processed HTML/XML content with CDATA sections replaced inline
"""
from xml.sax.saxutils import escape as xml_escape
from .html_tools import html_to_text
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
def repl(m):
text = m.group(1)
return xml_escape(html_to_text(html_content=text, render_anchor_tag_content=render_anchor_tag_content)).strip()
return re.sub(pattern, repl, html_content)
def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
"""
Format RSS/Atom feed items in a readable text format using feedparser.
Converts RSS <item> or Atom <entry> elements to formatted text with:
- <title> → <h1>Title</h1>
- <link> → Link: [url]
- <guid> → Guid: [id]
- <pubDate> → PubDate: [date]
- <description> or <content> → Raw HTML content (CDATA and entities automatically handled)
Args:
rss_content: The RSS/Atom feed content
render_anchor_tag_content: Whether to render anchor tag content in descriptions (unused, kept for compatibility)
Returns:
Formatted HTML content ready for html_to_text conversion
"""
try:
import feedparser
from xml.sax.saxutils import escape as xml_escape
# Parse the feed - feedparser handles all RSS/Atom variants, CDATA, entity unescaping, etc.
feed = feedparser.parse(rss_content)
formatted_items = []
# Determine feed type for appropriate labels when fields are missing
# feedparser sets feed.version to things like 'rss20', 'atom10', etc.
is_atom = feed.version and 'atom' in feed.version
for entry in feed.entries:
item_parts = []
# Title - feedparser handles CDATA and entity unescaping automatically
if hasattr(entry, 'title') and entry.title:
item_parts.append(f'<h1>{xml_escape(entry.title)}</h1>')
# Link
if hasattr(entry, 'link') and entry.link:
item_parts.append(f'Link: {xml_escape(entry.link)}<br>')
# GUID/ID
if hasattr(entry, 'id') and entry.id:
item_parts.append(f'Guid: {xml_escape(entry.id)}<br>')
# Date - feedparser normalizes all date field names to 'published'
if hasattr(entry, 'published') and entry.published:
item_parts.append(f'PubDate: {xml_escape(entry.published)}<br>')
# Description/Content - feedparser handles CDATA and entity unescaping automatically
# Only add "Summary:" label for Atom <summary> tags
content = None
add_label = False
if hasattr(entry, 'content') and entry.content:
# Atom <content> - no label, just content
content = entry.content[0].value if entry.content[0].value else None
elif hasattr(entry, 'summary'):
# Could be RSS <description> or Atom <summary>
# feedparser maps both to entry.summary
content = entry.summary if entry.summary else None
# Only add "Summary:" label for Atom feeds (which use <summary> tag)
if is_atom:
add_label = True
# Add content with or without label
if content:
if add_label:
item_parts.append(f'Summary:<br>{content}')
else:
item_parts.append(content)
else:
# No content - just show <none>
item_parts.append('&lt;none&gt;')
# Join all parts of this item
if item_parts:
formatted_items.append('\n'.join(item_parts))
# Wrap each item in a div with classes (first, last, item-N)
items_html = []
total_items = len(formatted_items)
for idx, item in enumerate(formatted_items):
classes = ['rss-item']
if idx == 0:
classes.append('first')
if idx == total_items - 1:
classes.append('last')
classes.append(f'item-{idx + 1}')
class_str = ' '.join(classes)
items_html.append(f'<div class="{class_str}">{item}</div>')
return '<html><body>\n'+"\n<br><br>".join(items_html)+'\n</body></html>'
except Exception as e:
logger.warning(f"Error formatting RSS items: {str(e)}")
# Fall back to original content
return rss_content

View File

@@ -344,7 +344,7 @@ label {
}
}
#notification-customisation {
.grey-form-border {
border: 1px solid var(--color-border-notification);
padding: 0.5rem;
border-radius: 5px;

File diff suppressed because one or more lines are too long

View File

@@ -33,7 +33,7 @@
<div id="notification-test-log" style="display: none;"><span class="pure-form-message-inline">Processing..</span></div>
</div>
</div>
<div id="notification-customisation" class="pure-control-group">
<div class="pure-control-group grey-form-border">
<div class="pure-control-group">
{{ render_field(form.notification_title, class="m-d notification-title", placeholder=settings_application['notification_title']) }}
<span class="pure-form-message-inline">Title for all notifications</span>

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
import time
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, \
extract_UUID_from_client, delete_all_watches
def set_original_cdata_xml():
test_return_data = """<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<title>Security Bulletins on wetscale</title>
<link>https://wetscale.com/security-bulletins/</link>
<description>Recent security bulletins from wetscale</description>
<lastBuildDate>Fri, 10 Oct 2025 14:58:11 GMT</lastBuildDate>
<docs>https://validator.w3.org/feed/docs/rss2.html</docs>
<generator>wetscale.com</generator>
<language>en-US</language>
<copyright>© 2025 wetscale Inc. All rights reserved.</copyright>
<atom:link href="https://wetscale.com/security-bulletins/index.xml" rel="self" type="application/rss+xml"/>
<item>
<title>TS-2025-005</title>
<link>https://wetscale.com/security-bulletins/#ts-2025-005</link>
<guid>https://wetscale.com/security-bulletins/#ts-2025-005</guid>
<pubDate>Thu, 07 Aug 2025 00:00:00 GMT</pubDate>
<description><p>Wet noodles escape<br><p>they also found themselves outside</p> </description>
</item>
<item>
<title>TS-2025-004</title>
<link>https://wetscale.com/security-bulletins/#ts-2025-004</link>
<guid>https://wetscale.com/security-bulletins/#ts-2025-004</guid>
<pubDate>Tue, 27 May 2025 00:00:00 GMT</pubDate>
<description>
<![CDATA[ <img class="type:primaryImage" src="https://testsite.com/701c981da04869e.jpg"/><p>The days of Terminator and The Matrix could be closer. But be positive.</p><p><a href="https://testsite.com">Read more link...</a></p> ]]>
</description>
</item>
</channel>
</rss>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_rss_reader_mode(client, live_server, measure_memory_usage):
set_original_cdata_xml()
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
live_server.app.config['DATASTORE'].data['settings']['application']['rss_reader_mode'] = True
# Add our URL to the import page
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(dates[0])
assert 'Wet noodles escape' in snapshot_contents
assert '<br>' not in snapshot_contents
assert '&lt;' not in snapshot_contents
assert 'The days of Terminator and The Matrix' in snapshot_contents
assert 'PubDate: Thu, 07 Aug 2025 00:00:00 GMT' in snapshot_contents
delete_all_watches(client)
def test_rss_reader_mode_with_css_filters(client, live_server, measure_memory_usage):
set_original_cdata_xml()
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
live_server.app.config['DATASTORE'].data['settings']['application']['rss_reader_mode'] = True
# Add our URL to the import page
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'include_filters': [".last"]})
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(dates[0])
assert 'Wet noodles escape' not in snapshot_contents
assert '<br>' not in snapshot_contents
assert '&lt;' not in snapshot_contents
assert 'The days of Terminator and The Matrix' in snapshot_contents
delete_all_watches(client)

View File

@@ -1,5 +1,6 @@
# eventlet>=0.38.0 # Removed - replaced with threading mode for better Python 3.12+ compatibility
feedgen~=0.9
feedparser~=6.0 # For parsing RSS/Atom feeds
flask-compress
# 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers)
flask-login>=0.6.3
@@ -124,8 +125,9 @@ price-parser
# flask_socket_io - incorrect package name, already have flask-socketio above
# So far for detecting correct favicon type, but for other things in the future
python-magic
# Lightweight MIME type detection (saves ~14MB memory vs python-magic/libmagic)
# Used for detecting correct favicon type and content-type detection
puremagic
# Scheduler - Windows seemed to miss a lot of default timezone info (even "UTC" !)
tzdata