Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
423b201d6a try this 2026-02-18 18:49:40 +01:00
dgtlmoon
839cf7fd9d Revisiting Dont' run docker container as root 2026-02-18 18:39:26 +01:00
10 changed files with 113 additions and 240 deletions

View File

@@ -86,6 +86,7 @@ LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.vendor="changedetection.io"
RUN apt-get update && apt-get install -y --no-install-recommends \
gosu \
libxslt1.1 \
# For presenting price amounts correctly in the restock/price detection overview
locales \
@@ -101,18 +102,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libxrender-dev \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Create unprivileged user and required directories
RUN groupadd -g 911 changedetection && \
useradd -u 911 -g 911 -M -s /bin/false changedetection && \
mkdir -p /datastore /extra_packages && \
chown changedetection:changedetection /extra_packages
# https://stackoverflow.com/questions/58701233/docker-logs-erroneously-appears-empty-until-container-stops
ENV PYTHONUNBUFFERED=1
RUN [ ! -d "/datastore" ] && mkdir /datastore
# Redirect .pyc cache to a writable location since /app is root-owned.
# To disable bytecode caching entirely, set PYTHONDONTWRITEBYTECODE=1 at runtime.
ENV PYTHONPYCACHEPREFIX=/tmp/pycache
# Disable pytest's .pytest_cache directory (also writes to /app, which is root-owned).
# Only has an effect when running tests inside the container.
ENV PYTEST_ADDOPTS="-p no:cacheprovider"
# Redirect test logs to the datastore (writable) instead of /app/tests/logs (read-only in container).
ENV TEST_LOG_DIR=/datastore/test_logs
# Re #80, sets SECLEVEL=1 in openssl.conf to allow monitoring sites with weak/old cipher suites
RUN sed -i 's/^CipherString = .*/CipherString = DEFAULT@SECLEVEL=1/' /etc/ssl/openssl.cnf
# Copy modules over to the final image and add their dir to PYTHONPATH
COPY --from=builder /dependencies /usr/local
ENV PYTHONPATH=/usr/local
ENV PYTHONPATH=/usr/local:/extra_packages
EXPOSE 5000

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
# Semver means never use .01, or 00. Should be .1.
__version__ = '0.53.5'
__version__ = '0.53.4'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -27,6 +27,7 @@ from flask import (
session,
url_for,
)
from flask_compress import Compress as FlaskCompress
from flask_restful import abort, Api
from flask_cors import CORS
@@ -73,14 +74,14 @@ CORS(app)
# There's also a bug between flask compress and socketio that causes some kind of slow memory leak
# It's better to use compression on your reverse proxy (nginx etc) instead.
if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
from flask_compress import Compress as FlaskCompress
app.config['COMPRESS_MIN_SIZE'] = 2096
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
app.config['COMPRESS_ALGORITHM'] = ['gzip']
compress = FlaskCompress()
compress.init_app(app)
compress = FlaskCompress()
compress.init_app(app)
app.config['TEMPLATES_AUTO_RELOAD'] = False

View File

@@ -561,33 +561,31 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
)
else:
parser_config = None
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
else:
# Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
# causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
# can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
# causing the regex to scan past the intended close and eat real page content.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Strip tags that inscriptis cannot render as meaningful text and which can be very large.
# svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
# video/audio/picture are kept — they may contain meaningful fallback text or captions.
for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
'math', 'canvas', 'iframe', 'template']):
tag.decompose()
# Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
# Causing inscriptis to silently exit when more than ~10MB is found.
# All we are doing here is converting the HTML to text, no CSS layout etc
# Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
'', html_content, flags=re.DOTALL | re.IGNORECASE)
# SPAs often use <body style="display:none"> to hide content until JS loads.
# inscriptis respects CSS display rules, so strip hiding styles from the body tag.
body_tag = soup.find('body')
if body_tag and body_tag.get('style'):
style = body_tag['style']
if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
del body_tag['style']
# SPAs often use <body style="display:none"> to hide content until JS loads
# inscriptis respects CSS display rules, so we need to remove these hiding styles
# to extract the actual page content
body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
# Check if body has hiding styles that need to be fixed
body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
if body_match:
from loguru import logger
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
html_content = str(soup)
text_content = get_text(html_content, config=parser_config)
return text_content

View File

@@ -199,25 +199,11 @@ def handle_watch_update(socketio, **kwargs):
logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")
def _patch_flask_request_context_session():
"""Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
still assigns to it directly (ctx.session = ...). Restore a setter that writes the
private _session attribute so the two libraries work together.
"""
from flask.ctx import RequestContext
if getattr(RequestContext.session, 'fset', None) is not None:
return # Already has a setter (future Flask version restored it)
original_prop = RequestContext.session
RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
def init_socketio(app, datastore):
"""Initialize SocketIO with the main Flask app"""
import platform
import sys
_patch_flask_request_context_session()
# Platform-specific async_mode selection for better stability
system = platform.system().lower()
python_version = sys.version_info

View File

@@ -39,8 +39,9 @@ def per_test_log_file(request):
"""Create a separate log file for each test function with pytest output."""
import re
# Create logs directory if it doesn't exist
log_dir = os.path.join(os.path.dirname(__file__), "logs")
# Create logs directory if it doesn't exist.
# TEST_LOG_DIR can be overridden e.g. to a writable path when /app is read-only (Docker).
log_dir = os.environ.get('TEST_LOG_DIR', os.path.join(os.path.dirname(__file__), "logs"))
os.makedirs(log_dir, exist_ok=True)
# Generate log filename from test name and worker ID (for parallel runs)

View File

@@ -453,175 +453,6 @@ class TestHtmlToText(unittest.TestCase):
def test_script_with_closing_tag_in_string_does_not_eat_content(self):
"""
Script tag containing </script> inside a JS string must not prematurely end the block.
This is the classic regex failure mode: the old pattern would find the first </script>
inside the JS string literal and stop there, leaving the tail of the script block
(plus any following content) exposed as raw text. BS4 parses the HTML correctly.
"""
html = '''<html><body>
<p>Before script</p>
<script>
var html = "<div>foo<\\/script><p>bar</p>";
var also = 1;
</script>
<p>AFTER SCRIPT</p>
</body></html>'''
text = html_to_text(html)
assert 'Before script' in text
assert 'AFTER SCRIPT' in text
# Script internals must not leak
assert 'var html' not in text
assert 'var also' not in text
def test_content_sandwiched_between_multiple_body_scripts(self):
"""Content between multiple script/style blocks in the body must all survive."""
html = '''<html><body>
<script>var a = 1;</script>
<p>CONTENT A</p>
<style>.x { color: red; }</style>
<p>CONTENT B</p>
<script>var b = 2;</script>
<p>CONTENT C</p>
<style>.y { color: blue; }</style>
<p>CONTENT D</p>
</body></html>'''
text = html_to_text(html)
for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
assert label in text, f"'{label}' was eaten by script/style stripping"
assert 'var a' not in text
assert 'var b' not in text
assert 'color: red' not in text
assert 'color: blue' not in text
def test_unicode_and_international_content_preserved(self):
"""Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
html = '''<html><body>
<style>.x{color:red}</style>
<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
<p>Chinese: \u6ce8\u518c</p>
<p>Japanese: \u767b\u9332</p>
<p>Korean: \ub4f1\ub85d</p>
<p>Emoji: \U0001f4e2</p>
<script>var x = 1;</script>
</body></html>'''
text = html_to_text(html)
assert 'ANMELDUNG' in text
assert '\u6ce8\u518c' in text # Chinese
assert '\u767b\u9332' in text # Japanese
assert '\ub4f1\ub85d' in text # Korean
def test_style_with_type_attribute_is_stripped(self):
"""<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
html = '''<html><body>
<style type="text/css">.important { display: none; }</style>
<p>VISIBLE CONTENT</p>
</body></html>'''
text = html_to_text(html)
assert 'VISIBLE CONTENT' in text
assert '.important' not in text
assert 'display: none' not in text
def test_ldjson_script_is_stripped(self):
"""<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
html = '''<html><body>
<script type="application/ld+json">
{"@type": "Product", "name": "Widget", "price": "9.99"}
</script>
<p>PRODUCT PAGE</p>
</body></html>'''
text = html_to_text(html)
assert 'PRODUCT PAGE' in text
assert '@type' not in text
assert '"price"' not in text
def test_inline_svg_is_stripped_entirely(self):
"""
Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
"""
html = '''<html><body>
<p>Before SVG</p>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M14 5L7 12L14 19Z" fill="none"/>
<circle cx="12" cy="12" r="10"/>
</svg>
<p>After SVG</p>
</body></html>'''
text = html_to_text(html)
assert 'Before SVG' in text
assert 'After SVG' in text
assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
assert 'viewBox' not in text, "SVG attributes should not appear in text output"
def test_tag_inside_json_data_attribute_does_not_eat_content(self):
"""
Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
closing tags appear as <\\/svg> rather than </svg>.
The old regex approach would find <svg> inside the attribute value, then fail to find
<\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
eating tens of kilobytes of actual page content in the process.
"""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
</div>
<div class="content">
<h1>IMPORTANT CONTENT</h1>
<p>This text must not be eaten by the tag-stripping logic.</p>
</div>
<svg><circle cx="50" cy="50" r="40"/></svg>
</body>
</html>'''
text = html_to_text(html)
assert 'IMPORTANT CONTENT' in text, (
"Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
"The tag-stripping logic is matching <tag> inside attribute values and scanning "
"forward to the next real closing tag in the DOM."
)
assert 'This text must not be eaten' in text
def test_script_inside_json_data_attribute_does_not_eat_content(self):
"""Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
</div>
<div>
<h1>MUST SURVIVE</h1>
<p>Real content after the data attribute with embedded script tag.</p>
</div>
<script>var real = 1;</script>
</body>
</html>'''
text = html_to_text(html)
assert 'MUST SURVIVE' in text, (
"Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
)
assert 'Real content after the data attribute' in text
if __name__ == '__main__':
# Can run this file directly for quick testing
unittest.main()

View File

@@ -9,6 +9,12 @@ services:
# - ./proxies.json:/datastore/proxies.json
# environment:
# Run as a specific user/group (UID:GID). Defaults to 911:911.
# The container will automatically fix datastore ownership on first start if needed.
# Set SKIP_CHOWN=1 to disable the ownership migration (e.g. if you manage permissions yourself).
# - PUID=1000
# - PGID=1000
#
# Default listening port, can also be changed with the -p option (not to be confused with ports: below)
# - PORT=5000
#
@@ -80,8 +86,9 @@ services:
# RAM usage will be higher if you increase this.
# - SCREENSHOT_MAX_HEIGHT=16000
#
# HTTPS SSL Mode for webserver, unset both of these, you may need to volume mount these files also.
# HTTPS SSL Mode for webserver, volume mount the cert files and set these env vars.
# ./cert.pem:/app/cert.pem and ./privkey.pem:/app/privkey.pem
# Permissions are fixed automatically on startup.
# - SSL_CERT_FILE=cert.pem
# - SSL_PRIVKEY_FILE=privkey.pem
#
@@ -95,6 +102,8 @@ services:
ports:
- 127.0.0.1:5000:5000
restart: unless-stopped
security_opt:
- no-new-privileges:true
# Used for fetching pages via WebDriver+Chrome where you need Javascript support.
# Now working on arm64 (needs testing on rPi - tested on Oracle ARM instance)

View File

@@ -1,28 +1,68 @@
#!/bin/bash
set -e
set -eu
# Install additional packages from EXTRA_PACKAGES env var
# Uses a marker file to avoid reinstalling on every container restart
INSTALLED_MARKER="/datastore/.extra_packages_installed"
CURRENT_PACKAGES="$EXTRA_PACKAGES"
DATASTORE_PATH="${DATASTORE_PATH:-/datastore}"
if [ -n "$EXTRA_PACKAGES" ]; then
# Check if we need to install/update packages
if [ ! -f "$INSTALLED_MARKER" ] || [ "$(cat $INSTALLED_MARKER 2>/dev/null)" != "$CURRENT_PACKAGES" ]; then
echo "Installing extra packages: $EXTRA_PACKAGES"
pip3 install --no-cache-dir $EXTRA_PACKAGES
# -----------------------------------------------------------------------
# Phase 1: Running as root — fix up PUID/PGID and datastore ownership,
# then re-exec as the unprivileged changedetection user via gosu.
# -----------------------------------------------------------------------
if [ "$(id -u)" = '0' ]; then
PUID=${PUID:-911}
PGID=${PGID:-911}
if [ $? -eq 0 ]; then
echo "$CURRENT_PACKAGES" > "$INSTALLED_MARKER"
echo "Extra packages installed successfully"
else
echo "ERROR: Failed to install extra packages"
exit 1
groupmod -o -g "$PGID" changedetection
usermod -o -u "$PUID" changedetection
# Keep /extra_packages writable by the (potentially re-mapped) user
chown changedetection:changedetection /extra_packages
# One-time ownership migration: only chown if the datastore isn't already
# owned by the target UID (e.g. existing root-owned installations).
if [ -z "${SKIP_CHOWN:-}" ]; then
datastore_uid=$(stat -c '%u' "$DATASTORE_PATH")
if [ "$datastore_uid" != "$PUID" ]; then
echo "Updating $DATASTORE_PATH ownership to $PUID:$PGID (one-time migration)..."
chown -R changedetection:changedetection "$DATASTORE_PATH"
echo "Done."
fi
fi
# Fix SSL certificate permissions so the unprivileged user can read them.
# SSL_CERT_FILE / SSL_PRIVKEY_FILE may be relative (to /app) or absolute.
fix_ssl_perm() {
local file="$1" mode="$2"
[ -z "$file" ] && return
[ "${file:0:1}" != "/" ] && file="/app/$file"
if [ -f "$file" ]; then
chown changedetection:changedetection "$file"
chmod "$mode" "$file"
fi
}
fix_ssl_perm "${SSL_CERT_FILE:-}" 644
fix_ssl_perm "${SSL_PRIVKEY_FILE:-}" 600
# Re-exec this script as the unprivileged user
exec gosu changedetection:changedetection "$0" "$@"
fi
# -----------------------------------------------------------------------
# Phase 2: Running as unprivileged user — install any EXTRA_PACKAGES into
# /extra_packages (already on PYTHONPATH) then exec the app.
# -----------------------------------------------------------------------
# Install additional packages from EXTRA_PACKAGES env var.
# Uses a marker file in the datastore to avoid reinstalling on every restart.
if [ -n "${EXTRA_PACKAGES:-}" ]; then
INSTALLED_MARKER="${DATASTORE_PATH}/.extra_packages_installed"
if [ ! -f "$INSTALLED_MARKER" ] || [ "$(cat "$INSTALLED_MARKER" 2>/dev/null)" != "$EXTRA_PACKAGES" ]; then
echo "Installing extra packages: $EXTRA_PACKAGES"
pip3 install --target=/extra_packages --no-cache-dir $EXTRA_PACKAGES
echo "$EXTRA_PACKAGES" > "$INSTALLED_MARKER"
echo "Extra packages installed successfully"
else
echo "Extra packages already installed: $EXTRA_PACKAGES"
fi
fi
# Execute the main command
exec "$@"

View File

@@ -9,15 +9,10 @@ flask_restful
flask_cors # For the Chrome extension to operate
# janus # No longer needed - using pure threading.Queue for multi-loop support
flask_wtf~=1.2
# Flask 3.1 removed the session setter on RequestContext; the patch in
# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
flask>=3.1,<4
# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
# >=5.5.0 ensures the threading async_mode we rely on is available.
flask-socketio>=5.5.0,<6
python-socketio>=5.11.0,<6
python-engineio>=4.9.0,<5
flask~=3.1
flask-socketio~=5.6.0
python-socketio~=5.16.1
python-engineio~=4.13.1
inscriptis~=2.2
pytz
timeago~=1.0