try this

Revisiting Dont' run docker container as root
2026-02-20 13:16:03 +00:00 · 2026-02-18 18:49:40 +01:00 · 2026-02-18 18:39:26 +01:00
10 changed files with 113 additions and 240 deletions
--- a/18
+++ b/18
@@ -86,6 +86,7 @@ LABEL org.opencontainers.image.licenses="Apache-2.0"
 LABEL org.opencontainers.image.vendor="changedetection.io"

 RUN apt-get update && apt-get install -y --no-install-recommends \
+    gosu \
    libxslt1.1 \
    # For presenting price amounts correctly in the restock/price detection overview
    locales \
@@ -101,18 +102,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libxrender-dev \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

+# Create unprivileged user and required directories
+RUN groupadd -g 911 changedetection && \
+    useradd -u 911 -g 911 -M -s /bin/false changedetection && \
+    mkdir -p /datastore /extra_packages && \
+    chown changedetection:changedetection /extra_packages

 # https://stackoverflow.com/questions/58701233/docker-logs-erroneously-appears-empty-until-container-stops
 ENV PYTHONUNBUFFERED=1
-
-RUN [ ! -d "/datastore" ] && mkdir /datastore
+# Redirect .pyc cache to a writable location since /app is root-owned.
+# To disable bytecode caching entirely, set PYTHONDONTWRITEBYTECODE=1 at runtime.
+ENV PYTHONPYCACHEPREFIX=/tmp/pycache
+# Disable pytest's .pytest_cache directory (also writes to /app, which is root-owned).
+# Only has an effect when running tests inside the container.
+ENV PYTEST_ADDOPTS="-p no:cacheprovider"
+# Redirect test logs to the datastore (writable) instead of /app/tests/logs (read-only in container).
+ENV TEST_LOG_DIR=/datastore/test_logs

 # Re #80, sets SECLEVEL=1 in openssl.conf to allow monitoring sites with weak/old cipher suites
 RUN sed -i 's/^CipherString = .*/CipherString = DEFAULT@SECLEVEL=1/' /etc/ssl/openssl.cnf

 # Copy modules over to the final image and add their dir to PYTHONPATH
 COPY --from=builder /dependencies /usr/local
-ENV PYTHONPATH=/usr/local
+ENV PYTHONPATH=/usr/local:/extra_packages

 EXPOSE 5000

--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
 # Semver means never use .01, or 00. Should be .1.
-__version__ = '0.53.5'
+__version__ = '0.53.4'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -27,6 +27,7 @@ from flask import (
    session,
    url_for,
 )
+from flask_compress import Compress as FlaskCompress
 from flask_restful import abort, Api
 from flask_cors import CORS

@@ -73,14 +74,14 @@ CORS(app)
 # There's also a bug between flask compress and socketio that causes some kind of slow memory leak
 # It's better to use compression on your reverse proxy (nginx etc) instead.
 if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
-    from flask_compress import Compress as FlaskCompress
    app.config['COMPRESS_MIN_SIZE'] = 2096
    app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
    # Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
    app.config['COMPRESS_ALGORITHM'] = ['gzip']
-    compress = FlaskCompress()
-    compress.init_app(app)

+compress = FlaskCompress()
+
+compress.init_app(app)
 app.config['TEMPLATES_AUTO_RELOAD'] = False


--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -561,33 +561,31 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        )
    else:
        parser_config = None
+
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
    else:
-        # Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
-        # causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
-        # can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
-        # causing the regex to scan past the intended close and eat real page content.
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Strip tags that inscriptis cannot render as meaningful text and which can be very large.
-        # svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
-        # video/audio/picture are kept — they may contain meaningful fallback text or captions.
-        for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
-                                  'math', 'canvas', 'iframe', 'template']):
-            tag.decompose()
+        # Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
+        # Causing inscriptis to silently exit when more than ~10MB is found.
+        # All we are doing here is converting the HTML to text, no CSS layout etc
+        # Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
+        html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
+                              '', html_content, flags=re.DOTALL | re.IGNORECASE)

-        # SPAs often use <body style="display:none"> to hide content until JS loads.
-        # inscriptis respects CSS display rules, so strip hiding styles from the body tag.
-        body_tag = soup.find('body')
-        if body_tag and body_tag.get('style'):
-            style = body_tag['style']
-            if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
-                logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
-                del body_tag['style']
+        # SPAs often use <body style="display:none"> to hide content until JS loads
+        # inscriptis respects CSS display rules, so we need to remove these hiding styles
+        # to extract the actual page content
+        body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
+
+        # Check if body has hiding styles that need to be fixed
+        body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
+        if body_match:
+            from loguru import logger
+            logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
+
+        html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)

-        html_content = str(soup)

    text_content = get_text(html_content, config=parser_config)
    return text_content
--- a/changedetectionio/realtime/socket_server.py
+++ b/changedetectionio/realtime/socket_server.py
@@ -199,25 +199,11 @@ def handle_watch_update(socketio, **kwargs):
        logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")


-def _patch_flask_request_context_session():
-    """Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
-    still assigns to it directly (ctx.session = ...).  Restore a setter that writes the
-    private _session attribute so the two libraries work together.
-    """
-    from flask.ctx import RequestContext
-    if getattr(RequestContext.session, 'fset', None) is not None:
-        return  # Already has a setter (future Flask version restored it)
-    original_prop = RequestContext.session
-    RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
-
-
 def init_socketio(app, datastore):
    """Initialize SocketIO with the main Flask app"""
    import platform
    import sys

-    _patch_flask_request_context_session()
-
    # Platform-specific async_mode selection for better stability
    system = platform.system().lower()
    python_version = sys.version_info
--- a/changedetectionio/tests/conftest.py
+++ b/changedetectionio/tests/conftest.py
@@ -39,8 +39,9 @@ def per_test_log_file(request):
    """Create a separate log file for each test function with pytest output."""
    import re

-    # Create logs directory if it doesn't exist
-    log_dir = os.path.join(os.path.dirname(__file__), "logs")
+    # Create logs directory if it doesn't exist.
+    # TEST_LOG_DIR can be overridden e.g. to a writable path when /app is read-only (Docker).
+    log_dir = os.environ.get('TEST_LOG_DIR', os.path.join(os.path.dirname(__file__), "logs"))
    os.makedirs(log_dir, exist_ok=True)

    # Generate log filename from test name and worker ID (for parallel runs)
--- a/changedetectionio/tests/unit/test_html_to_text.py
+++ b/changedetectionio/tests/unit/test_html_to_text.py
@@ -453,175 +453,6 @@ class TestHtmlToText(unittest.TestCase):



-    def test_script_with_closing_tag_in_string_does_not_eat_content(self):
-        """
-        Script tag containing </script> inside a JS string must not prematurely end the block.
-
-        This is the classic regex failure mode: the old pattern would find the first </script>
-        inside the JS string literal and stop there, leaving the tail of the script block
-        (plus any following content) exposed as raw text. BS4 parses the HTML correctly.
-        """
-        html = '''<html><body>
-<p>Before script</p>
-<script>
-var html = "<div>foo<\\/script><p>bar</p>";
-var also = 1;
-</script>
-<p>AFTER SCRIPT</p>
-</body></html>'''
-
-        text = html_to_text(html)
-        assert 'Before script' in text
-        assert 'AFTER SCRIPT' in text
-        # Script internals must not leak
-        assert 'var html' not in text
-        assert 'var also' not in text
-
-    def test_content_sandwiched_between_multiple_body_scripts(self):
-        """Content between multiple script/style blocks in the body must all survive."""
-        html = '''<html><body>
-<script>var a = 1;</script>
-<p>CONTENT A</p>
-<style>.x { color: red; }</style>
-<p>CONTENT B</p>
-<script>var b = 2;</script>
-<p>CONTENT C</p>
-<style>.y { color: blue; }</style>
-<p>CONTENT D</p>
-</body></html>'''
-
-        text = html_to_text(html)
-        for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
-            assert label in text, f"'{label}' was eaten by script/style stripping"
-        assert 'var a' not in text
-        assert 'var b' not in text
-        assert 'color: red' not in text
-        assert 'color: blue' not in text
-
-    def test_unicode_and_international_content_preserved(self):
-        """Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
-        html = '''<html><body>
-<style>.x{color:red}</style>
-<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
-<p>Chinese: \u6ce8\u518c</p>
-<p>Japanese: \u767b\u9332</p>
-<p>Korean: \ub4f1\ub85d</p>
-<p>Emoji: \U0001f4e2</p>
-<script>var x = 1;</script>
-</body></html>'''
-
-        text = html_to_text(html)
-        assert 'ANMELDUNG' in text
-        assert '\u6ce8\u518c' in text   # Chinese
-        assert '\u767b\u9332' in text   # Japanese
-        assert '\ub4f1\ub85d' in text   # Korean
-
-    def test_style_with_type_attribute_is_stripped(self):
-        """<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
-        html = '''<html><body>
-<style type="text/css">.important { display: none; }</style>
-<p>VISIBLE CONTENT</p>
-</body></html>'''
-
-        text = html_to_text(html)
-        assert 'VISIBLE CONTENT' in text
-        assert '.important' not in text
-        assert 'display: none' not in text
-
-    def test_ldjson_script_is_stripped(self):
-        """<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
-        html = '''<html><body>
-<script type="application/ld+json">
-{"@type": "Product", "name": "Widget", "price": "9.99"}
-</script>
-<p>PRODUCT PAGE</p>
-</body></html>'''
-
-        text = html_to_text(html)
-        assert 'PRODUCT PAGE' in text
-        assert '@type' not in text
-        assert '"price"' not in text
-
-    def test_inline_svg_is_stripped_entirely(self):
-        """
-        Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
-        SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
-        text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
-        """
-        html = '''<html><body>
-<p>Before SVG</p>
-<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
-    <path d="M14 5L7 12L14 19Z" fill="none"/>
-    <circle cx="12" cy="12" r="10"/>
-</svg>
-<p>After SVG</p>
-</body></html>'''
-
-        text = html_to_text(html)
-        assert 'Before SVG' in text
-        assert 'After SVG' in text
-        assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
-        assert 'viewBox' not in text, "SVG attributes should not appear in text output"
-
-    def test_tag_inside_json_data_attribute_does_not_eat_content(self):
-        """
-        Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
-
-        Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
-        inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
-        closing tags appear as <\\/svg> rather than </svg>.
-
-        The old regex approach would find <svg> inside the attribute value, then fail to find
-        <\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
-        eating tens of kilobytes of actual page content in the process.
-        """
-        html = '''<!DOCTYPE html>
-<html>
-<head><title>Test</title></head>
-<body>
-<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
-</div>
-<div class="content">
-    <h1>IMPORTANT CONTENT</h1>
-    <p>This text must not be eaten by the tag-stripping logic.</p>
-</div>
-<svg><circle cx="50" cy="50" r="40"/></svg>
-</body>
-</html>'''
-
-        text = html_to_text(html)
-
-        assert 'IMPORTANT CONTENT' in text, (
-            "Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
-            "The tag-stripping logic is matching <tag> inside attribute values and scanning "
-            "forward to the next real closing tag in the DOM."
-        )
-        assert 'This text must not be eaten' in text
-
-    def test_script_inside_json_data_attribute_does_not_eat_content(self):
-        """Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
-        html = '''<!DOCTYPE html>
-<html>
-<head><title>Test</title></head>
-<body>
-<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
-</div>
-<div>
-    <h1>MUST SURVIVE</h1>
-    <p>Real content after the data attribute with embedded script tag.</p>
-</div>
-<script>var real = 1;</script>
-</body>
-</html>'''
-
-        text = html_to_text(html)
-
-        assert 'MUST SURVIVE' in text, (
-            "Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
-        )
-        assert 'Real content after the data attribute' in text
-
-
 if __name__ == '__main__':
    # Can run this file directly for quick testing
    unittest.main()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,6 +9,12 @@ services:
 #        - ./proxies.json:/datastore/proxies.json

  #    environment:
+  #        Run as a specific user/group (UID:GID). Defaults to 911:911.
+  #        The container will automatically fix datastore ownership on first start if needed.
+  #        Set SKIP_CHOWN=1 to disable the ownership migration (e.g. if you manage permissions yourself).
+  #      - PUID=1000
+  #      - PGID=1000
+  #
  #        Default listening port, can also be changed with the -p option (not to be confused with ports: below)
  #      - PORT=5000
  #
@@ -80,8 +86,9 @@ services:
  #        RAM usage will be higher if you increase this.
  #      - SCREENSHOT_MAX_HEIGHT=16000
  #
-  #        HTTPS SSL Mode for webserver, unset both of these, you may need to volume mount these files also.
+  #        HTTPS SSL Mode for webserver, volume mount the cert files and set these env vars.
  #        ./cert.pem:/app/cert.pem and ./privkey.pem:/app/privkey.pem
+  #        Permissions are fixed automatically on startup.
  #      - SSL_CERT_FILE=cert.pem
  #      - SSL_PRIVKEY_FILE=privkey.pem
  #
@@ -95,6 +102,8 @@ services:
      ports:
        - 127.0.0.1:5000:5000
      restart: unless-stopped
+      security_opt:
+        - no-new-privileges:true

     # Used for fetching pages via WebDriver+Chrome where you need Javascript support.
     # Now working on arm64 (needs testing on rPi - tested on Oracle ARM instance)
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,28 +1,68 @@
 #!/bin/bash
-set -e
+set -eu

-# Install additional packages from EXTRA_PACKAGES env var
-# Uses a marker file to avoid reinstalling on every container restart
-INSTALLED_MARKER="/datastore/.extra_packages_installed"
-CURRENT_PACKAGES="$EXTRA_PACKAGES"
+DATASTORE_PATH="${DATASTORE_PATH:-/datastore}"

-if [ -n "$EXTRA_PACKAGES" ]; then
-    # Check if we need to install/update packages
-    if [ ! -f "$INSTALLED_MARKER" ] || [ "$(cat $INSTALLED_MARKER 2>/dev/null)" != "$CURRENT_PACKAGES" ]; then
-        echo "Installing extra packages: $EXTRA_PACKAGES"
-        pip3 install --no-cache-dir $EXTRA_PACKAGES
+# -----------------------------------------------------------------------
+# Phase 1: Running as root — fix up PUID/PGID and datastore ownership,
+#           then re-exec as the unprivileged changedetection user via gosu.
+# -----------------------------------------------------------------------
+if [ "$(id -u)" = '0' ]; then
+    PUID=${PUID:-911}
+    PGID=${PGID:-911}

-        if [ $? -eq 0 ]; then
-            echo "$CURRENT_PACKAGES" > "$INSTALLED_MARKER"
-            echo "Extra packages installed successfully"
-        else
-            echo "ERROR: Failed to install extra packages"
-            exit 1
+    groupmod -o -g "$PGID" changedetection
+    usermod -o -u "$PUID" changedetection
+
+    # Keep /extra_packages writable by the (potentially re-mapped) user
+    chown changedetection:changedetection /extra_packages
+
+    # One-time ownership migration: only chown if the datastore isn't already
+    # owned by the target UID (e.g. existing root-owned installations).
+    if [ -z "${SKIP_CHOWN:-}" ]; then
+        datastore_uid=$(stat -c '%u' "$DATASTORE_PATH")
+        if [ "$datastore_uid" != "$PUID" ]; then
+            echo "Updating $DATASTORE_PATH ownership to $PUID:$PGID (one-time migration)..."
+            chown -R changedetection:changedetection "$DATASTORE_PATH"
+            echo "Done."
        fi
+    fi
+
+    # Fix SSL certificate permissions so the unprivileged user can read them.
+    # SSL_CERT_FILE / SSL_PRIVKEY_FILE may be relative (to /app) or absolute.
+    fix_ssl_perm() {
+        local file="$1" mode="$2"
+        [ -z "$file" ] && return
+        [ "${file:0:1}" != "/" ] && file="/app/$file"
+        if [ -f "$file" ]; then
+            chown changedetection:changedetection "$file"
+            chmod "$mode" "$file"
+        fi
+    }
+    fix_ssl_perm "${SSL_CERT_FILE:-}" 644
+    fix_ssl_perm "${SSL_PRIVKEY_FILE:-}" 600
+
+    # Re-exec this script as the unprivileged user
+    exec gosu changedetection:changedetection "$0" "$@"
+fi
+
+# -----------------------------------------------------------------------
+# Phase 2: Running as unprivileged user — install any EXTRA_PACKAGES into
+#           /extra_packages (already on PYTHONPATH) then exec the app.
+# -----------------------------------------------------------------------
+
+# Install additional packages from EXTRA_PACKAGES env var.
+# Uses a marker file in the datastore to avoid reinstalling on every restart.
+if [ -n "${EXTRA_PACKAGES:-}" ]; then
+    INSTALLED_MARKER="${DATASTORE_PATH}/.extra_packages_installed"
+    if [ ! -f "$INSTALLED_MARKER" ] || [ "$(cat "$INSTALLED_MARKER" 2>/dev/null)" != "$EXTRA_PACKAGES" ]; then
+        echo "Installing extra packages: $EXTRA_PACKAGES"
+        pip3 install --target=/extra_packages --no-cache-dir $EXTRA_PACKAGES
+        echo "$EXTRA_PACKAGES" > "$INSTALLED_MARKER"
+        echo "Extra packages installed successfully"
    else
        echo "Extra packages already installed: $EXTRA_PACKAGES"
    fi
 fi

-# Execute the main command
 exec "$@"
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,15 +9,10 @@ flask_restful
 flask_cors # For the Chrome extension to operate
 # janus # No longer needed - using pure threading.Queue for multi-loop support
 flask_wtf~=1.2
-# Flask 3.1 removed the session setter on RequestContext; the patch in
-# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
-# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
-flask>=3.1,<4
-# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
-# >=5.5.0 ensures the threading async_mode we rely on is available.
-flask-socketio>=5.5.0,<6
-python-socketio>=5.11.0,<6
-python-engineio>=4.9.0,<5
+flask~=3.1
+flask-socketio~=5.6.0
+python-socketio~=5.16.1
+python-engineio~=4.13.1
 inscriptis~=2.2
 pytz
 timeago~=1.0
Author	SHA1	Message	Date
dgtlmoon	423b201d6a	try this	2026-02-18 18:49:40 +01:00
dgtlmoon	839cf7fd9d	Revisiting Dont' run docker container as root	2026-02-18 18:39:26 +01:00