0.53.5

Fixing bad replacement of metadata causing possible content removal #3906 (#3908 )
UI - Backup restore (#3899 )
2026-02-20 13:16:03 +00:00 · 2026-02-20 00:57:52 +01:00 · 2026-02-20 00:55:37 +01:00 · 2026-02-18 18:05:32 +01:00 · 2026-02-18 14:07:26 +01:00 · 2026-02-18 14:05:23 +01:00
13 changed files with 646 additions and 80 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
 # Semver means never use .01, or 00. Should be .1.
-__version__ = '0.53.3'
+__version__ = '0.53.5'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/blueprint/backups/init.py
+++ b/changedetectionio/blueprint/backups/init.py
@@ -13,7 +13,7 @@ from loguru import logger
 BACKUP_FILENAME_FORMAT = "changedetection-backup-{}.zip"


-def create_backup(datastore_path, watches: dict):
+def create_backup(datastore_path, watches: dict, tags: dict = None):
    logger.debug("Creating backup...")
    import zipfile
    from pathlib import Path
@@ -45,6 +45,15 @@ def create_backup(datastore_path, watches: dict):
        if os.path.isfile(secret_file):
            zipObj.write(secret_file, arcname="secret.txt")

+        # Add tag data directories (each tag has its own {uuid}/tag.json)
+        for uuid, tag in (tags or {}).items():
+            for f in Path(tag.data_dir).glob('*'):
+                zipObj.write(f,
+                             arcname=os.path.join(f.parts[-2], f.parts[-1]),
+                             compress_type=zipfile.ZIP_DEFLATED,
+                             compresslevel=8)
+            logger.debug(f"Added tag '{tag.get('title')}' ({uuid}) to backup")
+
        # Add any data in the watch data directory.
        for uuid, w in watches.items():
            for f in Path(w.data_dir).glob('*'):
@@ -88,7 +97,10 @@ def create_backup(datastore_path, watches: dict):


 def construct_blueprint(datastore: ChangeDetectionStore):
+    from .restore import construct_restore_blueprint
+
    backups_blueprint = Blueprint('backups', __name__, template_folder="templates")
+    backups_blueprint.register_blueprint(construct_restore_blueprint(datastore))
    backup_threads = []

    @login_optionally_required
@@ -96,16 +108,17 @@ def construct_blueprint(datastore: ChangeDetectionStore):
    def request_backup():
        if any(thread.is_alive() for thread in backup_threads):
            flash(gettext("A backup is already running, check back in a few minutes"), "error")
-            return redirect(url_for('backups.index'))
+            return redirect(url_for('backups.create'))

        if len(find_backups()) > int(os.getenv("MAX_NUMBER_BACKUPS", 100)):
            flash(gettext("Maximum number of backups reached, please remove some"), "error")
-            return redirect(url_for('backups.index'))
+            return redirect(url_for('backups.create'))

        # With immediate persistence, all data is already saved
        zip_thread = threading.Thread(
            target=create_backup,
            args=(datastore.datastore_path, datastore.data.get("watching")),
+            kwargs={'tags': datastore.data['settings']['application'].get('tags', {})},
            daemon=True,
            name="BackupCreator"
        )
@@ -113,7 +126,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        backup_threads.append(zip_thread)
        flash(gettext("Backup building in background, check back in a few minutes."))

-        return redirect(url_for('backups.index'))
+        return redirect(url_for('backups.create'))

    def find_backups():
        backup_filepath = os.path.join(datastore.datastore_path, BACKUP_FILENAME_FORMAT.format("*"))
@@ -155,14 +168,14 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        return send_from_directory(os.path.abspath(datastore.datastore_path), filename, as_attachment=True)

    @login_optionally_required
-    @backups_blueprint.route("", methods=['GET'])
-    def index():
+    @backups_blueprint.route("/", methods=['GET'])
+    @backups_blueprint.route("/create", methods=['GET'])
+    def create():
        backups = find_backups()
-        output = render_template("overview.html",
+        output = render_template("backup_create.html",
                                 available_backups=backups,
                                 backup_running=any(thread.is_alive() for thread in backup_threads)
                                 )
-
        return output

    @login_optionally_required
@@ -176,6 +189,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):

        flash(gettext("Backups were deleted."))

-        return redirect(url_for('backups.index'))
+        return redirect(url_for('backups.create'))

    return backups_blueprint
--- a/changedetectionio/blueprint/backups/restore.py
+++ b/changedetectionio/blueprint/backups/restore.py
@@ -0,0 +1,208 @@
+import io
+import json
+import os
+import shutil
+import tempfile
+import threading
+import zipfile
+
+from flask import Blueprint, render_template, flash, url_for, redirect, request
+from flask_babel import gettext, lazy_gettext as _l
+from wtforms import Form, BooleanField, SubmitField
+from flask_wtf.file import FileField, FileAllowed
+from loguru import logger
+
+from changedetectionio.flask_app import login_optionally_required
+
+
+class RestoreForm(Form):
+    zip_file = FileField(_l('Backup zip file'), validators=[
+        FileAllowed(['zip'], _l('Must be a .zip backup file!'))
+    ])
+    include_groups = BooleanField(_l('Include groups'), default=True)
+    include_groups_replace_existing = BooleanField(_l('Replace existing groups of the same UUID'), default=True)
+    include_watches = BooleanField(_l('Include watches'), default=True)
+    include_watches_replace_existing = BooleanField(_l('Replace existing watches of the same UUID'), default=True)
+    submit = SubmitField(_l('Restore backup'))
+
+
+def import_from_zip(zip_stream, datastore, include_groups, include_groups_replace, include_watches, include_watches_replace):
+    """
+    Extract and import watches and groups from a backup zip stream.
+
+    Mirrors the store's _load_watches / _load_tags loading pattern:
+      - UUID dirs with tag.json  → Tag.model + tag_obj.commit()
+      - UUID dirs with watch.json → rehydrate_entity + watch_obj.commit()
+
+    Returns a dict with counts: restored_groups, skipped_groups, restored_watches, skipped_watches.
+    Raises zipfile.BadZipFile if the stream is not a valid zip.
+    """
+    from changedetectionio.model import Tag
+
+    restored_groups = 0
+    skipped_groups = 0
+    restored_watches = 0
+    skipped_watches = 0
+
+    current_tags = datastore.data['settings']['application'].get('tags', {})
+    current_watches = datastore.data['watching']
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger.debug(f"Restore: extracting zip to {tmpdir}")
+        with zipfile.ZipFile(zip_stream, 'r') as zf:
+            zf.extractall(tmpdir)
+        logger.debug("Restore: zip extracted, scanning UUID directories")
+
+        for entry in os.scandir(tmpdir):
+            if not entry.is_dir():
+                continue
+
+            uuid = entry.name
+            tag_json_path = os.path.join(entry.path, 'tag.json')
+            watch_json_path = os.path.join(entry.path, 'watch.json')
+
+            # --- Tags (groups) ---
+            if include_groups and os.path.exists(tag_json_path):
+                if uuid in current_tags and not include_groups_replace:
+                    logger.debug(f"Restore: skipping existing group {uuid} (replace not requested)")
+                    skipped_groups += 1
+                    continue
+
+                try:
+                    with open(tag_json_path, 'r', encoding='utf-8') as f:
+                        tag_data = json.load(f)
+                except (json.JSONDecodeError, IOError) as e:
+                    logger.error(f"Restore: failed to read tag.json for {uuid}: {e}")
+                    continue
+
+                title = tag_data.get('title', uuid)
+                logger.debug(f"Restore: importing group '{title}' ({uuid})")
+
+                # Mirror _load_tags: set uuid and force processor
+                tag_data['uuid'] = uuid
+                tag_data['processor'] = 'restock_diff'
+
+                # Copy the UUID directory so data_dir exists for commit()
+                dst_dir = os.path.join(datastore.datastore_path, uuid)
+                if os.path.exists(dst_dir):
+                    shutil.rmtree(dst_dir)
+                shutil.copytree(entry.path, dst_dir)
+
+                tag_obj = Tag.model(
+                    datastore_path=datastore.datastore_path,
+                    __datastore=datastore.data,
+                    default=tag_data
+                )
+                current_tags[uuid] = tag_obj
+                tag_obj.commit()
+                restored_groups += 1
+                logger.success(f"Restore: group '{title}' ({uuid}) restored")
+
+            # --- Watches ---
+            elif include_watches and os.path.exists(watch_json_path):
+                if uuid in current_watches and not include_watches_replace:
+                    logger.debug(f"Restore: skipping existing watch {uuid} (replace not requested)")
+                    skipped_watches += 1
+                    continue
+
+                try:
+                    with open(watch_json_path, 'r', encoding='utf-8') as f:
+                        watch_data = json.load(f)
+                except (json.JSONDecodeError, IOError) as e:
+                    logger.error(f"Restore: failed to read watch.json for {uuid}: {e}")
+                    continue
+
+                url = watch_data.get('url', uuid)
+                logger.debug(f"Restore: importing watch '{url}' ({uuid})")
+
+                # Copy UUID directory first so data_dir and history files exist
+                dst_dir = os.path.join(datastore.datastore_path, uuid)
+                if os.path.exists(dst_dir):
+                    shutil.rmtree(dst_dir)
+                shutil.copytree(entry.path, dst_dir)
+
+                # Mirror _load_watches / rehydrate_entity
+                watch_data['uuid'] = uuid
+                watch_obj = datastore.rehydrate_entity(uuid, watch_data)
+                current_watches[uuid] = watch_obj
+                watch_obj.commit()
+                restored_watches += 1
+                logger.success(f"Restore: watch '{url}' ({uuid}) restored")
+
+        logger.debug(f"Restore: scan complete - groups {restored_groups} restored / {skipped_groups} skipped, "
+                     f"watches {restored_watches} restored / {skipped_watches} skipped")
+
+    # Persist changedetection.json (includes the updated tags dict)
+    logger.debug("Restore: committing datastore settings")
+    datastore.commit()
+
+    return {
+        'restored_groups': restored_groups,
+        'skipped_groups': skipped_groups,
+        'restored_watches': restored_watches,
+        'skipped_watches': skipped_watches,
+    }
+
+
+
+def construct_restore_blueprint(datastore):
+    restore_blueprint = Blueprint('restore', __name__, template_folder="templates")
+    restore_threads = []
+
+    @login_optionally_required
+    @restore_blueprint.route("/restore", methods=['GET'])
+    def restore():
+        form = RestoreForm()
+        return render_template("backup_restore.html",
+                               form=form,
+                               restore_running=any(t.is_alive() for t in restore_threads))
+
+    @login_optionally_required
+    @restore_blueprint.route("/restore/start", methods=['POST'])
+    def backups_restore_start():
+        if any(t.is_alive() for t in restore_threads):
+            flash(gettext("A restore is already running, check back in a few minutes"), "error")
+            return redirect(url_for('backups.restore.restore'))
+
+        zip_file = request.files.get('zip_file')
+        if not zip_file or not zip_file.filename:
+            flash(gettext("No file uploaded"), "error")
+            return redirect(url_for('backups.restore.restore'))
+
+        if not zip_file.filename.lower().endswith('.zip'):
+            flash(gettext("File must be a .zip backup file"), "error")
+            return redirect(url_for('backups.restore.restore'))
+
+        # Read into memory now — the request stream is gone once we return
+        try:
+            zip_bytes = io.BytesIO(zip_file.read())
+            zipfile.ZipFile(zip_bytes)  # quick validity check before spawning
+            zip_bytes.seek(0)
+        except zipfile.BadZipFile:
+            flash(gettext("Invalid or corrupted zip file"), "error")
+            return redirect(url_for('backups.restore.restore'))
+
+        include_groups = request.form.get('include_groups') == 'y'
+        include_groups_replace = request.form.get('include_groups_replace_existing') == 'y'
+        include_watches = request.form.get('include_watches') == 'y'
+        include_watches_replace = request.form.get('include_watches_replace_existing') == 'y'
+
+        restore_thread = threading.Thread(
+            target=import_from_zip,
+            kwargs={
+                'zip_stream': zip_bytes,
+                'datastore': datastore,
+                'include_groups': include_groups,
+                'include_groups_replace': include_groups_replace,
+                'include_watches': include_watches,
+                'include_watches_replace': include_watches_replace,
+            },
+            daemon=True,
+            name="BackupRestore"
+        )
+        restore_thread.start()
+        restore_threads.append(restore_thread)
+        flash(gettext("Restore started in background, check back in a few minutes."))
+        return redirect(url_for('backups.restore.restore'))
+
+    return restore_blueprint
--- a/changedetectionio/blueprint/backups/templates/backup_create.html
+++ b/changedetectionio/blueprint/backups/templates/backup_create.html
@@ -0,0 +1,49 @@
+{% extends 'base.html' %}
+{% block content %}
+    {% from '_helpers.html' import render_simple_field, render_field %}
+
+    <div class="edit-form">
+        <div class="tabs collapsable">
+            <ul>
+                <li class="tab active" id=""><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
+                <li class="tab"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
+            </ul>
+        </div>
+        <div class="box-wrap inner">
+            <div id="general">
+                {% if backup_running %}
+                    <p>
+                        <span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
+                    </p>
+                {% endif %}
+
+                <p>
+                    {{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
+                </p>
+                <br>
+                {% if available_backups %}
+                    <ul>
+                        {% for backup in available_backups %}
+                            <li>
+                                <a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{ backup["filesize"] }} {{ _('Mb') }}
+                            </li>
+                        {% endfor %}
+                    </ul>
+                {% else %}
+                    <p>
+                        <strong>{{ _('No backups found.') }}</strong>
+                    </p>
+                {% endif %}
+
+                <a class="pure-button pure-button-primary"
+                   href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
+                {% if available_backups %}
+                    <a class="pure-button button-small button-error "
+                       href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
+                {% endif %}
+
+            </div>
+
+        </div>
+    </div>
+{% endblock %}
--- a/changedetectionio/blueprint/backups/templates/backup_restore.html
+++ b/changedetectionio/blueprint/backups/templates/backup_restore.html
@@ -0,0 +1,58 @@
+{% extends 'base.html' %}
+{% block content %}
+    {% from '_helpers.html' import render_field, render_checkbox_field %}
+
+    <div class="edit-form">
+        <div class="tabs collapsable">
+            <ul>
+                <li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
+                <li class="tab active"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
+            </ul>
+        </div>
+        <div class="box-wrap inner">
+            <div id="general">
+                {% if restore_running %}
+                    <p>
+                        <span class="spinner"></span>&nbsp;<strong>{{ _('A restore is running!') }}</strong>
+                    </p>
+                {% endif %}
+
+                <p>{{ _('Restore a backup. Must be a .zip backup file created on/after v0.53.1 (new database layout).') }}</p>
+                <p>{{ _('Note: This does not override the main application settings, only watches and groups.') }}</p>
+
+                <form class="pure-form pure-form-stacked settings"
+                      action="{{ url_for('backups.restore.backups_restore_start') }}"
+                      method="POST"
+                      enctype="multipart/form-data">
+                    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
+
+                    <div class="pure-control-group">
+                        {{ render_checkbox_field(form.include_groups) }}
+                        <span class="pure-form-message-inline">{{ _('Include all groups found in backup?') }}</span>
+                    </div>
+                    <div class="pure-control-group">
+                        {{ render_checkbox_field(form.include_groups_replace_existing) }}
+                        <span class="pure-form-message-inline">{{ _('Replace any existing groups of the same UUID?') }}</span>
+                    </div>
+
+                    <div class="pure-control-group">
+                        {{ render_checkbox_field(form.include_watches) }}
+                        <span class="pure-form-message-inline">{{ _('Include all watches found in backup?') }}</span>
+                    </div>
+                    <div class="pure-control-group">
+                        {{ render_checkbox_field(form.include_watches_replace_existing) }}
+                        <span class="pure-form-message-inline">{{ _('Replace any existing watches of the same UUID?') }}</span>
+                    </div>
+
+                    <div class="pure-control-group">
+                        {{ render_field(form.zip_file) }}
+                    </div>
+
+                    <div class="pure-controls">
+                        <button type="submit" class="pure-button pure-button-primary">{{ _('Restore backup') }}</button>
+                    </div>
+                </form>
+            </div>
+        </div>
+    </div>
+{% endblock %}
--- a/changedetectionio/blueprint/backups/templates/overview.html
+++ b/changedetectionio/blueprint/backups/templates/overview.html
@@ -1,36 +0,0 @@
-{% extends 'base.html' %}
-{% block content %}
-    {% from '_helpers.html' import render_simple_field, render_field %}
-    <div class="edit-form">
-        <div class="box-wrap inner">
-            <h2>{{ _('Backups') }}</h2>
-            {% if backup_running %}
-                <p>
-                    <span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
-                </p>
-            {% endif %}
-            <p>
-                {{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
-            </p>
-            <br>
-                {% if available_backups %}
-                    <ul>
-                    {% for backup in available_backups %}
-                        <li><a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{  backup["filesize"] }} {{ _('Mb') }}</li>
-                    {% endfor %}
-                    </ul>
-                {% else %}
-                    <p>
-                    <strong>{{ _('No backups found.') }}</strong>
-                    </p>
-                {% endif %}
-
-            <a class="pure-button pure-button-primary" href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
-            {% if available_backups %}
-                <a class="pure-button button-small button-error " href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
-            {% endif %}
-        </div>
-    </div>
-
-
-{% endblock %}
--- a/changedetectionio/blueprint/imports/templates/import.html
+++ b/changedetectionio/blueprint/imports/templates/import.html
@@ -16,6 +16,11 @@
        <form class="pure-form" action="{{url_for('imports.import_page')}}" method="POST" enctype="multipart/form-data">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
            <div class="tab-pane-inner" id="url-list">
+
+                <p>
+                {{ _('Restoring changedetection.io backups is in the') }}<a href="{{ url_for('backups.restore.restore') }}"> {{ _('backups section') }}</a>.
+                <br>
+                </p>
                <div class="pure-control-group">
                        {{ _('Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):') }}
                        <br>
@@ -37,9 +42,6 @@
            </div>

            <div class="tab-pane-inner" id="distill-io">
-
-
-
                    <div class="pure-control-group">
                        {{ _('Copy and Paste your Distill.io watch \'export\' file, this should be a JSON file.') }}<br>
                        {{ _('This is') }} <i>{{ _('experimental') }}</i>, {{ _('supported fields are') }} <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, {{ _('the rest (including') }} <code>schedule</code>) {{ _('are ignored.') }}
@@ -49,8 +51,6 @@
                        {{ _('Be sure to set your default fetcher to Chrome if required.') }}<br>
                        </p>
                    </div>
-
-
                    <textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
                                font-family:monospace;
                                white-space: pre;
@@ -114,6 +114,7 @@
                </div>
            </div>
            <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">{{ _('Import') }}</button>
+
        </form>

    </div>
--- a/changedetectionio/blueprint/settings/templates/settings.html
+++ b/changedetectionio/blueprint/settings/templates/settings.html
@@ -25,7 +25,7 @@
            <li class="tab"><a href="#ui-options">{{ _('UI Options') }}</a></li>
            <li class="tab"><a href="#api">{{ _('API') }}</a></li>
            <li class="tab"><a href="#rss">{{ _('RSS') }}</a></li>
-            <li class="tab"><a href="{{ url_for('backups.index') }}">{{ _('Backups') }}</a></li>
+            <li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Backups') }}</a></li>
            <li class="tab"><a href="#timedate">{{ _('Time & Date') }}</a></li>
            <li class="tab"><a href="#proxies">{{ _('CAPTCHA & Proxies') }}</a></li>
            {% if plugin_tabs %}
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -561,31 +561,33 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
        )
    else:
        parser_config = None
-
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
    else:
-        # Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
-        # Causing inscriptis to silently exit when more than ~10MB is found.
-        # All we are doing here is converting the HTML to text, no CSS layout etc
-        # Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
-        html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
-                              '', html_content, flags=re.DOTALL | re.IGNORECASE)
+        # Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
+        # causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
+        # can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
+        # causing the regex to scan past the intended close and eat real page content.
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Strip tags that inscriptis cannot render as meaningful text and which can be very large.
+        # svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
+        # video/audio/picture are kept — they may contain meaningful fallback text or captions.
+        for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
+                                  'math', 'canvas', 'iframe', 'template']):
+            tag.decompose()

-        # SPAs often use <body style="display:none"> to hide content until JS loads
-        # inscriptis respects CSS display rules, so we need to remove these hiding styles
-        # to extract the actual page content
-        body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
-
-        # Check if body has hiding styles that need to be fixed
-        body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
-        if body_match:
-            from loguru import logger
-            logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
-
-        html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
+        # SPAs often use <body style="display:none"> to hide content until JS loads.
+        # inscriptis respects CSS display rules, so strip hiding styles from the body tag.
+        body_tag = soup.find('body')
+        if body_tag and body_tag.get('style'):
+            style = body_tag['style']
+            if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
+                logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
+                del body_tag['style']

+        html_content = str(soup)

    text_content = get_text(html_content, config=parser_config)
    return text_content
--- a/changedetectionio/realtime/socket_server.py
+++ b/changedetectionio/realtime/socket_server.py
@@ -199,11 +199,25 @@ def handle_watch_update(socketio, **kwargs):
        logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")


+def _patch_flask_request_context_session():
+    """Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
+    still assigns to it directly (ctx.session = ...).  Restore a setter that writes the
+    private _session attribute so the two libraries work together.
+    """
+    from flask.ctx import RequestContext
+    if getattr(RequestContext.session, 'fset', None) is not None:
+        return  # Already has a setter (future Flask version restored it)
+    original_prop = RequestContext.session
+    RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
+
+
 def init_socketio(app, datastore):
    """Initialize SocketIO with the main Flask app"""
    import platform
    import sys

+    _patch_flask_request_context_session()
+
    # Platform-specific async_mode selection for better stability
    system = platform.system().lower()
    python_version = sys.version_info
--- a/changedetectionio/tests/test_backup.py
+++ b/changedetectionio/tests/test_backup.py
@@ -6,11 +6,10 @@ import io
 from zipfile import ZipFile
 import re
 import time
+from changedetectionio.model import Watch, Tag


 def test_backup(client, live_server, measure_memory_usage, datastore_path):
-   #  live_server_setup(live_server) # Setup on conftest per function
-
    set_original_response(datastore_path=datastore_path)


@@ -32,7 +31,7 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
    time.sleep(4)

    res = client.get(
-        url_for("backups.index"),
+        url_for("backups.create"),
        follow_redirects=True
    )
    # Can see the download link to the backup
@@ -80,11 +79,12 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):

 def test_watch_data_package_download(client, live_server, measure_memory_usage, datastore_path):
    """Test downloading a single watch's data as a zip package"""
-    import os

    set_original_response(datastore_path=datastore_path)

    uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
+    tag_uuid = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag")
+    tag_uuid2 = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag number two")
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)

    wait_for_all_checks(client)
@@ -113,4 +113,87 @@ def test_watch_data_package_download(client, live_server, measure_memory_usage,
    # Should contain history/snapshot files
    uuid4hex_txt = re.compile(f'^{re.escape(uuid)}/.*\\.txt', re.I)
    txt_files = list(filter(uuid4hex_txt.match, files))
-    assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
+    assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
+
+
+def test_backup_restore(client, live_server, measure_memory_usage, datastore_path):
+    """Test that a full backup zip can be restored — watches and tags survive a round-trip."""
+
+    set_original_response(datastore_path=datastore_path)
+
+    datastore = live_server.app.config['DATASTORE']
+    watch_url = url_for('test_endpoint', _external=True)
+
+    # Set up: one watch and two tags
+    uuid = datastore.add_watch(url=watch_url)
+    tag_uuid = datastore.add_tag(title="Tasty backup tag")
+    tag_uuid2 = datastore.add_tag(title="Tasty backup tag number two")
+
+    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+
+    # Create a full backup
+    client.get(url_for("backups.request_backup"), follow_redirects=True)
+    time.sleep(4)
+
+    # Download the latest backup zip
+    res = client.get(url_for("backups.download_backup", filename="latest"), follow_redirects=True)
+    assert res.content_type == "application/zip"
+    zip_data = res.data
+
+    # Confirm the zip contains both watch.json and tag.json entries
+    backup = ZipFile(io.BytesIO(zip_data))
+    names = backup.namelist()
+    assert f"{uuid}/watch.json" in names, f"watch.json missing from backup: {names}"
+    assert f"{tag_uuid}/tag.json" in names, f"tag.json for tag 1 missing from backup: {names}"
+    assert f"{tag_uuid2}/tag.json" in names, f"tag.json for tag 2 missing from backup: {names}"
+
+    # --- Wipe everything ---
+    datastore.delete('all')
+    client.get(url_for("tags.delete_all"), follow_redirects=True)
+
+    assert uuid not in datastore.data['watching'], "Watch should be gone after delete"
+    assert tag_uuid not in datastore.data['settings']['application']['tags'], "Tag 1 should be gone after delete"
+    assert tag_uuid2 not in datastore.data['settings']['application']['tags'], "Tag 2 should be gone after delete"
+
+    # --- Restore from the backup zip ---
+    res = client.post(
+        url_for("backups.restore.backups_restore_start"),
+        data={
+            'zip_file': (io.BytesIO(zip_data), 'backup.zip'),
+            'include_groups': 'y',
+            'include_groups_replace_existing': 'y',
+            'include_watches': 'y',
+            'include_watches_replace_existing': 'y',
+        },
+        content_type='multipart/form-data',
+        follow_redirects=True
+    )
+    assert res.status_code == 200
+
+    # Wait for the thread to finish
+    time.sleep(2)
+
+    # --- Watch checks ---
+    restored_watch = datastore.data['watching'].get(uuid)
+    assert restored_watch is not None, f"Watch {uuid} not found after restore"
+    assert restored_watch['url'] == watch_url, "Restored watch URL does not match"
+    assert isinstance(restored_watch, Watch.model), \
+        f"Watch not properly rehydrated, got {type(restored_watch)}"
+    assert restored_watch.history_n >= 1, \
+        f"Restored watch should have at least 1 history entry, got {restored_watch.history_n}"
+
+    # --- Tag checks ---
+    restored_tags = datastore.data['settings']['application']['tags']
+
+    restored_tag = restored_tags.get(tag_uuid)
+    assert restored_tag is not None, f"Tag {tag_uuid} not found after restore"
+    assert restored_tag['title'] == "Tasty backup tag", "Restored tag 1 title does not match"
+    assert isinstance(restored_tag, Tag.model), \
+        f"Tag 1 not properly rehydrated, got {type(restored_tag)}"
+
+    restored_tag2 = restored_tags.get(tag_uuid2)
+    assert restored_tag2 is not None, f"Tag {tag_uuid2} not found after restore"
+    assert restored_tag2['title'] == "Tasty backup tag number two", "Restored tag 2 title does not match"
+    assert isinstance(restored_tag2, Tag.model), \
+        f"Tag 2 not properly rehydrated, got {type(restored_tag2)}"
--- a/changedetectionio/tests/unit/test_html_to_text.py
+++ b/changedetectionio/tests/unit/test_html_to_text.py
@@ -453,6 +453,175 @@ class TestHtmlToText(unittest.TestCase):



+    def test_script_with_closing_tag_in_string_does_not_eat_content(self):
+        """
+        Script tag containing </script> inside a JS string must not prematurely end the block.
+
+        This is the classic regex failure mode: the old pattern would find the first </script>
+        inside the JS string literal and stop there, leaving the tail of the script block
+        (plus any following content) exposed as raw text. BS4 parses the HTML correctly.
+        """
+        html = '''<html><body>
+<p>Before script</p>
+<script>
+var html = "<div>foo<\\/script><p>bar</p>";
+var also = 1;
+</script>
+<p>AFTER SCRIPT</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'Before script' in text
+        assert 'AFTER SCRIPT' in text
+        # Script internals must not leak
+        assert 'var html' not in text
+        assert 'var also' not in text
+
+    def test_content_sandwiched_between_multiple_body_scripts(self):
+        """Content between multiple script/style blocks in the body must all survive."""
+        html = '''<html><body>
+<script>var a = 1;</script>
+<p>CONTENT A</p>
+<style>.x { color: red; }</style>
+<p>CONTENT B</p>
+<script>var b = 2;</script>
+<p>CONTENT C</p>
+<style>.y { color: blue; }</style>
+<p>CONTENT D</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
+            assert label in text, f"'{label}' was eaten by script/style stripping"
+        assert 'var a' not in text
+        assert 'var b' not in text
+        assert 'color: red' not in text
+        assert 'color: blue' not in text
+
+    def test_unicode_and_international_content_preserved(self):
+        """Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
+        html = '''<html><body>
+<style>.x{color:red}</style>
+<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
+<p>Chinese: \u6ce8\u518c</p>
+<p>Japanese: \u767b\u9332</p>
+<p>Korean: \ub4f1\ub85d</p>
+<p>Emoji: \U0001f4e2</p>
+<script>var x = 1;</script>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'ANMELDUNG' in text
+        assert '\u6ce8\u518c' in text   # Chinese
+        assert '\u767b\u9332' in text   # Japanese
+        assert '\ub4f1\ub85d' in text   # Korean
+
+    def test_style_with_type_attribute_is_stripped(self):
+        """<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
+        html = '''<html><body>
+<style type="text/css">.important { display: none; }</style>
+<p>VISIBLE CONTENT</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'VISIBLE CONTENT' in text
+        assert '.important' not in text
+        assert 'display: none' not in text
+
+    def test_ldjson_script_is_stripped(self):
+        """<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
+        html = '''<html><body>
+<script type="application/ld+json">
+{"@type": "Product", "name": "Widget", "price": "9.99"}
+</script>
+<p>PRODUCT PAGE</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'PRODUCT PAGE' in text
+        assert '@type' not in text
+        assert '"price"' not in text
+
+    def test_inline_svg_is_stripped_entirely(self):
+        """
+        Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
+        SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
+        text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
+        """
+        html = '''<html><body>
+<p>Before SVG</p>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
+    <path d="M14 5L7 12L14 19Z" fill="none"/>
+    <circle cx="12" cy="12" r="10"/>
+</svg>
+<p>After SVG</p>
+</body></html>'''
+
+        text = html_to_text(html)
+        assert 'Before SVG' in text
+        assert 'After SVG' in text
+        assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
+        assert 'viewBox' not in text, "SVG attributes should not appear in text output"
+
+    def test_tag_inside_json_data_attribute_does_not_eat_content(self):
+        """
+        Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
+
+        Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
+        inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
+        closing tags appear as <\\/svg> rather than </svg>.
+
+        The old regex approach would find <svg> inside the attribute value, then fail to find
+        <\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
+        eating tens of kilobytes of actual page content in the process.
+        """
+        html = '''<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
+</div>
+<div class="content">
+    <h1>IMPORTANT CONTENT</h1>
+    <p>This text must not be eaten by the tag-stripping logic.</p>
+</div>
+<svg><circle cx="50" cy="50" r="40"/></svg>
+</body>
+</html>'''
+
+        text = html_to_text(html)
+
+        assert 'IMPORTANT CONTENT' in text, (
+            "Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
+            "The tag-stripping logic is matching <tag> inside attribute values and scanning "
+            "forward to the next real closing tag in the DOM."
+        )
+        assert 'This text must not be eaten' in text
+
+    def test_script_inside_json_data_attribute_does_not_eat_content(self):
+        """Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
+        html = '''<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
+</div>
+<div>
+    <h1>MUST SURVIVE</h1>
+    <p>Real content after the data attribute with embedded script tag.</p>
+</div>
+<script>var real = 1;</script>
+</body>
+</html>'''
+
+        text = html_to_text(html)
+
+        assert 'MUST SURVIVE' in text, (
+            "Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
+        )
+        assert 'Real content after the data attribute' in text
+
+
 if __name__ == '__main__':
    # Can run this file directly for quick testing
    unittest.main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,10 +9,15 @@ flask_restful
 flask_cors # For the Chrome extension to operate
 # janus # No longer needed - using pure threading.Queue for multi-loop support
 flask_wtf~=1.2
-flask~=3.1
-flask-socketio~=5.6.0
-python-socketio~=5.16.1
-python-engineio~=4.13.1
+# Flask 3.1 removed the session setter on RequestContext; the patch in
+# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
+# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
+flask>=3.1,<4
+# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
+# >=5.5.0 ensures the threading async_mode we rely on is available.
+flask-socketio>=5.5.0,<6
+python-socketio>=5.11.0,<6
+python-engineio>=4.9.0,<5
 inscriptis~=2.2
 pytz
 timeago~=1.0
Author	SHA1	Message	Date
dgtlmoon	4128acf95a	0.53.5 Some checks failed ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Waiting to run Details ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Waiting to run Details Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details	2026-02-20 00:57:52 +01:00
dgtlmoon	7c8d59c795	Fixing bad replacement of metadata causing possible content removal #3906 (#3908 )	2026-02-20 00:55:37 +01:00
dgtlmoon	897403f7cc	UI - Backup restore (#3899 ) Some checks failed Build and push containers / metadata (push) Has been cancelled Details Build and push containers / build-push-containers (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details CodeQL / Analyze (javascript) (push) Has been cancelled Details CodeQL / Analyze (python) (push) Has been cancelled Details	2026-02-18 18:05:32 +01:00
dgtlmoon	bca35f680e	0.53.4	2026-02-18 14:07:26 +01:00
dgtlmoon	fafea1b5c6	Updates/migration - Re-run tag update, re-save to cleanup changedetection.json, code refactor (#3898 )	2026-02-18 14:05:23 +01:00