Compare commits

..

5 Commits

Author SHA1 Message Date
dgtlmoon
4128acf95a 0.53.5
Some checks failed
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Waiting to run
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Waiting to run
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Waiting to run
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Waiting to run
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Waiting to run
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Waiting to run
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2026-02-20 00:57:52 +01:00
dgtlmoon
7c8d59c795 Fixing bad replacement of metadata causing possible content removal #3906 (#3908) 2026-02-20 00:55:37 +01:00
dgtlmoon
897403f7cc UI - Backup restore (#3899)
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
2026-02-18 18:05:32 +01:00
dgtlmoon
bca35f680e 0.53.4 2026-02-18 14:07:26 +01:00
dgtlmoon
fafea1b5c6 Updates/migration - Re-run tag update, re-save to cleanup changedetection.json, code refactor (#3898) 2026-02-18 14:05:23 +01:00
13 changed files with 646 additions and 80 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
# Semver means never use .01, or 00. Should be .1.
__version__ = '0.53.3'
__version__ = '0.53.5'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -13,7 +13,7 @@ from loguru import logger
BACKUP_FILENAME_FORMAT = "changedetection-backup-{}.zip"
def create_backup(datastore_path, watches: dict):
def create_backup(datastore_path, watches: dict, tags: dict = None):
logger.debug("Creating backup...")
import zipfile
from pathlib import Path
@@ -45,6 +45,15 @@ def create_backup(datastore_path, watches: dict):
if os.path.isfile(secret_file):
zipObj.write(secret_file, arcname="secret.txt")
# Add tag data directories (each tag has its own {uuid}/tag.json)
for uuid, tag in (tags or {}).items():
for f in Path(tag.data_dir).glob('*'):
zipObj.write(f,
arcname=os.path.join(f.parts[-2], f.parts[-1]),
compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8)
logger.debug(f"Added tag '{tag.get('title')}' ({uuid}) to backup")
# Add any data in the watch data directory.
for uuid, w in watches.items():
for f in Path(w.data_dir).glob('*'):
@@ -88,7 +97,10 @@ def create_backup(datastore_path, watches: dict):
def construct_blueprint(datastore: ChangeDetectionStore):
from .restore import construct_restore_blueprint
backups_blueprint = Blueprint('backups', __name__, template_folder="templates")
backups_blueprint.register_blueprint(construct_restore_blueprint(datastore))
backup_threads = []
@login_optionally_required
@@ -96,16 +108,17 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def request_backup():
if any(thread.is_alive() for thread in backup_threads):
flash(gettext("A backup is already running, check back in a few minutes"), "error")
return redirect(url_for('backups.index'))
return redirect(url_for('backups.create'))
if len(find_backups()) > int(os.getenv("MAX_NUMBER_BACKUPS", 100)):
flash(gettext("Maximum number of backups reached, please remove some"), "error")
return redirect(url_for('backups.index'))
return redirect(url_for('backups.create'))
# With immediate persistence, all data is already saved
zip_thread = threading.Thread(
target=create_backup,
args=(datastore.datastore_path, datastore.data.get("watching")),
kwargs={'tags': datastore.data['settings']['application'].get('tags', {})},
daemon=True,
name="BackupCreator"
)
@@ -113,7 +126,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
backup_threads.append(zip_thread)
flash(gettext("Backup building in background, check back in a few minutes."))
return redirect(url_for('backups.index'))
return redirect(url_for('backups.create'))
def find_backups():
backup_filepath = os.path.join(datastore.datastore_path, BACKUP_FILENAME_FORMAT.format("*"))
@@ -155,14 +168,14 @@ def construct_blueprint(datastore: ChangeDetectionStore):
return send_from_directory(os.path.abspath(datastore.datastore_path), filename, as_attachment=True)
@login_optionally_required
@backups_blueprint.route("", methods=['GET'])
def index():
@backups_blueprint.route("/", methods=['GET'])
@backups_blueprint.route("/create", methods=['GET'])
def create():
backups = find_backups()
output = render_template("overview.html",
output = render_template("backup_create.html",
available_backups=backups,
backup_running=any(thread.is_alive() for thread in backup_threads)
)
return output
@login_optionally_required
@@ -176,6 +189,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
flash(gettext("Backups were deleted."))
return redirect(url_for('backups.index'))
return redirect(url_for('backups.create'))
return backups_blueprint

View File

@@ -0,0 +1,208 @@
import io
import json
import os
import shutil
import tempfile
import threading
import zipfile
from flask import Blueprint, render_template, flash, url_for, redirect, request
from flask_babel import gettext, lazy_gettext as _l
from wtforms import Form, BooleanField, SubmitField
from flask_wtf.file import FileField, FileAllowed
from loguru import logger
from changedetectionio.flask_app import login_optionally_required
class RestoreForm(Form):
zip_file = FileField(_l('Backup zip file'), validators=[
FileAllowed(['zip'], _l('Must be a .zip backup file!'))
])
include_groups = BooleanField(_l('Include groups'), default=True)
include_groups_replace_existing = BooleanField(_l('Replace existing groups of the same UUID'), default=True)
include_watches = BooleanField(_l('Include watches'), default=True)
include_watches_replace_existing = BooleanField(_l('Replace existing watches of the same UUID'), default=True)
submit = SubmitField(_l('Restore backup'))
def import_from_zip(zip_stream, datastore, include_groups, include_groups_replace, include_watches, include_watches_replace):
"""
Extract and import watches and groups from a backup zip stream.
Mirrors the store's _load_watches / _load_tags loading pattern:
- UUID dirs with tag.json → Tag.model + tag_obj.commit()
- UUID dirs with watch.json → rehydrate_entity + watch_obj.commit()
Returns a dict with counts: restored_groups, skipped_groups, restored_watches, skipped_watches.
Raises zipfile.BadZipFile if the stream is not a valid zip.
"""
from changedetectionio.model import Tag
restored_groups = 0
skipped_groups = 0
restored_watches = 0
skipped_watches = 0
current_tags = datastore.data['settings']['application'].get('tags', {})
current_watches = datastore.data['watching']
with tempfile.TemporaryDirectory() as tmpdir:
logger.debug(f"Restore: extracting zip to {tmpdir}")
with zipfile.ZipFile(zip_stream, 'r') as zf:
zf.extractall(tmpdir)
logger.debug("Restore: zip extracted, scanning UUID directories")
for entry in os.scandir(tmpdir):
if not entry.is_dir():
continue
uuid = entry.name
tag_json_path = os.path.join(entry.path, 'tag.json')
watch_json_path = os.path.join(entry.path, 'watch.json')
# --- Tags (groups) ---
if include_groups and os.path.exists(tag_json_path):
if uuid in current_tags and not include_groups_replace:
logger.debug(f"Restore: skipping existing group {uuid} (replace not requested)")
skipped_groups += 1
continue
try:
with open(tag_json_path, 'r', encoding='utf-8') as f:
tag_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Restore: failed to read tag.json for {uuid}: {e}")
continue
title = tag_data.get('title', uuid)
logger.debug(f"Restore: importing group '{title}' ({uuid})")
# Mirror _load_tags: set uuid and force processor
tag_data['uuid'] = uuid
tag_data['processor'] = 'restock_diff'
# Copy the UUID directory so data_dir exists for commit()
dst_dir = os.path.join(datastore.datastore_path, uuid)
if os.path.exists(dst_dir):
shutil.rmtree(dst_dir)
shutil.copytree(entry.path, dst_dir)
tag_obj = Tag.model(
datastore_path=datastore.datastore_path,
__datastore=datastore.data,
default=tag_data
)
current_tags[uuid] = tag_obj
tag_obj.commit()
restored_groups += 1
logger.success(f"Restore: group '{title}' ({uuid}) restored")
# --- Watches ---
elif include_watches and os.path.exists(watch_json_path):
if uuid in current_watches and not include_watches_replace:
logger.debug(f"Restore: skipping existing watch {uuid} (replace not requested)")
skipped_watches += 1
continue
try:
with open(watch_json_path, 'r', encoding='utf-8') as f:
watch_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Restore: failed to read watch.json for {uuid}: {e}")
continue
url = watch_data.get('url', uuid)
logger.debug(f"Restore: importing watch '{url}' ({uuid})")
# Copy UUID directory first so data_dir and history files exist
dst_dir = os.path.join(datastore.datastore_path, uuid)
if os.path.exists(dst_dir):
shutil.rmtree(dst_dir)
shutil.copytree(entry.path, dst_dir)
# Mirror _load_watches / rehydrate_entity
watch_data['uuid'] = uuid
watch_obj = datastore.rehydrate_entity(uuid, watch_data)
current_watches[uuid] = watch_obj
watch_obj.commit()
restored_watches += 1
logger.success(f"Restore: watch '{url}' ({uuid}) restored")
logger.debug(f"Restore: scan complete - groups {restored_groups} restored / {skipped_groups} skipped, "
f"watches {restored_watches} restored / {skipped_watches} skipped")
# Persist changedetection.json (includes the updated tags dict)
logger.debug("Restore: committing datastore settings")
datastore.commit()
return {
'restored_groups': restored_groups,
'skipped_groups': skipped_groups,
'restored_watches': restored_watches,
'skipped_watches': skipped_watches,
}
def construct_restore_blueprint(datastore):
restore_blueprint = Blueprint('restore', __name__, template_folder="templates")
restore_threads = []
@login_optionally_required
@restore_blueprint.route("/restore", methods=['GET'])
def restore():
form = RestoreForm()
return render_template("backup_restore.html",
form=form,
restore_running=any(t.is_alive() for t in restore_threads))
@login_optionally_required
@restore_blueprint.route("/restore/start", methods=['POST'])
def backups_restore_start():
if any(t.is_alive() for t in restore_threads):
flash(gettext("A restore is already running, check back in a few minutes"), "error")
return redirect(url_for('backups.restore.restore'))
zip_file = request.files.get('zip_file')
if not zip_file or not zip_file.filename:
flash(gettext("No file uploaded"), "error")
return redirect(url_for('backups.restore.restore'))
if not zip_file.filename.lower().endswith('.zip'):
flash(gettext("File must be a .zip backup file"), "error")
return redirect(url_for('backups.restore.restore'))
# Read into memory now — the request stream is gone once we return
try:
zip_bytes = io.BytesIO(zip_file.read())
zipfile.ZipFile(zip_bytes) # quick validity check before spawning
zip_bytes.seek(0)
except zipfile.BadZipFile:
flash(gettext("Invalid or corrupted zip file"), "error")
return redirect(url_for('backups.restore.restore'))
include_groups = request.form.get('include_groups') == 'y'
include_groups_replace = request.form.get('include_groups_replace_existing') == 'y'
include_watches = request.form.get('include_watches') == 'y'
include_watches_replace = request.form.get('include_watches_replace_existing') == 'y'
restore_thread = threading.Thread(
target=import_from_zip,
kwargs={
'zip_stream': zip_bytes,
'datastore': datastore,
'include_groups': include_groups,
'include_groups_replace': include_groups_replace,
'include_watches': include_watches,
'include_watches_replace': include_watches_replace,
},
daemon=True,
name="BackupRestore"
)
restore_thread.start()
restore_threads.append(restore_thread)
flash(gettext("Restore started in background, check back in a few minutes."))
return redirect(url_for('backups.restore.restore'))
return restore_blueprint

View File

@@ -0,0 +1,49 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_simple_field, render_field %}
<div class="edit-form">
<div class="tabs collapsable">
<ul>
<li class="tab active" id=""><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
</ul>
</div>
<div class="box-wrap inner">
<div id="general">
{% if backup_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
</p>
{% endif %}
<p>
{{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
</p>
<br>
{% if available_backups %}
<ul>
{% for backup in available_backups %}
<li>
<a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{ backup["filesize"] }} {{ _('Mb') }}
</li>
{% endfor %}
</ul>
{% else %}
<p>
<strong>{{ _('No backups found.') }}</strong>
</p>
{% endif %}
<a class="pure-button pure-button-primary"
href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
{% if available_backups %}
<a class="pure-button button-small button-error "
href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
{% endif %}
</div>
</div>
</div>
{% endblock %}

View File

@@ -0,0 +1,58 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_field, render_checkbox_field %}
<div class="edit-form">
<div class="tabs collapsable">
<ul>
<li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Create') }}</a></li>
<li class="tab active"><a href="{{ url_for('backups.restore.restore') }}">{{ _('Restore') }}</a></li>
</ul>
</div>
<div class="box-wrap inner">
<div id="general">
{% if restore_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A restore is running!') }}</strong>
</p>
{% endif %}
<p>{{ _('Restore a backup. Must be a .zip backup file created on/after v0.53.1 (new database layout).') }}</p>
<p>{{ _('Note: This does not override the main application settings, only watches and groups.') }}</p>
<form class="pure-form pure-form-stacked settings"
action="{{ url_for('backups.restore.backups_restore_start') }}"
method="POST"
enctype="multipart/form-data">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<div class="pure-control-group">
{{ render_checkbox_field(form.include_groups) }}
<span class="pure-form-message-inline">{{ _('Include all groups found in backup?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_groups_replace_existing) }}
<span class="pure-form-message-inline">{{ _('Replace any existing groups of the same UUID?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_watches) }}
<span class="pure-form-message-inline">{{ _('Include all watches found in backup?') }}</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.include_watches_replace_existing) }}
<span class="pure-form-message-inline">{{ _('Replace any existing watches of the same UUID?') }}</span>
</div>
<div class="pure-control-group">
{{ render_field(form.zip_file) }}
</div>
<div class="pure-controls">
<button type="submit" class="pure-button pure-button-primary">{{ _('Restore backup') }}</button>
</div>
</form>
</div>
</div>
</div>
{% endblock %}

View File

@@ -1,36 +0,0 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_simple_field, render_field %}
<div class="edit-form">
<div class="box-wrap inner">
<h2>{{ _('Backups') }}</h2>
{% if backup_running %}
<p>
<span class="spinner"></span>&nbsp;<strong>{{ _('A backup is running!') }}</strong>
</p>
{% endif %}
<p>
{{ _('Here you can download and request a new backup, when a backup is completed you will see it listed below.') }}
</p>
<br>
{% if available_backups %}
<ul>
{% for backup in available_backups %}
<li><a href="{{ url_for('backups.download_backup', filename=backup["filename"]) }}">{{ backup["filename"] }}</a> {{ backup["filesize"] }} {{ _('Mb') }}</li>
{% endfor %}
</ul>
{% else %}
<p>
<strong>{{ _('No backups found.') }}</strong>
</p>
{% endif %}
<a class="pure-button pure-button-primary" href="{{ url_for('backups.request_backup') }}">{{ _('Create backup') }}</a>
{% if available_backups %}
<a class="pure-button button-small button-error " href="{{ url_for('backups.remove_backups') }}">{{ _('Remove backups') }}</a>
{% endif %}
</div>
</div>
{% endblock %}

View File

@@ -16,6 +16,11 @@
<form class="pure-form" action="{{url_for('imports.import_page')}}" method="POST" enctype="multipart/form-data">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<div class="tab-pane-inner" id="url-list">
<p>
{{ _('Restoring changedetection.io backups is in the') }}<a href="{{ url_for('backups.restore.restore') }}"> {{ _('backups section') }}</a>.
<br>
</p>
<div class="pure-control-group">
{{ _('Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):') }}
<br>
@@ -37,9 +42,6 @@
</div>
<div class="tab-pane-inner" id="distill-io">
<div class="pure-control-group">
{{ _('Copy and Paste your Distill.io watch \'export\' file, this should be a JSON file.') }}<br>
{{ _('This is') }} <i>{{ _('experimental') }}</i>, {{ _('supported fields are') }} <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, {{ _('the rest (including') }} <code>schedule</code>) {{ _('are ignored.') }}
@@ -49,8 +51,6 @@
{{ _('Be sure to set your default fetcher to Chrome if required.') }}<br>
</p>
</div>
<textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
font-family:monospace;
white-space: pre;
@@ -114,6 +114,7 @@
</div>
</div>
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">{{ _('Import') }}</button>
</form>
</div>

View File

@@ -25,7 +25,7 @@
<li class="tab"><a href="#ui-options">{{ _('UI Options') }}</a></li>
<li class="tab"><a href="#api">{{ _('API') }}</a></li>
<li class="tab"><a href="#rss">{{ _('RSS') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.index') }}">{{ _('Backups') }}</a></li>
<li class="tab"><a href="{{ url_for('backups.create') }}">{{ _('Backups') }}</a></li>
<li class="tab"><a href="#timedate">{{ _('Time & Date') }}</a></li>
<li class="tab"><a href="#proxies">{{ _('CAPTCHA & Proxies') }}</a></li>
{% if plugin_tabs %}

View File

@@ -561,31 +561,33 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
)
else:
parser_config = None
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
else:
# Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
# Causing inscriptis to silently exit when more than ~10MB is found.
# All we are doing here is converting the HTML to text, no CSS layout etc
# Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
'', html_content, flags=re.DOTALL | re.IGNORECASE)
# Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
# causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
# can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
# causing the regex to scan past the intended close and eat real page content.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Strip tags that inscriptis cannot render as meaningful text and which can be very large.
# svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
# video/audio/picture are kept — they may contain meaningful fallback text or captions.
for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
'math', 'canvas', 'iframe', 'template']):
tag.decompose()
# SPAs often use <body style="display:none"> to hide content until JS loads
# inscriptis respects CSS display rules, so we need to remove these hiding styles
# to extract the actual page content
body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
# Check if body has hiding styles that need to be fixed
body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
if body_match:
from loguru import logger
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
# SPAs often use <body style="display:none"> to hide content until JS loads.
# inscriptis respects CSS display rules, so strip hiding styles from the body tag.
body_tag = soup.find('body')
if body_tag and body_tag.get('style'):
style = body_tag['style']
if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
del body_tag['style']
html_content = str(soup)
text_content = get_text(html_content, config=parser_config)
return text_content

View File

@@ -199,11 +199,25 @@ def handle_watch_update(socketio, **kwargs):
logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")
def _patch_flask_request_context_session():
"""Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
still assigns to it directly (ctx.session = ...). Restore a setter that writes the
private _session attribute so the two libraries work together.
"""
from flask.ctx import RequestContext
if getattr(RequestContext.session, 'fset', None) is not None:
return # Already has a setter (future Flask version restored it)
original_prop = RequestContext.session
RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
def init_socketio(app, datastore):
"""Initialize SocketIO with the main Flask app"""
import platform
import sys
_patch_flask_request_context_session()
# Platform-specific async_mode selection for better stability
system = platform.system().lower()
python_version = sys.version_info

View File

@@ -6,11 +6,10 @@ import io
from zipfile import ZipFile
import re
import time
from changedetectionio.model import Watch, Tag
def test_backup(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
set_original_response(datastore_path=datastore_path)
@@ -32,7 +31,7 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
time.sleep(4)
res = client.get(
url_for("backups.index"),
url_for("backups.create"),
follow_redirects=True
)
# Can see the download link to the backup
@@ -80,11 +79,12 @@ def test_backup(client, live_server, measure_memory_usage, datastore_path):
def test_watch_data_package_download(client, live_server, measure_memory_usage, datastore_path):
"""Test downloading a single watch's data as a zip package"""
import os
set_original_response(datastore_path=datastore_path)
uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
tag_uuid = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag")
tag_uuid2 = client.application.config.get('DATASTORE').add_tag(title="Tasty backup tag number two")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
@@ -113,4 +113,87 @@ def test_watch_data_package_download(client, live_server, measure_memory_usage,
# Should contain history/snapshot files
uuid4hex_txt = re.compile(f'^{re.escape(uuid)}/.*\\.txt', re.I)
txt_files = list(filter(uuid4hex_txt.match, files))
assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
assert len(txt_files) > 0, f"Should have at least one .txt file (history/snapshot), got: {files}"
def test_backup_restore(client, live_server, measure_memory_usage, datastore_path):
"""Test that a full backup zip can be restored — watches and tags survive a round-trip."""
set_original_response(datastore_path=datastore_path)
datastore = live_server.app.config['DATASTORE']
watch_url = url_for('test_endpoint', _external=True)
# Set up: one watch and two tags
uuid = datastore.add_watch(url=watch_url)
tag_uuid = datastore.add_tag(title="Tasty backup tag")
tag_uuid2 = datastore.add_tag(title="Tasty backup tag number two")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Create a full backup
client.get(url_for("backups.request_backup"), follow_redirects=True)
time.sleep(4)
# Download the latest backup zip
res = client.get(url_for("backups.download_backup", filename="latest"), follow_redirects=True)
assert res.content_type == "application/zip"
zip_data = res.data
# Confirm the zip contains both watch.json and tag.json entries
backup = ZipFile(io.BytesIO(zip_data))
names = backup.namelist()
assert f"{uuid}/watch.json" in names, f"watch.json missing from backup: {names}"
assert f"{tag_uuid}/tag.json" in names, f"tag.json for tag 1 missing from backup: {names}"
assert f"{tag_uuid2}/tag.json" in names, f"tag.json for tag 2 missing from backup: {names}"
# --- Wipe everything ---
datastore.delete('all')
client.get(url_for("tags.delete_all"), follow_redirects=True)
assert uuid not in datastore.data['watching'], "Watch should be gone after delete"
assert tag_uuid not in datastore.data['settings']['application']['tags'], "Tag 1 should be gone after delete"
assert tag_uuid2 not in datastore.data['settings']['application']['tags'], "Tag 2 should be gone after delete"
# --- Restore from the backup zip ---
res = client.post(
url_for("backups.restore.backups_restore_start"),
data={
'zip_file': (io.BytesIO(zip_data), 'backup.zip'),
'include_groups': 'y',
'include_groups_replace_existing': 'y',
'include_watches': 'y',
'include_watches_replace_existing': 'y',
},
content_type='multipart/form-data',
follow_redirects=True
)
assert res.status_code == 200
# Wait for the thread to finish
time.sleep(2)
# --- Watch checks ---
restored_watch = datastore.data['watching'].get(uuid)
assert restored_watch is not None, f"Watch {uuid} not found after restore"
assert restored_watch['url'] == watch_url, "Restored watch URL does not match"
assert isinstance(restored_watch, Watch.model), \
f"Watch not properly rehydrated, got {type(restored_watch)}"
assert restored_watch.history_n >= 1, \
f"Restored watch should have at least 1 history entry, got {restored_watch.history_n}"
# --- Tag checks ---
restored_tags = datastore.data['settings']['application']['tags']
restored_tag = restored_tags.get(tag_uuid)
assert restored_tag is not None, f"Tag {tag_uuid} not found after restore"
assert restored_tag['title'] == "Tasty backup tag", "Restored tag 1 title does not match"
assert isinstance(restored_tag, Tag.model), \
f"Tag 1 not properly rehydrated, got {type(restored_tag)}"
restored_tag2 = restored_tags.get(tag_uuid2)
assert restored_tag2 is not None, f"Tag {tag_uuid2} not found after restore"
assert restored_tag2['title'] == "Tasty backup tag number two", "Restored tag 2 title does not match"
assert isinstance(restored_tag2, Tag.model), \
f"Tag 2 not properly rehydrated, got {type(restored_tag2)}"

View File

@@ -453,6 +453,175 @@ class TestHtmlToText(unittest.TestCase):
def test_script_with_closing_tag_in_string_does_not_eat_content(self):
"""
Script tag containing </script> inside a JS string must not prematurely end the block.
This is the classic regex failure mode: the old pattern would find the first </script>
inside the JS string literal and stop there, leaving the tail of the script block
(plus any following content) exposed as raw text. BS4 parses the HTML correctly.
"""
html = '''<html><body>
<p>Before script</p>
<script>
var html = "<div>foo<\\/script><p>bar</p>";
var also = 1;
</script>
<p>AFTER SCRIPT</p>
</body></html>'''
text = html_to_text(html)
assert 'Before script' in text
assert 'AFTER SCRIPT' in text
# Script internals must not leak
assert 'var html' not in text
assert 'var also' not in text
def test_content_sandwiched_between_multiple_body_scripts(self):
"""Content between multiple script/style blocks in the body must all survive."""
html = '''<html><body>
<script>var a = 1;</script>
<p>CONTENT A</p>
<style>.x { color: red; }</style>
<p>CONTENT B</p>
<script>var b = 2;</script>
<p>CONTENT C</p>
<style>.y { color: blue; }</style>
<p>CONTENT D</p>
</body></html>'''
text = html_to_text(html)
for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
assert label in text, f"'{label}' was eaten by script/style stripping"
assert 'var a' not in text
assert 'var b' not in text
assert 'color: red' not in text
assert 'color: blue' not in text
def test_unicode_and_international_content_preserved(self):
"""Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
html = '''<html><body>
<style>.x{color:red}</style>
<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
<p>Chinese: \u6ce8\u518c</p>
<p>Japanese: \u767b\u9332</p>
<p>Korean: \ub4f1\ub85d</p>
<p>Emoji: \U0001f4e2</p>
<script>var x = 1;</script>
</body></html>'''
text = html_to_text(html)
assert 'ANMELDUNG' in text
assert '\u6ce8\u518c' in text # Chinese
assert '\u767b\u9332' in text # Japanese
assert '\ub4f1\ub85d' in text # Korean
def test_style_with_type_attribute_is_stripped(self):
"""<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
html = '''<html><body>
<style type="text/css">.important { display: none; }</style>
<p>VISIBLE CONTENT</p>
</body></html>'''
text = html_to_text(html)
assert 'VISIBLE CONTENT' in text
assert '.important' not in text
assert 'display: none' not in text
def test_ldjson_script_is_stripped(self):
"""<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
html = '''<html><body>
<script type="application/ld+json">
{"@type": "Product", "name": "Widget", "price": "9.99"}
</script>
<p>PRODUCT PAGE</p>
</body></html>'''
text = html_to_text(html)
assert 'PRODUCT PAGE' in text
assert '@type' not in text
assert '"price"' not in text
def test_inline_svg_is_stripped_entirely(self):
"""
Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
"""
html = '''<html><body>
<p>Before SVG</p>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M14 5L7 12L14 19Z" fill="none"/>
<circle cx="12" cy="12" r="10"/>
</svg>
<p>After SVG</p>
</body></html>'''
text = html_to_text(html)
assert 'Before SVG' in text
assert 'After SVG' in text
assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
assert 'viewBox' not in text, "SVG attributes should not appear in text output"
def test_tag_inside_json_data_attribute_does_not_eat_content(self):
"""
Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
closing tags appear as <\\/svg> rather than </svg>.
The old regex approach would find <svg> inside the attribute value, then fail to find
<\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
eating tens of kilobytes of actual page content in the process.
"""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div class="slider" data-slider-atts="{&quot;prevArrow&quot;:&quot;<i class=\\&quot;icon\\&quot;><svg width=\\&quot;24\\&quot; height=\\&quot;24\\&quot; viewBox=\\&quot;0 0 24 24\\&quot; xmlns=\\&quot;http:\\/\\/www.w3.org\\/2000\\/svg\\&quot;><path d=\\&quot;M14 5L7 12L14 19\\&quot;\\/><\\/svg><\\/i>&quot;}">
</div>
<div class="content">
<h1>IMPORTANT CONTENT</h1>
<p>This text must not be eaten by the tag-stripping logic.</p>
</div>
<svg><circle cx="50" cy="50" r="40"/></svg>
</body>
</html>'''
text = html_to_text(html)
assert 'IMPORTANT CONTENT' in text, (
"Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
"The tag-stripping logic is matching <tag> inside attribute values and scanning "
"forward to the next real closing tag in the DOM."
)
assert 'This text must not be eaten' in text
def test_script_inside_json_data_attribute_does_not_eat_content(self):
"""Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
html = '''<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div data-config="{&quot;template&quot;:&quot;<script type=\\&quot;text\\/javascript\\&quot;>var x=1;<\\/script>&quot;}">
</div>
<div>
<h1>MUST SURVIVE</h1>
<p>Real content after the data attribute with embedded script tag.</p>
</div>
<script>var real = 1;</script>
</body>
</html>'''
text = html_to_text(html)
assert 'MUST SURVIVE' in text, (
"Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
)
assert 'Real content after the data attribute' in text
if __name__ == '__main__':
# Can run this file directly for quick testing
unittest.main()

View File

@@ -9,10 +9,15 @@ flask_restful
flask_cors # For the Chrome extension to operate
# janus # No longer needed - using pure threading.Queue for multi-loop support
flask_wtf~=1.2
flask~=3.1
flask-socketio~=5.6.0
python-socketio~=5.16.1
python-engineio~=4.13.1
# Flask 3.1 removed the session setter on RequestContext; the patch in
# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
flask>=3.1,<4
# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
# >=5.5.0 ensures the threading async_mode we rely on is available.
flask-socketio>=5.5.0,<6
python-socketio>=5.11.0,<6
python-engineio>=4.9.0,<5
inscriptis~=2.2
pytz
timeago~=1.0