Compare commits

..

2 Commits

Author SHA1 Message Date
dgtlmoon
0c87e62e10 Test tweak 2026-01-13 16:21:01 +01:00
dgtlmoon
5b151c68e2 Possible RSS watch UUID bugfix 2026-01-13 16:11:19 +01:00
17 changed files with 251 additions and 338 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
# Semver means never use .01, or 00. Should be .1.
__version__ = '0.52.2'
__version__ = '0.52.1'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -17,48 +17,36 @@ from loguru import logger
# Async version of update_worker
# Processes jobs from AsyncSignalPriorityQueue instead of threaded queue
async def async_update_worker(worker_id, q, notification_q, app, datastore, executor=None):
async def async_update_worker(worker_id, q, notification_q, app, datastore):
"""
Async worker function that processes watch check jobs from the queue.
Args:
worker_id: Unique identifier for this worker
q: AsyncSignalPriorityQueue containing jobs to process
notification_q: Standard queue for notifications
app: Flask application instance
datastore: Application datastore
executor: ThreadPoolExecutor for queue operations (optional)
"""
# Set a descriptive name for this task
task = asyncio.current_task()
if task:
task.set_name(f"async-worker-{worker_id}")
logger.info(f"Starting async worker {worker_id}")
while not app.config.exit.is_set():
update_handler = None
watch = None
try:
# Use sync interface via run_in_executor since each worker has its own event loop
loop = asyncio.get_event_loop()
queued_item_data = await asyncio.wait_for(
loop.run_in_executor(executor, q.get, True, 1.0), # block=True, timeout=1.0
timeout=1.5
)
# Use native janus async interface - no threads needed!
queued_item_data = await asyncio.wait_for(q.async_get(), timeout=1.0)
except asyncio.TimeoutError:
# No jobs available, continue loop
continue
except Exception as e:
# Handle expected Empty exception from queue timeout
import queue
if isinstance(e, queue.Empty):
# Queue is empty, normal behavior - just continue
continue
# Unexpected exception - log as critical
logger.critical(f"CRITICAL: Worker {worker_id} failed to get queue item: {type(e).__name__}: {e}")
# Log queue health for debugging
@@ -426,13 +414,14 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Worker error: {str(e)}"})
finally:
try:
await update_handler.fetcher.quit(watch=watch)
except Exception as e:
logger.error(f"Exception while cleaning/quit after calling browser: {e}")
# Always cleanup - this runs whether there was an exception or not
if uuid:
try:
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
await update_handler.fetcher.quit(watch=watch)
except Exception as e:
logger.error(f"Exception while cleaning/quit after calling browser: {e}")
try:
# Mark UUID as no longer being processed by this worker
worker_handler.set_uuid_processing(uuid, worker_id=worker_id, processing=False)
@@ -471,9 +460,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
logger.debug(f"Worker {worker_id} completed watch {uuid} in {time.time()-fetch_start_time:.2f}s")
except Exception as cleanup_error:
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")
del(uuid)
# Brief pause before continuing to avoid tight error loops (only on error)
if 'e' in locals():
await asyncio.sleep(1.0)

View File

@@ -92,12 +92,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Be sure we're written fresh
datastore.sync_to_json()
zip_thread = threading.Thread(
target=create_backup,
args=(datastore.datastore_path, datastore.data.get("watching")),
daemon=True,
name="BackupCreator"
)
zip_thread = threading.Thread(target=create_backup, args=(datastore.datastore_path, datastore.data.get("watching")))
zip_thread.start()
backup_threads.append(zip_thread)
flash(gettext("Backup building in background, check back in a few minutes."))

View File

@@ -57,15 +57,14 @@ class SignalPriorityQueue(queue.PriorityQueue):
def put(self, item, block=True, timeout=None):
# Call the parent's put method first
super().put(item, block, timeout)
# After putting the item in the queue, check if it has a UUID and emit signal
if hasattr(item, 'item') and isinstance(item.item, dict) and 'uuid' in item.item:
uuid = item.item['uuid']
# Get the signal and send it if it exists
watch_check_update = signal('watch_check_update')
if watch_check_update:
# NOTE: This would block other workers from .put/.get while this signal sends
# Signal handlers may iterate the queue/datastore while holding locks
# Send the watch_uuid parameter
watch_check_update.send(watch_uuid=uuid)
# Send queue_length signal with current queue size
@@ -313,15 +312,14 @@ class AsyncSignalPriorityQueue(asyncio.PriorityQueue):
async def put(self, item):
# Call the parent's put method first
await super().put(item)
# After putting the item in the queue, check if it has a UUID and emit signal
if hasattr(item, 'item') and isinstance(item.item, dict) and 'uuid' in item.item:
uuid = item.item['uuid']
# Get the signal and send it if it exists
watch_check_update = signal('watch_check_update')
if watch_check_update:
# NOTE: This would block other workers from .put/.get while this signal sends
# Signal handlers may iterate the queue/datastore while holding locks
# Send the watch_uuid parameter
watch_check_update.send(watch_uuid=uuid)
# Send queue_length signal with current queue size

View File

@@ -863,13 +863,13 @@ def changedetection_app(config=None, datastore_o=None):
worker_handler.start_workers(n_workers, update_q, notification_q, app, datastore)
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks, daemon=True, name="TickerThread-ScheduleChecker").start()
threading.Thread(target=notification_runner, daemon=True, name="NotificationRunner").start()
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
threading.Thread(target=notification_runner).start()
in_pytest = "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ
# Check for new release version, but not when running in test/build or pytest
if not os.getenv("GITHUB_REF", False) and not strtobool(os.getenv('DISABLE_VERSION_CHECK', 'no')) and not in_pytest:
threading.Thread(target=check_for_new_version, daemon=True, name="VersionChecker").start()
threading.Thread(target=check_for_new_version).start()
# Return the Flask app - the Socket.IO will be attached to it but initialized separately
# This avoids circular dependencies

View File

@@ -10,13 +10,9 @@ from pathlib import Path
from loguru import logger
from .. import jinja2_custom as safe_jinja
from ..diff import ADDED_PLACEMARKER_OPEN
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
FAVICON_RESAVE_THRESHOLD_SECONDS=86400
BROTLI_COMPRESS_SIZE_THRESHOLD = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
def _brotli_compress_worker(conn, filepath, mode=None):
"""
@@ -33,7 +29,6 @@ def _brotli_compress_worker(conn, filepath, mode=None):
try:
# Receive data from parent process via pipe (avoids pickle overhead)
contents = conn.recv()
logger.debug(f"Starting brotli compression of {len(contents)} bytes.")
if mode is not None:
compressed_data = brotli.compress(contents, mode=mode)
@@ -45,10 +40,9 @@ def _brotli_compress_worker(conn, filepath, mode=None):
# Send success status back
conn.send(True)
logger.debug(f"Finished brotli compression - From {len(contents)} to {len(compressed_data)} bytes.")
# No need for explicit cleanup - process exit frees all memory
except Exception as e:
logger.critical(f"Brotli compression worker failed: {e}")
logger.error(f"Brotli compression worker failed: {e}")
conn.send(False)
finally:
conn.close()
@@ -72,6 +66,7 @@ def _brotli_subprocess_save(contents, filepath, mode=None, timeout=30, fallback_
Raises:
Exception: if compression fails and fallback_uncompressed is False
"""
import brotli
import multiprocessing
import sys
@@ -149,6 +144,11 @@ def _brotli_subprocess_save(contents, filepath, mode=None, timeout=30, fallback_
else:
raise Exception(f"Brotli compression subprocess failed for {filepath}")
FAVICON_RESAVE_THRESHOLD_SECONDS=86400
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
class model(watch_base):
__newest_history_key = None
@@ -492,6 +492,7 @@ class model(watch_base):
self.ensure_data_dir_exists()
threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False'))
# Binary data - detect file type and save without compression
@@ -515,7 +516,7 @@ class model(watch_base):
# Text data - use brotli compression if enabled and above threshold
else:
if not skip_brotli and len(contents) > BROTLI_COMPRESS_SIZE_THRESHOLD:
if not skip_brotli and len(contents) > threshold:
# Compressed text
import brotli
snapshot_fname = f"{snapshot_id}.txt.br"

View File

@@ -86,7 +86,6 @@ class RecheckPriorityQueue:
def get(self, block: bool = True, timeout: Optional[float] = None):
"""Thread-safe sync get with priority ordering"""
import queue
try:
# Wait for notification
self.sync_q.get(block=block, timeout=timeout)
@@ -104,11 +103,8 @@ class RecheckPriorityQueue:
logger.debug(f"Successfully retrieved item: {self._get_item_uuid(item)}")
return item
except queue.Empty:
# Queue is empty with timeout - expected behavior, re-raise without logging
raise
except Exception as e:
# Re-raise without logging - caller (worker) will handle and log appropriately
logger.critical(f"CRITICAL: Failed to get item from queue: {str(e)}")
raise
# ASYNC INTERFACE (for workers)

View File

@@ -98,12 +98,11 @@ pytest -vv -s --maxfail=1 tests/test_rss.py
pytest -vv -s --maxfail=1 tests/test_unique_lines.py
# Try high concurrency
FETCH_WORKERS=50 pytest tests/test_history_consistency.py -vv -l -s
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest -vv -s tests/test_security.py
# Run it again so that brotli kicks in
TEST_WITH_BROTLI=1 SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD=100 FETCH_WORKERS=20 pytest tests/test_history_consistency.py -vv -l -s

View File

@@ -186,7 +186,7 @@ class ChangeDetectionStore:
# Finally start the thread that will manage periodic data saves to JSON
# Only start if thread is not already running (reload_state might be called multiple times)
if not self.save_data_thread or not self.save_data_thread.is_alive():
self.save_data_thread = threading.Thread(target=self.save_datastore, daemon=True, name="DatastoreSaver")
self.save_data_thread = threading.Thread(target=self.save_datastore)
self.save_data_thread.start()
def rehydrate_entity(self, uuid, entity, processor_override=None):

View File

@@ -17,8 +17,6 @@ _MAP = {
def strtobool(value):
if not value:
return False
try:
return _MAP[str(value).lower()]
except KeyError:

View File

@@ -270,6 +270,3 @@ def app(request, datastore_path):
request.addfinalizer(teardown)
yield app

View File

@@ -206,10 +206,11 @@ def test_regex_error_handling(client, live_server, measure_memory_usage, datasto
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
time.sleep(0.2)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
### test regex error handling
res = client.post(
url_for("ui.ui_edit.edit_page", uuid=uuid),
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"extract_text": '/something bad\d{3/XYZ',
"url": test_url,
"fetch_backend": "html_requests",

View File

@@ -4,47 +4,25 @@ import time
import os
import json
from flask import url_for
from loguru import logger
from .. import strtobool
from .util import wait_for_all_checks, delete_all_watches
import brotli
from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage, datastore_path):
# live_server_setup(live_server) # Setup on conftest per function
workers = int(os.getenv("FETCH_WORKERS", 10))
r = range(1, 10+workers)
uuids = set()
sys_fetch_workers = int(os.getenv("FETCH_WORKERS", 10))
workers = range(1, sys_fetch_workers)
now = time.time()
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": test_url},
follow_redirects=True
)
for one in workers:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
# A very long string that WILL trigger Brotli compression of the snapshot
# BROTLI_COMPRESS_SIZE_THRESHOLD should be set to say 200
from ..model.Watch import BROTLI_COMPRESS_SIZE_THRESHOLD
content = str(one) + "x" + str(one) * (BROTLI_COMPRESS_SIZE_THRESHOLD + 10)
else:
# Just enough to test datastore
content = str(one)+'x'
test_url = url_for('test_endpoint', content_type="text/html", content=content, _external=True)
uuids.add(client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'title': str(one)}))
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
duration = time.time() - now
per_worker = duration/sys_fetch_workers
if sys_fetch_workers < 20:
per_worker_threshold=0.6
elif sys_fetch_workers < 50:
per_worker_threshold = 0.8
else:
per_worker_threshold = 1.5
logger.debug(f"All fetched in {duration:.2f}s, {per_worker}s per worker")
# Problematic on github
#assert per_worker < per_worker_threshold, f"If concurrency is working good, no blocking async problems, each worker ({sys_fetch_workers} workers) should have done his job in under {per_worker_threshold}s, got {per_worker:.2f}s per worker, total duration was {duration:.2f}s"
# Essentially just triggers the DB write/update
res = client.post(
@@ -56,7 +34,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
)
assert b"Settings updated." in res.data
# Wait for the sync DB save to happen
time.sleep(2)
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
@@ -66,18 +44,14 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
json_obj = json.load(f)
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(workers), "Correct number of watches was found in the JSON"
i = 0
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
i=0
# each one should have a history.txt containing just one line
for w in json_obj['watching'].keys():
i += 1
i+=1
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
# Should be no errors (could be from brotli etc)
assert not live_server.app.config['DATASTORE'].data['watching'][w].get('last_error')
# Same like in model.Watch
with open(history_txt_index_file, "r") as f:
tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
@@ -89,21 +63,15 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
full_snapshot_history_path = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname)
# contents should match what we requested as content returned from the test url
if fname.endswith('.br'):
with open(full_snapshot_history_path, 'rb') as f:
contents = brotli.decompress(f.read()).decode('utf-8')
else:
with open(full_snapshot_history_path, 'r') as snapshot_f:
contents = snapshot_f.read()
with open(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname), 'r') as snapshot_f:
contents = snapshot_f.read()
watch_url = json_obj['watching'][w]['url']
u = urlparse(watch_url)
q = parse_qs(u[4])
assert q['content'][0] == contents.strip(), f"Snapshot file {fname} should contain {q['content'][0]}"
watch_title = json_obj['watching'][w]['title']
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"

View File

@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, wait_for_watch_history, extract_rss_token_from_UI, get_UUID_for_tag_name, delete_all_watches
from .util import live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, get_UUID_for_tag_name, delete_all_watches
import os
@@ -87,9 +87,6 @@ def test_rss_group(client, live_server, measure_memory_usage, datastore_path):
# Wait for initial checks to complete
wait_for_all_checks(client)
# Ensure initial snapshots are saved
assert wait_for_watch_history(client, min_history_count=1, timeout=10), "Watches did not save initial snapshots"
# Trigger a change
set_modified_response(datastore_path=datastore_path)
@@ -97,9 +94,6 @@ def test_rss_group(client, live_server, measure_memory_usage, datastore_path):
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# Ensure all watches have sufficient history for RSS generation
assert wait_for_watch_history(client, min_history_count=2, timeout=10), "Watches did not accumulate sufficient history"
# Get RSS token
rss_token = extract_rss_token_from_UI(client)
assert rss_token is not None
@@ -222,13 +216,11 @@ def test_rss_group_only_unviewed(client, live_server, measure_memory_usage, data
assert b"Watch added" in res.data
wait_for_all_checks(client)
assert wait_for_watch_history(client, min_history_count=1, timeout=10), "Initial snapshots not saved"
# Trigger changes
set_modified_response(datastore_path=datastore_path)
res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
assert wait_for_watch_history(client, min_history_count=2, timeout=10), "History not accumulated"
# Get RSS token
rss_token = extract_rss_token_from_UI(client)

View File

@@ -1,10 +1,8 @@
import sys
import os
import pytest
from changedetectionio import html_tools
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import html_tools
# test generation guide.
# 1. Do not include encoding in the xml declaration if the test object is a str type.

View File

@@ -164,45 +164,14 @@ def wait_for_all_checks(client=None):
if q_length == 0 and not any_workers_busy:
if empty_since is None:
empty_since = time.time()
# Brief stabilization period for async workers
elif time.time() - empty_since >= 0.3:
elif time.time() - empty_since >= 0.15: # Shorter wait
break
else:
empty_since = None
attempt += 1
time.sleep(0.3)
def wait_for_watch_history(client, min_history_count=2, timeout=10):
"""
Wait for watches to have sufficient history entries.
Useful after wait_for_all_checks() when you need to ensure history is populated.
Args:
client: Test client with access to datastore
min_history_count: Minimum number of history entries required
timeout: Maximum time to wait in seconds
"""
datastore = client.application.config.get('DATASTORE')
start_time = time.time()
while time.time() - start_time < timeout:
all_have_history = True
for uuid, watch in datastore.data['watching'].items():
history_count = len(watch.history.keys())
if history_count < min_history_count:
all_have_history = False
break
if all_have_history:
return True
time.sleep(0.2)
# Timeout - return False
return False
# Replaced by new_live_server_setup and calling per function scope in conftest.py
def live_server_setup(live_server):
return True
@@ -220,8 +189,6 @@ def new_live_server_setup(live_server):
@live_server.app.route('/test-endpoint')
def test_endpoint():
from loguru import logger
logger.debug(f"/test-endpoint hit {request}")
ctype = request.args.get('content_type')
status_code = request.args.get('status_code')
content = request.args.get('content') or None

View File

@@ -2,18 +2,19 @@
Worker management module for changedetection.io
Handles asynchronous workers for dynamic worker scaling.
Each worker runs in its own thread with its own event loop for isolation.
Sync worker support has been removed in favor of async-only architecture.
"""
import asyncio
import os
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from loguru import logger
# Global worker state - each worker has its own thread and event loop
worker_threads = [] # List of WorkerThread objects
# Global worker state
running_async_tasks = []
async_loop = None
async_loop_thread = None
# Track currently processing UUIDs for async workers - maps {uuid: worker_id}
currently_processing_uuids = {}
@@ -21,118 +22,89 @@ currently_processing_uuids = {}
# Configuration - async workers only
USE_ASYNC_WORKERS = True
# Custom ThreadPoolExecutor for queue operations with named threads
# Scale executor threads with FETCH_WORKERS to avoid bottleneck at high concurrency
_max_executor_workers = max(50, int(os.getenv("FETCH_WORKERS", "10")))
queue_executor = ThreadPoolExecutor(
max_workers=_max_executor_workers,
thread_name_prefix="QueueGetter-"
)
class WorkerThread:
"""Container for a worker thread with its own event loop"""
def __init__(self, worker_id, update_q, notification_q, app, datastore):
self.worker_id = worker_id
self.update_q = update_q
self.notification_q = notification_q
self.app = app
self.datastore = datastore
self.thread = None
self.loop = None
self.running = False
def run(self):
"""Run the worker in its own event loop"""
try:
# Create a new event loop for this thread
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
self.running = True
# Run the worker coroutine
self.loop.run_until_complete(
start_single_async_worker(
self.worker_id,
self.update_q,
self.notification_q,
self.app,
self.datastore,
queue_executor
)
)
except asyncio.CancelledError:
# Normal shutdown - worker was cancelled
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info(f"Worker {self.worker_id} shutting down gracefully")
except RuntimeError as e:
# Ignore expected shutdown errors
if "Event loop stopped" not in str(e) and "Event loop is closed" not in str(e):
logger.error(f"Worker {self.worker_id} runtime error: {e}")
except Exception as e:
logger.error(f"Worker {self.worker_id} thread error: {e}")
finally:
# Clean up
if self.loop and not self.loop.is_closed():
self.loop.close()
self.running = False
self.loop = None
def start(self):
"""Start the worker thread"""
self.thread = threading.Thread(
target=self.run,
daemon=True,
name=f"PageFetchAsyncUpdateWorker-{self.worker_id}"
)
self.thread.start()
def stop(self):
"""Stop the worker thread"""
if self.loop and self.running:
try:
# Signal the loop to stop
self.loop.call_soon_threadsafe(self.loop.stop)
except RuntimeError:
pass
if self.thread and self.thread.is_alive():
self.thread.join(timeout=2.0)
def start_async_event_loop():
"""Start a dedicated event loop for async workers in a separate thread"""
global async_loop
logger.info("Starting async event loop for workers")
try:
# Create a new event loop for this thread
async_loop = asyncio.new_event_loop()
# Set it as the event loop for this thread
asyncio.set_event_loop(async_loop)
logger.debug(f"Event loop created and set: {async_loop}")
# Run the event loop forever
async_loop.run_forever()
except Exception as e:
logger.error(f"Async event loop error: {e}")
finally:
# Clean up
if async_loop and not async_loop.is_closed():
async_loop.close()
async_loop = None
logger.info("Async event loop stopped")
def start_async_workers(n_workers, update_q, notification_q, app, datastore):
"""Start async workers, each with its own thread and event loop for isolation"""
global worker_threads, currently_processing_uuids
# Clear any stale state
"""Start the async worker management system"""
global async_loop_thread, async_loop, running_async_tasks, currently_processing_uuids
# Clear any stale UUID tracking state
currently_processing_uuids.clear()
# Start each worker in its own thread with its own event loop
logger.info(f"Starting {n_workers} async workers (isolated threads)")
# Start the event loop in a separate thread
async_loop_thread = threading.Thread(target=start_async_event_loop, daemon=True)
async_loop_thread.start()
# Wait for the loop to be available (with timeout for safety)
max_wait_time = 5.0
wait_start = time.time()
while async_loop is None and (time.time() - wait_start) < max_wait_time:
time.sleep(0.1)
if async_loop is None:
logger.error("Failed to start async event loop within timeout")
return
# Additional brief wait to ensure loop is running
time.sleep(0.2)
# Start async workers
logger.info(f"Starting {n_workers} async workers")
for i in range(n_workers):
try:
worker = WorkerThread(i, update_q, notification_q, app, datastore)
worker.start()
worker_threads.append(worker)
# No sleep needed - threads start independently and asynchronously
except Exception as e:
# Use a factory function to create named worker coroutines
def create_named_worker(worker_id):
async def named_worker():
task = asyncio.current_task()
if task:
task.set_name(f"async-worker-{worker_id}")
return await start_single_async_worker(worker_id, update_q, notification_q, app, datastore)
return named_worker()
task_future = asyncio.run_coroutine_threadsafe(create_named_worker(i), async_loop)
running_async_tasks.append(task_future)
except RuntimeError as e:
logger.error(f"Failed to start async worker {i}: {e}")
continue
async def start_single_async_worker(worker_id, update_q, notification_q, app, datastore, executor=None):
async def start_single_async_worker(worker_id, update_q, notification_q, app, datastore):
"""Start a single async worker with auto-restart capability"""
from changedetectionio.async_update_worker import async_update_worker
# Check if we're in pytest environment - if so, be more gentle with logging
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
while not app.config.exit.is_set():
try:
await async_update_worker(worker_id, update_q, notification_q, app, datastore, executor)
if not in_pytest:
logger.info(f"Starting async worker {worker_id}")
await async_update_worker(worker_id, update_q, notification_q, app, datastore)
# If we reach here, worker exited cleanly
if not in_pytest:
logger.info(f"Async worker {worker_id} exited cleanly")
@@ -159,38 +131,39 @@ def start_workers(n_workers, update_q, notification_q, app, datastore):
def add_worker(update_q, notification_q, app, datastore):
"""Add a new async worker (for dynamic scaling)"""
global worker_threads
worker_id = len(worker_threads)
logger.info(f"Adding async worker {worker_id}")
try:
worker = WorkerThread(worker_id, update_q, notification_q, app, datastore)
worker.start()
worker_threads.append(worker)
return True
except Exception as e:
logger.error(f"Failed to add worker {worker_id}: {e}")
global running_async_tasks
if not async_loop:
logger.error("Async loop not running, cannot add worker")
return False
worker_id = len(running_async_tasks)
logger.info(f"Adding async worker {worker_id}")
task_future = asyncio.run_coroutine_threadsafe(
start_single_async_worker(worker_id, update_q, notification_q, app, datastore), async_loop
)
running_async_tasks.append(task_future)
return True
def remove_worker():
"""Remove an async worker (for dynamic scaling)"""
global worker_threads
if not worker_threads:
global running_async_tasks
if not running_async_tasks:
return False
# Stop the last worker
worker = worker_threads.pop()
worker.stop()
logger.info(f"Removed async worker, {len(worker_threads)} workers remaining")
# Cancel the last worker
task_future = running_async_tasks.pop()
task_future.cancel()
logger.info(f"Removed async worker, {len(running_async_tasks)} workers remaining")
return True
def get_worker_count():
"""Get current number of async workers"""
return len(worker_threads)
return len(running_async_tasks)
def get_running_uuids():
@@ -276,21 +249,38 @@ def queue_item_async_safe(update_q, item, silent=False):
def shutdown_workers():
"""Shutdown all async workers fast and aggressively"""
global worker_threads
global async_loop, async_loop_thread, running_async_tasks
# Check if we're in pytest environment - if so, be more gentle with logging
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info("Fast shutdown of async workers initiated...")
# Stop all worker threads
for worker in worker_threads:
worker.stop()
worker_threads.clear()
# Cancel all async tasks immediately
for task_future in running_async_tasks:
if not task_future.done():
task_future.cancel()
# Stop the async event loop immediately
if async_loop and not async_loop.is_closed():
try:
async_loop.call_soon_threadsafe(async_loop.stop)
except RuntimeError:
# Loop might already be stopped
pass
running_async_tasks.clear()
async_loop = None
# Give async thread minimal time to finish, then continue
if async_loop_thread and async_loop_thread.is_alive():
async_loop_thread.join(timeout=1.0) # Only 1 second timeout
if async_loop_thread.is_alive() and not in_pytest:
logger.info("Async thread still running after timeout - continuing with shutdown")
async_loop_thread = None
if not in_pytest:
logger.info("Async workers fast shutdown complete")
@@ -300,57 +290,69 @@ def shutdown_workers():
def adjust_async_worker_count(new_count, update_q=None, notification_q=None, app=None, datastore=None):
"""
Dynamically adjust the number of async workers.
Args:
new_count: Target number of workers
update_q, notification_q, app, datastore: Required for adding new workers
Returns:
dict: Status of the adjustment operation
"""
global worker_threads
global running_async_tasks
current_count = get_worker_count()
if new_count == current_count:
return {
'status': 'no_change',
'message': f'Worker count already at {current_count}',
'current_count': current_count
}
if new_count > current_count:
# Add workers
workers_to_add = new_count - current_count
logger.info(f"Adding {workers_to_add} async workers (from {current_count} to {new_count})")
if not all([update_q, notification_q, app, datastore]):
return {
'status': 'error',
'message': 'Missing required parameters to add workers',
'current_count': current_count
}
for i in range(workers_to_add):
add_worker(update_q, notification_q, app, datastore)
worker_id = len(running_async_tasks)
task_future = asyncio.run_coroutine_threadsafe(
start_single_async_worker(worker_id, update_q, notification_q, app, datastore),
async_loop
)
running_async_tasks.append(task_future)
return {
'status': 'success',
'message': f'Added {workers_to_add} workers',
'previous_count': current_count,
'current_count': len(worker_threads)
'current_count': new_count
}
else:
# Remove workers
workers_to_remove = current_count - new_count
logger.info(f"Removing {workers_to_remove} async workers (from {current_count} to {new_count})")
removed_count = 0
for _ in range(workers_to_remove):
if remove_worker():
if running_async_tasks:
task_future = running_async_tasks.pop()
task_future.cancel()
# Wait for the task to actually stop
try:
task_future.result(timeout=5) # 5 second timeout
except Exception:
pass # Task was cancelled, which is expected
removed_count += 1
return {
'status': 'success',
'message': f'Removed {removed_count} workers',
@@ -365,58 +367,72 @@ def get_worker_status():
'worker_type': 'async',
'worker_count': get_worker_count(),
'running_uuids': get_running_uuids(),
'active_threads': sum(1 for w in worker_threads if w.thread and w.thread.is_alive()),
'async_loop_running': async_loop is not None,
}
def check_worker_health(expected_count, update_q=None, notification_q=None, app=None, datastore=None):
"""
Check if the expected number of async workers are running and restart any missing ones.
Args:
expected_count: Expected number of workers
update_q, notification_q, app, datastore: Required for restarting workers
Returns:
dict: Health check results
"""
global worker_threads
global running_async_tasks
current_count = get_worker_count()
# Check which workers are actually alive
alive_count = sum(1 for w in worker_threads if w.thread and w.thread.is_alive())
if alive_count == expected_count:
if current_count == expected_count:
return {
'status': 'healthy',
'expected_count': expected_count,
'actual_count': alive_count,
'actual_count': current_count,
'message': f'All {expected_count} async workers running'
}
# Find dead workers
# Check for crashed async workers
dead_workers = []
for i, worker in enumerate(worker_threads[:]):
if not worker.thread or not worker.thread.is_alive():
dead_workers.append(i)
logger.warning(f"Async worker {worker.worker_id} thread is dead")
alive_count = 0
for i, task_future in enumerate(running_async_tasks[:]):
if task_future.done():
try:
result = task_future.result()
dead_workers.append(i)
logger.warning(f"Async worker {i} completed unexpectedly")
except Exception as e:
dead_workers.append(i)
logger.error(f"Async worker {i} crashed: {e}")
else:
alive_count += 1
# Remove dead workers from tracking
for i in reversed(dead_workers):
if i < len(worker_threads):
worker_threads.pop(i)
if i < len(running_async_tasks):
running_async_tasks.pop(i)
missing_workers = expected_count - alive_count
restarted_count = 0
if missing_workers > 0 and all([update_q, notification_q, app, datastore]):
logger.info(f"Restarting {missing_workers} crashed async workers")
for i in range(missing_workers):
if add_worker(update_q, notification_q, app, datastore):
worker_id = alive_count + i
try:
task_future = asyncio.run_coroutine_threadsafe(
start_single_async_worker(worker_id, update_q, notification_q, app, datastore),
async_loop
)
running_async_tasks.append(task_future)
restarted_count += 1
except Exception as e:
logger.error(f"Failed to restart worker {worker_id}: {e}")
return {
'status': 'repaired' if restarted_count > 0 else 'degraded',
'expected_count': expected_count,