Files
changedetection.io/changedetectionio/worker_pool.py
dgtlmoon 30dc4ac23b
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
Refactor of queue systenm and improve tests, improves multiple workers (#3826)
2026-02-02 22:28:27 +01:00

553 lines
19 KiB
Python

"""
Worker management module for changedetection.io
Handles asynchronous workers for dynamic worker scaling.
Each worker runs in its own thread with its own event loop for isolation.
"""
import asyncio
import os
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from loguru import logger
# Global worker state - each worker has its own thread and event loop
worker_threads = [] # List of WorkerThread objects
# Track currently processing UUIDs for async workers - maps {uuid: worker_id}
currently_processing_uuids = {}
_uuid_processing_lock = threading.Lock() # Protects currently_processing_uuids
# Configuration - async workers only
USE_ASYNC_WORKERS = True
# Custom ThreadPoolExecutor for queue operations with named threads
# Scale executor threads to match FETCH_WORKERS (no minimum, no maximum)
# Thread naming: "QueueGetter-N" for easy debugging in thread dumps/traces
# With FETCH_WORKERS=10: 10 workers + 10 executor threads = 20 threads total
# With FETCH_WORKERS=500: 500 workers + 500 executor threads = 1000 threads total (acceptable on modern systems)
_max_executor_workers = int(os.getenv("FETCH_WORKERS", "10"))
queue_executor = ThreadPoolExecutor(
max_workers=_max_executor_workers,
thread_name_prefix="QueueGetter-" # Shows in thread dumps as "QueueGetter-0", "QueueGetter-1", etc.
)
class WorkerThread:
"""Container for a worker thread with its own event loop"""
def __init__(self, worker_id, update_q, notification_q, app, datastore):
self.worker_id = worker_id
self.update_q = update_q
self.notification_q = notification_q
self.app = app
self.datastore = datastore
self.thread = None
self.loop = None
self.running = False
def run(self):
"""Run the worker in its own event loop"""
try:
# Create a new event loop for this thread
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
self.running = True
# Run the worker coroutine
self.loop.run_until_complete(
start_single_async_worker(
self.worker_id,
self.update_q,
self.notification_q,
self.app,
self.datastore,
queue_executor
)
)
except asyncio.CancelledError:
# Normal shutdown - worker was cancelled
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info(f"Worker {self.worker_id} shutting down gracefully")
except RuntimeError as e:
# Ignore expected shutdown errors
if "Event loop stopped" not in str(e) and "Event loop is closed" not in str(e):
logger.error(f"Worker {self.worker_id} runtime error: {e}")
except Exception as e:
logger.error(f"Worker {self.worker_id} thread error: {e}")
finally:
# Clean up
if self.loop and not self.loop.is_closed():
self.loop.close()
self.running = False
self.loop = None
def start(self):
"""Start the worker thread with descriptive name for debugging"""
self.thread = threading.Thread(
target=self.run,
daemon=True,
name=f"PageFetchAsyncUpdateWorker-{self.worker_id}" # Shows in thread dumps with worker ID
)
self.thread.start()
def stop(self):
"""Stop the worker thread brutally - no waiting"""
# Try to stop the event loop if it exists
if self.loop and self.running:
try:
# Signal the loop to stop
self.loop.call_soon_threadsafe(self.loop.stop)
except RuntimeError:
pass
# Don't wait - thread is daemon and will die when needed
def start_async_workers(n_workers, update_q, notification_q, app, datastore):
"""Start async workers, each with its own thread and event loop for isolation"""
global worker_threads, currently_processing_uuids
# Clear any stale state
currently_processing_uuids.clear()
# Start each worker in its own thread with its own event loop
logger.info(f"Starting {n_workers} async workers (isolated threads)")
for i in range(n_workers):
try:
worker = WorkerThread(i, update_q, notification_q, app, datastore)
worker.start()
worker_threads.append(worker)
# No sleep needed - threads start independently and asynchronously
except Exception as e:
logger.error(f"Failed to start async worker {i}: {e}")
continue
async def start_single_async_worker(worker_id, update_q, notification_q, app, datastore, executor=None):
"""Start a single async worker with auto-restart capability"""
from changedetectionio.worker import async_update_worker
# Check if we're in pytest environment - if so, be more gentle with logging
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
while not app.config.exit.is_set():
try:
result = await async_update_worker(worker_id, update_q, notification_q, app, datastore, executor)
if result == "restart":
# Worker requested restart - immediately loop back and restart
if not in_pytest:
logger.debug(f"Async worker {worker_id} restarting")
continue
else:
# Worker exited cleanly (shutdown)
if not in_pytest:
logger.info(f"Async worker {worker_id} exited cleanly")
break
except asyncio.CancelledError:
# Task was cancelled (normal shutdown)
if not in_pytest:
logger.info(f"Async worker {worker_id} cancelled")
break
except Exception as e:
logger.error(f"Async worker {worker_id} crashed: {e}")
if not in_pytest:
logger.info(f"Restarting async worker {worker_id} in 5 seconds...")
await asyncio.sleep(5)
if not in_pytest:
logger.info(f"Async worker {worker_id} shutdown complete")
def start_workers(n_workers, update_q, notification_q, app, datastore):
"""Start async workers - sync workers are deprecated"""
start_async_workers(n_workers, update_q, notification_q, app, datastore)
def add_worker(update_q, notification_q, app, datastore):
"""Add a new async worker (for dynamic scaling)"""
global worker_threads
# Reuse lowest available ID to prevent unbounded growth over time
used_ids = {w.worker_id for w in worker_threads}
worker_id = 0
while worker_id in used_ids:
worker_id += 1
logger.info(f"Adding async worker {worker_id}")
try:
worker = WorkerThread(worker_id, update_q, notification_q, app, datastore)
worker.start()
worker_threads.append(worker)
return True
except Exception as e:
logger.error(f"Failed to add worker {worker_id}: {e}")
return False
def remove_worker():
"""Remove an async worker (for dynamic scaling)"""
global worker_threads
if not worker_threads:
return False
# Stop the last worker
worker = worker_threads.pop()
worker.stop()
logger.info(f"Removed async worker, {len(worker_threads)} workers remaining")
return True
def get_worker_count():
"""Get current number of async workers"""
return len(worker_threads)
def get_running_uuids():
"""Get list of UUIDs currently being processed by async workers"""
with _uuid_processing_lock:
return list(currently_processing_uuids.keys())
def claim_uuid_for_processing(uuid, worker_id):
"""
Atomically check if UUID is available and claim it for processing.
This is thread-safe and prevents race conditions where multiple workers
try to process the same UUID simultaneously.
Args:
uuid: The watch UUID to claim
worker_id: The ID of the worker claiming this UUID
Returns:
True if successfully claimed (UUID was not being processed)
False if already being processed by another worker
"""
with _uuid_processing_lock:
if uuid in currently_processing_uuids:
# Already being processed by another worker
return False
# Claim it atomically
currently_processing_uuids[uuid] = worker_id
logger.debug(f"Worker {worker_id} claimed UUID: {uuid}")
return True
def release_uuid_from_processing(uuid, worker_id):
"""
Release a UUID from processing (thread-safe).
Args:
uuid: The watch UUID to release
worker_id: The ID of the worker releasing this UUID
"""
with _uuid_processing_lock:
# Only remove if this worker owns it (defensive)
if currently_processing_uuids.get(uuid) == worker_id:
currently_processing_uuids.pop(uuid, None)
logger.debug(f"Worker {worker_id} released UUID: {uuid}")
else:
logger.warning(f"Worker {worker_id} tried to release UUID {uuid} but doesn't own it (owned by {currently_processing_uuids.get(uuid, 'nobody')})")
def set_uuid_processing(uuid, worker_id=None, processing=True):
"""
Mark a UUID as being processed or completed by a specific worker.
DEPRECATED: Use claim_uuid_for_processing() and release_uuid_from_processing() instead.
This function is kept for backward compatibility but doesn't provide atomic check-and-set.
"""
if processing:
with _uuid_processing_lock:
currently_processing_uuids[uuid] = worker_id
logger.debug(f"Worker {worker_id} started processing UUID: {uuid}")
else:
release_uuid_from_processing(uuid, worker_id)
def is_watch_running(watch_uuid):
"""Check if a specific watch is currently being processed by any worker"""
with _uuid_processing_lock:
return watch_uuid in currently_processing_uuids
def is_watch_running_by_another_worker(watch_uuid, current_worker_id):
"""Check if a specific watch is currently being processed by a different worker"""
with _uuid_processing_lock:
if watch_uuid not in currently_processing_uuids:
return False
processing_worker_id = currently_processing_uuids[watch_uuid]
return processing_worker_id != current_worker_id
def queue_item_async_safe(update_q, item, silent=False):
"""Bulletproof queue operation with comprehensive error handling"""
item_uuid = 'unknown'
try:
# Safely extract UUID for logging
if hasattr(item, 'item') and isinstance(item.item, dict):
item_uuid = item.item.get('uuid', 'unknown')
except Exception as uuid_e:
logger.critical(f"CRITICAL: Failed to extract UUID from queue item: {uuid_e}")
# Validate inputs
if not update_q:
logger.critical(f"CRITICAL: Queue is None/invalid for item {item_uuid}")
return False
if not item:
logger.critical(f"CRITICAL: Item is None/invalid")
return False
# Attempt queue operation with multiple fallbacks
try:
# Primary: Use sync interface (thread-safe)
success = update_q.put(item, block=True, timeout=5.0)
if success is False: # Explicit False return means failure
logger.critical(f"CRITICAL: Queue.put() returned False for item {item_uuid}")
return False
if not silent:
logger.trace(f"Successfully queued item: {item_uuid}")
return True
except Exception as e:
logger.critical(f"CRITICAL: Exception during queue operation for item {item_uuid}: {type(e).__name__}: {e}")
# Secondary: Attempt queue health check
try:
queue_size = update_q.qsize()
is_empty = update_q.empty()
logger.critical(f"CRITICAL: Queue health - size: {queue_size}, empty: {is_empty}")
except Exception as health_e:
logger.critical(f"CRITICAL: Queue health check failed: {health_e}")
# Log queue type for debugging
try:
logger.critical(f"CRITICAL: Queue type: {type(update_q)}, has sync_q: {hasattr(update_q, 'sync_q')}")
except Exception:
logger.critical(f"CRITICAL: Cannot determine queue type")
return False
def shutdown_workers():
"""Shutdown all async workers brutally - no delays, no waiting"""
global worker_threads, queue_executor
# Check if we're in pytest environment - if so, be more gentle with logging
import os
in_pytest = "pytest" in os.sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info("Brutal shutdown of async workers initiated...")
# Stop all worker event loops
for worker in worker_threads:
worker.stop()
# Clear immediately - threads are daemon and will die
worker_threads.clear()
# Shutdown the queue executor to prevent "cannot schedule new futures after shutdown" errors
# This must happen AFTER workers are stopped to avoid race conditions
if queue_executor:
try:
queue_executor.shutdown(wait=False)
if not in_pytest:
logger.debug("Queue executor shut down")
except Exception as e:
if not in_pytest:
logger.warning(f"Error shutting down queue executor: {e}")
if not in_pytest:
logger.info("Async workers brutal shutdown complete")
def adjust_async_worker_count(new_count, update_q=None, notification_q=None, app=None, datastore=None):
"""
Dynamically adjust the number of async workers.
Args:
new_count: Target number of workers
update_q, notification_q, app, datastore: Required for adding new workers
Returns:
dict: Status of the adjustment operation
"""
global worker_threads
current_count = get_worker_count()
if new_count == current_count:
return {
'status': 'no_change',
'message': f'Worker count already at {current_count}',
'current_count': current_count
}
if new_count > current_count:
# Add workers
workers_to_add = new_count - current_count
logger.info(f"Adding {workers_to_add} async workers (from {current_count} to {new_count})")
if not all([update_q, notification_q, app, datastore]):
return {
'status': 'error',
'message': 'Missing required parameters to add workers',
'current_count': current_count
}
for i in range(workers_to_add):
add_worker(update_q, notification_q, app, datastore)
return {
'status': 'success',
'message': f'Added {workers_to_add} workers',
'previous_count': current_count,
'current_count': len(worker_threads)
}
else:
# Remove workers
workers_to_remove = current_count - new_count
logger.info(f"Removing {workers_to_remove} async workers (from {current_count} to {new_count})")
removed_count = 0
for _ in range(workers_to_remove):
if remove_worker():
removed_count += 1
return {
'status': 'success',
'message': f'Removed {removed_count} workers',
'previous_count': current_count,
'current_count': current_count - removed_count
}
def get_worker_status():
"""Get status information about async workers"""
return {
'worker_type': 'async',
'worker_count': get_worker_count(),
'running_uuids': get_running_uuids(),
'active_threads': sum(1 for w in worker_threads if w.thread and w.thread.is_alive()),
}
def wait_for_all_checks(update_q, timeout=150):
"""
Wait for queue to be empty and all workers to be idle.
Args:
update_q: The update queue to monitor
timeout: Maximum wait time in seconds (default 150 = 150 iterations * 0.2-0.8s)
Returns:
bool: True if all checks completed, False if timeout
"""
import time
empty_since = None
attempt = 0
max_attempts = timeout
while attempt < max_attempts:
# Adaptive sleep - start fast, slow down if needed
if attempt < 10:
sleep_time = 0.2 # Very fast initial checks
elif attempt < 30:
sleep_time = 0.4 # Medium speed
else:
sleep_time = 0.8 # Slower for persistent issues
time.sleep(sleep_time)
q_length = update_q.qsize()
running_uuids = get_running_uuids()
any_workers_busy = len(running_uuids) > 0
if q_length == 0 and not any_workers_busy:
if empty_since is None:
empty_since = time.time()
# Brief stabilization period for async workers
elif time.time() - empty_since >= 0.3:
# Add small buffer for filesystem operations to complete
time.sleep(0.2)
logger.trace("wait_for_all_checks: All checks complete (queue empty, workers idle)")
return True
else:
empty_since = None
attempt += 1
logger.warning(f"wait_for_all_checks: Timeout after {timeout} attempts")
return False # Timeout
def check_worker_health(expected_count, update_q=None, notification_q=None, app=None, datastore=None):
"""
Check if the expected number of async workers are running and restart any missing ones.
Args:
expected_count: Expected number of workers
update_q, notification_q, app, datastore: Required for restarting workers
Returns:
dict: Health check results
"""
global worker_threads
current_count = get_worker_count()
# Check which workers are actually alive
alive_count = sum(1 for w in worker_threads if w.thread and w.thread.is_alive())
if alive_count == expected_count:
return {
'status': 'healthy',
'expected_count': expected_count,
'actual_count': alive_count,
'message': f'All {expected_count} async workers running'
}
# Find dead workers
dead_workers = []
for i, worker in enumerate(worker_threads[:]):
if not worker.thread or not worker.thread.is_alive():
dead_workers.append(i)
logger.warning(f"Async worker {worker.worker_id} thread is dead")
# Remove dead workers from tracking
for i in reversed(dead_workers):
if i < len(worker_threads):
worker_threads.pop(i)
missing_workers = expected_count - alive_count
restarted_count = 0
if missing_workers > 0 and all([update_q, notification_q, app, datastore]):
logger.info(f"Restarting {missing_workers} crashed async workers")
for i in range(missing_workers):
if add_worker(update_q, notification_q, app, datastore):
restarted_count += 1
return {
'status': 'repaired' if restarted_count > 0 else 'degraded',
'expected_count': expected_count,
'actual_count': alive_count,
'dead_workers': len(dead_workers),
'restarted_workers': restarted_count,
'message': f'Found {len(dead_workers)} dead workers, restarted {restarted_count}'
}