package orchestrator import ( "context" "errors" "fmt" "io" "slices" "sync" "time" v1 "github.com/garethgeorge/backrest/gen/go/v1" "github.com/garethgeorge/backrest/internal/config" "github.com/garethgeorge/backrest/internal/logstore" "github.com/garethgeorge/backrest/internal/metric" "github.com/garethgeorge/backrest/internal/oplog" "github.com/garethgeorge/backrest/internal/orchestrator/logging" "github.com/garethgeorge/backrest/internal/orchestrator/repo" "github.com/garethgeorge/backrest/internal/orchestrator/tasks" "github.com/garethgeorge/backrest/internal/queue" "github.com/google/uuid" "go.uber.org/zap" "google.golang.org/protobuf/proto" ) var ErrRepoNotFound = errors.New("repo not found") var ErrRepoInitializationFailed = errors.New("repo initialization failed") var ErrPlanNotFound = errors.New("plan not found") const defaultTaskLogDuration = 14 * 24 * time.Hour // Orchestrator is responsible for managing repos and backups. type Orchestrator struct { mu sync.Mutex config *v1.Config OpLog *oplog.OpLog repoPool *resticRepoPool taskQueue *queue.TimePriorityQueue[stContainer] logStore *logstore.LogStore taskCancelMu sync.Mutex taskCancel map[int64]context.CancelFunc // now for the purpose of testing; used by Run() to get the current time. now func() time.Time } var _ tasks.TaskExecutor = &Orchestrator{} type stContainer struct { tasks.ScheduledTask retryCount int // number of times this task has been retried. configModno int32 callbacks []func(error) } func (st stContainer) Eq(other stContainer) bool { return st.ScheduledTask.Eq(other.ScheduledTask) } func (st stContainer) Less(other stContainer) bool { return st.ScheduledTask.Less(other.ScheduledTask) } func NewOrchestrator(resticBin string, cfg *v1.Config, log *oplog.OpLog, logStore *logstore.LogStore) (*Orchestrator, error) { cfg = proto.Clone(cfg).(*v1.Config) // create the orchestrator. o := &Orchestrator{ OpLog: log, config: cfg, // repoPool created with a memory store to ensure the config is updated in an atomic operation with the repo pool's config value. repoPool: newResticRepoPool(resticBin, cfg), taskQueue: queue.NewTimePriorityQueue[stContainer](), logStore: logStore, taskCancel: make(map[int64]context.CancelFunc), } // verify the operation log and mark any incomplete operations as failed. if log != nil { // oplog may be nil for testing. incompleteRepos := []string{} incompleteOps := []*v1.Operation{} toDelete := []int64{} startTime := time.Now() zap.S().Info("scrubbing operation log for incomplete operations") if err := log.Query(oplog.SelectAll, func(op *v1.Operation) error { if op.Status == v1.OperationStatus_STATUS_PENDING || op.Status == v1.OperationStatus_STATUS_SYSTEM_CANCELLED || op.Status == v1.OperationStatus_STATUS_USER_CANCELLED || op.Status == v1.OperationStatus_STATUS_UNKNOWN { toDelete = append(toDelete, op.Id) } else if op.Status == v1.OperationStatus_STATUS_INPROGRESS { incompleteOps = append(incompleteOps, op) if !slices.Contains(incompleteRepos, op.RepoId) { incompleteRepos = append(incompleteRepos, op.RepoId) } } return nil }); err != nil { return nil, fmt.Errorf("scan oplog: %w", err) } for _, op := range incompleteOps { op.Status = v1.OperationStatus_STATUS_ERROR op.DisplayMessage = "Operation was incomplete when orchestrator was restarted." op.UnixTimeEndMs = op.UnixTimeStartMs if err := log.Update(op); err != nil { return nil, fmt.Errorf("update incomplete operation: %w", err) } } if err := log.Delete(toDelete...); err != nil { return nil, fmt.Errorf("delete incomplete operations: %w", err) } for _, repoId := range incompleteRepos { repo, err := o.GetRepoOrchestrator(repoId) if err != nil { if errors.Is(err, ErrRepoNotFound) { zap.L().Warn("repo not found for incomplete operation. Possibly just deleted.", zap.String("repo", repoId)) } return nil, fmt.Errorf("get repo %q: %w", repoId, err) } if err := repo.Unlock(context.Background()); err != nil { zap.L().Error("failed to unlock repo", zap.String("repo", repoId), zap.Error(err)) } } zap.L().Info("scrubbed operation log for incomplete operations", zap.Duration("duration", time.Since(startTime)), zap.Int("incomplete_ops", len(incompleteOps)), zap.Int("incomplete_repos", len(incompleteRepos)), zap.Int("deleted_ops", len(toDelete))) } // apply starting configuration which also queues initial tasks. if err := o.ApplyConfig(cfg); err != nil { return nil, fmt.Errorf("apply initial config: %w", err) } zap.L().Info("orchestrator created") return o, nil } func (o *Orchestrator) curTime() time.Time { if o.now != nil { return o.now() } return time.Now() } func (o *Orchestrator) ApplyConfig(cfg *v1.Config) error { o.mu.Lock() o.config = proto.Clone(cfg).(*v1.Config) o.repoPool = newResticRepoPool(o.repoPool.resticPath, o.config) o.mu.Unlock() return o.ScheduleDefaultTasks(cfg) } // rescheduleTasksIfNeeded checks if any tasks need to be rescheduled based on config changes. func (o *Orchestrator) ScheduleDefaultTasks(config *v1.Config) error { zap.L().Info("scheduling default tasks, waiting for task queue reset.") removedTasks := o.taskQueue.Reset() for _, t := range removedTasks { if t.Op == nil { continue } if err := o.OpLog.Delete(t.Op.Id); err != nil { zap.S().Warnf("failed to cancel pending task %d: %v", t.Op.Id, err) } } zap.L().Info("reset task queue, scheduling new task set", zap.String("timezone", time.Now().Location().String())) // Requeue tasks that are affected by the config change. if err := o.ScheduleTask(tasks.NewCollectGarbageTask(o.logStore), tasks.TaskPriorityDefault); err != nil { return fmt.Errorf("schedule collect garbage task: %w", err) } for _, plan := range config.Plans { // Schedule a backup task for the plan t := tasks.NewScheduledBackupTask(plan) if err := o.ScheduleTask(t, tasks.TaskPriorityDefault); err != nil { return fmt.Errorf("schedule backup task for plan %q: %w", plan.Id, err) } } for _, repo := range config.Repos { // Schedule a prune task for the repo t := tasks.NewPruneTask(repo.GetId(), tasks.PlanForSystemTasks, false) if err := o.ScheduleTask(t, tasks.TaskPriorityPrune); err != nil { return fmt.Errorf("schedule prune task for repo %q: %w", repo.GetId(), err) } // Schedule a check task for the repo t = tasks.NewCheckTask(repo.GetId(), tasks.PlanForSystemTasks, false) if err := o.ScheduleTask(t, tasks.TaskPriorityCheck); err != nil { return fmt.Errorf("schedule check task for repo %q: %w", repo.GetId(), err) } } return nil } func (o *Orchestrator) GetRepoOrchestrator(repoId string) (repo *repo.RepoOrchestrator, err error) { o.mu.Lock() defer o.mu.Unlock() r, err := o.repoPool.GetRepo(repoId) if err != nil { return nil, fmt.Errorf("get repo %q: %w", repoId, err) } return r, nil } func (o *Orchestrator) GetRepo(repoID string) (*v1.Repo, error) { o.mu.Lock() defer o.mu.Unlock() repo := config.FindRepo(o.config, repoID) if repo == nil { return nil, fmt.Errorf("get repo %q: %w", repoID, ErrRepoNotFound) } return repo, nil } func (o *Orchestrator) GetPlan(planID string) (*v1.Plan, error) { o.mu.Lock() defer o.mu.Unlock() plan := config.FindPlan(o.config, planID) if plan == nil { return nil, fmt.Errorf("get plan %q: %w", planID, ErrPlanNotFound) } return plan, nil } func (o *Orchestrator) CancelOperation(operationId int64, status v1.OperationStatus) error { allTasks := o.taskQueue.GetAll() idx := slices.IndexFunc(allTasks, func(t stContainer) bool { return t.Op != nil && t.Op.GetId() == operationId }) if idx == -1 { o.taskCancelMu.Lock() if cancel, ok := o.taskCancel[operationId]; ok { cancel() } o.taskCancelMu.Unlock() return nil } t := allTasks[idx] if err := o.cancelHelper(t.Op, status); err != nil { return fmt.Errorf("cancel operation: %w", err) } o.taskQueue.Remove(t) if st, err := o.CreateUnscheduledTask(t.Task, tasks.TaskPriorityDefault, t.RunAt); err != nil { return fmt.Errorf("reschedule cancelled task: %w", err) } else if !st.Eq(tasks.NeverScheduledTask) { o.taskQueue.Enqueue(st.RunAt, tasks.TaskPriorityDefault, stContainer{ ScheduledTask: st, configModno: o.config.Modno, }) } return nil } func (o *Orchestrator) cancelHelper(op *v1.Operation, status v1.OperationStatus) error { op.Status = status op.UnixTimeEndMs = time.Now().UnixMilli() if err := o.OpLog.Update(op); err != nil { return fmt.Errorf("update cancelled operation: %w", err) } return nil } // Run is the main orchestration loop. Cancel the context to stop the loop. func (o *Orchestrator) Run(ctx context.Context) { zap.L().Info("starting orchestrator loop") for { if ctx.Err() != nil { zap.L().Info("shutting down orchestrator loop, context cancelled.") break } t := o.taskQueue.Dequeue(ctx) if t.Task == nil { continue } // Clone the operation incase we need to reset changes and reschedule the task for a retry originalOp := proto.Clone(t.Op).(*v1.Operation) if t.Op != nil && t.retryCount != 0 { t.Op.DisplayMessage = fmt.Sprintf("running after %d retries", t.retryCount) // Delete any previous hook executions for this operation incase this is a retry. prevHookExecutionIDs := []int64{} if err := o.OpLog.Query(oplog.Query{FlowID: t.Op.FlowId}, func(op *v1.Operation) error { if hookOp, ok := op.Op.(*v1.Operation_OperationRunHook); ok && hookOp.OperationRunHook.GetParentOp() == t.Op.Id { prevHookExecutionIDs = append(prevHookExecutionIDs, op.Id) } return nil }); err != nil { zap.L().Error("failed to collect previous hook execution IDs", zap.Error(err)) } zap.S().Debugf("deleting previous hook execution IDs: %v", prevHookExecutionIDs) if err := o.OpLog.Delete(prevHookExecutionIDs...); err != nil { zap.L().Error("failed to delete previous hook execution IDs", zap.Error(err)) } } err := o.RunTask(ctx, t.ScheduledTask) o.mu.Lock() curCfgModno := o.config.Modno o.mu.Unlock() if t.configModno == curCfgModno { // Only reschedule tasks if the config hasn't changed since the task was scheduled. var retryErr *tasks.TaskRetryError if errors.As(err, &retryErr) { // If the task returned a retry error, schedule for a retry reusing the same task and operation data. t.retryCount += 1 delay := retryErr.Backoff(t.retryCount) if t.Op != nil { t.Op = originalOp t.Op.DisplayMessage = fmt.Sprintf("waiting for retry, current backoff delay: %v", delay) t.Op.UnixTimeStartMs = t.RunAt.UnixMilli() if err := o.OpLog.Update(t.Op); err != nil { zap.S().Errorf("failed to update operation in oplog: %v", err) } } t.RunAt = time.Now().Add(delay) o.taskQueue.Enqueue(t.RunAt, tasks.TaskPriorityDefault, t) zap.L().Info("retrying task", zap.String("task", t.Task.Name()), zap.String("runAt", t.RunAt.Format(time.RFC3339)), zap.Duration("delay", delay)) continue // skip executing the task's callbacks. } else if e := o.ScheduleTask(t.Task, tasks.TaskPriorityDefault); e != nil { // Schedule the next execution of the task zap.L().Error("reschedule task", zap.String("task", t.Task.Name()), zap.Error(e)) } } for _, cb := range t.callbacks { go cb(err) } } } func (o *Orchestrator) RunTask(ctx context.Context, st tasks.ScheduledTask) error { ctx, cancel := context.WithCancel(ctx) defer cancel() zap.L().Info("running task", zap.String("task", st.Task.Name()), zap.String("runAt", st.RunAt.Format(time.RFC3339))) var logWriter io.WriteCloser op := st.Op if op != nil { var err error o.taskCancelMu.Lock() o.taskCancel[op.Id] = cancel o.taskCancelMu.Unlock() defer func() { o.taskCancelMu.Lock() delete(o.taskCancel, op.Id) o.taskCancelMu.Unlock() }() logID := uuid.New().String() logWriter, err = o.logStore.Create(logID, op.Id, defaultTaskLogDuration) if err != nil { zap.S().Errorf("failed to create live log writer: %v", err) } ctx = logging.ContextWithWriter(ctx, logWriter) op.Logref = logID op.UnixTimeStartMs = time.Now().UnixMilli() if op.Status == v1.OperationStatus_STATUS_PENDING || op.Status == v1.OperationStatus_STATUS_UNKNOWN { op.Status = v1.OperationStatus_STATUS_INPROGRESS } if op.Id != 0 { if err := o.OpLog.Update(op); err != nil { zap.S().Errorf("failed to add operation to oplog: %w", err) } } else { if err := o.OpLog.Add(op); err != nil { zap.S().Errorf("failed to add operation to oplog: %w", err) } } } start := time.Now() runner := newTaskRunnerImpl(o, st.Task, st.Op) err := st.Task.Run(ctx, st, runner) if err != nil { runner.Logger(ctx).Error("task failed", zap.Error(err), zap.Duration("duration", time.Since(start))) } else { runner.Logger(ctx).Info("task finished", zap.Duration("duration", time.Since(start))) metric.GetRegistry().RecordTaskRun(st.Task.RepoID(), st.Task.PlanID(), st.Task.Type(), time.Since(start).Seconds(), "success") } // write logs to log storage for this task. if logWriter != nil { if err := logWriter.Close(); err != nil { zap.S().Warnf("failed to close log writer for %q, logs may be partial: %v", st.Task.Name(), err) } } if op != nil { if err != nil { var taskCancelledError *tasks.TaskCancelledError var taskRetryError *tasks.TaskRetryError if errors.As(err, &taskCancelledError) { op.Status = v1.OperationStatus_STATUS_USER_CANCELLED } else if errors.As(err, &taskRetryError) { op.Status = v1.OperationStatus_STATUS_PENDING } else { op.Status = v1.OperationStatus_STATUS_ERROR } // prepend the error to the display if op.DisplayMessage != "" { op.DisplayMessage = err.Error() + "\n\n" + op.DisplayMessage } else { op.DisplayMessage = err.Error() } } op.UnixTimeEndMs = time.Now().UnixMilli() if op.Status == v1.OperationStatus_STATUS_INPROGRESS { op.Status = v1.OperationStatus_STATUS_SUCCESS } if e := o.OpLog.Update(op); e != nil { zap.S().Errorf("failed to update operation in oplog: %v", e) } } return err } // ScheduleTask schedules a task to run at the next available time. // note that o.mu must not be held when calling this function. func (o *Orchestrator) ScheduleTask(t tasks.Task, priority int, callbacks ...func(error)) error { nextRun, err := o.CreateUnscheduledTask(t, priority, o.curTime()) if err != nil { return err } if nextRun.Eq(tasks.NeverScheduledTask) { return nil } stc := stContainer{ ScheduledTask: nextRun, configModno: o.config.Modno, callbacks: callbacks, } o.taskQueue.Enqueue(nextRun.RunAt, priority, stc) zap.L().Info("scheduled task", zap.String("task", t.Name()), zap.String("runAt", nextRun.RunAt.Format(time.RFC3339))) return nil } func (o *Orchestrator) CreateUnscheduledTask(t tasks.Task, priority int, curTime time.Time) (tasks.ScheduledTask, error) { nextRun, err := t.Next(curTime, newTaskRunnerImpl(o, t, nil)) if err != nil { return tasks.NeverScheduledTask, fmt.Errorf("finding run time for task %q: %w", t.Name(), err) } if nextRun.Eq(tasks.NeverScheduledTask) { return tasks.NeverScheduledTask, nil } nextRun.Task = t if nextRun.Op != nil { nextRun.Op.InstanceId = o.config.Instance nextRun.Op.PlanId = t.PlanID() nextRun.Op.RepoId = t.RepoID() nextRun.Op.Status = v1.OperationStatus_STATUS_PENDING nextRun.Op.UnixTimeStartMs = nextRun.RunAt.UnixMilli() if err := o.OpLog.Add(nextRun.Op); err != nil { return tasks.NeverScheduledTask, fmt.Errorf("add operation to oplog: %w", err) } } return nextRun, nil } func (o *Orchestrator) Config() *v1.Config { o.mu.Lock() defer o.mu.Unlock() return proto.Clone(o.config).(*v1.Config) } // resticRepoPool caches restic repos. type resticRepoPool struct { mu sync.Mutex resticPath string repos map[string]*repo.RepoOrchestrator config *v1.Config } func newResticRepoPool(resticPath string, config *v1.Config) *resticRepoPool { return &resticRepoPool{ resticPath: resticPath, repos: make(map[string]*repo.RepoOrchestrator), config: config, } } func (rp *resticRepoPool) GetRepo(repoId string) (*repo.RepoOrchestrator, error) { rp.mu.Lock() defer rp.mu.Unlock() if rp.config.Repos == nil { return nil, ErrRepoNotFound } // Check if we already have a repo for this id, if we do return it. r, ok := rp.repos[repoId] if ok { return r, nil } repoProto := config.FindRepo(rp.config, repoId) if repoProto == nil { return nil, ErrRepoNotFound } // Otherwise create a new repo. r, err := repo.NewRepoOrchestrator(rp.config, repoProto, rp.resticPath) if err != nil { return nil, err } rp.repos[repoId] = r return r, nil }