mirror of
https://github.com/garethgeorge/backrest.git
synced 2025-12-12 08:45:38 +00:00
chore: refactor task scheduler
This commit is contained in:
@@ -21,23 +21,33 @@ var ErrPlanNotFound = errors.New("plan not found")
|
||||
|
||||
// Orchestrator is responsible for managing repos and backups.
|
||||
type Orchestrator struct {
|
||||
mu sync.Mutex
|
||||
config *v1.Config
|
||||
OpLog *oplog.OpLog
|
||||
repoPool *resticRepoPool
|
||||
mu sync.Mutex
|
||||
config *v1.Config
|
||||
OpLog *oplog.OpLog
|
||||
repoPool *resticRepoPool
|
||||
taskQueue taskQueue
|
||||
|
||||
configUpdates chan *v1.Config // configUpdates chan makes config changes available to Run()
|
||||
externTasks chan Task // externTasks is a channel that externally added tasks can be added to, they will be consumed by Run()
|
||||
// now for the purpose of testing; used by Run() to get the current time.
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
func NewOrchestrator(resticBin string, cfg *v1.Config, oplog *oplog.OpLog) (*Orchestrator, error) {
|
||||
return &Orchestrator{
|
||||
func NewOrchestrator(resticBin string, cfg *v1.Config, oplog *oplog.OpLog) *Orchestrator {
|
||||
var o *Orchestrator
|
||||
o = &Orchestrator{
|
||||
config: cfg,
|
||||
OpLog: oplog,
|
||||
// repoPool created with a memory store to ensure the config is updated in an atomic operation with the repo pool's config value.
|
||||
repoPool: newResticRepoPool(resticBin, &config.MemoryStore{Config: cfg}),
|
||||
externTasks: make(chan Task, 2),
|
||||
}, nil
|
||||
repoPool: newResticRepoPool(resticBin, &config.MemoryStore{Config: cfg}),
|
||||
taskQueue: taskQueue{
|
||||
Now: func() time.Time {
|
||||
if o.now != nil {
|
||||
return o.now()
|
||||
}
|
||||
return time.Now()
|
||||
},
|
||||
},
|
||||
}
|
||||
return o
|
||||
}
|
||||
|
||||
func (o *Orchestrator) ApplyConfig(cfg *v1.Config) error {
|
||||
@@ -52,9 +62,15 @@ func (o *Orchestrator) ApplyConfig(cfg *v1.Config) error {
|
||||
return fmt.Errorf("failed to update repo pool config: %w", err)
|
||||
}
|
||||
|
||||
if o.configUpdates != nil {
|
||||
// orchestrator loop is running, notify it of the config change.
|
||||
o.configUpdates <- cfg
|
||||
o.taskQueue.Reset() // reset queued tasks, this may loose any ephemeral operations scheduled by RPC. Tasks in progress are not cancelled.
|
||||
|
||||
// Requeue tasks that are affected by the config change.
|
||||
for _, plan := range cfg.Plans {
|
||||
t, err := NewScheduledBackupTask(o, plan)
|
||||
if err != nil {
|
||||
return fmt.Errorf("schedule backup task for plan %q: %w", plan.Id, err)
|
||||
}
|
||||
o.ScheduleTask(t)
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -66,7 +82,7 @@ func (o *Orchestrator) GetRepo(repoId string) (repo *RepoOrchestrator, err error
|
||||
|
||||
r, err := o.repoPool.GetRepo(repoId)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get repo %q: %w", repoId, err)
|
||||
return nil, fmt.Errorf("get repo %q: %w", repoId, err)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
@@ -89,110 +105,55 @@ func (o *Orchestrator) GetPlan(planId string) (*v1.Plan, error) {
|
||||
}
|
||||
|
||||
// Run is the main orchestration loop. Cancel the context to stop the loop.
|
||||
func (o *Orchestrator) Run(mainCtx context.Context) error {
|
||||
func (o *Orchestrator) Run(mainCtx context.Context) {
|
||||
zap.L().Info("starting orchestrator loop")
|
||||
|
||||
o.mu.Lock()
|
||||
o.configUpdates = make(chan *v1.Config)
|
||||
o.mu.Unlock()
|
||||
|
||||
for {
|
||||
o.mu.Lock()
|
||||
config := o.config
|
||||
o.mu.Unlock()
|
||||
if o.runVersion(mainCtx, config) {
|
||||
zap.L().Info("restarting orchestrator loop")
|
||||
} else {
|
||||
zap.L().Info("exiting orchestrator loop, context cancelled.")
|
||||
if mainCtx.Err() != nil {
|
||||
zap.L().Info("shutting down orchestrator loop, context cancelled.")
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// runImmutable is a helper function for Run() that runs the orchestration loop with a single version of the config.
|
||||
func (o *Orchestrator) runVersion(mainCtx context.Context, config *v1.Config) bool {
|
||||
var lock sync.Mutex
|
||||
ctx, cancel := context.WithCancel(mainCtx)
|
||||
t := o.taskQueue.Dequeue(mainCtx)
|
||||
if t == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
zap.L().Info("running task", zap.String("task", t.task.Name()))
|
||||
if err := t.task.Run(mainCtx); err != nil {
|
||||
zap.L().Error("task failed", zap.String("task", t.task.Name()), zap.Error(err))
|
||||
} else {
|
||||
zap.L().Debug("task finished", zap.String("task", t.task.Name()))
|
||||
}
|
||||
|
||||
var execTask func(t Task)
|
||||
execTask = func(t Task) {
|
||||
curTime := time.Now()
|
||||
|
||||
runAt := t.Next(curTime)
|
||||
if runAt == nil {
|
||||
zap.L().Debug("task has no next run, not scheduling.", zap.String("task", t.Name()))
|
||||
return
|
||||
if o.now != nil {
|
||||
curTime = o.now()
|
||||
}
|
||||
|
||||
timer := time.NewTimer(runAt.Sub(curTime))
|
||||
zap.L().Info("scheduling task", zap.String("task", t.Name()), zap.String("runAt", runAt.Format(time.RFC3339)))
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if !timer.Stop() {
|
||||
<-timer.C
|
||||
}
|
||||
zap.L().Debug("cancelled scheduled (but not running) task, orchestrator context is cancelled.", zap.String("task", t.Name()))
|
||||
return
|
||||
case <-timer.C:
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
zap.L().Info("running task", zap.String("task", t.Name()))
|
||||
|
||||
// Task execution runs with mainCtx meaning config changes do not interrupt it, but cancelling the orchestration loop will.
|
||||
if err := t.Run(mainCtx); err != nil {
|
||||
zap.L().Error("task failed", zap.String("task", t.Name()), zap.Error(err))
|
||||
} else {
|
||||
zap.L().Debug("task finished", zap.String("task", t.Name()))
|
||||
}
|
||||
|
||||
if ctx.Err() != nil {
|
||||
zap.L().Debug("not attempting to reschedule task, orchestrator context is cancelled.", zap.String("task", t.Name()))
|
||||
return
|
||||
}
|
||||
|
||||
execTask(t)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Schedule all backup tasks.
|
||||
for _, plan := range config.Plans {
|
||||
t, err := NewScheduledBackupTask(o, plan)
|
||||
if err != nil {
|
||||
zap.L().Error("failed to create backup task for plan", zap.String("plan", plan.Id), zap.Error(err))
|
||||
}
|
||||
|
||||
execTask(t)
|
||||
}
|
||||
|
||||
// wait for either an error or the context to be cancelled, then wait for all tasks.
|
||||
for {
|
||||
select {
|
||||
case t := <-o.externTasks:
|
||||
execTask(t)
|
||||
case <-mainCtx.Done():
|
||||
zap.L().Info("orchestrator context cancelled, shutting down orchestrator")
|
||||
cancel()
|
||||
wg.Wait()
|
||||
return false
|
||||
case <-o.configUpdates:
|
||||
zap.L().Info("orchestrator received config change, waiting for in-progress operations then restarting")
|
||||
cancel()
|
||||
wg.Wait()
|
||||
return true
|
||||
if nextTime := t.task.Next(curTime); nextTime != nil {
|
||||
o.taskQueue.Push(scheduledTask{
|
||||
task: t.task,
|
||||
runAt: *nextTime,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) EnqueueTask(t Task) {
|
||||
o.externTasks <- t
|
||||
func (o *Orchestrator) ScheduleTask(t Task) {
|
||||
curTime := time.Now()
|
||||
if o.now != nil {
|
||||
curTime = o.now()
|
||||
}
|
||||
nextRun := t.Next(curTime)
|
||||
if nextRun == nil {
|
||||
return
|
||||
}
|
||||
zap.L().Info("scheduling task", zap.String("task", t.Name()), zap.String("runAt", nextRun.Format(time.RFC3339)))
|
||||
o.taskQueue.Push(scheduledTask{
|
||||
task: t,
|
||||
runAt: *nextRun,
|
||||
})
|
||||
}
|
||||
|
||||
// resticRepoPool caches restic repos.
|
||||
|
||||
Reference in New Issue
Block a user