chore: refactor task scheduler

This commit is contained in:
garethgeorge
2023-11-28 19:57:50 -08:00
parent 4957496787
commit 1b67e2b200
6 changed files with 479 additions and 117 deletions

View File

@@ -21,23 +21,33 @@ var ErrPlanNotFound = errors.New("plan not found")
// Orchestrator is responsible for managing repos and backups.
type Orchestrator struct {
mu sync.Mutex
config *v1.Config
OpLog *oplog.OpLog
repoPool *resticRepoPool
mu sync.Mutex
config *v1.Config
OpLog *oplog.OpLog
repoPool *resticRepoPool
taskQueue taskQueue
configUpdates chan *v1.Config // configUpdates chan makes config changes available to Run()
externTasks chan Task // externTasks is a channel that externally added tasks can be added to, they will be consumed by Run()
// now for the purpose of testing; used by Run() to get the current time.
now func() time.Time
}
func NewOrchestrator(resticBin string, cfg *v1.Config, oplog *oplog.OpLog) (*Orchestrator, error) {
return &Orchestrator{
func NewOrchestrator(resticBin string, cfg *v1.Config, oplog *oplog.OpLog) *Orchestrator {
var o *Orchestrator
o = &Orchestrator{
config: cfg,
OpLog: oplog,
// repoPool created with a memory store to ensure the config is updated in an atomic operation with the repo pool's config value.
repoPool: newResticRepoPool(resticBin, &config.MemoryStore{Config: cfg}),
externTasks: make(chan Task, 2),
}, nil
repoPool: newResticRepoPool(resticBin, &config.MemoryStore{Config: cfg}),
taskQueue: taskQueue{
Now: func() time.Time {
if o.now != nil {
return o.now()
}
return time.Now()
},
},
}
return o
}
func (o *Orchestrator) ApplyConfig(cfg *v1.Config) error {
@@ -52,9 +62,15 @@ func (o *Orchestrator) ApplyConfig(cfg *v1.Config) error {
return fmt.Errorf("failed to update repo pool config: %w", err)
}
if o.configUpdates != nil {
// orchestrator loop is running, notify it of the config change.
o.configUpdates <- cfg
o.taskQueue.Reset() // reset queued tasks, this may loose any ephemeral operations scheduled by RPC. Tasks in progress are not cancelled.
// Requeue tasks that are affected by the config change.
for _, plan := range cfg.Plans {
t, err := NewScheduledBackupTask(o, plan)
if err != nil {
return fmt.Errorf("schedule backup task for plan %q: %w", plan.Id, err)
}
o.ScheduleTask(t)
}
return nil
@@ -66,7 +82,7 @@ func (o *Orchestrator) GetRepo(repoId string) (repo *RepoOrchestrator, err error
r, err := o.repoPool.GetRepo(repoId)
if err != nil {
return nil, fmt.Errorf("failed to get repo %q: %w", repoId, err)
return nil, fmt.Errorf("get repo %q: %w", repoId, err)
}
return r, nil
}
@@ -89,110 +105,55 @@ func (o *Orchestrator) GetPlan(planId string) (*v1.Plan, error) {
}
// Run is the main orchestration loop. Cancel the context to stop the loop.
func (o *Orchestrator) Run(mainCtx context.Context) error {
func (o *Orchestrator) Run(mainCtx context.Context) {
zap.L().Info("starting orchestrator loop")
o.mu.Lock()
o.configUpdates = make(chan *v1.Config)
o.mu.Unlock()
for {
o.mu.Lock()
config := o.config
o.mu.Unlock()
if o.runVersion(mainCtx, config) {
zap.L().Info("restarting orchestrator loop")
} else {
zap.L().Info("exiting orchestrator loop, context cancelled.")
if mainCtx.Err() != nil {
zap.L().Info("shutting down orchestrator loop, context cancelled.")
break
}
}
return nil
}
// runImmutable is a helper function for Run() that runs the orchestration loop with a single version of the config.
func (o *Orchestrator) runVersion(mainCtx context.Context, config *v1.Config) bool {
var lock sync.Mutex
ctx, cancel := context.WithCancel(mainCtx)
t := o.taskQueue.Dequeue(mainCtx)
if t == nil {
continue
}
var wg sync.WaitGroup
zap.L().Info("running task", zap.String("task", t.task.Name()))
if err := t.task.Run(mainCtx); err != nil {
zap.L().Error("task failed", zap.String("task", t.task.Name()), zap.Error(err))
} else {
zap.L().Debug("task finished", zap.String("task", t.task.Name()))
}
var execTask func(t Task)
execTask = func(t Task) {
curTime := time.Now()
runAt := t.Next(curTime)
if runAt == nil {
zap.L().Debug("task has no next run, not scheduling.", zap.String("task", t.Name()))
return
if o.now != nil {
curTime = o.now()
}
timer := time.NewTimer(runAt.Sub(curTime))
zap.L().Info("scheduling task", zap.String("task", t.Name()), zap.String("runAt", runAt.Format(time.RFC3339)))
wg.Add(1)
go func() {
defer wg.Done()
select {
case <-ctx.Done():
if !timer.Stop() {
<-timer.C
}
zap.L().Debug("cancelled scheduled (but not running) task, orchestrator context is cancelled.", zap.String("task", t.Name()))
return
case <-timer.C:
lock.Lock()
defer lock.Unlock()
zap.L().Info("running task", zap.String("task", t.Name()))
// Task execution runs with mainCtx meaning config changes do not interrupt it, but cancelling the orchestration loop will.
if err := t.Run(mainCtx); err != nil {
zap.L().Error("task failed", zap.String("task", t.Name()), zap.Error(err))
} else {
zap.L().Debug("task finished", zap.String("task", t.Name()))
}
if ctx.Err() != nil {
zap.L().Debug("not attempting to reschedule task, orchestrator context is cancelled.", zap.String("task", t.Name()))
return
}
execTask(t)
}
}()
}
// Schedule all backup tasks.
for _, plan := range config.Plans {
t, err := NewScheduledBackupTask(o, plan)
if err != nil {
zap.L().Error("failed to create backup task for plan", zap.String("plan", plan.Id), zap.Error(err))
}
execTask(t)
}
// wait for either an error or the context to be cancelled, then wait for all tasks.
for {
select {
case t := <-o.externTasks:
execTask(t)
case <-mainCtx.Done():
zap.L().Info("orchestrator context cancelled, shutting down orchestrator")
cancel()
wg.Wait()
return false
case <-o.configUpdates:
zap.L().Info("orchestrator received config change, waiting for in-progress operations then restarting")
cancel()
wg.Wait()
return true
if nextTime := t.task.Next(curTime); nextTime != nil {
o.taskQueue.Push(scheduledTask{
task: t.task,
runAt: *nextTime,
})
}
}
}
func (o *Orchestrator) EnqueueTask(t Task) {
o.externTasks <- t
func (o *Orchestrator) ScheduleTask(t Task) {
curTime := time.Now()
if o.now != nil {
curTime = o.now()
}
nextRun := t.Next(curTime)
if nextRun == nil {
return
}
zap.L().Info("scheduling task", zap.String("task", t.Name()), zap.String("runAt", nextRun.Format(time.RFC3339)))
o.taskQueue.Push(scheduledTask{
task: t,
runAt: *nextRun,
})
}
// resticRepoPool caches restic repos.