feat: support restic check operation (#303)

This commit is contained in:
Gareth
2024-05-27 10:26:18 -07:00
committed by GitHub
parent 5a51ae7c20
commit ce42f68d0d
43 changed files with 1807 additions and 989 deletions

View File

@@ -166,9 +166,15 @@ func (o *Orchestrator) ScheduleDefaultTasks(config *v1.Config) error {
for _, repo := range config.Repos {
// Schedule a prune task for the repo
t := tasks.NewPruneTask(repo.GetId(), tasks.PlanForSystemTasks, false)
if err := o.ScheduleTask(t, tasks.TaskPriorityDefault); err != nil {
if err := o.ScheduleTask(t, tasks.TaskPriorityPrune); err != nil {
return fmt.Errorf("schedule prune task for repo %q: %w", repo.GetId(), err)
}
// Schedule a check task for the repo
t = tasks.NewCheckTask(repo.GetId(), tasks.PlanForSystemTasks, false)
if err := o.ScheduleTask(t, tasks.TaskPriorityCheck); err != nil {
return fmt.Errorf("schedule check task for repo %q: %w", repo.GetId(), err)
}
}
return nil

View File

@@ -263,6 +263,32 @@ func (r *RepoOrchestrator) Prune(ctx context.Context, output io.Writer) error {
return nil
}
func (r *RepoOrchestrator) Check(ctx context.Context, output io.Writer) error {
r.mu.Lock()
defer r.mu.Unlock()
ctx, flush := forwardResticLogs(ctx)
defer flush()
var opts []restic.GenericOption
if r.repoConfig.CheckPolicy != nil {
switch m := r.repoConfig.CheckPolicy.Mode.(type) {
case *v1.CheckPolicy_ReadDataSubsetPercent:
if m.ReadDataSubsetPercent > 0 {
opts = append(opts, restic.WithFlags(fmt.Sprintf("--read-data-subset=%v%%", m.ReadDataSubsetPercent)))
}
case *v1.CheckPolicy_StructureOnly:
default:
}
}
r.l.Debug("checking repo")
err := r.repo.Check(ctx, output, opts...)
if err != nil {
return fmt.Errorf("check repo %v: %w", r.repoConfig.Id, err)
}
return nil
}
func (r *RepoOrchestrator) Restore(ctx context.Context, snapshotId string, path string, target string, progressCallback func(event *v1.RestoreProgressEntry)) (*v1.RestoreProgressEntry, error) {
r.mu.Lock()
defer r.mu.Unlock()

View File

@@ -1,6 +1,7 @@
package repo
import (
"bytes"
"context"
"os"
"slices"
@@ -189,3 +190,59 @@ func TestEnvVarPropagation(t *testing.T) {
t.Fatal("expected snapshot id")
}
}
func TestCheck(t *testing.T) {
t.Parallel()
tcs := []struct {
name string
repo *v1.Repo
}{
{
name: "check structure",
repo: &v1.Repo{
Id: "test",
Uri: t.TempDir(),
Password: "test",
CheckPolicy: &v1.CheckPolicy{
Mode: nil,
},
},
},
{
name: "read data percent",
repo: &v1.Repo{
Id: "test",
Uri: t.TempDir(),
Password: "test",
CheckPolicy: &v1.CheckPolicy{
Mode: &v1.CheckPolicy_ReadDataSubsetPercent{
ReadDataSubsetPercent: 50,
},
},
},
},
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
orchestrator, err := NewRepoOrchestrator(configForTest, tc.repo, helpers.ResticBinary(t))
if err != nil {
t.Fatalf("failed to create repo orchestrator: %v", err)
}
buf := bytes.NewBuffer(nil)
err = orchestrator.Init(context.Background())
if err != nil {
t.Fatalf("init error: %v", err)
}
err = orchestrator.Check(context.Background(), buf)
if err != nil {
t.Errorf("check error: %v", err)
}
t.Logf("check output: %s", buf.String())
})
}
}

View File

@@ -84,11 +84,7 @@ func (t *taskRunnerImpl) ExecuteHooks(events []v1.Hook_Condition, vars hook.Hook
}
}
if planID != "" {
var err error
plan, err = t.FindPlan()
if err != nil {
return err
}
plan, _ = t.FindPlan()
}
var flowID int64
if t.op != nil {

View File

@@ -17,12 +17,13 @@ const (
PlanForUnassociatedOperations = "_unassociated_"
PlanForSystemTasks = "_system_" // plan for system tasks e.g. garbage collection, prune, stats, etc.
TaskPriorityStats = -1
TaskPriorityDefault = 0
TaskPriorityInteractive = 1 << 1
TaskPriorityStats = 0
TaskPriorityDefault = 1 << 1 // default priority
TaskPriorityForget = 1 << 2
TaskPriorityIndexSnapshots = 1 << 3
TaskPriorityPrune = 1 << 4
TaskPriorityCheck = 1 << 4 // check should always run after prune.
TaskPriorityPrune = 1 << 5
TaskPriorityInteractive = 1 << 6 // highest priority
)
// TaskRunner is an interface for running tasks. It is used by tasks to create operations and write logs.

View File

@@ -0,0 +1,185 @@
package tasks
import (
"context"
"errors"
"fmt"
"sync"
"time"
v1 "github.com/garethgeorge/backrest/gen/go/v1"
"github.com/garethgeorge/backrest/internal/hook"
"github.com/garethgeorge/backrest/internal/ioutil"
"github.com/garethgeorge/backrest/internal/oplog"
"github.com/garethgeorge/backrest/internal/oplog/indexutil"
"github.com/garethgeorge/backrest/internal/protoutil"
"go.uber.org/zap"
)
type CheckTask struct {
BaseTask
force bool
didRun bool
}
func NewCheckTask(repoID, planID string, force bool) Task {
return &CheckTask{
BaseTask: BaseTask{
TaskName: fmt.Sprintf("prune repo %q", repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
},
force: force,
}
}
func (t *CheckTask) Next(now time.Time, runner TaskRunner) (ScheduledTask, error) {
if t.force {
if t.didRun {
return NeverScheduledTask, nil
}
t.didRun = true
return ScheduledTask{
Task: t,
RunAt: now,
Op: &v1.Operation{
Op: &v1.Operation_OperationCheck{},
},
}, nil
}
repo, err := runner.GetRepo(t.RepoID())
if err != nil {
return ScheduledTask{}, fmt.Errorf("get repo %v: %w", t.RepoID(), err)
}
if repo.CheckPolicy.GetSchedule() == nil {
return NeverScheduledTask, nil
}
var lastRan time.Time
var foundBackup bool
if err := runner.OpLog().ForEach(oplog.Query{RepoId: t.RepoID()}, indexutil.Reversed(indexutil.CollectAll()), func(op *v1.Operation) error {
if _, ok := op.Op.(*v1.Operation_OperationCheck); ok {
lastRan = time.Unix(0, op.UnixTimeEndMs*int64(time.Millisecond))
return oplog.ErrStopIteration
}
if _, ok := op.Op.(*v1.Operation_OperationBackup); ok {
foundBackup = true
}
return nil
}); err != nil {
return NeverScheduledTask, fmt.Errorf("finding last check run time: %w", err)
} else if !foundBackup {
return NeverScheduledTask, nil
}
zap.L().Debug("last prune time", zap.Time("time", lastRan), zap.String("repo", t.RepoID()))
runAt, err := protoutil.ResolveSchedule(repo.CheckPolicy.GetSchedule(), lastRan)
if errors.Is(err, protoutil.ErrScheduleDisabled) {
return NeverScheduledTask, nil
} else if err != nil {
return NeverScheduledTask, fmt.Errorf("resolve schedule: %w", err)
}
return ScheduledTask{
Task: t,
RunAt: runAt,
Op: &v1.Operation{
Op: &v1.Operation_OperationCheck{},
},
}, nil
}
func (t *CheckTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner) error {
op := st.Op
repo, err := runner.GetRepoOrchestrator(t.RepoID())
if err != nil {
return fmt.Errorf("couldn't get repo %q: %w", t.RepoID(), err)
}
if err := runner.ExecuteHooks([]v1.Hook_Condition{
v1.Hook_CONDITION_CHECK_START,
}, hook.HookVars{}); err != nil {
// TODO: generalize this logic
op.DisplayMessage = err.Error()
var cancelErr *hook.HookErrorRequestCancel
if errors.As(err, &cancelErr) {
op.Status = v1.OperationStatus_STATUS_USER_CANCELLED // user visible cancelled status
return nil
}
op.Status = v1.OperationStatus_STATUS_ERROR
return fmt.Errorf("execute check start hooks: %w", err)
}
err = repo.UnlockIfAutoEnabled(ctx)
if err != nil {
return fmt.Errorf("auto unlock repo %q: %w", t.RepoID(), err)
}
opCheck := &v1.Operation_OperationCheck{
OperationCheck: &v1.OperationCheck{},
}
op.Op = opCheck
ctx, cancel := context.WithCancel(ctx)
interval := time.NewTicker(1 * time.Second)
defer interval.Stop()
buf := ioutil.HeadWriter{Limit: 16 * 1024}
bufWriter := ioutil.SynchronizedWriter{W: &buf}
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-interval.C:
bufWriter.Mu.Lock()
output := string(buf.Bytes())
bufWriter.Mu.Unlock()
if opCheck.OperationCheck.Output != string(output) {
opCheck.OperationCheck.Output = string(output)
if err := runner.OpLog().Update(op); err != nil {
zap.L().Error("update prune operation with status output", zap.Error(err))
}
}
case <-ctx.Done():
return
}
}
}()
if err := repo.Check(ctx, &bufWriter); err != nil {
cancel()
runner.ExecuteHooks([]v1.Hook_Condition{
v1.Hook_CONDITION_CHECK_ERROR,
v1.Hook_CONDITION_ANY_ERROR,
}, hook.HookVars{
Error: err.Error(),
})
return fmt.Errorf("prune: %w", err)
}
cancel()
wg.Wait()
opCheck.OperationCheck.Output = string(buf.Bytes())
// Run a stats task after a successful prune
if err := runner.ScheduleTask(NewStatsTask(t.RepoID(), PlanForSystemTasks, false), TaskPriorityStats); err != nil {
zap.L().Error("schedule stats task", zap.Error(err))
}
if err := runner.ExecuteHooks([]v1.Hook_Condition{
v1.Hook_CONDITION_CHECK_SUCCESS,
}, hook.HookVars{}); err != nil {
return fmt.Errorf("execute prune success hooks: %w", err)
}
return nil
}

View File

@@ -11,7 +11,7 @@ import (
)
const (
gcStartupDelay = 5 * time.Second
gcStartupDelay = 60 * time.Second
gcInterval = 24 * time.Hour
// keep operations that are eligible for gc for 30 days OR up to a limit of 100 for any one plan.
// an operation is eligible for gc if:

View File

@@ -1,7 +1,6 @@
package tasks
import (
"bytes"
"context"
"errors"
"fmt"
@@ -59,14 +58,20 @@ func (t *PruneTask) Next(now time.Time, runner TaskRunner) (ScheduledTask, error
}
var lastRan time.Time
var foundBackup bool
if err := runner.OpLog().ForEach(oplog.Query{RepoId: t.RepoID()}, indexutil.Reversed(indexutil.CollectAll()), func(op *v1.Operation) error {
if _, ok := op.Op.(*v1.Operation_OperationPrune); ok {
lastRan = time.Unix(0, op.UnixTimeEndMs*int64(time.Millisecond))
return oplog.ErrStopIteration
}
if _, ok := op.Op.(*v1.Operation_OperationBackup); ok {
foundBackup = true
}
return nil
}); err != nil {
return NeverScheduledTask, fmt.Errorf("finding last backup run time: %w", err)
return NeverScheduledTask, fmt.Errorf("finding last prune run time: %w", err)
} else if !foundBackup {
return NeverScheduledTask, nil
}
zap.L().Debug("last prune time", zap.Time("time", lastRan), zap.String("repo", t.RepoID()))
@@ -95,6 +100,20 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
return fmt.Errorf("couldn't get repo %q: %w", t.RepoID(), err)
}
if err := runner.ExecuteHooks([]v1.Hook_Condition{
v1.Hook_CONDITION_PRUNE_START,
}, hook.HookVars{}); err != nil {
op.DisplayMessage = err.Error()
// TODO: generalize this logic
var cancelErr *hook.HookErrorRequestCancel
if errors.As(err, &cancelErr) {
op.Status = v1.OperationStatus_STATUS_USER_CANCELLED // user visible cancelled status
return nil
}
op.Status = v1.OperationStatus_STATUS_ERROR
return fmt.Errorf("execute prune start hooks: %w", err)
}
err = repo.UnlockIfAutoEnabled(ctx)
if err != nil {
return fmt.Errorf("auto unlock repo %q: %w", t.RepoID(), err)
@@ -108,7 +127,7 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
ctx, cancel := context.WithCancel(ctx)
interval := time.NewTicker(1 * time.Second)
defer interval.Stop()
var buf bytes.Buffer
buf := ioutil.HeadWriter{Limit: 16 * 1024}
bufWriter := ioutil.SynchronizedWriter{W: &buf}
var wg sync.WaitGroup
wg.Add(1)
@@ -118,14 +137,11 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
select {
case <-interval.C:
bufWriter.Mu.Lock()
output := buf.String()
output := string(buf.Bytes())
bufWriter.Mu.Unlock()
if len(output) > 8*1024 { // only provide live status upto the first 8K of output.
output = output[:len(output)-8*1024]
}
if opPrune.OperationPrune.Output != output {
opPrune.OperationPrune.Output = buf.String()
if opPrune.OperationPrune.Output != string(output) {
opPrune.OperationPrune.Output = string(output)
if err := runner.OpLog().Update(op); err != nil {
zap.L().Error("update prune operation with status output", zap.Error(err))
@@ -151,17 +167,18 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
cancel()
wg.Wait()
output := buf.String()
if len(output) > 8*1024 { // only save the first 4K of output.
output = output[:len(output)-8*1024]
}
opPrune.OperationPrune.Output = output
opPrune.OperationPrune.Output = string(buf.Bytes())
// Run a stats task after a successful prune
if err := runner.ScheduleTask(NewStatsTask(t.RepoID(), PlanForSystemTasks, false), TaskPriorityStats); err != nil {
zap.L().Error("schedule stats task", zap.Error(err))
}
if err := runner.ExecuteHooks([]v1.Hook_Condition{
v1.Hook_CONDITION_PRUNE_SUCCESS,
}, hook.HookVars{}); err != nil {
return fmt.Errorf("execute prune end hooks: %w", err)
}
return nil
}