feat: support restic check operation (#303)

2025-12-13 17:25:38 +00:00 · 2024-05-27 10:26:18 -07:00
parent 5a51ae7c20
commit ce42f68d0d
43 changed files with 1807 additions and 989 deletions
--- a/internal/orchestrator/orchestrator.go
+++ b/internal/orchestrator/orchestrator.go
@@ -166,9 +166,15 @@ func (o *Orchestrator) ScheduleDefaultTasks(config *v1.Config) error {
 	for _, repo := range config.Repos {
 		// Schedule a prune task for the repo
 		t := tasks.NewPruneTask(repo.GetId(), tasks.PlanForSystemTasks, false)
-		if err := o.ScheduleTask(t, tasks.TaskPriorityDefault); err != nil {
+		if err := o.ScheduleTask(t, tasks.TaskPriorityPrune); err != nil {
 			return fmt.Errorf("schedule prune task for repo %q: %w", repo.GetId(), err)
 		}
+
+		// Schedule a check task for the repo
+		t = tasks.NewCheckTask(repo.GetId(), tasks.PlanForSystemTasks, false)
+		if err := o.ScheduleTask(t, tasks.TaskPriorityCheck); err != nil {
+			return fmt.Errorf("schedule check task for repo %q: %w", repo.GetId(), err)
+		}
 	}

 	return nil
--- a/internal/orchestrator/repo/repo.go
+++ b/internal/orchestrator/repo/repo.go
@@ -263,6 +263,32 @@ func (r *RepoOrchestrator) Prune(ctx context.Context, output io.Writer) error {
 	return nil
 }

+func (r *RepoOrchestrator) Check(ctx context.Context, output io.Writer) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	ctx, flush := forwardResticLogs(ctx)
+	defer flush()
+
+	var opts []restic.GenericOption
+	if r.repoConfig.CheckPolicy != nil {
+		switch m := r.repoConfig.CheckPolicy.Mode.(type) {
+		case *v1.CheckPolicy_ReadDataSubsetPercent:
+			if m.ReadDataSubsetPercent > 0 {
+				opts = append(opts, restic.WithFlags(fmt.Sprintf("--read-data-subset=%v%%", m.ReadDataSubsetPercent)))
+			}
+		case *v1.CheckPolicy_StructureOnly:
+		default:
+		}
+	}
+
+	r.l.Debug("checking repo")
+	err := r.repo.Check(ctx, output, opts...)
+	if err != nil {
+		return fmt.Errorf("check repo %v: %w", r.repoConfig.Id, err)
+	}
+	return nil
+}
+
 func (r *RepoOrchestrator) Restore(ctx context.Context, snapshotId string, path string, target string, progressCallback func(event *v1.RestoreProgressEntry)) (*v1.RestoreProgressEntry, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
--- a/internal/orchestrator/repo/repo_test.go
+++ b/internal/orchestrator/repo/repo_test.go
@@ -1,6 +1,7 @@
 package repo

 import (
+	"bytes"
 	"context"
 	"os"
 	"slices"
@@ -189,3 +190,59 @@ func TestEnvVarPropagation(t *testing.T) {
 		t.Fatal("expected snapshot id")
 	}
 }
+
+func TestCheck(t *testing.T) {
+	t.Parallel()
+
+	tcs := []struct {
+		name string
+		repo *v1.Repo
+	}{
+		{
+			name: "check structure",
+			repo: &v1.Repo{
+				Id:       "test",
+				Uri:      t.TempDir(),
+				Password: "test",
+				CheckPolicy: &v1.CheckPolicy{
+					Mode: nil,
+				},
+			},
+		},
+		{
+			name: "read data percent",
+			repo: &v1.Repo{
+				Id:       "test",
+				Uri:      t.TempDir(),
+				Password: "test",
+				CheckPolicy: &v1.CheckPolicy{
+					Mode: &v1.CheckPolicy_ReadDataSubsetPercent{
+						ReadDataSubsetPercent: 50,
+					},
+				},
+			},
+		},
+	}
+
+	for _, tc := range tcs {
+		t.Run(tc.name, func(t *testing.T) {
+			orchestrator, err := NewRepoOrchestrator(configForTest, tc.repo, helpers.ResticBinary(t))
+			if err != nil {
+				t.Fatalf("failed to create repo orchestrator: %v", err)
+			}
+
+			buf := bytes.NewBuffer(nil)
+
+			err = orchestrator.Init(context.Background())
+			if err != nil {
+				t.Fatalf("init error: %v", err)
+			}
+
+			err = orchestrator.Check(context.Background(), buf)
+			if err != nil {
+				t.Errorf("check error: %v", err)
+			}
+			t.Logf("check output: %s", buf.String())
+		})
+	}
+}
--- a/internal/orchestrator/taskrunnerimpl.go
+++ b/internal/orchestrator/taskrunnerimpl.go
@@ -84,11 +84,7 @@ func (t *taskRunnerImpl) ExecuteHooks(events []v1.Hook_Condition, vars hook.Hook
 		}
 	}
 	if planID != "" {
-		var err error
-		plan, err = t.FindPlan()
-		if err != nil {
-			return err
-		}
+		plan, _ = t.FindPlan()
 	}
 	var flowID int64
 	if t.op != nil {
--- a/internal/orchestrator/tasks/task.go
+++ b/internal/orchestrator/tasks/task.go
@@ -17,12 +17,13 @@ const (
 	PlanForUnassociatedOperations = "_unassociated_"
 	PlanForSystemTasks            = "_system_" // plan for system tasks e.g. garbage collection, prune, stats, etc.

-	TaskPriorityStats          = -1
-	TaskPriorityDefault        = 0
-	TaskPriorityInteractive    = 1 << 1
+	TaskPriorityStats          = 0
+	TaskPriorityDefault        = 1 << 1 // default priority
 	TaskPriorityForget         = 1 << 2
 	TaskPriorityIndexSnapshots = 1 << 3
-	TaskPriorityPrune          = 1 << 4
+	TaskPriorityCheck          = 1 << 4 // check should always run after prune.
+	TaskPriorityPrune          = 1 << 5
+	TaskPriorityInteractive    = 1 << 6 // highest priority
 )

 // TaskRunner is an interface for running tasks. It is used by tasks to create operations and write logs.
--- a/internal/orchestrator/tasks/taskcheck.go
+++ b/internal/orchestrator/tasks/taskcheck.go
@@ -0,0 +1,185 @@
+package tasks
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	v1 "github.com/garethgeorge/backrest/gen/go/v1"
+	"github.com/garethgeorge/backrest/internal/hook"
+	"github.com/garethgeorge/backrest/internal/ioutil"
+	"github.com/garethgeorge/backrest/internal/oplog"
+	"github.com/garethgeorge/backrest/internal/oplog/indexutil"
+	"github.com/garethgeorge/backrest/internal/protoutil"
+	"go.uber.org/zap"
+)
+
+type CheckTask struct {
+	BaseTask
+	force  bool
+	didRun bool
+}
+
+func NewCheckTask(repoID, planID string, force bool) Task {
+	return &CheckTask{
+		BaseTask: BaseTask{
+			TaskName:   fmt.Sprintf("prune repo %q", repoID),
+			TaskRepoID: repoID,
+			TaskPlanID: planID,
+		},
+		force: force,
+	}
+}
+
+func (t *CheckTask) Next(now time.Time, runner TaskRunner) (ScheduledTask, error) {
+	if t.force {
+		if t.didRun {
+			return NeverScheduledTask, nil
+		}
+		t.didRun = true
+		return ScheduledTask{
+			Task:  t,
+			RunAt: now,
+			Op: &v1.Operation{
+				Op: &v1.Operation_OperationCheck{},
+			},
+		}, nil
+	}
+
+	repo, err := runner.GetRepo(t.RepoID())
+	if err != nil {
+		return ScheduledTask{}, fmt.Errorf("get repo %v: %w", t.RepoID(), err)
+	}
+
+	if repo.CheckPolicy.GetSchedule() == nil {
+		return NeverScheduledTask, nil
+	}
+
+	var lastRan time.Time
+	var foundBackup bool
+	if err := runner.OpLog().ForEach(oplog.Query{RepoId: t.RepoID()}, indexutil.Reversed(indexutil.CollectAll()), func(op *v1.Operation) error {
+		if _, ok := op.Op.(*v1.Operation_OperationCheck); ok {
+			lastRan = time.Unix(0, op.UnixTimeEndMs*int64(time.Millisecond))
+			return oplog.ErrStopIteration
+		}
+		if _, ok := op.Op.(*v1.Operation_OperationBackup); ok {
+			foundBackup = true
+		}
+		return nil
+	}); err != nil {
+		return NeverScheduledTask, fmt.Errorf("finding last check run time: %w", err)
+	} else if !foundBackup {
+		return NeverScheduledTask, nil
+	}
+
+	zap.L().Debug("last prune time", zap.Time("time", lastRan), zap.String("repo", t.RepoID()))
+
+	runAt, err := protoutil.ResolveSchedule(repo.CheckPolicy.GetSchedule(), lastRan)
+	if errors.Is(err, protoutil.ErrScheduleDisabled) {
+		return NeverScheduledTask, nil
+	} else if err != nil {
+		return NeverScheduledTask, fmt.Errorf("resolve schedule: %w", err)
+	}
+
+	return ScheduledTask{
+		Task:  t,
+		RunAt: runAt,
+		Op: &v1.Operation{
+			Op: &v1.Operation_OperationCheck{},
+		},
+	}, nil
+}
+
+func (t *CheckTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner) error {
+	op := st.Op
+
+	repo, err := runner.GetRepoOrchestrator(t.RepoID())
+	if err != nil {
+		return fmt.Errorf("couldn't get repo %q: %w", t.RepoID(), err)
+	}
+
+	if err := runner.ExecuteHooks([]v1.Hook_Condition{
+		v1.Hook_CONDITION_CHECK_START,
+	}, hook.HookVars{}); err != nil {
+		// TODO: generalize this logic
+		op.DisplayMessage = err.Error()
+		var cancelErr *hook.HookErrorRequestCancel
+		if errors.As(err, &cancelErr) {
+			op.Status = v1.OperationStatus_STATUS_USER_CANCELLED // user visible cancelled status
+			return nil
+		}
+		op.Status = v1.OperationStatus_STATUS_ERROR
+		return fmt.Errorf("execute check start hooks: %w", err)
+	}
+
+	err = repo.UnlockIfAutoEnabled(ctx)
+	if err != nil {
+		return fmt.Errorf("auto unlock repo %q: %w", t.RepoID(), err)
+	}
+
+	opCheck := &v1.Operation_OperationCheck{
+		OperationCheck: &v1.OperationCheck{},
+	}
+	op.Op = opCheck
+
+	ctx, cancel := context.WithCancel(ctx)
+	interval := time.NewTicker(1 * time.Second)
+	defer interval.Stop()
+	buf := ioutil.HeadWriter{Limit: 16 * 1024}
+	bufWriter := ioutil.SynchronizedWriter{W: &buf}
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for {
+			select {
+			case <-interval.C:
+				bufWriter.Mu.Lock()
+				output := string(buf.Bytes())
+				bufWriter.Mu.Unlock()
+
+				if opCheck.OperationCheck.Output != string(output) {
+					opCheck.OperationCheck.Output = string(output)
+
+					if err := runner.OpLog().Update(op); err != nil {
+						zap.L().Error("update prune operation with status output", zap.Error(err))
+					}
+				}
+			case <-ctx.Done():
+				return
+			}
+		}
+	}()
+
+	if err := repo.Check(ctx, &bufWriter); err != nil {
+		cancel()
+
+		runner.ExecuteHooks([]v1.Hook_Condition{
+			v1.Hook_CONDITION_CHECK_ERROR,
+			v1.Hook_CONDITION_ANY_ERROR,
+		}, hook.HookVars{
+			Error: err.Error(),
+		})
+
+		return fmt.Errorf("prune: %w", err)
+	}
+	cancel()
+	wg.Wait()
+
+	opCheck.OperationCheck.Output = string(buf.Bytes())
+
+	// Run a stats task after a successful prune
+	if err := runner.ScheduleTask(NewStatsTask(t.RepoID(), PlanForSystemTasks, false), TaskPriorityStats); err != nil {
+		zap.L().Error("schedule stats task", zap.Error(err))
+	}
+
+	if err := runner.ExecuteHooks([]v1.Hook_Condition{
+		v1.Hook_CONDITION_CHECK_SUCCESS,
+	}, hook.HookVars{}); err != nil {
+		return fmt.Errorf("execute prune success hooks: %w", err)
+	}
+
+	return nil
+}
--- a/internal/orchestrator/tasks/taskcollectgarbage.go
+++ b/internal/orchestrator/tasks/taskcollectgarbage.go
@@ -11,7 +11,7 @@ import (
 )

 const (
-	gcStartupDelay = 5 * time.Second
+	gcStartupDelay = 60 * time.Second
 	gcInterval     = 24 * time.Hour
 	// keep operations that are eligible for gc for 30 days OR up to a limit of 100 for any one plan.
 	// an operation is eligible for gc if:
--- a/internal/orchestrator/tasks/taskprune.go
+++ b/internal/orchestrator/tasks/taskprune.go
@@ -1,7 +1,6 @@
 package tasks

 import (
-	"bytes"
 	"context"
 	"errors"
 	"fmt"
@@ -59,14 +58,20 @@ func (t *PruneTask) Next(now time.Time, runner TaskRunner) (ScheduledTask, error
 	}

 	var lastRan time.Time
+	var foundBackup bool
 	if err := runner.OpLog().ForEach(oplog.Query{RepoId: t.RepoID()}, indexutil.Reversed(indexutil.CollectAll()), func(op *v1.Operation) error {
 		if _, ok := op.Op.(*v1.Operation_OperationPrune); ok {
 			lastRan = time.Unix(0, op.UnixTimeEndMs*int64(time.Millisecond))
 			return oplog.ErrStopIteration
 		}
+		if _, ok := op.Op.(*v1.Operation_OperationBackup); ok {
+			foundBackup = true
+		}
 		return nil
 	}); err != nil {
-		return NeverScheduledTask, fmt.Errorf("finding last backup run time: %w", err)
+		return NeverScheduledTask, fmt.Errorf("finding last prune run time: %w", err)
+	} else if !foundBackup {
+		return NeverScheduledTask, nil
 	}

 	zap.L().Debug("last prune time", zap.Time("time", lastRan), zap.String("repo", t.RepoID()))
@@ -95,6 +100,20 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
 		return fmt.Errorf("couldn't get repo %q: %w", t.RepoID(), err)
 	}

+	if err := runner.ExecuteHooks([]v1.Hook_Condition{
+		v1.Hook_CONDITION_PRUNE_START,
+	}, hook.HookVars{}); err != nil {
+		op.DisplayMessage = err.Error()
+		// TODO: generalize this logic
+		var cancelErr *hook.HookErrorRequestCancel
+		if errors.As(err, &cancelErr) {
+			op.Status = v1.OperationStatus_STATUS_USER_CANCELLED // user visible cancelled status
+			return nil
+		}
+		op.Status = v1.OperationStatus_STATUS_ERROR
+		return fmt.Errorf("execute prune start hooks: %w", err)
+	}
+
 	err = repo.UnlockIfAutoEnabled(ctx)
 	if err != nil {
 		return fmt.Errorf("auto unlock repo %q: %w", t.RepoID(), err)
@@ -108,7 +127,7 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
 	ctx, cancel := context.WithCancel(ctx)
 	interval := time.NewTicker(1 * time.Second)
 	defer interval.Stop()
-	var buf bytes.Buffer
+	buf := ioutil.HeadWriter{Limit: 16 * 1024}
 	bufWriter := ioutil.SynchronizedWriter{W: &buf}
 	var wg sync.WaitGroup
 	wg.Add(1)
@@ -118,14 +137,11 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
 			select {
 			case <-interval.C:
 				bufWriter.Mu.Lock()
-				output := buf.String()
+				output := string(buf.Bytes())
 				bufWriter.Mu.Unlock()
-				if len(output) > 8*1024 { // only provide live status upto the first 8K of output.
-					output = output[:len(output)-8*1024]
-				}

-				if opPrune.OperationPrune.Output != output {
-					opPrune.OperationPrune.Output = buf.String()
+				if opPrune.OperationPrune.Output != string(output) {
+					opPrune.OperationPrune.Output = string(output)

 					if err := runner.OpLog().Update(op); err != nil {
 						zap.L().Error("update prune operation with status output", zap.Error(err))
@@ -151,17 +167,18 @@ func (t *PruneTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunner
 	cancel()
 	wg.Wait()

-	output := buf.String()
-	if len(output) > 8*1024 { // only save the first 4K of output.
-		output = output[:len(output)-8*1024]
-	}
-
-	opPrune.OperationPrune.Output = output
+	opPrune.OperationPrune.Output = string(buf.Bytes())

 	// Run a stats task after a successful prune
 	if err := runner.ScheduleTask(NewStatsTask(t.RepoID(), PlanForSystemTasks, false), TaskPriorityStats); err != nil {
 		zap.L().Error("schedule stats task", zap.Error(err))
 	}

+	if err := runner.ExecuteHooks([]v1.Hook_Condition{
+		v1.Hook_CONDITION_PRUNE_SUCCESS,
+	}, hook.HookVars{}); err != nil {
+		return fmt.Errorf("execute prune end hooks: %w", err)
+	}
+
 	return nil
 }