From ff505a7715e2de62fc167deb6a2e611cde21ed9e Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Thu, 24 Jan 2019 14:47:31 -0700 Subject: [PATCH 1/4] use restic stats instead of check to check repo existence Signed-off-by: Steve Kriss --- pkg/controller/restic_repository_controller.go | 6 +++--- pkg/restic/command_factory.go | 7 +++++++ pkg/restic/command_factory_test.go | 7 +++++++ pkg/restic/repository_manager.go | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pkg/controller/restic_repository_controller.go b/pkg/controller/restic_repository_controller.go index 9f550c8fa..f72ce76e0 100644 --- a/pkg/controller/restic_repository_controller.go +++ b/pkg/controller/restic_repository_controller.go @@ -162,10 +162,10 @@ func (c *resticRepositoryController) initializeRepo(req *v1.ResticRepository, lo }) } -// ensureRepo first checks the repo, and returns if check passes. If it fails, -// attempts to init the repo, and returns the result. +// ensureRepo first tries to connect to the repo, and returns if it succeeds. If it fails, +// it attempts to init the repo, and returns the result. func ensureRepo(repo *v1.ResticRepository, repoManager restic.RepositoryManager) error { - if repoManager.CheckRepo(repo) == nil { + if repoManager.ConnectToRepo(repo) == nil { return nil } diff --git a/pkg/restic/command_factory.go b/pkg/restic/command_factory.go index 591d6f9d3..4d1a993d1 100644 --- a/pkg/restic/command_factory.go +++ b/pkg/restic/command_factory.go @@ -84,6 +84,13 @@ func InitCommand(repoIdentifier string) *Command { } } +func StatsCommand(repoIdentifier string) *Command { + return &Command{ + Command: "stats", + RepoIdentifier: repoIdentifier, + } +} + func CheckCommand(repoIdentifier string) *Command { return &Command{ Command: "check", diff --git a/pkg/restic/command_factory_test.go b/pkg/restic/command_factory_test.go index 5e2714038..636ddf849 100644 --- a/pkg/restic/command_factory_test.go +++ b/pkg/restic/command_factory_test.go @@ -96,6 +96,13 @@ func TestInitCommand(t *testing.T) { assert.Equal(t, "repo-id", c.RepoIdentifier) } +func TestStatsCommand(t *testing.T) { + c := StatsCommand("repo-id") + + assert.Equal(t, "stats", c.Command) + assert.Equal(t, "repo-id", c.RepoIdentifier) +} + func TestCheckCommand(t *testing.T) { c := CheckCommand("repo-id") diff --git a/pkg/restic/repository_manager.go b/pkg/restic/repository_manager.go index 65e307d68..d1dc6a785 100644 --- a/pkg/restic/repository_manager.go +++ b/pkg/restic/repository_manager.go @@ -42,6 +42,12 @@ type RepositoryManager interface { // InitRepo initializes a repo with the specified name and identifier. InitRepo(repo *arkv1api.ResticRepository) error + // ConnectToRepo runs the 'restic stats' command against the + // specified repo, and returns an error if it fails. This is + // intended to be used to ensure that the repo exists/can be + // authenticated to. + ConnectToRepo(repo *arkv1api.ResticRepository) error + // CheckRepo checks the specified repo for errors. CheckRepo(repo *arkv1api.ResticRepository) error @@ -170,6 +176,14 @@ func (rm *repositoryManager) InitRepo(repo *arkv1api.ResticRepository) error { return rm.exec(InitCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation) } +func (rm *repositoryManager) ConnectToRepo(repo *arkv1api.ResticRepository) error { + // restic stats requires a non-exclusive lock + rm.repoLocker.Lock(repo.Name) + defer rm.repoLocker.Unlock(repo.Name) + + return rm.exec(StatsCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation) +} + func (rm *repositoryManager) CheckRepo(repo *arkv1api.ResticRepository) error { // restic check requires an exclusive lock rm.repoLocker.LockExclusive(repo.Name) From 76c0b77cb53f29a4a6a9ba48c1c075c445b00170 Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Thu, 10 Jan 2019 12:48:37 -0700 Subject: [PATCH 2/4] upgrade to restic v0.9.4 and replace --hostname with --host Signed-off-by: Steve Kriss --- Dockerfile-ark.alpine | 6 +++--- pkg/restic/command_factory.go | 4 ++-- pkg/restic/command_factory_test.go | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile-ark.alpine b/Dockerfile-ark.alpine index 2fb1a3492..7fff5c39f 100644 --- a/Dockerfile-ark.alpine +++ b/Dockerfile-ark.alpine @@ -19,9 +19,9 @@ MAINTAINER Andy Goldstein RUN apk add --no-cache ca-certificates RUN apk add --update --no-cache bzip2 && \ - wget --quiet https://github.com/restic/restic/releases/download/v0.9.3/restic_0.9.3_linux_amd64.bz2 && \ - bunzip2 restic_0.9.3_linux_amd64.bz2 && \ - mv restic_0.9.3_linux_amd64 /usr/bin/restic && \ + wget --quiet https://github.com/restic/restic/releases/download/v0.9.4/restic_0.9.4_linux_amd64.bz2 && \ + bunzip2 restic_0.9.4_linux_amd64.bz2 && \ + mv restic_0.9.4_linux_amd64 /usr/bin/restic && \ chmod +x /usr/bin/restic ADD /bin/linux/amd64/ark /ark diff --git a/pkg/restic/command_factory.go b/pkg/restic/command_factory.go index 4d1a993d1..c216bd636 100644 --- a/pkg/restic/command_factory.go +++ b/pkg/restic/command_factory.go @@ -23,7 +23,7 @@ import ( // BackupCommand returns a Command for running a restic backup. func BackupCommand(repoIdentifier, passwordFile, path string, tags map[string]string) *Command { - // --hostname flag is provided with a generic value because restic uses the hostname + // --host flag is provided with a generic value because restic uses the host // to find a parent snapshot, and by default it will be the name of the daemonset pod // where the `restic backup` command is run. If this pod is recreated, we want to continue // taking incremental backups rather than triggering a full one due to a new pod name. @@ -34,7 +34,7 @@ func BackupCommand(repoIdentifier, passwordFile, path string, tags map[string]st PasswordFile: passwordFile, Dir: path, Args: []string{"."}, - ExtraFlags: append(backupTagFlags(tags), "--hostname=ark"), + ExtraFlags: append(backupTagFlags(tags), "--host=ark"), } } diff --git a/pkg/restic/command_factory_test.go b/pkg/restic/command_factory_test.go index 636ddf849..610831875 100644 --- a/pkg/restic/command_factory_test.go +++ b/pkg/restic/command_factory_test.go @@ -32,7 +32,7 @@ func TestBackupCommand(t *testing.T) { assert.Equal(t, "path", c.Dir) assert.Equal(t, []string{"."}, c.Args) - expected := []string{"--tag=foo=bar", "--tag=c=d", "--hostname=ark"} + expected := []string{"--tag=foo=bar", "--tag=c=d", "--host=ark"} sort.Strings(expected) sort.Strings(c.ExtraFlags) assert.Equal(t, expected, c.ExtraFlags) From b3157190866b3c6bdc6e125c8ab03c069a5928f3 Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Mon, 25 Feb 2019 13:06:22 -0800 Subject: [PATCH 3/4] pkg/restic: fix concurrency bugs causing dupe repos, panics Signed-off-by: Steve Kriss --- changelogs/unreleased/1235-skriss | 1 + pkg/restic/repository_ensurer.go | 60 +++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 changelogs/unreleased/1235-skriss diff --git a/changelogs/unreleased/1235-skriss b/changelogs/unreleased/1235-skriss new file mode 100644 index 000000000..a2c55f75c --- /dev/null +++ b/changelogs/unreleased/1235-skriss @@ -0,0 +1 @@ +Fix concurrency bug in code ensuring restic repository exists diff --git a/pkg/restic/repository_ensurer.go b/pkg/restic/repository_ensurer.go index 3810c7ec2..df5117e94 100644 --- a/pkg/restic/repository_ensurer.go +++ b/pkg/restic/repository_ensurer.go @@ -35,18 +35,31 @@ import ( // repositoryEnsurer ensures that Ark restic repositories are created and ready. type repositoryEnsurer struct { + log logrus.FieldLogger repoLister arkv1listers.ResticRepositoryLister repoClient arkv1client.ResticRepositoriesGetter readyChansLock sync.Mutex readyChans map[string]chan *arkv1api.ResticRepository + + // repoLocksMu synchronizes reads/writes to the repoLocks map itself + // since maps are not threadsafe. + repoLocksMu sync.Mutex + repoLocks map[repoKey]*sync.Mutex +} + +type repoKey struct { + volumeNamespace string + backupLocation string } func newRepositoryEnsurer(repoInformer arkv1informers.ResticRepositoryInformer, repoClient arkv1client.ResticRepositoriesGetter, log logrus.FieldLogger) *repositoryEnsurer { r := &repositoryEnsurer{ + log: log, repoLister: repoInformer.Lister(), repoClient: repoClient, readyChans: make(map[string]chan *arkv1api.ResticRepository), + repoLocks: make(map[repoKey]*sync.Mutex), } repoInformer.Informer().AddEventHandler( @@ -67,7 +80,7 @@ func newRepositoryEnsurer(repoInformer arkv1informers.ResticRepositoryInformer, } readyChan <- newObj - delete(r.readyChans, newObj.Name) + delete(r.readyChans, key) } }, }, @@ -84,6 +97,30 @@ func repoLabels(volumeNamespace, backupLocation string) labels.Set { } func (r *repositoryEnsurer) EnsureRepo(ctx context.Context, namespace, volumeNamespace, backupLocation string) (*arkv1api.ResticRepository, error) { + log := r.log.WithField("volumeNamespace", volumeNamespace).WithField("backupLocation", backupLocation) + + // It's only safe to have one instance of this method executing concurrently for a + // given volumeNamespace + backupLocation, so synchronize based on that. It's fine + // to run concurrently for *different* namespaces/locations. If you had 2 goroutines + // running this for the same inputs, both might find no ResticRepository exists, then + // both would create new ones for the same namespace/location. + // + // This issue could probably be avoided if we had a deterministic name for + // each restic repository, and we just tried to create it, checked for an + // AlreadyExists err, and then waited for it to be ready. However, there are + // already repositories in the wild with non-deterministic names (i.e. using + // GenerateName) which poses a backwards compatibility problem. + log.Debug("Acquiring lock") + + repoMu := r.repoLock(volumeNamespace, backupLocation) + repoMu.Lock() + defer func() { + repoMu.Unlock() + log.Debug("Released lock") + }() + + log.Debug("Acquired lock") + selector := labels.SelectorFromSet(repoLabels(volumeNamespace, backupLocation)) repos, err := r.repoLister.ResticRepositories(namespace).List(selector) @@ -97,11 +134,14 @@ func (r *repositoryEnsurer) EnsureRepo(ctx context.Context, namespace, volumeNam if repos[0].Status.Phase != arkv1api.ResticRepositoryPhaseReady { return nil, errors.New("restic repository is not ready") } + + log.Debug("Ready repository found") return repos[0], nil } - // no repo found: create one and wait for it to be ready + log.Debug("No repository found, creating one") + // no repo found: create one and wait for it to be ready repo := &arkv1api.ResticRepository{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, @@ -137,3 +177,19 @@ func (r *repositoryEnsurer) getReadyChan(name string) chan *arkv1api.ResticRepos r.readyChans[name] = make(chan *arkv1api.ResticRepository) return r.readyChans[name] } + +func (r *repositoryEnsurer) repoLock(volumeNamespace, backupLocation string) *sync.Mutex { + r.repoLocksMu.Lock() + defer r.repoLocksMu.Unlock() + + key := repoKey{ + volumeNamespace: volumeNamespace, + backupLocation: backupLocation, + } + + if r.repoLocks[key] == nil { + r.repoLocks[key] = new(sync.Mutex) + } + + return r.repoLocks[key] +} From 4666579459b8eb57a4f308a954e65965bf78cb72 Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Tue, 26 Feb 2019 14:29:23 -0800 Subject: [PATCH 4/4] update CHANGELOG-0.10 for v0.10.2 Signed-off-by: Steve Kriss --- changelogs/CHANGELOG-0.10.md | 12 ++++++++++++ changelogs/unreleased/1235-skriss | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) delete mode 100644 changelogs/unreleased/1235-skriss diff --git a/changelogs/CHANGELOG-0.10.md b/changelogs/CHANGELOG-0.10.md index 059e3049b..187336e51 100644 --- a/changelogs/CHANGELOG-0.10.md +++ b/changelogs/CHANGELOG-0.10.md @@ -1,6 +1,18 @@ + - [v0.10.2](#v0102) - [v0.10.1](#v0101) - [v0.10.0](#v0100) +## v0.10.2 +#### 2019-02-28 + +### Download +- https://github.com/heptio/ark/releases/tag/v0.10.2 + +### Changes + * upgrade restic to v0.9.4 & replace --hostname flag with --host (#1156, @skriss) + * use 'restic stats' instead of 'restic check' to determine if repo exists (#1171, @skriss) + * Fix concurrency bug in code ensuring restic repository exists (#1235, @skriss) + ## v0.10.1 #### 2019-01-10 diff --git a/changelogs/unreleased/1235-skriss b/changelogs/unreleased/1235-skriss deleted file mode 100644 index a2c55f75c..000000000 --- a/changelogs/unreleased/1235-skriss +++ /dev/null @@ -1 +0,0 @@ -Fix concurrency bug in code ensuring restic repository exists