Mark in-progress CRs as failed when starting the server

Mark in-progress CRs as failed when starting the server

Fixes #4953

Signed-off-by: Wenkai Yin(尹文开) <yinw@vmware.com>
This commit is contained in:
Wenkai Yin(尹文开)
2022-06-06 22:56:29 +08:00
parent 9af031b84a
commit 6fa4d7d606
10 changed files with 204 additions and 186 deletions

View File

@@ -157,12 +157,13 @@ func NewBackupController(
backup := obj.(*velerov1api.Backup)
switch backup.Status.Phase {
case "", velerov1api.BackupPhaseNew, velerov1api.BackupPhaseInProgress:
case "", velerov1api.BackupPhaseNew:
// only process new backups
default:
c.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(backup),
"phase": backup.Status.Phase,
}).Debug("Backup is not new or in-progress, skipping")
}).Debug("Backup is not new, skipping")
return
}
@@ -250,22 +251,7 @@ func (c *backupController) processBackup(key string) error {
// this key (even though it was a no-op).
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
case velerov1api.BackupPhaseInProgress:
// A backup may stay in-progress forever because of
// 1) the controller restarts during the processing of a backup
// 2) the backup with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such Backups as failed to avoid it
updated := original.DeepCopy()
updated.Status.Phase = velerov1api.BackupPhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Backup with unexpected status %q, this may be due to a restart of the controller during the backing up, mark it as %q",
velerov1api.BackupPhaseInProgress, updated.Status.Phase)
updated.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
_, err = patchBackup(original, updated, c.client)
if err != nil {
return errors.Wrapf(err, "error updating Backup status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
// only process new backups
default:
return nil
}

View File

@@ -94,6 +94,11 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseFailedValidation).Result(),
},
{
name: "InProgress backup is not processed",
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result(),
},
{
name: "Completed backup is not processed",
key: "velero/backup-1",
@@ -135,28 +140,6 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
}
}
func TestMarkInProgressBackupAsFailed(t *testing.T) {
backup := defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result()
clientset := fake.NewSimpleClientset(backup)
sharedInformers := informers.NewSharedInformerFactory(clientset, 0)
logger := logging.DefaultLogger(logrus.DebugLevel, logging.FormatText)
c := &backupController{
genericController: newGenericController("backup-test", logger),
client: clientset.VeleroV1(),
lister: sharedInformers.Velero().V1().Backups().Lister(),
clock: &clock.RealClock{},
}
require.NoError(t, sharedInformers.Velero().V1().Backups().Informer().GetStore().Add(backup))
err := c.processBackup(fmt.Sprintf("%s/%s", backup.Namespace, backup.Name))
require.Nil(t, err)
res, err := clientset.VeleroV1().Backups(backup.Namespace).Get(context.TODO(), backup.Name, metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, velerov1api.BackupPhaseFailed, res.Status.Phase)
}
func TestProcessBackupValidationFailures(t *testing.T) {
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Result()

View File

@@ -93,20 +93,9 @@ func (r *PodVolumeBackupReconciler) Reconcile(ctx context.Context, req ctrl.Requ
switch pvb.Status.Phase {
case "", velerov1api.PodVolumeBackupPhaseNew:
case velerov1api.PodVolumeBackupPhaseInProgress:
original := pvb.DeepCopy()
pvb.Status.Phase = velerov1api.PodVolumeBackupPhaseFailed
pvb.Status.Message = fmt.Sprintf("got a PodVolumeBackup with unexpected status %q, this may be due to a restart of the controller during the backing up, mark it as %q",
velerov1api.PodVolumeBackupPhaseInProgress, pvb.Status.Phase)
pvb.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
if err := kube.Patch(ctx, original, &pvb, r.Client); err != nil {
log.WithError(err).Error("error updating PodVolumeBackup status")
return ctrl.Result{}, err
}
log.Warn(pvb.Status.Message)
return ctrl.Result{}, nil
// Only process new items.
default:
log.Debug("PodVolumeBackup is not new or in-progress, not processing")
log.Debug("PodVolumeBackup is not new, not processing")
return ctrl.Result{}, nil
}
@@ -298,7 +287,7 @@ func (r *PodVolumeBackupReconciler) updateBackupProgressFunc(pvb *velerov1api.Po
func (r *PodVolumeBackupReconciler) updateStatusToFailed(ctx context.Context, pvb *velerov1api.PodVolumeBackup, err error, msg string, log logrus.FieldLogger) (ctrl.Result, error) {
original := pvb.DeepCopy()
pvb.Status.Phase = velerov1api.PodVolumeBackupPhaseFailed
pvb.Status.Message = msg
pvb.Status.Message = errors.WithMessage(err, msg).Error()
pvb.Status.CompletionTimestamp = &metav1.Time{Time: r.Clock.Now()}
if err = kube.Patch(ctx, original, pvb, r.Client); err != nil {

View File

@@ -179,16 +179,16 @@ var _ = Describe("PodVolumeBackup Reconciler", func() {
Result(),
expectedRequeue: ctrl.Result{},
}),
Entry("in progress phase pvb on same node should be marked as failed", request{
Entry("in progress phase pvb on same node should not be processed", request{
pvb: pvbBuilder().
Phase(velerov1api.PodVolumeBackupPhaseInProgress).
Node("test_node").
Result(),
pod: podBuilder().Result(),
bsl: bslBuilder().Result(),
expectedProcessed: true,
expectedProcessed: false,
expected: builder.ForPodVolumeBackup(velerov1api.DefaultNamespace, "pvb-1").
Phase(velerov1api.PodVolumeBackupPhaseFailed).
Phase(velerov1api.PodVolumeBackupPhaseInProgress).
Result(),
expectedRequeue: ctrl.Result{},
}),

View File

@@ -134,6 +134,11 @@ func (c *PodVolumeRestoreReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
func (c *PodVolumeRestoreReconciler) shouldProcess(ctx context.Context, log logrus.FieldLogger, pvr *velerov1api.PodVolumeRestore) (bool, *corev1api.Pod, error) {
if !isPVRNew(pvr) {
log.Debug("PodVolumeRestore is not new, skip")
return false, nil, nil
}
// we filter the pods during the initialization of cache, if we can get a pod here, the pod must be in the same node with the controller
// so we don't need to compare the node anymore
pod := &corev1api.Pod{}
@@ -146,28 +151,6 @@ func (c *PodVolumeRestoreReconciler) shouldProcess(ctx context.Context, log logr
return false, nil, err
}
// the status checking logic must be put after getting the PVR's pod because that the getting pod logic
// makes sure the PVR's pod is on the same node with the controller. The controller should only process
// the PVRs on the same node
switch pvr.Status.Phase {
case "", velerov1api.PodVolumeRestorePhaseNew:
case velerov1api.PodVolumeRestorePhaseInProgress:
original := pvr.DeepCopy()
pvr.Status.Phase = velerov1api.PodVolumeRestorePhaseFailed
pvr.Status.Message = fmt.Sprintf("got a PodVolumeRestore with unexpected status %q, this may be due to a restart of the controller during the restoring, mark it as %q",
velerov1api.PodVolumeRestorePhaseInProgress, pvr.Status.Phase)
pvr.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
if err := kube.Patch(ctx, original, pvr, c.Client); err != nil {
log.WithError(err).Error("Unable to update status to failed")
return false, nil, err
}
log.Warn(pvr.Status.Message)
return false, nil, nil
default:
log.Debug("PodVolumeRestore is not new or in-progress, skip")
return false, nil, nil
}
if !isResticInitContainerRunning(pod) {
log.Debug("Pod is not running restic-wait init container, skip")
return false, nil, nil
@@ -209,6 +192,10 @@ func (c *PodVolumeRestoreReconciler) findVolumeRestoresForPod(pod client.Object)
return requests
}
func isPVRNew(pvr *velerov1api.PodVolumeRestore) bool {
return pvr.Status.Phase == "" || pvr.Status.Phase == velerov1api.PodVolumeRestorePhaseNew
}
func isResticInitContainerRunning(pod *corev1api.Pod) bool {
// Restic wait container can be anywhere in the list of init containers, but must be running.
i := getResticInitContainerIndex(pod)

View File

@@ -27,7 +27,6 @@ import (
corev1api "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/clock"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -44,48 +43,15 @@ func TestShouldProcess(t *testing.T) {
obj *velerov1api.PodVolumeRestore
pod *corev1api.Pod
shouldProcessed bool
expectedPhase velerov1api.PodVolumeRestorePhase
}{
{
name: "Unable to get pvr's pod should not be processed",
name: "InProgress phase pvr should not be processed",
obj: &velerov1api.PodVolumeRestore{
Spec: velerov1api.PodVolumeRestoreSpec{
Pod: corev1api.ObjectReference{
Namespace: "ns-1",
Name: "pod-1",
},
},
Status: velerov1api.PodVolumeRestoreStatus{
Phase: "",
},
},
shouldProcessed: false,
},
{
name: "InProgress phase pvr should be marked as failed",
obj: &velerov1api.PodVolumeRestore{
ObjectMeta: metav1.ObjectMeta{
Namespace: "velero",
Name: "pvr-1",
},
Spec: velerov1api.PodVolumeRestoreSpec{
Pod: corev1api.ObjectReference{
Namespace: "ns-1",
Name: "pod-1",
},
},
Status: velerov1api.PodVolumeRestoreStatus{
Phase: velerov1api.PodVolumeRestorePhaseInProgress,
},
},
pod: &corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "ns-1",
Name: "pod-1",
},
},
shouldProcessed: false,
expectedPhase: velerov1api.PodVolumeRestorePhaseFailed,
},
{
name: "Completed phase pvr should not be processed",
@@ -94,22 +60,10 @@ func TestShouldProcess(t *testing.T) {
Namespace: "velero",
Name: "pvr-1",
},
Spec: velerov1api.PodVolumeRestoreSpec{
Pod: corev1api.ObjectReference{
Namespace: "ns-1",
Name: "pod-1",
},
},
Status: velerov1api.PodVolumeRestoreStatus{
Phase: velerov1api.PodVolumeRestorePhaseCompleted,
},
},
pod: &corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "ns-1",
Name: "pod-1",
},
},
shouldProcessed: false,
},
{
@@ -119,6 +73,15 @@ func TestShouldProcess(t *testing.T) {
Namespace: "velero",
Name: "pvr-1",
},
Status: velerov1api.PodVolumeRestoreStatus{
Phase: velerov1api.PodVolumeRestorePhaseFailed,
},
},
shouldProcessed: false,
},
{
name: "Unable to get pvr's pod should not be processed",
obj: &velerov1api.PodVolumeRestore{
Spec: velerov1api.PodVolumeRestoreSpec{
Pod: corev1api.ObjectReference{
Namespace: "ns-1",
@@ -126,13 +89,7 @@ func TestShouldProcess(t *testing.T) {
},
},
Status: velerov1api.PodVolumeRestoreStatus{
Phase: velerov1api.PodVolumeRestorePhaseFailed,
},
},
pod: &corev1api.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "ns-1",
Name: "pod-1",
Phase: "",
},
},
shouldProcessed: false,
@@ -244,12 +201,6 @@ func TestShouldProcess(t *testing.T) {
shouldProcess, _, _ := c.shouldProcess(ctx, c.logger, ts.obj)
require.Equal(t, ts.shouldProcessed, shouldProcess)
if len(ts.expectedPhase) > 0 {
pvr := &velerov1api.PodVolumeRestore{}
err := c.Client.Get(ctx, types.NamespacedName{Namespace: ts.obj.Namespace, Name: ts.obj.Name}, pvr)
require.Nil(t, err)
assert.Equal(t, ts.expectedPhase, pvr.Status.Phase)
}
})
}
}

View File

@@ -144,12 +144,13 @@ func NewRestoreController(
restore := obj.(*api.Restore)
switch restore.Status.Phase {
case "", api.RestorePhaseNew, api.RestorePhaseInProgress:
case "", api.RestorePhaseNew:
// only process new restores
default:
c.logger.WithFields(logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
"phase": restore.Status.Phase,
}).Debug("Restore is not new or in-progress, skipping")
}).Debug("Restore is not new, skipping")
return
}
@@ -201,21 +202,7 @@ func (c *restoreController) processQueueItem(key string) error {
// is ("" | New)
switch restore.Status.Phase {
case "", api.RestorePhaseNew:
case api.RestorePhaseInProgress:
// A restore may stay in-progress forever because of
// 1) the controller restarts during the processing of a restore
// 2) the restore with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such restores as failed to avoid it
updated := restore.DeepCopy()
updated.Status.Phase = api.RestorePhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Restore with unexpected status %q, this may be due to a restart of the controller during the restore, mark it as %q",
api.RestorePhaseInProgress, updated.Status.Phase)
_, err = patchRestore(restore, updated, c.restoreClient)
if err != nil {
return errors.Wrapf(err, "error updating Restore status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
// only process new restores
default:
return nil
}

View File

@@ -20,7 +20,6 @@ import (
"bytes"
"context"
"encoding/json"
"fmt"
"io/ioutil"
"testing"
"time"
@@ -171,6 +170,11 @@ func TestProcessQueueItemSkips(t *testing.T) {
restoreKey: "foo/bar",
expectError: true,
},
{
name: "restore with phase InProgress does not get processed",
restoreKey: "foo/bar",
restore: builder.ForRestore("foo", "bar").Phase(velerov1api.RestorePhaseInProgress).Result(),
},
{
name: "restore with phase Completed does not get processed",
restoreKey: "foo/bar",
@@ -222,31 +226,6 @@ func TestProcessQueueItemSkips(t *testing.T) {
}
}
func TestMarkInProgressRestoreAsFailed(t *testing.T) {
var (
restore = builder.ForRestore("velero", "bar").Phase(velerov1api.RestorePhaseInProgress).Result()
client = fake.NewSimpleClientset(restore)
sharedInformers = informers.NewSharedInformerFactory(client, 0)
logger = velerotest.NewLogger()
)
c := restoreController{
genericController: newGenericController("restore-test", logger),
restoreClient: client.VeleroV1(),
restoreLister: sharedInformers.Velero().V1().Restores().Lister(),
}
err := sharedInformers.Velero().V1().Restores().Informer().GetStore().Add(restore)
require.Nil(t, err)
err = c.processQueueItem(fmt.Sprintf("%s/%s", restore.Namespace, restore.Name))
require.Nil(t, err)
res, err := c.restoreClient.Restores(restore.Namespace).Get(context.Background(), restore.Name, metav1.GetOptions{})
require.Nil(t, err)
assert.Equal(t, velerov1api.RestorePhaseFailed, res.Status.Phase)
}
func TestProcessQueueItem(t *testing.T) {
defaultStorageLocation := builder.ForBackupStorageLocation("velero", "default").Provider("myCloud").Bucket("bucket").Result()