Skip to content

Commit

Permalink
K8OP-5 Do not create MedusaBackup if MadusaBakupJob did not fully suc…
Browse files Browse the repository at this point in the history
…ceed
  • Loading branch information
rzvoncek committed Oct 17, 2024
1 parent f3111e7 commit 1323971
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 61 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG/CHANGELOG-1.21.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen

## unreleased


* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MadusaBakupJob did not fully succeed
85 changes: 46 additions & 39 deletions controllers/medusa/medusabackupjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, err
}

backup := instance.DeepCopy()
backupJob := instance.DeepCopy()

cassdcKey := types.NamespacedName{Namespace: backup.Namespace, Name: backup.Spec.CassandraDatacenter}
cassdcKey := types.NamespacedName{Namespace: backupJob.Namespace, Name: backupJob.Spec.CassandraDatacenter}
cassdc := &cassdcapi.CassandraDatacenter{}
err = r.Get(ctx, cassdcKey, cassdc)
if err != nil {
Expand All @@ -83,12 +83,12 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}

// Set an owner reference on the backup job so that it can be cleaned up when the cassandra datacenter is deleted
if backup.OwnerReferences == nil {
if err = controllerutil.SetControllerReference(cassdc, backup, r.Scheme); err != nil {
if backupJob.OwnerReferences == nil {
if err = controllerutil.SetControllerReference(cassdc, backupJob, r.Scheme); err != nil {
logger.Error(err, "failed to set controller reference", "CassandraDatacenter", cassdcKey)
return ctrl.Result{}, err
}
if err = r.Update(ctx, backup); err != nil {
if err = r.Update(ctx, backupJob); err != nil {
logger.Error(err, "failed to update MedusaBackupJob with owner reference", "CassandraDatacenter", cassdcKey)
return ctrl.Result{}, err
} else {
Expand All @@ -104,36 +104,36 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}

// If there is anything in progress, simply requeue the request until each pod has finished or errored
if len(backup.Status.InProgress) > 0 {
if len(backupJob.Status.InProgress) > 0 {
logger.Info("There are backups in progress, checking them..")
progress := make([]string, 0, len(backup.Status.InProgress))
patch := client.MergeFrom(backup.DeepCopy())
progress := make([]string, 0, len(backupJob.Status.InProgress))
patch := client.MergeFrom(backupJob.DeepCopy())

StatusCheck:
for _, podName := range backup.Status.InProgress {
for _, podName := range backupJob.Status.InProgress {
for _, pod := range pods {
if podName == pod.Name {
status, err := backupStatus(ctx, backup.ObjectMeta.Name, &pod, r.ClientFactory, logger)
status, err := backupStatus(ctx, backupJob.ObjectMeta.Name, &pod, r.ClientFactory, logger)
if err != nil {
return ctrl.Result{}, err
}

if status == medusa.StatusType_IN_PROGRESS {
progress = append(progress, podName)
} else if status == medusa.StatusType_SUCCESS {
backup.Status.Finished = append(backup.Status.Finished, podName)
backupJob.Status.Finished = append(backupJob.Status.Finished, podName)
} else if status == medusa.StatusType_FAILED || status == medusa.StatusType_UNKNOWN {
backup.Status.Failed = append(backup.Status.Failed, podName)
backupJob.Status.Failed = append(backupJob.Status.Failed, podName)
}

continue StatusCheck
}
}
}

if len(backup.Status.InProgress) != len(progress) {
backup.Status.InProgress = progress
if err := r.Status().Patch(ctx, backup, patch); err != nil {
if len(backupJob.Status.InProgress) != len(progress) {
backupJob.Status.InProgress = progress
if err := r.Status().Patch(ctx, backupJob, patch); err != nil {
logger.Error(err, "failed to patch status")
return ctrl.Result{}, err
}
Expand All @@ -147,42 +147,49 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}

// If the backup is already finished, there is nothing to do.
if medusaBackupFinished(backup) {
if medusaBackupFinished(backupJob) {
logger.Info("Backup operation is already finished")
return ctrl.Result{Requeue: false}, nil
}

// First check to see if the backup is already in progress
if !backup.Status.StartTime.IsZero() {
if !backupJob.Status.StartTime.IsZero() {
// If there is anything in progress, simply requeue the request
if len(backup.Status.InProgress) > 0 {
if len(backupJob.Status.InProgress) > 0 {
logger.Info("Backup is still in progress")
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
}

// there is nothing in progress, so the job is finished (not yet sure if successfully)
// Regardless of the success, we set the job finish time
// Note that the time here is not accurate, but that is ok. For now we are just
// using it as a completion marker.
patch := client.MergeFrom(backupJob.DeepCopy())
backupJob.Status.FinishTime = metav1.Now()
if err := r.Status().Patch(ctx, backupJob, patch); err != nil {
logger.Error(err, "failed to patch status with finish time")
return ctrl.Result{}, err
}

// if there are failures, we will end here and not proceed with creating a backup object
if len(backupJob.Status.Failed) > 0 {
logger.Info("Backup failed on some nodes", "BackupName", backupJob.Name, "Failed", backupJob.Status.Failed)
return ctrl.Result{Requeue: false}, nil
}

logger.Info("backup complete")

// The MedusaBackupJob is finished and we now need to create the MedusaBackup object.
backupSummary, err := r.getBackupSummary(ctx, backup, pods, logger)
// The MedusaBackupJob is finished successfully and we now need to create the MedusaBackup object.
backupSummary, err := r.getBackupSummary(ctx, backupJob, pods, logger)
if err != nil {
logger.Error(err, "Failed to get backup summary")
return ctrl.Result{}, err
}
if err := r.createMedusaBackup(ctx, backup, backupSummary, logger); err != nil {
if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil {
logger.Error(err, "Failed to create MedusaBackup")
return ctrl.Result{}, err
}

// Set the finish time
// Note that the time here is not accurate, but that is ok. For now we are just
// using it as a completion marker.
patch := client.MergeFrom(backup.DeepCopy())
backup.Status.FinishTime = metav1.Now()
if err := r.Status().Patch(ctx, backup, patch); err != nil {
logger.Error(err, "failed to patch status with finish time")
return ctrl.Result{}, err
}

return ctrl.Result{Requeue: false}, nil
}

Expand All @@ -195,31 +202,31 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, medusa.BackupSidecarNotFound
}

patch := client.MergeFromWithOptions(backup.DeepCopy(), client.MergeFromWithOptimisticLock{})
patch := client.MergeFromWithOptions(backupJob.DeepCopy(), client.MergeFromWithOptimisticLock{})

backup.Status.StartTime = metav1.Now()
backupJob.Status.StartTime = metav1.Now()

if err := r.Status().Patch(ctx, backup, patch); err != nil {
if err := r.Status().Patch(ctx, backupJob, patch); err != nil {
logger.Error(err, "Failed to patch status")
// We received a stale object, requeue for next processing
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
}

logger.Info("Starting backups")
patch = client.MergeFrom(backup.DeepCopy())
patch = client.MergeFrom(backupJob.DeepCopy())

for _, p := range pods {
logger.Info("starting backup", "CassandraPod", p.Name)
_, err := doMedusaBackup(ctx, backup.ObjectMeta.Name, backup.Spec.Type, &p, r.ClientFactory, logger)
_, err := doMedusaBackup(ctx, backupJob.ObjectMeta.Name, backupJob.Spec.Type, &p, r.ClientFactory, logger)
if err != nil {
logger.Error(err, "backup failed", "CassandraPod", p.Name)
}

backup.Status.InProgress = append(backup.Status.InProgress, p.Name)
backupJob.Status.InProgress = append(backupJob.Status.InProgress, p.Name)
}
// logger.Info("finished backup operations")
if err := r.Status().Patch(context.Background(), backup, patch); err != nil {
logger.Error(err, "failed to patch status", "Backup", fmt.Sprintf("%s/%s", backup.Name, backup.Namespace))
if err := r.Status().Patch(context.Background(), backupJob, patch); err != nil {
logger.Error(err, "failed to patch status", "Backup", fmt.Sprintf("%s/%s", backupJob.Name, backupJob.Namespace))
}

return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
Expand Down
61 changes: 44 additions & 17 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@ import (
)

const (
medusaImageRepo = "test/medusa"
cassandraUserSecret = "medusa-secret"
defaultBackupName = "backup1"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
fakeBackupByteSize = int64(42)
fakeBackupHumanSize = "42.00 B"
fakeMaxBackupCount = 1
medusaImageRepo = "test/medusa"
cassandraUserSecret = "medusa-secret"
successfulBackupName = "good-backup"
failingBackupName = "bad-backup"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
fakeBackupByteSize = int64(42)
fakeBackupHumanSize = "42.00 B"
fakeMaxBackupCount = 1
)

var (
alreadyReportedFailingBackup = false
)

func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) {
Expand Down Expand Up @@ -142,16 +147,19 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.
})
require.NoError(err, "failed to update dc1 status to ready")

backupCreated := createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, defaultBackupName)
backupCreated := createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, successfulBackupName)
require.True(backupCreated, "failed to create backup")

t.Log("verify that medusa gRPC clients are invoked")
require.Equal(map[string][]string{
fmt.Sprintf("%s:%d", getPodIpAddress(0, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName},
fmt.Sprintf("%s:%d", getPodIpAddress(1, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName},
fmt.Sprintf("%s:%d", getPodIpAddress(2, dc1.DatacenterName()), shared.BackupSidecarPort): {defaultBackupName},
fmt.Sprintf("%s:%d", getPodIpAddress(0, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName},
fmt.Sprintf("%s:%d", getPodIpAddress(1, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName},
fmt.Sprintf("%s:%d", getPodIpAddress(2, dc1.DatacenterName()), shared.BackupSidecarPort): {successfulBackupName},
}, medusaClientFactory.GetRequestedBackups(dc1.DatacenterName()))

backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, failingBackupName)
require.False(backupCreated, "the backup object shouldn't have been created")

err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval)
require.NoError(err, "failed to delete K8ssandraCluster")
verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{})
Expand Down Expand Up @@ -231,16 +239,22 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
return false
}
t.Logf("backup finish time: %v", updated.Status.FinishTime)
t.Logf("backup failed: %v", updated.Status.Failed)
t.Logf("backup finished: %v", updated.Status.Finished)
t.Logf("backup in progress: %v", updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero() && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0
return !updated.Status.FinishTime.IsZero() // && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0
}, timeout, interval)

t.Log("verify that the MedusaBackup is created")
t.Log("check for the MedusaBackup being created")
medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
medusaBackup := &api.MedusaBackup{}
err = f.Get(ctx, medusaBackupKey, medusaBackup)
require.NoError(err, "failed to get MedusaBackup")
if err != nil {
if errors.IsNotFound(err) {
return false
}
}
t.Log("verify the MedusaBackup is correct")
require.Equal(medusaBackup.Status.TotalNodes, dc.Spec.Size, "backup total nodes doesn't match dc nodes")
require.Equal(medusaBackup.Status.FinishedNodes, dc.Spec.Size, "backup finished nodes doesn't match dc nodes")
require.Equal(len(medusaBackup.Status.Nodes), int(dc.Spec.Size), "backup topology doesn't match dc topology")
Expand Down Expand Up @@ -387,8 +401,21 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm
}

func (c *fakeMedusaClient) BackupStatus(ctx context.Context, name string) (*medusa.BackupStatusResponse, error) {
var status medusa.StatusType
if name == successfulBackupName || strings.HasPrefix(successfulBackupName, name) {
status = medusa.StatusType_SUCCESS
} else if name == failingBackupName {
if !alreadyReportedFailingBackup {
status = medusa.StatusType_FAILED
alreadyReportedFailingBackup = true
} else {
status = medusa.StatusType_SUCCESS
}
} else {
status = medusa.StatusType_IN_PROGRESS
}
return &medusa.BackupStatusResponse{
Status: medusa.StatusType_SUCCESS,
Status: status,
}, nil
}

Expand Down
8 changes: 4 additions & 4 deletions controllers/medusa/medusatask_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ import (
)

const (
backup1 = "backup1"
backup2 = "backup2"
backup3 = "backup3"
backup4 = "backup4"
backup1 = "good-backup1"
backup2 = "good-backup2"
backup3 = "good-backup3"
backup4 = "good-backup4"
)

func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) {
Expand Down

0 comments on commit 1323971

Please sign in to comment.