From ed2a66098943419dd60092dd9bc1b4e64f93fde1 Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Wed, 27 Nov 2024 19:20:27 +0200 Subject: [PATCH] K8OP-294 Do not try to work out backup status if there are no pods --- CHANGELOG/CHANGELOG-1.21.md | 1 + .../medusa/medusabackupjob_controller.go | 9 ++ .../medusa/medusabackupjob_controller_test.go | 112 +++++++++++++----- 3 files changed, 94 insertions(+), 28 deletions(-) diff --git a/CHANGELOG/CHANGELOG-1.21.md b/CHANGELOG/CHANGELOG-1.21.md index dcd092818..c7927cc05 100644 --- a/CHANGELOG/CHANGELOG-1.21.md +++ b/CHANGELOG/CHANGELOG-1.21.md @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen * [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades * [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed * [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper` +* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 36607440f..0ec105c3e 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -103,6 +103,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get datacenter pods") return ctrl.Result{}, err } + if len(pods) == 0 { + logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey) + return ctrl.Result{Requeue: true}, nil + } // If there is anything in progress, simply requeue the request until each pod has finished or errored if len(backupJob.Status.InProgress) > 0 { @@ -186,6 +190,11 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get backup summary") return ctrl.Result{}, err } + if backupSummary == nil { + // if a backup is complete, but we fail to find the summary, we're in a non-recoverable situation + logger.Info("Backup summary not found", "backupJob", backupJob) + return ctrl.Result{Requeue: false}, nil + } if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil { logger.Error(err, "Failed to create MedusaBackup") return ctrl.Result{}, err diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index e20ce82f8..e6dca4095 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -7,6 +7,7 @@ import ( "strings" "sync" "testing" + "time" cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1" @@ -31,6 +32,7 @@ const ( successfulBackupName = "good-backup" failingBackupName = "bad-backup" missingBackupName = "missing-backup" + backupWithNoPods = "backup-with-no-pods" dc1PodPrefix = "192.168.1." dc2PodPrefix = "192.168.2." fakeBackupFileCount = int64(13) @@ -167,6 +169,10 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework. backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName) require.False(backupCreated, "the backup object shouldn't have been created") + // in K8OP-294 we found out we can try to make backups on StatefulSets with no pods + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods) + require.False(backupCreated, "the backup object shouldn't have been created") + err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) require.NoError(err, "failed to delete K8ssandraCluster") verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{}) @@ -202,13 +208,23 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa } } - createDatacenterPods(t, f, ctx, dcKey, dc) + if backupName != backupWithNoPods { + createDatacenterPods(t, f, ctx, dcKey, dc) + } dcCopy := dc.DeepCopy() dcKeyCopy := framework.NewClusterKey(f.DataPlaneContexts[0], dcKey.Namespace+"-copy", dcKey.Name) dcCopy.ObjectMeta.Namespace = dc.Namespace + "-copy" - createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + if backupName != backupWithNoPods { + createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + } + + // clean up the pods to keep the environment reusable + if backupName != backupWithNoPods { + defer deleteDatacenterPods(t, f, ctx, dcKey, dc) + defer deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc) + } t.Log("creating MedusaBackupJob") backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) @@ -225,32 +241,14 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err := f.Create(ctx, backupKey, backup) require.NoError(err, "failed to create MedusaBackupJob") - t.Log("verify that the backups are started") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - t.Logf("failed to get MedusaBackupJob: %v", err) - return false - } - return !updated.Status.StartTime.IsZero() - }, timeout, interval) - - t.Log("verify the backup finished") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - return false - } - t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) - t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) - t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) - t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) - return !updated.Status.FinishTime.IsZero() - }, timeout, interval) + if backupName != backupWithNoPods { + t.Log("verify that the backups start eventually") + verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey) + verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName) + } else { + t.Log("verify that the backups never start") + verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey) + } t.Log("check for the MedusaBackup being created") medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) @@ -258,6 +256,7 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err = f.Get(ctx, medusaBackupKey, medusaBackup) if err != nil { if errors.IsNotFound(err) { + // exit the test if the backup was not created. this happens for some backup names on purpose return false } } @@ -274,6 +273,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa return true } +func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) { + t.Log("verify the backup finished") + require.Eventually(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + return false + } + t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) + t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) + t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) + t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) + return !updated.Status.FinishTime.IsZero() + }, timeout, interval) +} + +func verifyBackupJobStarted( + verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}), + t *testing.T, + dc *cassdcapi.CassandraDatacenter, + f *framework.Framework, + ctx context.Context, + backupKey framework.ClusterKey, +) { + verifyFunction(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + t.Logf("failed to get MedusaBackupJob: %v", err) + return false + } + return !updated.Status.StartTime.IsZero() + }, timeout, interval) +} + func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) { t.Log("check ReplicatedSecret reconciled") @@ -488,6 +524,26 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy return nil } +func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { + for i := int32(0); i < dc.Spec.Size; i++ { + pod := &corev1.Pod{} + podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) + podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) + err := f.Get(ctx, podKey, pod) + if err != nil { + if !errors.IsNotFound(err) { + t.Logf("failed to get pod %s: %v", podKey, err) + } + } else { + err = f.Delete(ctx, podKey, pod) + if err != nil { + t.Logf("failed to delete pod %s: %v", podKey, err) + } + } + + } +} + func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { _ = f.CreateNamespace(dcKey.Namespace) for i := int32(0); i < dc.Spec.Size; i++ {