Skip to content

Commit

Permalink
K8OP-294 Do not try to work out backup status if there are no pods
Browse files Browse the repository at this point in the history
  • Loading branch information
rzvoncek committed Nov 28, 2024
1 parent 5940bcc commit ed2a660
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 28 deletions.
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.21.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen
* [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades
* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed
* [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper`
* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods
9 changes: 9 additions & 0 deletions controllers/medusa/medusabackupjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get datacenter pods")
return ctrl.Result{}, err
}
if len(pods) == 0 {
logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey)
return ctrl.Result{Requeue: true}, nil
}

// If there is anything in progress, simply requeue the request until each pod has finished or errored
if len(backupJob.Status.InProgress) > 0 {
Expand Down Expand Up @@ -186,6 +190,11 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get backup summary")
return ctrl.Result{}, err
}
if backupSummary == nil {
// if a backup is complete, but we fail to find the summary, we're in a non-recoverable situation
logger.Info("Backup summary not found", "backupJob", backupJob)
return ctrl.Result{Requeue: false}, nil
}
if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil {
logger.Error(err, "Failed to create MedusaBackup")
return ctrl.Result{}, err
Expand Down
112 changes: 84 additions & 28 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"sync"
"testing"
"time"

cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1"
Expand All @@ -31,6 +32,7 @@ const (
successfulBackupName = "good-backup"
failingBackupName = "bad-backup"
missingBackupName = "missing-backup"
backupWithNoPods = "backup-with-no-pods"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
Expand Down Expand Up @@ -167,6 +169,10 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName)
require.False(backupCreated, "the backup object shouldn't have been created")

// in K8OP-294 we found out we can try to make backups on StatefulSets with no pods
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods)
require.False(backupCreated, "the backup object shouldn't have been created")

err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval)
require.NoError(err, "failed to delete K8ssandraCluster")
verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{})
Expand Down Expand Up @@ -202,13 +208,23 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
}
}

createDatacenterPods(t, f, ctx, dcKey, dc)
if backupName != backupWithNoPods {
createDatacenterPods(t, f, ctx, dcKey, dc)
}

dcCopy := dc.DeepCopy()
dcKeyCopy := framework.NewClusterKey(f.DataPlaneContexts[0], dcKey.Namespace+"-copy", dcKey.Name)
dcCopy.ObjectMeta.Namespace = dc.Namespace + "-copy"

createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)
if backupName != backupWithNoPods {
createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)
}

// clean up the pods to keep the environment reusable
if backupName != backupWithNoPods {
defer deleteDatacenterPods(t, f, ctx, dcKey, dc)
defer deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc)
}

t.Log("creating MedusaBackupJob")
backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
Expand All @@ -225,39 +241,22 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
err := f.Create(ctx, backupKey, backup)
require.NoError(err, "failed to create MedusaBackupJob")

t.Log("verify that the backups are started")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)

t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
if backupName != backupWithNoPods {
t.Log("verify that the backups start eventually")
verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey)
verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName)
} else {
t.Log("verify that the backups never start")
verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey)
}

t.Log("check for the MedusaBackup being created")
medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
medusaBackup := &api.MedusaBackup{}
err = f.Get(ctx, medusaBackupKey, medusaBackup)
if err != nil {
if errors.IsNotFound(err) {
// exit the test if the backup was not created. this happens for some backup names on purpose
return false
}
}
Expand All @@ -274,6 +273,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
return true
}

func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) {
t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
}

func verifyBackupJobStarted(
verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}),
t *testing.T,
dc *cassdcapi.CassandraDatacenter,
f *framework.Framework,
ctx context.Context,
backupKey framework.ClusterKey,
) {
verifyFunction(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)
}

func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) {
t.Log("check ReplicatedSecret reconciled")

Expand Down Expand Up @@ -488,6 +524,26 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy
return nil
}

func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
for i := int32(0); i < dc.Spec.Size; i++ {
pod := &corev1.Pod{}
podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i)
podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName)
err := f.Get(ctx, podKey, pod)
if err != nil {
if !errors.IsNotFound(err) {
t.Logf("failed to get pod %s: %v", podKey, err)
}
} else {
err = f.Delete(ctx, podKey, pod)
if err != nil {
t.Logf("failed to delete pod %s: %v", podKey, err)
}
}

}
}

func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
_ = f.CreateNamespace(dcKey.Namespace)
for i := int32(0); i < dc.Spec.Size; i++ {
Expand Down

0 comments on commit ed2a660

Please sign in to comment.