diff --git a/.github/workflows/kindIntegTest.yml b/.github/workflows/kindIntegTest.yml index 4baae2c47..0548447ec 100644 --- a/.github/workflows/kindIntegTest.yml +++ b/.github/workflows/kindIntegTest.yml @@ -168,7 +168,7 @@ jobs: - webhook_validation # Three worker tests: - canary_upgrade - # - config_change_condition # config_change takes care of testing the same + - config_change_condition #- cdc_successful # OSS only # - delete_node_lost_readiness # DSE specific behavior - host_network diff --git a/.github/workflows/workflow-integration-tests.yaml b/.github/workflows/workflow-integration-tests.yaml index bb36d1e7e..cdb5b537b 100644 --- a/.github/workflows/workflow-integration-tests.yaml +++ b/.github/workflows/workflow-integration-tests.yaml @@ -183,7 +183,7 @@ jobs: - webhook_validation # Three worker tests: # - canary_upgrade # See kind_40_tests job - # - config_change_condition # config_change takes care of the same testing + - config_change_condition # - cdc_successful # CDC is OSS only , see kind_311_tests and kind_40_tests jobs # - delete_node_lost_readiness # DSE specific behavior see kind_dse_tests job - host_network diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index bdf48fe94..5592d7a74 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -173,7 +173,7 @@ func (rc *ReconciliationContext) failureModeDetection() bool { continue } if pod.Status.Phase == corev1.PodPending { - if hasBeenXMinutes(5, pod.Status.StartTime.Time) { + if pod.Status.StartTime == nil || hasBeenXMinutes(5, pod.Status.StartTime.Time) { // Pod has been over 5 minutes in Pending state. This can be normal, but lets see // if we have some detected failures events like FailedScheduling events := &corev1.EventList{} diff --git a/tests/config_change_condition/config_change_condition_suite_test.go b/tests/config_change_condition/config_change_condition_suite_test.go index 3a2a26d11..23c3dd869 100644 --- a/tests/config_change_condition/config_change_condition_suite_test.go +++ b/tests/config_change_condition/config_change_condition_suite_test.go @@ -18,12 +18,13 @@ import ( ) var ( - testName = "Config change condition" - namespace = "test-config-change-condition" - dcName = "dc2" - dcYaml = "../testdata/default-single-rack-2-node-dc.yaml" - dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName) - ns = ginkgo_util.NewWrapper(testName, namespace) + testName = "Config change condition with failure" + namespace = "test-config-change-condition" + dcName = "dc1" + clusterName = "cluster1" + dcYaml = "../testdata/default-three-rack-three-node-dc-zones.yaml" + dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName) + ns = ginkgo_util.NewWrapper(testName, namespace) ) func TestLifecycle(t *testing.T) { @@ -55,22 +56,33 @@ var _ = Describe(testName, func() { ns.WaitForOperatorReady() - step := "creating a datacenter resource with 1 racks/2 nodes" + step := "creating a datacenter resource with 3 racks/3 nodes using unavailable zones" testFile, err := ginkgo_util.CreateTestFile(dcYaml) Expect(err).ToNot(HaveOccurred()) k := kubectl.ApplyFiles(testFile) ns.ExecAndLog(step, k) - ns.WaitForDatacenterReady(dcName) + // Wait for status to be Unschedulable + step = "waiting the nodes to be unschedulable" + json := `jsonpath={.status.conditions[?(@.type=="PodScheduled")].status}` + k = kubectl.Get(fmt.Sprintf("pod/%s-%s-r1-sts-0", clusterName, dcName)). + FormatOutput(json) + ns.WaitForOutputContains(k, "False", 30) + + json = `jsonpath={.status.conditions[?(@.type=="PodScheduled")].reason}` + k = kubectl.Get(fmt.Sprintf("pod/%s-%s-r1-sts-0", clusterName, dcName)). + FormatOutput(json) + ns.WaitForOutputContainsAndLog(step, k, "Unschedulable", 30) - step = "change the config" - json := ginkgo_util.CreateTestJson("{\"spec\": {\"config\": {\"cassandra-yaml\": {\"roles_validity\": \"256000ms\"}, \"jvm-server-options\": {\"garbage_collector\": \"CMS\"}}}}") + step = "change the config by removing zones" + json = `{"spec": { "racks": [{"name": "r1"}, {"name": "r2"}, {"name": "r3"}]}}` k = kubectl.PatchMerge(dcResource, json) ns.ExecAndLog(step, k) ns.WaitForDatacenterCondition(dcName, "Updating", string(corev1.ConditionTrue)) ns.WaitForDatacenterCondition(dcName, "Updating", string(corev1.ConditionFalse)) + ns.WaitForDatacenterReady(dcName) ns.WaitForDatacenterOperatorProgress(dcName, "Ready", 1800) }) }) diff --git a/tests/testdata/default-three-rack-three-node-dc-zones.yaml b/tests/testdata/default-three-rack-three-node-dc-zones.yaml new file mode 100644 index 000000000..4f258b3b5 --- /dev/null +++ b/tests/testdata/default-three-rack-three-node-dc-zones.yaml @@ -0,0 +1,54 @@ +apiVersion: cassandra.datastax.com/v1beta1 +kind: CassandraDatacenter +metadata: + name: dc1 +spec: + clusterName: cluster1 + serverType: cassandra + serverVersion: "5.0.2" + managementApiAuth: + insecure: {} + size: 3 + storageConfig: + cassandraDataVolumeClaimSpec: + storageClassName: standard + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + racks: + - name: r1 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - europe-north1-a + - name: r2 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - europe-north1-b + - name: r3 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - europe-north1-c + config: + jvm-options: + initial_heap_size: "512m" + max_heap_size: "512m"