Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix HA cluster upgrades #798

Merged
merged 26 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b89504e
test upgrading an online HA cluster
laverya Jul 19, 2024
4edde54
ha airgap upgrade too
laverya Jul 19, 2024
73b229a
run multinode airgap HA upgrade on large runner
laverya Jul 19, 2024
d47d573
Merge remote-tracking branch 'origin/main' into laverya/sc-109059/mul…
laverya Jul 20, 2024
a735140
create seaweedfs service in join command
laverya Jul 22, 2024
ae9ce6d
Merge remote-tracking branch 'origin/main' into laverya/sc-109059/mul…
laverya Jul 22, 2024
3d11ad3
refactor tests
laverya Jul 22, 2024
77d20bd
test fixes
laverya Jul 22, 2024
ba2cf1e
test more OSes to track down CA error
laverya Jul 22, 2024
e47dff3
operator ca certs?
laverya Jul 23, 2024
9c6e5a3
is this just a timeout issue for HA upgrades
laverya Jul 23, 2024
e971095
move secret creation to join command
laverya Jul 24, 2024
1dea4e2
fix unit tests
laverya Jul 24, 2024
ce48344
test if failures are due to resources
laverya Jul 24, 2024
938da95
remove nodes _after_ completing ha upgrade
laverya Jul 24, 2024
8200447
bump operator, fix airgap node reset
laverya Jul 24, 2024
3a05821
reset node 0
laverya Jul 24, 2024
6301541
upgrade HA app after recovery
laverya Jul 24, 2024
7ed3171
return to working node resets
laverya Jul 24, 2024
6fe8d6d
Merge remote-tracking branch 'origin/main' into laverya/sc-109059/mul…
laverya Jul 24, 2024
7467ef9
feat: update embeddedclusteroperator version (#825)
replicated-ci Jul 24, 2024
0b4dcf8
reenable all CI tests
laverya Jul 24, 2024
f7b0807
keep key the same
laverya Jul 24, 2024
c666404
operator update version 3
laverya Jul 24, 2024
43bdedb
revert operator busybox image name
laverya Jul 24, 2024
5ba23fa
remove duplicate test runs
laverya Jul 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 35 additions & 33 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -243,47 +243,49 @@ jobs:
- TestSingleNodeInstallationDebian11
- TestSingleNodeInstallationDebian12
- TestSingleNodeInstallationCentos9Stream
- TestVersion
- TestHostPreflight
- TestUnsupportedOverrides
- TestMultiNodeInstallation
- TestMultiNodeReset
- TestCommandsRequireSudo
- TestInstallWithoutEmbed
# - TestVersion
# - TestHostPreflight
# - TestUnsupportedOverrides
# - TestMultiNodeInstallation
# - TestMultiNodeReset
# - TestCommandsRequireSudo
# - TestInstallWithoutEmbed
- TestInstallFromReplicatedApp
- TestResetAndReinstall
- TestResetAndReinstallAirgap
- TestCollectSupportBundle
- TestOldVersionUpgrade
- TestMaterialize
- TestLocalArtifactMirror
- TestSingleNodeAirgapUpgrade
- TestSingleNodeAirgapUpgradeCustomCIDR
- TestInstallSnapshotFromReplicatedApp
- TestMultiNodeAirgapUpgrade
- TestSingleNodeDisasterRecovery
- TestSingleNodeDisasterRecoveryWithProxy
- TestSingleNodeResumeDisasterRecovery
- TestSingleNodeAirgapDisasterRecovery
# - TestResetAndReinstall
# - TestResetAndReinstallAirgap
# - TestCollectSupportBundle
# - TestOldVersionUpgrade
# - TestMaterialize
# - TestLocalArtifactMirror
# - TestSingleNodeAirgapUpgrade
# - TestSingleNodeAirgapUpgradeCustomCIDR
# - TestInstallSnapshotFromReplicatedApp
# - TestMultiNodeAirgapUpgrade
# - TestSingleNodeDisasterRecovery
# - TestSingleNodeDisasterRecoveryWithProxy
# - TestSingleNodeResumeDisasterRecovery
# - TestSingleNodeAirgapDisasterRecovery
# - TestProxiedEnvironment
- TestMultiNodeHAInstallation
- TestMultiNodeAirgapHAInstallation
- TestProxiedEnvironment
- TestMultiNodeHADisasterRecovery
- TestMultiNodeAirgapHADisasterRecovery
- TestCustomCIDR
- TestProxiedCustomCIDR
- TestSingleNodeInstallationNoopUpgrade
# - TestCustomCIDR
# - TestProxiedCustomCIDR
# - TestSingleNodeInstallationNoopUpgrade
include:
- test: TestMultiNodeAirgapUpgrade
runner: embedded-cluster
- test: TestMultiNodeAirgapUpgradeSameK0s
runner: embedded-cluster
# - test: TestMultiNodeAirgapUpgrade
# runner: embedded-cluster
# - test: TestMultiNodeAirgapUpgradeSameK0s
# runner: embedded-cluster
# - test: TestMultiNodeAirgapHAInstallation
# runner: embedded-cluster
# - test: TestMultiNodeAirgapHADisasterRecovery
# runner: embedded-cluster
# - test: TestSingleNodeAirgapDisasterRecovery
# runner: embedded-cluster
- test: TestMultiNodeAirgapHAInstallation
runner: embedded-cluster
- test: TestMultiNodeAirgapHADisasterRecovery
runner: embedded-cluster
- test: TestSingleNodeAirgapDisasterRecovery
runner: embedded-cluster
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@ ADMIN_CONSOLE_CHART_REPO_OVERRIDE =
ADMIN_CONSOLE_IMAGE_OVERRIDE =
ADMIN_CONSOLE_MIGRATIONS_IMAGE_OVERRIDE =
ADMIN_CONSOLE_KURL_PROXY_IMAGE_OVERRIDE =
EMBEDDED_OPERATOR_IMAGE_OVERRIDE =
EMBEDDED_OPERATOR_IMAGE_OVERRIDE = ttl.sh/embedded-cluster-operator-image:dev-caf6235
EMBEDDED_OPERATOR_BINARY_URL_OVERRIDE =
EMBEDDED_OPERATOR_UTILS_IMAGE ?= replicated/embedded-cluster-utils
EMBEDDED_OPERATOR_UTILS_IMAGE_VERSION ?= $(subst +,-,$(VERSION))
EMBEDDED_OPERATOR_UTILS_IMAGE_LOCATION = proxy.replicated.com/anonymous/$(EMBEDDED_OPERATOR_UTILS_IMAGE):$(EMBEDDED_OPERATOR_UTILS_IMAGE_VERSION)
EMBEDDED_CLUSTER_OPERATOR_IMAGE_OVERRIDE =
KUBECTL_VERSION = v1.30.1
K0S_VERSION = v1.29.6+k0s.0
K0S_GO_VERSION = v1.29.6+k0s.0
Expand Down
50 changes: 3 additions & 47 deletions cmd/embedded-cluster/join.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,15 @@ import (
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
"gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
k8syaml "sigs.k8s.io/yaml"

"github.com/replicatedhq/embedded-cluster/pkg/airgap"
"github.com/replicatedhq/embedded-cluster/pkg/config"
"github.com/replicatedhq/embedded-cluster/pkg/defaults"
"github.com/replicatedhq/embedded-cluster/pkg/helpers"
"github.com/replicatedhq/embedded-cluster/pkg/highavailability"
"github.com/replicatedhq/embedded-cluster/pkg/kubeutils"
"github.com/replicatedhq/embedded-cluster/pkg/metrics"
"github.com/replicatedhq/embedded-cluster/pkg/prompts"
Expand Down Expand Up @@ -458,7 +456,7 @@ func waitForNode(ctx context.Context, kcli client.Client, hostname string) error
}

func maybeEnableHA(ctx context.Context, kcli client.Client) error {
canEnableHA, err := canEnableHA(ctx, kcli)
canEnableHA, err := highavailability.CanEnableHA(ctx, kcli)
if err != nil {
return fmt.Errorf("unable to check if HA can be enabled: %w", err)
}
Expand All @@ -473,47 +471,5 @@ func maybeEnableHA(ctx context.Context, kcli client.Client) error {
return nil
}
logrus.Info("")
return enableHA(ctx, kcli)
}

// canEnableHA checks if high availability can be enabled in the cluster.
func canEnableHA(ctx context.Context, kcli client.Client) (bool, error) {
installation, err := kubeutils.GetLatestInstallation(ctx, kcli)
if err != nil {
return false, fmt.Errorf("unable to get latest installation: %w", err)
}
if installation.Spec.HighAvailability {
return false, nil
}
if err := kcli.Get(ctx, types.NamespacedName{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"}, &corev1.ConfigMap{}); err == nil {
return false, nil // cannot enable HA during a restore
} else if !errors.IsNotFound(err) {
return false, fmt.Errorf("unable to get restore state configmap: %w", err)
}
ncps, err := kubeutils.NumOfControlPlaneNodes(ctx, kcli)
if err != nil {
return false, fmt.Errorf("unable to check control plane nodes: %w", err)
}
return ncps >= 3, nil
}

// enableHA enables high availability in the installation object
// and waits for the migration to be complete.
func enableHA(ctx context.Context, kcli client.Client) error {
loading := spinner.Start()
defer loading.Close()
loading.Infof("Enabling high availability")
in, err := kubeutils.GetLatestInstallation(ctx, kcli)
if err != nil {
return fmt.Errorf("unable to get latest installation: %w", err)
}
in.Spec.HighAvailability = true
if err := kcli.Update(ctx, in); err != nil {
return fmt.Errorf("unable to update installation: %w", err)
}
if err := kubeutils.WaitForHAInstallation(ctx, kcli); err != nil {
return fmt.Errorf("unable to wait for ha installation: %w", err)
}
loading.Infof("High availability enabled!")
return nil
return highavailability.EnableHA(ctx, kcli)
}
175 changes: 0 additions & 175 deletions cmd/embedded-cluster/join_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package main

import (
"context"
"embed"
"fmt"
"os"
Expand All @@ -15,11 +14,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gopkg.in/yaml.v3"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
k8syaml "sigs.k8s.io/yaml"
)

Expand Down Expand Up @@ -131,172 +125,3 @@ func TestJoinCommandResponseOverrides(t *testing.T) {
})
}
}

func Test_canEnableHA(t *testing.T) {
scheme := scheme.Scheme
embeddedclusterv1beta1.AddToScheme(scheme)
controllerLabels := map[string]string{"node-role.kubernetes.io/control-plane": "true"}
type args struct {
kcli client.Client
}
tests := []struct {
name string
args args
want bool
wantErr bool
}{
{
name: "high availability is not enabled and there is three or more controller nodes",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: false},
},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3", Labels: controllerLabels}},
).Build(),
},
want: true,
},
{
name: "high availability is not enabled and there is not three or more controller nodes",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: false},
},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3"}},
).Build(),
},
want: false,
},
{
name: "high availability is already enabled",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: true},
},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3", Labels: controllerLabels}},
).Build(),
},
want: false,
},
{
name: "high availability is not enabled and there is three or more controller nodes but a restore is in progress",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: false},
},
&corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"},
},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}},
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3", Labels: controllerLabels}},
).Build(),
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
req := require.New(t)
ctx := context.Background()
got, err := canEnableHA(ctx, tt.args.kcli)
if tt.wantErr {
req.Error(err)
return
}
req.NoError(err)
req.Equal(tt.want, got)
})
}
}

func Test_enableHA(t *testing.T) {
scheme := scheme.Scheme
embeddedclusterv1beta1.AddToScheme(scheme)
type args struct {
kcli client.Client
}
tests := []struct {
name string
args args
wantErr bool
}{
{
name: "happy path airgap",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{
HighAvailability: false,
AirGap: true,
},
Status: embeddedclusterv1beta1.InstallationStatus{
Conditions: []metav1.Condition{
{
Type: "HighAvailability",
Status: metav1.ConditionTrue,
},
},
State: embeddedclusterv1beta1.InstallationStateInstalled,
},
},
).Build(),
},
},
{
name: "happy path online",
args: args{
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
&embeddedclusterv1beta1.Installation{
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
Spec: embeddedclusterv1beta1.InstallationSpec{
HighAvailability: false,
AirGap: false,
},
Status: embeddedclusterv1beta1.InstallationStatus{
Conditions: []metav1.Condition{
{
Type: "HighAvailability",
Status: metav1.ConditionTrue,
},
},
State: embeddedclusterv1beta1.InstallationStateInstalled,
},
},
).Build(),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
req := require.New(t)
ctx := context.Background()
err := enableHA(ctx, tt.args.kcli)
if tt.wantErr {
req.Error(err)
return
}
req.NoError(err)
// validate that high availability is enabled
var installation embeddedclusterv1beta1.Installation
err = tt.args.kcli.Get(ctx, client.ObjectKey{Name: "test-installation"}, &installation)
req.NoError(err)
req.True(installation.Spec.HighAvailability)
})
}
}
Loading
Loading