diff --git a/doc/rollback-cluster.md b/doc/rollback-cluster.md new file mode 100644 index 000000000..d499643c8 --- /dev/null +++ b/doc/rollback-cluster.md @@ -0,0 +1,184 @@ +# Pravega Cluster Rollback + +This document details how manual rollback can be triggered after a Pravega cluster upgrade fails. +Note that a rollback can be triggered only on Upgrade Failure. + +## Upgrade Failure + +An Upgrade can fail because of following reasons: + +1. Incorrect configuration (wrong quota, permissions, limit ranges) +2. Network issues (ImagePullError) +3. K8s Cluster Issues. +4. Application issues (Application runtime misconfiguration or code bugs) + +An upgrade failure can manifest through a Pod staying in `Pending` state forever or continuously restarting or crashing (CrashLoopBackOff). +A component deployment failure needs to be tracked and mapped to "Upgrade Failure" for Pravega Cluster. +Here we try to fail-fast by explicitly checking for some common causes for deployment failure like image pull errors or CrashLoopBackOff State and failing the upgrade if any pod runs into this state during upgrade. + +The following Pravega Cluster Status Condition indicates a Failed Upgrade: + +``` +ClusterConditionType: Error +Status: True +Reason: UpgradeFailed +Message:
+``` +After an Upgrade Failure the output of `kubectl describe pravegacluster pravega` would look like this: + +``` +$> kubectl describe pravegacluster pravega +. . . +Spec: +. . . +Version: 0.6.0-2252.b6f6512 +. . . +Status: +. . . +Conditions: + Last Transition Time: 2019-09-06T09:00:13Z + Last Update Time: 2019-09-06T09:00:13Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-06T08:58:40Z + Last Update Time: 2019-09-06T08:58:40Z + Status: False + Type: PodsReady + Last Transition Time: 2019-09-06T09:00:13Z + Last Update Time: 2019-09-06T09:00:13Z + Message: failed to sync segmentstore version. pod pravega-pravega-segmentstore-0 update failed because of ImagePullBackOff + Reason: UpgradeFailed + Status: True + Type: Error + . . . + Current Version: 0.6.0-2239.6e24df7 +. . . +Version History: + 0.6.0-2239.6e24df7 +``` +where `0.6.0-2252.b6f6512` is the version we tried upgrading to and `0.6.0-2239.6e24df7` is the cluster version prior to triggering the upgrade. + +## Manual Rollback Trigger + +A Rollback is triggered when a Pravega Cluster is in `UpgradeFailed` Error State and a user manually updates version feild in the PravegaCluster spec to point to the last stable cluster version. + +A Rollback involves moving all components in the cluster back to the last stable cluster version. As with upgrades, the operator rolls back one component at a time and one pod at a time to preserve high-availability. + +Note: +1. A Rollback to only the last stable cluster version is supported at this point. +2. Changing the cluster spec version to the previous cluster version, when cluster is not in `UpgradeFailed` state, will not trigger a rollback. + +## Rollback Implementation + +When Rollback is triggered the cluster moves into ClusterCondition `RollbackInProgress`. +Once the Rollback completes, this condition is set to false. + +During a Rollback, the Cluster Status should look something like: +``` +$> kubectl describe pravegacluster pravega +. . . +Status: + Conditions: + Last Transition Time: 2019-09-20T10:41:10Z + Last Update Time: 2019-09-20T10:41:10Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T10:45:12Z + Last Update Time: 2019-09-20T10:45:12Z + Status: True + Type: PodsReady + Last Transition Time: 2019-09-20T10:41:10Z + Last Update Time: 2019-09-20T10:41:10Z + Message: failed to sync segmentstore version. pod pravega-pravega-segmentstore-0 update failed because of ImagePullBackOff + Reason: UpgradeFailed + Status: True + Type: Error + Last Update Time: 2019-09-20T10:45:12Z + Message: 1 + Reason: Updating Bookkeeper + Status: True + Type: RollbackInProgress +. . . +``` +Here the `RollbackInProgress` condition being `true` indicates that a Rollback is in Progress. +Also `Reason` and `Message` feilds of this condition indicate the component being rolled back and number of updated replicas respectively. + +The operator rolls back components following the reverse upgrade order : + +1. Pravega Controller +2. Pravega Segment Store +3. BookKeeper + +A `versionHistory` field in the PravegaClusterSpec maintains the history of upgrades. + +## Rollback Outcome + +### Success +If the Rollback completes successfully, the cluster state goes back to condition `PodsReady`, which would mean the cluster is now in a stable state. All other conditions should be `false`. +``` +Last Transition Time: 2019-09-20T09:49:26Z +Last Update Time: 2019-09-20T09:49:26Z +Status: True +Type: PodsReady + +``` + +Example: +``` +Status: + Conditions: + Last Transition Time: 2019-09-20T10:12:04Z + Last Update Time: 2019-09-20T10:12:04Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T10:11:34Z + Last Update Time: 2019-09-20T10:11:34Z + Status: True + Type: PodsReady + Last Transition Time: 2019-09-20T10:07:19Z + Last Update Time: 2019-09-20T10:07:19Z + Status: False + Type: Error + Last Transition Time: 2019-09-20T09:50:57Z + Last Update Time: 2019-09-20T09:50:57Z + Status: False + Type: RollbackInProgress +``` + +### Failure +If the Rollback Fails, the cluster would move to `Error` state indicated by this cluster condition: +``` +ClusterConditionType: Error +Status: True +Reason: RollbackFailed +Message:
+``` + +Example: +``` +Status: + Conditions: + Last Transition Time: 2019-09-20T09:46:24Z + Last Update Time: 2019-09-20T09:46:24Z + Status: False + Type: Upgrading + Last Transition Time: 2019-09-20T09:49:26Z + Last Update Time: 2019-09-20T09:49:26Z + Status: False + Type: PodsReady + Last Transition Time: 2019-09-20T09:46:24Z + Last Update Time: 2019-09-20T09:50:57Z + Message: failed to sync bookkeeper version. pod pravega-bookie-0 update failed because of ImagePullBackOff + Reason: RollbackFailed + Status: True + Type: Error + Last Transition Time: 2019-09-20T09:50:57Z + Last Update Time: 2019-09-20T09:50:57Z + Status: False + Type: RollbackInProgress +``` + +When a rollback failure happens, manual intervention would be required to resolve this. +After checking and solving the root cause of failure, to bring the cluster back to a stable state, a user can upgrade to: +1. The version to which a user initially intended to upgrade.(when upgrade failure was noticed) +2. To any other supported version based versions of all pods in the cluster. diff --git a/doc/upgrade-cluster.md b/doc/upgrade-cluster.md index 8e98f2692..1d9c1fcfb 100644 --- a/doc/upgrade-cluster.md +++ b/doc/upgrade-cluster.md @@ -20,8 +20,6 @@ Check out [Pravega documentation](http://pravega.io/docs/latest/) for more infor ## Pending tasks -- The rollback mechanism is on the roadmap but not implemented yet. Check out [this issue](https://github.com/pravega/pravega-operator/issues/153). -- Manual recovery from an upgrade is possible but it has not been defined yet. Check out [this issue](https://github.com/pravega/pravega-operator/issues/157). - There is no validation of the configured desired version. Check out [this issue](https://github.com/pravega/pravega-operator/issues/156) @@ -35,6 +33,19 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.4.0 7 7 11m ``` +## Upgrade Path Matrix + +| BASE VERSION | TARGET VERSION | +| ------------ | ---------------- | +| 0.1.0 | 0.1.0 | +| 0.2.0 | 0.2.0 | +| 0.3.0 | 0.3.0, 0.3.1, 0.3.2| +| 0.3.1 | 0.3.1, 0.3.2 | +| 0.3.2 | 0.3.2 | +| 0.4.0 | 0.4.0 | +| 0.5.0 | 0.5.0, 0.6.0 | +| 0.6.0 | 0.6.0 | + ## Trigger an upgrade To initiate an upgrade process, a user has to update the `spec.version` field on the `PravegaCluster` custom resource. This can be done in three different ways using the `kubectl` command. @@ -103,8 +114,7 @@ Segment Store instances need access to a persistent volume to store the cache. L Also, Segment Store pods need to be individually accessed by clients, so having a stable network identifier provided by the Statefulset and a headless service is very convenient. -Same as Bookkeeper, we use `OnDelete` strategy for Segment Store. The reason that we don't use `RollingUpdate` strategy here is that we found it convenient to manage the upgrade -and rollback in the same fashion. Using `RollingUpdate` will introduce Kubernetes rollback mechanism which will cause trouble to our implementation. +Same as Bookkeeper, we use `OnDelete` strategy for Segment Store. The reason that we don't use `RollingUpdate` strategy here is that we found it convenient to manage the upgrade and rollback in the same fashion. Using `RollingUpdate` will introduce Kubernetes rollback mechanism which will cause trouble to our implementation. ### Pravega Controller upgrade @@ -131,7 +141,30 @@ NAME VERSION DESIRED MEMBERS READY MEMBERS AGE example 0.5.0 8 8 1h ``` -If your upgrade has failed, you can describe the status section of your Pravega cluster to discover why. +The command `kubectl describe` can be used to track progress of the upgrade. +``` +$ kubectl describe PravegaCluster example +... +Status: + Conditions: + Status: True + Type: Upgrading + Reason: Updating BookKeeper + Message: 1 + Last Transition Time: 2019-04-01T19:42:37+02:00 + Last Update Time: 2019-04-01T19:42:37+02:00 + Status: False + Type: PodsReady + Last Transition Time: 2019-04-01T19:43:08+02:00 + Last Update Time: 2019-04-01T19:43:08+02:00 + Status: False + Type: Error +... + +``` +The `Reason` field in Upgrading Condition shows the component currently being upgraded and `Message` field reflects number of successfully upgraded replicas in this component. + +If upgrade has failed, please check the `Status` section to understand the reason for failure. ``` $ kubectl describe PravegaCluster example @@ -181,10 +214,10 @@ INFO[5899] Reconciling PravegaCluster default/example INFO[5900] statefulset (example-bookie) status: 1 updated, 2 ready, 3 target INFO[5929] Reconciling PravegaCluster default/example INFO[5930] statefulset (example-bookie) status: 1 updated, 2 ready, 3 target -INFO[5930] error syncing cluster version, need manual intervention. failed to sync bookkeeper version. pod example-bookie-0 is restarting +INFO[5930] error syncing cluster version, upgrade failed. failed to sync bookkeeper version. pod example-bookie-0 is restarting ... ``` ### Recovering from a failed upgrade -Not defined yet. Check [this issue](https://github.com/pravega/pravega-operator/issues/157) for tracking. +See [Rollback](rollback-cluster.md) diff --git a/pkg/apis/pravega/v1alpha1/status.go b/pkg/apis/pravega/v1alpha1/status.go index 0e26b4997..572145cd3 100644 --- a/pkg/apis/pravega/v1alpha1/status.go +++ b/pkg/apis/pravega/v1alpha1/status.go @@ -11,6 +11,7 @@ package v1alpha1 import ( + "log" "time" corev1 "k8s.io/api/core/v1" @@ -21,12 +22,13 @@ type ClusterConditionType string const ( ClusterConditionPodsReady ClusterConditionType = "PodsReady" ClusterConditionUpgrading = "Upgrading" + ClusterConditionRollback = "RollbackInProgress" ClusterConditionError = "Error" // Reasons for cluster upgrading condition - UpgradingControllerReason = "UpgradingController" - UpgradingSegmentstoreReason = "UpgradingSegmentstore" - UpgradingBookkeeperReason = "UpgradingBookkeeper" + UpdatingControllerReason = "Updating Controller" + UpdatingSegmentstoreReason = "Updating Segmentstore" + UpdatingBookkeeperReason = "Updating Bookkeeper" ) // ClusterStatus defines the observed state of PravegaCluster @@ -41,6 +43,8 @@ type ClusterStatus struct { // If the cluster is not upgrading, TargetVersion is empty. TargetVersion string `json:"targetVersion,omitempty"` + VersionHistory []string `json:"versionHistory,omitempty"` + // Replicas is the number of desired replicas in the cluster Replicas int32 `json:"replicas"` @@ -83,7 +87,8 @@ type ClusterCondition struct { LastTransitionTime string `json:"lastTransitionTime,omitempty"` } -func (ps *ClusterStatus) InitConditions() { +func (ps *ClusterStatus) Init() { + // Initialise conditions conditionTypes := []ClusterConditionType{ ClusterConditionPodsReady, ClusterConditionUpgrading, @@ -95,6 +100,12 @@ func (ps *ClusterStatus) InitConditions() { ps.setClusterCondition(*c) } } + + // Set current cluster version in version history, + // so if the first upgrade fails we can rollback to this version + if ps.VersionHistory == nil && ps.CurrentVersion != "" { + ps.VersionHistory = []string{ps.CurrentVersion} + } } func (ps *ClusterStatus) SetPodsReadyConditionTrue() { @@ -127,6 +138,15 @@ func (ps *ClusterStatus) SetErrorConditionFalse() { ps.setClusterCondition(*c) } +func (ps *ClusterStatus) SetRollbackConditionTrue(reason, message string) { + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionTrue, reason, message) + ps.setClusterCondition(*c) +} +func (ps *ClusterStatus) SetRollbackConditionFalse() { + c := newClusterCondition(ClusterConditionRollback, corev1.ConditionFalse, "", "") + ps.setClusterCondition(*c) +} + func newClusterCondition(condType ClusterConditionType, status corev1.ConditionStatus, reason, message string) *ClusterCondition { return &ClusterCondition{ Type: condType, @@ -170,3 +190,88 @@ func (ps *ClusterStatus) setClusterCondition(newCondition ClusterCondition) { ps.Conditions[position] = *existingCondition } + +func (ps *ClusterStatus) AddToVersionHistory(version string) { + lastIndex := len(ps.VersionHistory) - 1 + if version != "" && ps.VersionHistory[lastIndex] != version { + ps.VersionHistory = append(ps.VersionHistory, version) + log.Printf("Updating version history adding version %v", version) + } +} + +func (ps *ClusterStatus) GetLastVersion() (previousVersion string) { + len := len(ps.VersionHistory) + return ps.VersionHistory[len-1] +} + +func (ps *ClusterStatus) IsClusterInUpgradeFailedState() bool { + _, errorCondition := ps.GetClusterCondition(ClusterConditionError) + if errorCondition == nil { + return false + } + if errorCondition.Status == corev1.ConditionTrue && errorCondition.Reason == "UpgradeFailed" { + return true + } + return false +} + +func (ps *ClusterStatus) IsClusterInUpgradeFailedOrRollbackState() bool { + if ps.IsClusterInUpgradeFailedState() || ps.IsClusterInRollbackState() { + return true + } + return false +} + +func (ps *ClusterStatus) IsClusterInRollbackState() bool { + _, rollbackCondition := ps.GetClusterCondition(ClusterConditionRollback) + if rollbackCondition == nil { + return false + } + if rollbackCondition.Status == corev1.ConditionTrue { + return true + } + return false +} + +func (ps *ClusterStatus) IsClusterInUpgradingState() bool { + _, upgradeCondition := ps.GetClusterCondition(ClusterConditionUpgrading) + if upgradeCondition == nil { + return false + } + if upgradeCondition.Status == corev1.ConditionTrue { + return true + } + return false +} + +func (ps *ClusterStatus) IsClusterInRollbackFailedState() bool { + _, errorCondition := ps.GetClusterCondition(ClusterConditionError) + if errorCondition == nil { + return false + } + if errorCondition.Status == corev1.ConditionTrue && errorCondition.Reason == "RollbackFailed" { + return true + } + return false +} + +func (ps *ClusterStatus) UpdateProgress(reason, updatedReplicas string) { + if ps.IsClusterInUpgradingState() { + // Set the upgrade condition reason to be UpgradingBookkeeperReason, message to be 0 + ps.SetUpgradingConditionTrue(reason, updatedReplicas) + } else { + ps.SetRollbackConditionTrue(reason, updatedReplicas) + } +} + +func (ps *ClusterStatus) GetLastCondition() (lastCondition *ClusterCondition) { + if ps.IsClusterInUpgradingState() { + _, lastCondition := ps.GetClusterCondition(ClusterConditionUpgrading) + return lastCondition + } else if ps.IsClusterInRollbackState() { + _, lastCondition := ps.GetClusterCondition(ClusterConditionRollback) + return lastCondition + } + // nothing to do if we are neither upgrading nor rolling back, + return nil +} diff --git a/pkg/controller/pravegacluster/pravegacluster_controller.go b/pkg/controller/pravegacluster/pravegacluster_controller.go index ce2fbcfbf..68143149b 100644 --- a/pkg/controller/pravegacluster/pravegacluster_controller.go +++ b/pkg/controller/pravegacluster/pravegacluster_controller.go @@ -138,11 +138,18 @@ func (r *ReconcilePravegaCluster) run(p *pravegav1alpha1.PravegaCluster) (err er return fmt.Errorf("failed to sync cluster size: %v", err) } + // Upgrade err = r.syncClusterVersion(p) if err != nil { return fmt.Errorf("failed to sync cluster version: %v", err) } + // Rollback + err = r.rollbackFailedUpgrade(p) + if err != nil { + return fmt.Errorf("Rollback attempt failed: %v", err) + } + err = r.reconcileClusterStatus(p) if err != nil { return fmt.Errorf("failed to reconcile cluster status: %v", err) @@ -168,10 +175,12 @@ func (r *ReconcilePravegaCluster) deployCluster(p *pravegav1alpha1.PravegaCluste log.Printf("failed to deploy segment store: %v", err) return err } + return nil } func (r *ReconcilePravegaCluster) deployController(p *pravegav1alpha1.PravegaCluster) (err error) { + pdb := pravega.MakeControllerPodDisruptionBudget(p) controllerutil.SetControllerReference(p, pdb, r.scheme) err = r.client.Create(context.TODO(), pdb) @@ -251,6 +260,7 @@ func (r *ReconcilePravegaCluster) deploySegmentStore(p *pravegav1alpha1.PravegaC } func (r *ReconcilePravegaCluster) deployBookie(p *pravegav1alpha1.PravegaCluster) (err error) { + headlessService := pravega.MakeBookieHeadlessService(p) controllerutil.SetControllerReference(p, headlessService, r.scheme) err = r.client.Create(context.TODO(), headlessService) @@ -439,7 +449,7 @@ func (r *ReconcilePravegaCluster) syncStatefulSetPvc(sts *appsv1.StatefulSet) er func (r *ReconcilePravegaCluster) reconcileClusterStatus(p *pravegav1alpha1.PravegaCluster) error { - p.Status.InitConditions() + p.Status.Init() expectedSize := util.GetClusterExpectedSize(p) listOps := &client.ListOptions{ @@ -483,3 +493,21 @@ func (r *ReconcilePravegaCluster) reconcileClusterStatus(p *pravegav1alpha1.Prav } return nil } + +func (r *ReconcilePravegaCluster) rollbackFailedUpgrade(p *pravegav1alpha1.PravegaCluster) error { + if r.isRollbackTriggered(p) { + // start rollback to previous version + previousVersion := p.Status.GetLastVersion() + log.Printf("Rolling back to last cluster version %v", previousVersion) + //Rollback cluster to previous version + return r.rollbackClusterVersion(p, previousVersion) + } + return nil +} + +func (r *ReconcilePravegaCluster) isRollbackTriggered(p *pravegav1alpha1.PravegaCluster) bool { + if p.Status.IsClusterInUpgradeFailedState() && p.Spec.Version == p.Status.GetLastVersion() { + return true + } + return false +} diff --git a/pkg/controller/pravegacluster/upgrade.go b/pkg/controller/pravegacluster/upgrade.go index 894daf928..533dedf02 100644 --- a/pkg/controller/pravegacluster/upgrade.go +++ b/pkg/controller/pravegacluster/upgrade.go @@ -33,11 +33,17 @@ type componentSyncVersionFun struct { fun func(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) } +// upgrade func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaCluster) (err error) { defer func() { r.client.Status().Update(context.TODO(), p) }() + // we cannot upgrade if cluster is in UpgradeFailed or Rollback state + if p.Status.IsClusterInUpgradeFailedOrRollbackState() { + return nil + } + _, upgradeCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) _, readyCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionPodsReady) @@ -51,7 +57,6 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC if upgradeCondition.Status == corev1.ConditionTrue { // Upgrade process already in progress - if p.Status.TargetVersion == "" { log.Println("syncing to an unknown version: cancelling upgrade process") return r.clearUpgradeStatus(p) @@ -62,31 +67,43 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC return r.clearUpgradeStatus(p) } - if err := r.syncComponentsVersion(p); err != nil { - log.Printf("error syncing cluster version, need manual intervention. %v", err) - // TODO: Trigger roll back to previous version + syncCompleted, err := r.syncComponentsVersion(p) + if err != nil { + log.Printf("error syncing cluster version, upgrade failed. %v", err) p.Status.SetErrorConditionTrue("UpgradeFailed", err.Error()) r.clearUpgradeStatus(p) + return err + } + + if syncCompleted { + // All component versions have been synced + p.Status.AddToVersionHistory(p.Status.TargetVersion) + p.Status.CurrentVersion = p.Status.TargetVersion + log.Printf("Upgrade completed for all pravega components.") } return nil } // No upgrade in progress - if p.Spec.Version == p.Status.CurrentVersion { // No intention to upgrade return nil } - if readyCondition == nil || readyCondition.Status != corev1.ConditionTrue { - r.clearUpgradeStatus(p) - log.Print("cannot trigger upgrade if there are unready pods") - return nil + if !p.Status.IsClusterInRollbackFailedState() { + // skip this check when cluster is in RollbackFailed state + if readyCondition == nil || readyCondition.Status != corev1.ConditionTrue { + r.clearUpgradeStatus(p) + log.Print("cannot trigger upgrade if there are unready pods") + return nil + } + } else { + // We are upgrading after a rollback failure, reset Error Status + p.Status.SetErrorConditionFalse() } // Need to sync cluster versions log.Printf("syncing cluster version from %s to %s", p.Status.CurrentVersion, p.Spec.Version) - // Setting target version and condition. // The upgrade process will start on the next reconciliation p.Status.TargetVersion = p.Spec.Version @@ -98,11 +115,10 @@ func (r *ReconcilePravegaCluster) syncClusterVersion(p *pravegav1alpha1.PravegaC func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaCluster) (err error) { p.Status.SetUpgradingConditionFalse() p.Status.TargetVersion = "" - // need to deep copy the status struct, otherwise it will be overridden + // need to deep copy the status struct, otherwise it will be overwritten // when updating the CR below status := p.Status.DeepCopy() - p.Spec.Version = p.Status.CurrentVersion if err := r.client.Update(context.TODO(), p); err != nil { return err } @@ -111,10 +127,66 @@ func (r *ReconcilePravegaCluster) clearUpgradeStatus(p *pravegav1alpha1.PravegaC return nil } -func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster) (err error) { - var synced bool +func (r *ReconcilePravegaCluster) rollbackClusterVersion(p *pravegav1alpha1.PravegaCluster, version string) (err error) { + defer func() { + r.client.Status().Update(context.TODO(), p) + }() + _, rollbackCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + if rollbackCondition == nil || rollbackCondition.Status != corev1.ConditionTrue { + // We're in the first iteration for Rollback + // Add Rollback Condition to Cluster Status + log.Printf("Updating Target Version to %v", version) + p.Status.TargetVersion = version + p.Status.SetRollbackConditionTrue("", "") + updateErr := r.client.Status().Update(context.TODO(), p) + if updateErr != nil { + p.Status.SetRollbackConditionFalse() + log.Printf("Error updating cluster: %v", updateErr.Error()) + return fmt.Errorf("Error updating cluster status. %v", updateErr) + } + return nil + } + + syncCompleted, err := r.syncComponentsVersion(p) + if err != nil { + // Error rolling back, set appropriate status and ask for manual intervention + p.Status.SetErrorConditionTrue("RollbackFailed", err.Error()) + r.clearRollbackStatus(p) + log.Printf("Error rolling back to cluster version %v. Reason: %v", version, err) + //r.client.Status().Update(context.TODO(), p) + return err + } + + if syncCompleted { + // All component versions have been synced + p.Status.CurrentVersion = p.Status.TargetVersion + // Set Error/UpgradeFailed Condition to 'false', so rollback is not triggered again + p.Status.SetErrorConditionFalse() + r.clearRollbackStatus(p) + log.Printf("Rollback to version %v completed for all pravega components.", version) + } + //r.client.Status().Update(context.TODO(), p) + return nil +} + +func (r *ReconcilePravegaCluster) clearRollbackStatus(p *pravegav1alpha1.PravegaCluster) (err error) { + log.Printf("clearRollbackStatus") + p.Status.SetRollbackConditionFalse() + p.Status.TargetVersion = "" + // need to deep copy the status struct, otherwise it will be overwritten + // when updating the CR below + status := p.Status.DeepCopy() - for _, component := range []componentSyncVersionFun{ + if err := r.client.Update(context.TODO(), p); err != nil { + return err + } + + p.Status = *status + return nil +} + +func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { + componentSyncFuncs := []componentSyncVersionFun{ componentSyncVersionFun{ name: "bookkeeper", fun: r.syncBookkeeperVersion, @@ -127,24 +199,44 @@ func (r *ReconcilePravegaCluster) syncComponentsVersion(p *pravegav1alpha1.Prave name: "controller", fun: r.syncControllerVersion, }, - } { - synced, err = component.fun(p) - if err != nil { - return fmt.Errorf("failed to sync %s version. %s", component.name, err) - } + } - if synced { - log.Printf("%s version sync has been completed", component.name) - } else { - // component version sync is still in progress - // Do not continue with the next component until this one is done - return nil + if p.Status.IsClusterInRollbackState() { + startIndex := len(componentSyncFuncs) - 1 + // update components in reverse order + for i := startIndex; i >= 0; i-- { + log.Printf("Rollback: syncing component %v", i) + component := componentSyncFuncs[i] + synced, err := r.syncComponent(component, p) + if !synced { + return synced, err + } + } + } else { + for _, component := range componentSyncFuncs { + synced, err := r.syncComponent(component, p) + if !synced { + return synced, err + } } } + log.Printf("Version sync completed for all components.") + return true, nil +} - // All component versions have been synced - p.Status.CurrentVersion = p.Status.TargetVersion - return nil +func (r *ReconcilePravegaCluster) syncComponent(component componentSyncVersionFun, p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { + isSyncComplete, err := component.fun(p) + if err != nil { + return false, fmt.Errorf("failed to sync %s version. %s", component.name, err) + } + + if !isSyncComplete { + // component version sync is still in progress + // Do not continue with the next component until this one is done + return false, nil + } + log.Printf("%s version sync has been completed", component.name) + return true, nil } func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.PravegaCluster) (synced bool, err error) { @@ -161,6 +253,8 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave } if deploy.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.UpdateProgress(pravegav1alpha1.UpdatingControllerReason, "0") + // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating deployment (%s) pod template image to '%s'", deploy.Name, targetImage) @@ -177,10 +271,6 @@ func (r *ReconcilePravegaCluster) syncControllerVersion(p *pravegav1alpha1.Prave if err != nil { return false, err } - - // Set the upgrade condition reason to be UpgradingControllerReason, message to be 0 - p.Status.SetUpgradingConditionTrue(pravegav1alpha1.UpgradingControllerReason, "0") - // Updated pod template. Upgrade process has been triggered return false, nil } @@ -233,6 +323,7 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.UpdateProgress(pravegav1alpha1.UpdatingSegmentstoreReason, "0") // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -250,15 +341,11 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra return false, err } - // Set the upgrade condition reason to be UpgradingSegmentstoreReason, message to be 0 - p.Status.SetUpgradingConditionTrue(pravegav1alpha1.UpgradingSegmentstoreReason, "0") - // Updated pod template. Upgrade process has been triggered return false, nil } // Pod template already updated - log.Printf("statefulset (%s) status: %d updated, %d ready, %d target", sts.Name, sts.Status.UpdatedReplicas, sts.Status.ReadyReplicas, sts.Status.Replicas) // Check whether the upgrade is in progress or has completed @@ -268,9 +355,8 @@ func (r *ReconcilePravegaCluster) syncSegmentStoreVersion(p *pravegav1alpha1.Pra return true, nil } // Upgrade still in progress - // Check if segmentstore fail to have progress within a timeout - err = checkUpgradeCondition(p, pravegav1alpha1.UpgradingSegmentstoreReason, sts.Status.UpdatedReplicas) + err = checkSyncTimeout(p, pravegav1alpha1.UpdatingSegmentstoreReason, sts.Status.UpdatedReplicas) if err != nil { return false, fmt.Errorf("updating statefulset (%s) failed due to %v", sts.Name, err) } @@ -322,6 +408,7 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave } if sts.Spec.Template.Spec.Containers[0].Image != targetImage { + p.Status.UpdateProgress(pravegav1alpha1.UpdatingBookkeeperReason, "0") // Need to update pod template // This will trigger the rolling upgrade process log.Printf("updating statefulset (%s) template image to '%s'", sts.Name, targetImage) @@ -339,15 +426,11 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave return false, err } - // Set the upgrade condition reason to be UpgradingBookkeeperReason, message to be 0 - p.Status.SetUpgradingConditionTrue(pravegav1alpha1.UpgradingBookkeeperReason, "0") - // Updated pod template return false, nil } // Pod template already updated - log.Printf("statefulset (%s) status: %d updated, %d ready, %d target", sts.Name, sts.Status.UpdatedReplicas, sts.Status.ReadyReplicas, sts.Status.Replicas) @@ -355,15 +438,13 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave if sts.Status.UpdatedReplicas == sts.Status.Replicas && sts.Status.UpdatedReplicas == sts.Status.ReadyReplicas { // StatefulSet upgrade completed - // TODO: wait until there is no under replicated ledger - // https://bookkeeper.apache.org/docs/4.7.2/reference/cli/#listunderreplicated return true, nil } // Upgrade still in progress // Check if bookkeeper fail to have progress - err = checkUpgradeCondition(p, pravegav1alpha1.UpgradingBookkeeperReason, sts.Status.UpdatedReplicas) + err = checkSyncTimeout(p, pravegav1alpha1.UpdatingBookkeeperReason, sts.Status.UpdatedReplicas) if err != nil { return false, fmt.Errorf("updating statefulset (%s) failed due to %v", sts.Name, err) } @@ -389,14 +470,13 @@ func (r *ReconcilePravegaCluster) syncBookkeeperVersion(p *pravegav1alpha1.Prave return false, fmt.Errorf("could not obtain outdated pod") } - log.Infof("upgrading pod: %s", pod.Name) + log.Infof("updating pod: %s", pod.Name) err = r.client.Delete(context.TODO(), pod) if err != nil { return false, err } } - // wait until the next reconcile iteration return false, nil } @@ -484,8 +564,11 @@ func (r *ReconcilePravegaCluster) getPodsWithVersion(selector labels.Selector, n return pods, nil } -func checkUpgradeCondition(p *pravegav1alpha1.PravegaCluster, reason string, updatedReplicas int32) error { - _, lastCondition := p.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) +func checkSyncTimeout(p *pravegav1alpha1.PravegaCluster, reason string, updatedReplicas int32) error { + lastCondition := p.Status.GetLastCondition() + if lastCondition == nil { + return nil + } if lastCondition.Reason == reason && lastCondition.Message == fmt.Sprint(updatedReplicas) { // if reason and message are the same as before, which means there is no progress since the last reconciling, // then check if it reaches the timeout. @@ -497,7 +580,6 @@ func checkUpgradeCondition(p *pravegav1alpha1.PravegaCluster, reason string, upd // it hasn't reached timeout return nil } - // progress has been made, update the status to the latest. This will also set the transition timestamp to now - p.Status.SetUpgradingConditionTrue(reason, fmt.Sprint(updatedReplicas)) + p.Status.UpdateProgress(reason, fmt.Sprint(updatedReplicas)) return nil } diff --git a/pkg/controller/pravegacluster/upgrade_test.go b/pkg/controller/pravegacluster/upgrade_test.go index 72e0da5af..c461d062d 100644 --- a/pkg/controller/pravegacluster/upgrade_test.go +++ b/pkg/controller/pravegacluster/upgrade_test.go @@ -36,7 +36,7 @@ func TestUpgrade(t *testing.T) { RunSpecs(t, "Pravega cluster") } -var _ = Describe("Pravega Cluster", func() { +var _ = Describe("Pravega Cluster Version Sync", func() { const ( Name = "example" Namespace = "default" @@ -47,7 +47,7 @@ var _ = Describe("Pravega Cluster", func() { r *ReconcilePravegaCluster ) - Context("Upgrade", func() { + var _ = Describe("Upgrade Test", func() { var ( req reconcile.Request p *v1alpha1.PravegaCluster @@ -70,7 +70,7 @@ var _ = Describe("Pravega Cluster", func() { s.AddKnownTypes(v1alpha1.SchemeGroupVersion, p) }) - Context("Pravega condition", func() { + Context("Cluster condition prior to Upgrade", func() { var ( client client.Client err error @@ -131,7 +131,7 @@ var _ = Describe("Pravega Cluster", func() { _, _ = r.Reconcile(req) }) - Context("Condition", func() { + Context("Upgrading Condition", func() { var ( foundPravega *v1alpha1.PravegaCluster ) @@ -170,7 +170,7 @@ var _ = Describe("Pravega Cluster", func() { It("should set upgrade condition reason to UpgradingBookkeeperReason and message to 0", func() { _, upgradeCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) - Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpgradingBookkeeperReason)) + Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingBookkeeperReason)) Ω(upgradeCondition.Message).Should(Equal("0")) }) }) @@ -200,7 +200,7 @@ var _ = Describe("Pravega Cluster", func() { It("should set upgrade condition reason to UpgradingSegmentstoreReason and message to 0", func() { _, upgradeCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) - Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpgradingSegmentstoreReason)) + Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingSegmentstoreReason)) Ω(upgradeCondition.Message).Should(Equal("0")) }) }) @@ -237,10 +237,205 @@ var _ = Describe("Pravega Cluster", func() { It("should set upgrade condition reason to UpgradingControllerReason and message to 0", func() { _, upgradeCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionUpgrading) - Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpgradingControllerReason)) + Ω(upgradeCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingControllerReason)) Ω(upgradeCondition.Message).Should(Equal("0")) }) }) }) }) + + var _ = Describe("Rollback Test", func() { + var ( + req reconcile.Request + p *v1alpha1.PravegaCluster + ) + + BeforeEach(func() { + req = reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: Name, + Namespace: Namespace, + }, + } + p = &v1alpha1.PravegaCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: Name, + Namespace: Namespace, + }, + } + p.Spec.Version = "0.5.0" + s.AddKnownTypes(v1alpha1.SchemeGroupVersion, p) + }) + + Context("Cluster Condition before Rollback", func() { + var ( + client client.Client + err error + ) + + BeforeEach(func() { + client = fake.NewFakeClient(p) + r = &ReconcilePravegaCluster{client: client, scheme: s} + _, err = r.Reconcile(req) + }) + + Context("First reconcile", func() { + It("shouldn't error", func() { + Ω(err).Should(BeNil()) + }) + }) + + Context("Initial status", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, err = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should have current version set to spec version", func() { + Ω(foundPravega.Status.CurrentVersion).Should(Equal(foundPravega.Spec.Version)) + }) + + It("should not have rollback condition set", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) + Ω(rollbackCondition).Should(BeNil()) + }) + + It("should have version history set", func() { + history := foundPravega.Status.VersionHistory + Ω(history[0]).Should(Equal("0.5.0")) + }) + + }) + }) + + Context("Rollback to previous version", func() { + var ( + client client.Client + ) + + BeforeEach(func() { + p.Spec = v1alpha1.ClusterSpec{ + Version: "0.6.0", + } + p.WithDefaults() + client = fake.NewFakeClient(p) + r = &ReconcilePravegaCluster{client: client, scheme: s} + _, _ = r.Reconcile(req) + foundPravega := &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + foundPravega.Spec.Version = "0.5.0" + foundPravega.Status.VersionHistory = []string{"0.5.0"} + // bypass the pods ready check in the upgrade logic + foundPravega.Status.SetPodsReadyConditionFalse() + foundPravega.Status.SetErrorConditionTrue("UpgradeFailed", "some error") + client.Update(context.TODO(), foundPravega) + _, _ = r.Reconcile(req) + }) + + Context("Rollback Triggered", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set Rollback condition status to be true", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(v1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Status).To(Equal(corev1.ConditionTrue)) + }) + + It("should set target version to previous version", func() { + Ω(foundPravega.Status.TargetVersion).To(Equal(foundPravega.Spec.Version)) + }) + }) + + Context("Rollback Controller", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingController and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingControllerReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + + Context("Rollback SegmentStore", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingSegmentStore and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingSegmentstoreReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + + Context("Rollback Bookkeeper", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set rollback condition reason to UpdatingBookkeeper and message to 0", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Reason).Should(Equal(pravegav1alpha1.UpdatingBookkeeperReason)) + Ω(rollbackCondition.Message).Should(Equal("0")) + }) + }) + Context("Rollback Completed", func() { + var ( + foundPravega *v1alpha1.PravegaCluster + ) + BeforeEach(func() { + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + _, _ = r.Reconcile(req) + foundPravega = &v1alpha1.PravegaCluster{} + _ = client.Get(context.TODO(), req.NamespacedName, foundPravega) + }) + + It("should set currentversion equal to target version", func() { + Ω(foundPravega.Status.CurrentVersion).Should(Equal("0.5.0")) + }) + It("should set TargetVersoin to empty", func() { + Ω(foundPravega.Status.TargetVersion).Should(Equal("")) + }) + It("should set rollback condition to false", func() { + _, rollbackCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionRollback) + Ω(rollbackCondition.Status).To(Equal(corev1.ConditionFalse)) + }) + It("should set error condition to false", func() { + _, errorCondition := foundPravega.Status.GetClusterCondition(pravegav1alpha1.ClusterConditionError) + Ω(errorCondition.Status).To(Equal(corev1.ConditionFalse)) + }) + }) + }) + }) })