From 9b5f5016d57c48456f03411c198fd97954af4e47 Mon Sep 17 00:00:00 2001 From: Langleu Date: Fri, 1 Mar 2024 14:28:45 +0100 Subject: [PATCH 01/13] docs(dual-region): add operational procedure --- .../multi-region/dual-region-ops.md | 295 ++++++++++++++++++ .../multi-region/img/10.svg | 1 + .../multi-region/img/11.svg | 1 + .../multi-region/img/12.svg | 1 + .../multi-region/img/13.svg | 1 + .../multi-region/img/14.svg | 1 + .../multi-region/img/15.svg | 1 + .../operational-guides/multi-region/img/3.svg | 1 + .../operational-guides/multi-region/img/4.svg | 1 + .../operational-guides/multi-region/img/5.svg | 1 + .../operational-guides/multi-region/img/6.svg | 1 + .../operational-guides/multi-region/img/7.svg | 1 + .../operational-guides/multi-region/img/9.svg | 1 + .../multi-region/swipItem.css | 29 ++ .../multi-region/swipItem.jsx | 38 +++ optimize_sidebars.js | 9 + sidebars.js | 5 + 17 files changed, 388 insertions(+) create mode 100644 docs/self-managed/operational-guides/multi-region/dual-region-ops.md create mode 100644 docs/self-managed/operational-guides/multi-region/img/10.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/11.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/12.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/13.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/14.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/15.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/3.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/4.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/5.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/6.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/7.svg create mode 100644 docs/self-managed/operational-guides/multi-region/img/9.svg create mode 100644 docs/self-managed/operational-guides/multi-region/swipItem.css create mode 100644 docs/self-managed/operational-guides/multi-region/swipItem.jsx diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md new file mode 100644 index 00000000000..886daca24b5 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -0,0 +1,295 @@ +--- +id: dual-region-operational-procedure +title: "Dual-Region Operational Procedure" +sidebar_label: "Dual-Region Operational Procedure" +description: "The operational procedure concerning dual-region setups to recover from a region loss." +--- + +import Swip from './swip.jsx'; + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import SwipItem from './swipItem'; + + + +import Three from './img/3.svg'; +import Four from './img/4.svg'; +import Five from './img/5.svg'; +import Six from './img/6.svg'; +import Seven from './img/7.svg'; + + + +import Nine from './img/9.svg'; +import Ten from './img/10.svg'; +import Eleven from './img/11.svg'; +import Twelve from './img/12.svg'; +import Thirteen from './img/13.svg'; +import Fourteen from './img/14.svg'; +import Fifteen from './img/15.svg'; + +## Introduction + +The operational procedure is a step-by-step guide on how to proceed in the case of a total region failure. Allowing you to temporarily restore functionality and ultimately do a full recovery to restore the dual-region. + +## Disclaimer + +:::danger + +- Customers must develop and test operational procedures in non-production environments based on the framework steps outlined by Camunda. +- Before advancing to production go-live, it is essential for customers to validate these procedures with Camunda. + +::: + +## Procedure + +We don't differ between primary and secondary regions as the procedure is the same for either loss. We will focus on losing the secondary region (passive) while still having the primary region (active). + +You'll have to take care of DNS considerations by rerouting traffic to the functioning region, which are disregarded in the following. + +After identifying or considering a region as lost, you should ensure that it doesn't reconnect, as this will hinder a successful recovery during failover and failback execution. + +We will first look at the failover procedure, which is responsible for temporarily recovering the operations to unblock those. Zeebe cannot export and process any new data as long as it can't export those to ElasticSearch. + +Afterwards, the failback procedure is responsible for recovering a region. + +### Failover + + + + } + desired={} + /> +
+ +#### Current + +The current state is that one of the regions is lost. This will result in Zeebe not being able to advance any new processes anymore since it +can't export data anymore as one of the ElasticSearch instances is unreachable. Neither would it export to the local region since exporters are +invoked sequentially. + +#### Desired + +For the failover procedure, we need to ensure that the lost region does not accidentally reconnect. You should be sure that the it really is lost and if so look into measures that it doesn't reconnect. + +#### How to get there + +Potential approaches are the following: + +- [Kubernetes Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) +- Firewall rules to block the traffic from the lost region + +
+
+ + } + desired={} + /> +
+ +#### Current + +You have made sure by previous measures, e.g. firewall rules that the lost region does not reconnect during the failover procedure. + +Due to the partitioning of Zeebe, no data has been lost so far. + +#### Desired + +You are creating a temporary Camunda Platform deployment within the same region, but different namespace, to recover functionality. + +The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing ElasticSearch instance and the newly deployed ElasticSearch instance. + +#### How to get there + +
+ +
+ + } + desired={} + /> +
+ +#### Current + +Zeebe won't be able to continue processing yet since the existing Zeebe brokers are still pointing at the ElasticSearch of the lost region. + +Simply disabling the exporter would not be enough since the sequence numbers are not persistent when an exporter removed and those are required by the WebApps importers. + +#### Desired + +You are reconfiguring the existing Camunda Platform setup to point Zeebe to the temporary ElasticSearch instance. This will result in Zeebe being operational again. + +#### How to get there + +```bash reference title="Example" +https://github.com/camunda/zeebe/blob/main/NOTICE.txt +``` + +
+
+
+ +### Fallback + + + + } + desired={} + /> +
+ +#### Current + +You have temporary Zeebe brokers deployed in failover mode together with a temporary ElasticSearch within the same surviving region. + +#### Desired + +You want to restore the dual-region functionality again and deploy Zeebe in fallback mode to the newly restored region. + +Fallback mode means that two brokers will be added to the cluster to allow processing and restore data. While two brokers are sleeping since you still have the temporary setup that you have to transfer. + +An ElasticSearch will also be deployed but not used yet since you have to restore a backup from the temporary setup. + +#### How to get there + +
+
+ + } + desired={} + /> +
+ +#### Current + +You currently have the following setups: + +- Healthy Camunda Platform +- Camunda Platform in failover mode within the same region as the healthy setup +- Camunda Platform in fallback mode within a newly created region + +#### Desired + +You are preparing everything for the newly created region to take over again to restore the benefits of a dual-region setup. + +For this, you need to stop the Zeebe exporters to not export any new data to ElasticSearch, so you can create a backup. + +Additionally, you need to scale down the WebApps. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to ElasticSearch. + +#### How to get there + +
+
+ + } + desired={} + /> +
+ +#### Current + +The Camunda Platform is currently not reachable by end-users and does not process any new processes to allow creating a backup of ElasticSearch without losing any new data. + +#### Desired + +You are creating a backup within the temporary ElasticSearch instance and restore it in the new region. + +#### How to get there + +
+ +
+ + } + desired={} + /> +
+ +#### Current + +The backup of ElasticSearch has been created and restored to the new region. + +The Camunda Platform remains unreachable by end-users as you proceed to restore functionality. + +#### Desired + +You are pointing all Camunda Platforms from the temporary ElasticSearch to the ElasticSearch in the new region. + +The exporters will remain paused but ultimately data will be exported to both regions again. + +#### How to get there + +
+
+ + } + desired={} + /> +
+ +#### Current + +The Camunda Platforms are pointing at the ElasticSearch instances in both regions again and not the temporary instance. It still remains unreachable to the end-users and no processes are advanced. + +#### Desired + +You are reactivating the exporters and enabling the WebApps again within the two regions. This will allow users to interact with the Camunda Platform again. + +#### How to get there + +
+
+ + } + desired={} + /> +
+ +#### Current + +The Camunda Platform is healthy and running in two regions again. + +#### Desired + +You can remove the temporary failover solution since it is not required anymore. + +#### How to get there + +
+
+ + } + desired={} + /> +
+ +#### Current + +Only the two Camunda Platform regions remain, without any temporary solution. + +The fallback mode in the new region is still active. + +#### Desired + +You restore the new region to its normal functionality by removing the fallback mode and forcefully removing the sleeping Zeebe pods. + +They would otherwise hinder the rollout since they will never be ready. + +#### How to get there + +
+
+
diff --git a/docs/self-managed/operational-guides/multi-region/img/10.svg b/docs/self-managed/operational-guides/multi-region/img/10.svg new file mode 100644 index 00000000000..5e9862924af --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/10.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/11.svg b/docs/self-managed/operational-guides/multi-region/img/11.svg new file mode 100644 index 00000000000..bb2fa807cfe --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/11.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/12.svg b/docs/self-managed/operational-guides/multi-region/img/12.svg new file mode 100644 index 00000000000..d9434c66c68 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/12.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/13.svg b/docs/self-managed/operational-guides/multi-region/img/13.svg new file mode 100644 index 00000000000..95ba38fa228 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/13.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/14.svg b/docs/self-managed/operational-guides/multi-region/img/14.svg new file mode 100644 index 00000000000..fdd750f2330 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/14.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/15.svg b/docs/self-managed/operational-guides/multi-region/img/15.svg new file mode 100644 index 00000000000..67178e059cf --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/15.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/3.svg b/docs/self-managed/operational-guides/multi-region/img/3.svg new file mode 100644 index 00000000000..5f26611946a --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/3.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/4.svg b/docs/self-managed/operational-guides/multi-region/img/4.svg new file mode 100644 index 00000000000..f9c21249a45 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/4.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/5.svg b/docs/self-managed/operational-guides/multi-region/img/5.svg new file mode 100644 index 00000000000..43aef34dd97 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/5.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/6.svg b/docs/self-managed/operational-guides/multi-region/img/6.svg new file mode 100644 index 00000000000..7a2a721a387 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/6.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/7.svg b/docs/self-managed/operational-guides/multi-region/img/7.svg new file mode 100644 index 00000000000..4000dbf1059 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/7.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/9.svg b/docs/self-managed/operational-guides/multi-region/img/9.svg new file mode 100644 index 00000000000..c2d013fdeae --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/img/9.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/swipItem.css b/docs/self-managed/operational-guides/multi-region/swipItem.css new file mode 100644 index 00000000000..41776e5aa29 --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/swipItem.css @@ -0,0 +1,29 @@ +.wrapper { + display: flex; + flex-flow: row wrap; + text-align: center; +} + +.wrapper > * { + flex: 1 100%; +} + +@media all and (min-width: 600px) { + .aside { + flex: 1 0 0; + } +} + +@media all and (min-width: 800px) { + .left-1 { + order: 1; + } + + .right-2 { + order: 2; + } + + .below { + order: 3; + } +} diff --git a/docs/self-managed/operational-guides/multi-region/swipItem.jsx b/docs/self-managed/operational-guides/multi-region/swipItem.jsx new file mode 100644 index 00000000000..c4b6e4b94af --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/swipItem.jsx @@ -0,0 +1,38 @@ +import React from 'react'; + +import './swipItem.css'; + +export default function SwipItem({ current, desired, description, code}) { + return ( +
+
+
+

Current State

+
+
+ {current} +
+
+
+
+

Desired State

+
+
+ {desired} +
+
+
+
+

Description / Code

+
+
+ {description} + {code} +
+
+
+

+
+
+ ); +}; diff --git a/optimize_sidebars.js b/optimize_sidebars.js index d48d327361d..1f92d15173a 100644 --- a/optimize_sidebars.js +++ b/optimize_sidebars.js @@ -1928,6 +1928,15 @@ module.exports = { ], }, + { + "Multi-Region": [ + docsLink( + "Dual-Region Operational Procedure", + "self-managed/operational-guides/multi-region/dual-region-operational-procedure/" + ), + ], + }, + { Troubleshooting: [ docsLink( diff --git a/sidebars.js b/sidebars.js index 87460a649d7..16dc7709d0d 100644 --- a/sidebars.js +++ b/sidebars.js @@ -899,6 +899,11 @@ module.exports = { "self-managed/operational-guides/backup-restore/modeler-backup-and-restore", ], }, + { + "Multi-Region": [ + "self-managed/operational-guides/multi-region/dual-region-operational-procedure", + ], + }, { Troubleshooting: [ "self-managed/operational-guides/troubleshooting/troubleshooting", From c96f3ada7a2753c4a3ad07bb3e698ee9abdd723d Mon Sep 17 00:00:00 2001 From: Langleu Date: Wed, 27 Mar 2024 16:11:46 +0100 Subject: [PATCH 02/13] docs(dual-region): first rework of the operational procedure --- .../stateContainer.css} | 0 .../components/stateContainer.jsx | 38 +++ .../multi-region/dual-region-ops.md | 234 ++++++++++++------ .../multi-region/img/10.svg | 2 +- .../multi-region/img/11.svg | 2 +- .../multi-region/img/12.svg | 2 +- .../multi-region/img/13.svg | 2 +- .../multi-region/img/14.svg | 2 +- .../multi-region/img/15.svg | 2 +- .../operational-guides/multi-region/img/3.svg | 2 +- .../operational-guides/multi-region/img/4.svg | 2 +- .../operational-guides/multi-region/img/5.svg | 2 +- .../operational-guides/multi-region/img/6.svg | 2 +- .../operational-guides/multi-region/img/7.svg | 2 +- .../operational-guides/multi-region/img/9.svg | 2 +- .../multi-region/swipItem.jsx | 38 --- 16 files changed, 209 insertions(+), 125 deletions(-) rename docs/self-managed/operational-guides/multi-region/{swipItem.css => components/stateContainer.css} (100%) create mode 100644 docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx delete mode 100644 docs/self-managed/operational-guides/multi-region/swipItem.jsx diff --git a/docs/self-managed/operational-guides/multi-region/swipItem.css b/docs/self-managed/operational-guides/multi-region/components/stateContainer.css similarity index 100% rename from docs/self-managed/operational-guides/multi-region/swipItem.css rename to docs/self-managed/operational-guides/multi-region/components/stateContainer.css diff --git a/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx new file mode 100644 index 00000000000..70b298caada --- /dev/null +++ b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx @@ -0,0 +1,38 @@ +import React from "react"; + +import "./stateContainer.css"; + +export default function StateContainer({ current, desired, description, code }) { +return ( +
+
+
+
Current State
+
+
+ {current} +
+
+
+
+
Desired State
+
+
+ {desired} +
+
+
+
+

Description / Code

+
+
+ {description} + {code} +
+
+
+

+
+
+); +} diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 886daca24b5..f6a1cb149de 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -5,12 +5,10 @@ sidebar_label: "Dual-Region Operational Procedure" description: "The operational procedure concerning dual-region setups to recover from a region loss." --- -import Swip from './swip.jsx'; - import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import SwipItem from './swipItem'; +import StateContainer from './components/stateContainer.jsx'; @@ -20,7 +18,7 @@ import Five from './img/5.svg'; import Six from './img/6.svg'; import Seven from './img/7.svg'; - + import Nine from './img/9.svg'; import Ten from './img/10.svg'; @@ -32,48 +30,56 @@ import Fifteen from './img/15.svg'; ## Introduction -The operational procedure is a step-by-step guide on how to proceed in the case of a total region failure. Allowing you to temporarily restore functionality and ultimately do a full recovery to restore the dual-region. +The operational procedure is a step-by-step guide on how to proceed in the case of a total region failure. Allowing you to temporarily restore functionality and ultimately do a full recovery to restore the dual-region setup. The operational procedure builds on top of the [dual-region AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) but is generally applicable for any dual-region setup. ## Disclaimer :::danger -- Customers must develop and test operational procedures in non-production environments based on the framework steps outlined by Camunda. -- Before advancing to production go-live, it is essential for customers to validate these procedures with Camunda. +- Customers must develop and test the below-described operational procedure in non-production environments based on the framework steps outlined by Camunda, **before applying them in production setups**. +- Before advancing to production go-live, customers need to validate these procedures with Camunda. +- Customers are solely responsible for detecting any regional failures and implementing the necessary described operational procedure. ::: ## Procedure -We don't differ between primary and secondary regions as the procedure is the same for either loss. We will focus on losing the secondary region (passive) while still having the primary region (active). +We don't differ between active and passive regions as the procedure is the same for either loss. We will focus on losing the passive region while still having the active region. You'll have to take care of DNS considerations by rerouting traffic to the functioning region, which are disregarded in the following. -After identifying or considering a region as lost, you should ensure that it doesn't reconnect, as this will hinder a successful recovery during failover and failback execution. +After identifying or considering a region as lost, you should ensure that it doesn't reconnect, as this will hinder a successful recovery during failover and failback execution. In case this is just temporary, Zeebe can survive a region loss but will stop processing due the loss in quorum and ultimately fill up the persistent disk before running out of volume resulting in the loss of data. + +The **failover** procedure aims to temporarily restore operations by redeploying Camunda 8 within the same region to resume workflow engine functionality. During this period, Zeebe is unable to export or process new data until it achieves quorum and the configured Elasticsearch endpoints for the exporters become accessible, which is the outcome of the failover procedure. -We will first look at the failover procedure, which is responsible for temporarily recovering the operations to unblock those. Zeebe cannot export and process any new data as long as it can't export those to ElasticSearch. +The **failback** procedure involves completely restoring the failed region, thereby restoring your dual-region setup to its full functionality. -Afterwards, the failback procedure is responsible for recovering a region. +The following procedures are building on top of the work done in the [AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) about deploying Camunda 8 to a dual-region cluster. We assume you have your own copy of the [c8-multi-region](https://github.com/camunda/c8-multi-region) repository and previously done changes in the `camunda-values.yml`. + +Please ensure to have followed the points [environment prerequisites](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites) and [deploy Camunda 8 to the clusters](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) to have the required base to build upon. ### Failover - + - } - desired={} - /> + +#### Ensure Network Disconnection + +} +desired={} +/> +
#### Current -The current state is that one of the regions is lost. This will result in Zeebe not being able to advance any new processes anymore since it -can't export data anymore as one of the ElasticSearch instances is unreachable. Neither would it export to the local region since exporters are +The current state is that one of the regions is lost. This will result in Zeebe being unable to process anything new due to the loss in quorum, nor can it export data to Elasticsearch since one of the instances is unreachable. Neither would it export to the local region since exporters are invoked sequentially. #### Desired -For the failover procedure, we need to ensure that the lost region does not accidentally reconnect. You should be sure that the it really is lost and if so look into measures that it doesn't reconnect. +For the failover procedure, we need to ensure that the lost region does not accidentally reconnect. You should be sure it is lost, and if so, look into measures to prevent it from reconnecting by for example utilizing the suggested solution below to isolate your active environment. #### How to get there @@ -85,15 +91,19 @@ Potential approaches are the following:
- } - desired={} - /> + +#### Deploy Temporary Camunda 8 Installation in Failover Mode in Existing Region + +} +desired={} +/> +
#### Current -You have made sure by previous measures, e.g. firewall rules that the lost region does not reconnect during the failover procedure. +You have made sure by previous measures, for example, firewall rules that the lost region does not reconnect during the failover procedure. Due to the partitioning of Zeebe, no data has been lost so far. @@ -101,29 +111,59 @@ Due to the partitioning of Zeebe, no data has been lost so far. You are creating a temporary Camunda Platform deployment within the same region, but different namespace, to recover functionality. -The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing ElasticSearch instance and the newly deployed ElasticSearch instance. +The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing Elasticsearch instance and the newly deployed Elasticsearch instance. #### How to get there +In the previously cloned repository [c8-multi-region](https://github.com/camunda/c8-multi-region) navigate to the folder [aws/dual-region/kubernetes/region0](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region0/) it contains the example Helm values yaml `camunda-values-failover.yml` containing the required overlay for the **failover** mode. + +In the case your **Region 0** was lost, please consider the folder [aws/dual-region/kubernetes/region1](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region1/). We will refrain from mentioning both possibilities always but as you can see it's simply the other way around in case of the loss of the **Region 0**. + +The chosen `camunda-values-failover.yml` requires adjustments before installing the Helm chart. + +- `ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS` +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` + +1. The bash script [generate_zeebe_helm_values.sh](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/scripts/generate_zeebe_helm_values.sh) in the repository folder `aws/dual-region/scripts/` helps generate those values. You only have to copy and replace them within the previously mentioned yaml. It will use the exported environment variables of the environment prerequisites for namespaces and regions. Additionally, you have to pass in whether your region 0 or 1 was lost. + +```bash +./generate_zeebe_helm_values.sh failover + +# It will ask you to provide the following values +# Enter the region that was lost, values can either be 0 or 1: +## In our case we lost region 1, therefore input 1 +# Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: +## the way you'll call the Helm release, for example camunda +# Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): +## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. +``` + +#### Verification +
- } - desired={} - /> + +#### Adjust Elasticsearch Exporters Endpoints to Temporary Deployment + +} +desired={} +/> +
#### Current -Zeebe won't be able to continue processing yet since the existing Zeebe brokers are still pointing at the ElasticSearch of the lost region. +Zeebe won't be able to continue processing yet since the existing Zeebe brokers are still pointing at the Elasticsearch of the lost region. Simply disabling the exporter would not be enough since the sequence numbers are not persistent when an exporter removed and those are required by the WebApps importers. #### Desired -You are reconfiguring the existing Camunda Platform setup to point Zeebe to the temporary ElasticSearch instance. This will result in Zeebe being operational again. +You are reconfiguring the existing Camunda Platform setup to point Zeebe to the temporary Elasticsearch instance. This will result in Zeebe being operational again. #### How to get there @@ -131,41 +171,53 @@ You are reconfiguring the existing Camunda Platform setup to point Zeebe to the https://github.com/camunda/zeebe/blob/main/NOTICE.txt ``` +#### Verification +
-### Fallback +### Failback - + - } - desired={} - /> + +#### Deploy Camunda 8 in Failback Mode in Newly Created Region + +} +desired={} +/> +
#### Current -You have temporary Zeebe brokers deployed in failover mode together with a temporary ElasticSearch within the same surviving region. +You have temporary Zeebe brokers deployed in failover mode together with a temporary Elasticsearch within the same surviving region. #### Desired -You want to restore the dual-region functionality again and deploy Zeebe in fallback mode to the newly restored region. +You want to restore the dual-region functionality again and deploy Zeebe in failback mode to the newly restored region. -Fallback mode means that two brokers will be added to the cluster to allow processing and restore data. While two brokers are sleeping since you still have the temporary setup that you have to transfer. +Failback mode means that two brokers will be added to the cluster to allow processing and restore data. While two brokers are sleeping since you still have the temporary setup that you have to transfer. -An ElasticSearch will also be deployed but not used yet since you have to restore a backup from the temporary setup. +An Elasticsearch will also be deployed but not used yet since you have to restore a backup from the temporary setup. #### How to get there +#### Verification +
- } - desired={} - /> + +#### Pause Elasticsearch Exporters and WebApps + +} +desired={} +/> +
#### Current @@ -174,73 +226,91 @@ You currently have the following setups: - Healthy Camunda Platform - Camunda Platform in failover mode within the same region as the healthy setup -- Camunda Platform in fallback mode within a newly created region +- Camunda Platform in failback mode within a newly created region #### Desired You are preparing everything for the newly created region to take over again to restore the benefits of a dual-region setup. -For this, you need to stop the Zeebe exporters to not export any new data to ElasticSearch, so you can create a backup. +For this, you need to stop the Zeebe exporters to not export any new data to Elasticsearch, so you can create a backup. -Additionally, you need to scale down the WebApps. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to ElasticSearch. +Additionally, you need to scale down the WebApps. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to Elasticsearch. #### How to get there +#### Verification +
- } - desired={} - /> + +#### Create and Restore Elasticsearch Backup + +} +desired={} +/> +
#### Current -The Camunda Platform is currently not reachable by end-users and does not process any new processes to allow creating a backup of ElasticSearch without losing any new data. +The Camunda Platform is currently not reachable by end-users and does not process any new processes to allow creating a backup of Elasticsearch without losing any new data. #### Desired -You are creating a backup within the temporary ElasticSearch instance and restore it in the new region. +You are creating a backup within the temporary Elasticsearch instance and restore it in the new region. #### How to get there +#### Verification +
- } - desired={} - /> + +#### Adjust Elasticsearch Exporters Endpoints to Newly Created Region + +} +desired={} +/> +
#### Current -The backup of ElasticSearch has been created and restored to the new region. +The backup of Elasticsearch has been created and restored to the new region. The Camunda Platform remains unreachable by end-users as you proceed to restore functionality. #### Desired -You are pointing all Camunda Platforms from the temporary ElasticSearch to the ElasticSearch in the new region. +You are pointing all Camunda Platforms from the temporary Elasticsearch to the Elasticsearch in the new region. The exporters will remain paused but ultimately data will be exported to both regions again. #### How to get there +#### Verification +
- } - desired={} - /> + +#### Reactivate Exporters and WebApps + +} +desired={} +/> +
#### Current -The Camunda Platforms are pointing at the ElasticSearch instances in both regions again and not the temporary instance. It still remains unreachable to the end-users and no processes are advanced. +The Camunda Platforms are pointing at the Elasticsearch instances in both regions again and not the temporary instance. It still remains unreachable to the end-users and no processes are advanced. #### Desired @@ -248,13 +318,19 @@ You are reactivating the exporters and enabling the WebApps again within the two #### How to get there +#### Verification +
- } - desired={} - /> + +#### Remove Temporary Failover Installation + +} +desired={} +/> +
#### Current @@ -267,29 +343,37 @@ You can remove the temporary failover solution since it is not required anymore. #### How to get there +#### Verification +
- } - desired={} - /> + +#### Switch to Normal Mode in Zeebe for Newly Created Region + +} +desired={} +/> +
#### Current Only the two Camunda Platform regions remain, without any temporary solution. -The fallback mode in the new region is still active. +The failback mode in the new region is still active. #### Desired -You restore the new region to its normal functionality by removing the fallback mode and forcefully removing the sleeping Zeebe pods. +You restore the new region to its normal functionality by removing the failback mode and forcefully removing the sleeping Zeebe pods. They would otherwise hinder the rollout since they will never be ready. #### How to get there +#### Verification +
diff --git a/docs/self-managed/operational-guides/multi-region/img/10.svg b/docs/self-managed/operational-guides/multi-region/img/10.svg index 5e9862924af..45afbdccfeb 100644 --- a/docs/self-managed/operational-guides/multi-region/img/10.svg +++ b/docs/self-managed/operational-guides/multi-region/img/10.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/11.svg b/docs/self-managed/operational-guides/multi-region/img/11.svg index bb2fa807cfe..ed2de493dc8 100644 --- a/docs/self-managed/operational-guides/multi-region/img/11.svg +++ b/docs/self-managed/operational-guides/multi-region/img/11.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/12.svg b/docs/self-managed/operational-guides/multi-region/img/12.svg index d9434c66c68..c2918534765 100644 --- a/docs/self-managed/operational-guides/multi-region/img/12.svg +++ b/docs/self-managed/operational-guides/multi-region/img/12.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/13.svg b/docs/self-managed/operational-guides/multi-region/img/13.svg index 95ba38fa228..e5ab3a79b3c 100644 --- a/docs/self-managed/operational-guides/multi-region/img/13.svg +++ b/docs/self-managed/operational-guides/multi-region/img/13.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/14.svg b/docs/self-managed/operational-guides/multi-region/img/14.svg index fdd750f2330..492473fe1a5 100644 --- a/docs/self-managed/operational-guides/multi-region/img/14.svg +++ b/docs/self-managed/operational-guides/multi-region/img/14.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/15.svg b/docs/self-managed/operational-guides/multi-region/img/15.svg index 67178e059cf..4fd23ce94e0 100644 --- a/docs/self-managed/operational-guides/multi-region/img/15.svg +++ b/docs/self-managed/operational-guides/multi-region/img/15.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/3.svg b/docs/self-managed/operational-guides/multi-region/img/3.svg index 5f26611946a..6703d8c9488 100644 --- a/docs/self-managed/operational-guides/multi-region/img/3.svg +++ b/docs/self-managed/operational-guides/multi-region/img/3.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/4.svg b/docs/self-managed/operational-guides/multi-region/img/4.svg index f9c21249a45..41f2701e8c3 100644 --- a/docs/self-managed/operational-guides/multi-region/img/4.svg +++ b/docs/self-managed/operational-guides/multi-region/img/4.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/5.svg b/docs/self-managed/operational-guides/multi-region/img/5.svg index 43aef34dd97..b38aa23ed66 100644 --- a/docs/self-managed/operational-guides/multi-region/img/5.svg +++ b/docs/self-managed/operational-guides/multi-region/img/5.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/6.svg b/docs/self-managed/operational-guides/multi-region/img/6.svg index 7a2a721a387..edcde812348 100644 --- a/docs/self-managed/operational-guides/multi-region/img/6.svg +++ b/docs/self-managed/operational-guides/multi-region/img/6.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/7.svg b/docs/self-managed/operational-guides/multi-region/img/7.svg index 4000dbf1059..8ce6cae3502 100644 --- a/docs/self-managed/operational-guides/multi-region/img/7.svg +++ b/docs/self-managed/operational-guides/multi-region/img/7.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/img/9.svg b/docs/self-managed/operational-guides/multi-region/img/9.svg index c2d013fdeae..79b0c50e6ee 100644 --- a/docs/self-managed/operational-guides/multi-region/img/9.svg +++ b/docs/self-managed/operational-guides/multi-region/img/9.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/operational-guides/multi-region/swipItem.jsx b/docs/self-managed/operational-guides/multi-region/swipItem.jsx deleted file mode 100644 index c4b6e4b94af..00000000000 --- a/docs/self-managed/operational-guides/multi-region/swipItem.jsx +++ /dev/null @@ -1,38 +0,0 @@ -import React from 'react'; - -import './swipItem.css'; - -export default function SwipItem({ current, desired, description, code}) { - return ( -
-
-
-

Current State

-
-
- {current} -
-
-
-
-

Desired State

-
-
- {desired} -
-
-
-
-

Description / Code

-
-
- {description} - {code} -
-
-
-

-
-
- ); -}; From 555552a8c727e540db874ba31e107e92447773d6 Mon Sep 17 00:00:00 2001 From: Langleu Date: Thu, 28 Mar 2024 15:23:40 +0100 Subject: [PATCH 03/13] docs(multi-region): describe all steps for operational guide --- .../multi-region/dual-region-ops.md | 470 +++++++++++++++++- 1 file changed, 455 insertions(+), 15 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index f6a1cb149de..7672b845f86 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -109,9 +109,9 @@ Due to the partitioning of Zeebe, no data has been lost so far. #### Desired -You are creating a temporary Camunda Platform deployment within the same region, but different namespace, to recover functionality. +You are creating a temporary Camunda Platform deployment within the same region, but different namespace, to recover functionality. The extra namespace allows for easier distinguishing between the normal Zeebe deployment and Zeebe failover deployment. -The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing Elasticsearch instance and the newly deployed Elasticsearch instance. +The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing Elasticsearch instance and the newly deployed Elasticsearch instance to allow exporting the data again. #### How to get there @@ -119,13 +119,13 @@ In the previously cloned repository [c8-multi-region](https://github.com/camunda In the case your **Region 0** was lost, please consider the folder [aws/dual-region/kubernetes/region1](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region1/). We will refrain from mentioning both possibilities always but as you can see it's simply the other way around in case of the loss of the **Region 0**. -The chosen `camunda-values-failover.yml` requires adjustments before installing the Helm chart. +The chosen `camunda-values-failover.yml` requires adjustments before installing the Helm chart and the same has to be done for the base `camunda-values.yml` in `aws/dual-region/kubernetes`. - `ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS` - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` -1. The bash script [generate_zeebe_helm_values.sh](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/scripts/generate_zeebe_helm_values.sh) in the repository folder `aws/dual-region/scripts/` helps generate those values. You only have to copy and replace them within the previously mentioned yaml. It will use the exported environment variables of the environment prerequisites for namespaces and regions. Additionally, you have to pass in whether your region 0 or 1 was lost. +1. The bash script [generate_zeebe_helm_values.sh](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/scripts/generate_zeebe_helm_values.sh) in the repository folder `aws/dual-region/scripts/` helps generate those values. You only have to copy and replace them within the previously mentioned Helm values files. It will use the exported environment variables of the environment prerequisites for namespaces and regions. Additionally, you have to pass in whether your region 0 or 1 was lost. ```bash ./generate_zeebe_helm_values.sh failover @@ -139,8 +139,63 @@ The chosen `camunda-values-failover.yml` requires adjustments before installing ## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. ``` +
+ Example output + + +```bash +Please use the following to change the existing environment variable ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS + value: camunda-zeebe-0.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-0.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-1.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-1.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-2.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-2.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-3.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-3.camunda-zeebe.camunda-paris.svc.cluster.local:26502 + +Please use the following to change the existing environment variable ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-london.svc.cluster.local:9200 + +Please use the following to change the existing environment variable ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-london-failover.svc.cluster.local:9200 +``` + + +
+ +2. As the script suggests, replace the environment variables within the `camunda-values-failover.yml`. +3. Repeat the adjustments for the base Helm values file `camunda-values.yml` in `aws/dual-region/kubernetes` with the same output for the mentioned environment variables. +4. From the terminal context of `aws/dual-region/kubernetes` execute: + +```bash +helm install camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_0 \ + --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ + -f camunda-values.yml \ + -f region0/camunda-values-failover.yml +``` + #### Verification +The following command will show the deployed pods of the failover namespace. + +Depending on your chosen `clusterSize` you should see that the failover deployment contains only a subset of Zeebe instances. + +For example 2 in the case of `clusterSize: 8`. This allows to recover the quorum. + +```bash +kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0_FAILOVER +``` + +Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the **failover** brokers have joined the cluster. + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +zbctl status --insecure --address localhost:26500 +``` + @@ -159,20 +214,35 @@ desired={} Zeebe won't be able to continue processing yet since the existing Zeebe brokers are still pointing at the Elasticsearch of the lost region. -Simply disabling the exporter would not be enough since the sequence numbers are not persistent when an exporter removed and those are required by the WebApps importers. +Simply disabling the exporter would not be enough since the sequence numbers are not persistent when an exporter is removed and those are required by the Operate and Tasklist importers. #### Desired -You are reconfiguring the existing Camunda Platform setup to point Zeebe to the temporary Elasticsearch instance. This will result in Zeebe being operational again. +You are reconfiguring the existing Camunda deployment of `CAMUNDA_NAMESPACE_0` to point Zeebe to the temporary Elasticsearch instance that was previously created in **Step 2**. The outcome will be that Zeebe is unblocked and can export data to Elasticsearch again. This allows users to interact with the Camunda Platform again. #### How to get there -```bash reference title="Example" -https://github.com/camunda/zeebe/blob/main/NOTICE.txt +In **Step 2** you have already adjusted the base Helm values file `camunda-values.yml` in `aws/dual-region/kubernetes` with the same changes as for the failover deployment for the environment variables. + +- `ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS` +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` + +1. From the terminal context of `aws/dual-region/kubernetes`, you will do a Helm upgrade to update the existing Zeebe deployment in `CAMUNDA_NAMESPACE_0` to point to the failover Elasticsearch instance: + +```bash +helm upgrade camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_0 \ + --namespace $CAMUNDA_NAMESPACE_0 \ + -f camunda-values.yml \ + -f region0/camunda-values.yml ``` #### Verification +TODO: We can check that the yaml was updated and Zeebe is restarting. Not sure there's an endpoint that reports on that kind of stuff. +
@@ -205,13 +275,53 @@ An Elasticsearch will also be deployed but not used yet since you have to restor #### How to get there +The changes previously done in the base Helm values file `camunda-values.yml` in `aws/dual-region/kubernetes` should still be present from **Failover - Step 2**. + +In particular, the values `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` and `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` should solely point at the surviving region. + +In addition, the following Helm command will disable Operate and Tasklist since those will only be enabled at the end of the full region restore. It's required to keep them disabled in the newly created region due to their Elasticsearch importers. +Lastly, the `installationType` is set to `failBack` to switch the behaviour of Zeebe and prepare for this procedure. + +1. From the terminal context of `aws/dual-region/kubernetes` execute: + +```bash +helm install camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_1 \ + --namespace $CAMUNDA_NAMESPACE_1 \ + -f camunda-values.yml \ + -f region1/camunda-values.yml \ + --set global.multiregion.installationType=failBack \ + --set operate.enabled=false \ + --set tasklist.enabled=false +``` + #### Verification +The following command will show the deployed pods of the newly created region. + +Depending on your chosen `clusterSize` you should see that the **failback** deployment contains some Zeebe instances being ready and others unready. Those unready instances are sleeping indefinitely and is the expected behaviour. +This behaviour stems from the **failback** mode since we still have the temporary **failover**, which acts as replacement for the lost region. + +For example in the case of `clusterSize: 8`, you find 2 active Zeebe brokers and 2 unready brokers in the newly created region. + +```bash +kubectl --context $CLUSTER_1 get pods -n $CAMUNDA_NAMESPACE_1 +``` + +Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the **failback** brokers have joined the cluster. + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +zbctl status --insecure --address localhost:26500 +``` + -#### Pause Elasticsearch Exporters and WebApps +#### Pause Elasticsearch Exporters and Operate / Tasklist } @@ -234,12 +344,44 @@ You are preparing everything for the newly created region to take over again to For this, you need to stop the Zeebe exporters to not export any new data to Elasticsearch, so you can create a backup. -Additionally, you need to scale down the WebApps. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to Elasticsearch. +Additionally, you need to scale down Operate and Tasklist. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to Elasticsearch. #### How to get there +1. Disable the Zeebe Elasticsearch exporters in Zeebe via kubectl + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 +curl -i localhost:9600/actuator/exporting/pause -XPOST +# The successful response should be: +# HTTP/1.1 204 No Content +``` + +2. Disable Operate and Tasklist by scaling to 0 + +```bash +OPERATE_DEPLOYMENT=$(kubectl --context $CLUSTER_0 get deployment --selector=app\.kubernetes\.io/component=operate -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +TASKLIST_DEPLOYMENT=$(kubectl --context $CLUSTER_0 get deployment --selector=app\.kubernetes\.io/component=tasklist -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) + +kubectl --context $CLUSTER_0 scale deployments/$OPERATE_DEPLOYMENT --replicas 0 +kubectl --context $CLUSTER_0 scale deployments/$TASKLIST_DEPLOYMENT --replicas 0 + +``` + #### Verification +For the Zeebe Elasticsearch exporters, there's currently no API available to confirm this. Only the response code of `204` indicates a successful disabling. + +For Operate and Tasklist, you can confirm that the deployments have successfully scaled down by listing those and indicating `0/0` ready. + +```bash +kubectl --context $CLUSTER_0 get deployments $OPERATE_DEPLOYMENT $TASKLIST_DEPLOYMENT -n $CAMUNDA_NAMESPACE_0 +# NAME READY UP-TO-DATE AVAILABLE AGE +# camunda-operate 0/0 0 0 23m +# camunda-tasklist 0/0 0 0 23m +``` + @@ -259,11 +401,166 @@ The Camunda Platform is currently not reachable by end-users and does not proces #### Desired -You are creating a backup within the temporary Elasticsearch instance and restore it in the new region. +You are creating a backup of the healthy Elasticsearch instance in `CAMUNDA_NAMESPACE_0` and restore it in the new region. This Elasticsearch backup contains all the data and may take some time to backup. The failover Elasticsearch instance only contains a subset of the data from after the region loss and is not sufficient to restore this in the new region. #### How to get there -#### Verification +This builds on top of the [AWS Setup](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) and assumes that the S3 bucket was automatically created as part of the Terraform execution. + +1. Determine the S3 bucket name by retrieving it via Terraform. Go to `aws/dual-region/terraform` within the repository and retrieve the bucket name from the Terraform state. + +```bash +export S3_BUCKET_NAME=$(terraform output -raw s3_bucket_name) +``` + +2. Configure Elasticsearch backup endpoint in the healthy namespace `CAMUNDA_NAMESPACE_0` + +```bash +ELASTIC_POD=$(kubectl --context $CLUSTER_0 get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' +{ + "type": "s3", + "settings": { + "bucket": "'$S3_BUCKET_NAME'", + "client": "camunda", + "base_path": "backups" + } +} +' +``` + +3. Create an Elasticsearch backup in the healthy namespace `CAMUNDA_NAMESPACE_0`. Depending on the amount of data, this operation will take a while to complete. + +```bash +# The backup will be called failback +kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup/failback?wait_for_completion=true" +``` + +4. Verify that the backup has been completed successfully by checking all backups and ensuring the `state` is `SUCCESS` + +```bash +kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" +``` + +
+ Example Output + + +```bash +{ + "snapshots": [ + { + "snapshot": "failback", + "uuid": "uTHGdUAYSk-91aAS0sMKFQ", + "repository": "camunda_backup", + "version_id": 8090299, + "version": "8.9.2", + "indices": [ + "operate-web-session-1.1.0_", + "tasklist-form-8.4.0_", + "operate-process-8.3.0_", + "zeebe-record_process-instance-creation_8.4.5_2024-03-28", + "operate-batch-operation-1.0.0_", + "operate-user-1.2.0_", + "operate-incident-8.3.1_", + "zeebe-record_job_8.4.5_2024-03-28", + "operate-variable-8.3.0_", + "tasklist-web-session-1.1.0_", + "tasklist-draft-task-variable-8.3.0_", + "operate-operation-8.4.0_", + "zeebe-record_process_8.4.5_2024-03-28", + ".ds-.logs-deprecation.elasticsearch-default-2024.03.28-000001", + "tasklist-process-8.4.0_", + "operate-metric-8.3.0_", + "operate-flownode-instance-8.3.1_", + "tasklist-flownode-instance-8.3.0_", + "tasklist-variable-8.3.0_", + "tasklist-metric-8.3.0_", + "operate-post-importer-queue-8.3.0_", + "tasklist-task-variable-8.3.0_", + "operate-event-8.3.0_", + "tasklist-process-instance-8.3.0_", + "operate-import-position-8.3.0_", + "operate-decision-requirements-8.3.0_", + "zeebe-record_command-distribution_8.4.5_2024-03-28", + "operate-list-view-8.3.0_", + "zeebe-record_process-instance_8.4.5_2024-03-28", + "tasklist-import-position-8.2.0_", + "tasklist-user-1.4.0_", + "operate-decision-instance-8.3.0_", + "zeebe-record_deployment_8.4.5_2024-03-28", + "operate-migration-steps-repository-1.1.0_", + "tasklist-migration-steps-repository-1.1.0_", + ".ds-ilm-history-5-2024.03.28-000001", + "operate-decision-8.3.0_", + "operate-sequence-flow-8.3.0_", + "tasklist-task-8.4.0_" + ], + "data_streams": [ + "ilm-history-5", + ".logs-deprecation.elasticsearch-default" + ], + "include_global_state": true, + "state": "SUCCESS", + "start_time": "2024-03-28T03:17:38.340Z", + "start_time_in_millis": 1711595858340, + "end_time": "2024-03-28T03:17:39.340Z", + "end_time_in_millis": 1711595859340, + "duration_in_millis": 1000, + "failures": [], + "shards": { + "total": 43, + "failed": 0, + "successful": 43 + }, + "feature_states": [] + } + ], + "total": 1, + "remaining": 0 +} +``` + + +
+ +5. Configure Elasticsearch backup endpoint in the new region namespace `CAMUNDA_NAMESPACE_1`. It's essential to only do this step now as otherwise it won't see the backup. + +```bash +ELASTIC_POD=$(kubectl --context $CLUSTER_1 get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_1) +kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' +{ + "type": "s3", + "settings": { + "bucket": "'$S3_BUCKET_NAME'", + "client": "camunda", + "base_path": "backups" + } +} +' +``` + +6. Verify that the backup can be found in the shared S3 bucket + +```bash +kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" +``` + +The example output above should be the same since it's the same backup. + +7. Restore Elasticsearch backup in the new region namespace `CAMUNDA_NAMESPACE_1`. Depending on the amount of data, this operation will take a while to complete. + +```bash +kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPOST "http://localhost:9200/_snapshot/camunda_backup/failback/_restore?wait_for_completion=true" +``` + +8. Verify that the restore has been completed successfully in the new region. + +```bash +kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPOST "http://localhost:9200/_snapshot/camunda_backup/failback/_status" +``` + +TODO: provide example output. @@ -289,17 +586,104 @@ The Camunda Platform remains unreachable by end-users as you proceed to restore You are pointing all Camunda Platforms from the temporary Elasticsearch to the Elasticsearch in the new region. -The exporters will remain paused but ultimately data will be exported to both regions again. +The Elasticsearch exporters will remain paused during this step. #### How to get there +Your `camunda-values-failover.yml` and base `camunda-values.yml` require adjustments again to reconfigure all installations to the Elasticsearch instance in the new region. + +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` +- `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` + +1. The bash script [generate_zeebe_helm_values.sh](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/scripts/generate_zeebe_helm_values.sh) in the repository folder `aws/dual-region/scripts/` helps generate those values again. You only have to copy and replace them within the previously mentioned Helm values files. It will use the exported environment variables of the environment prerequisites for namespaces and regions. + +```bash +./generate_zeebe_helm_values.sh + +# It will ask you to provide the following values +# Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: +## the way you'll call the Helm release, for example camunda +# Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): +## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. +``` + +
+ Example output + + +```bash +Please use the following to change the existing environment variable ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS + value: camunda-zeebe-0.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-0.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-1.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-1.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-2.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-2.camunda-zeebe.camunda-paris.svc.cluster.local:26502,camunda-zeebe-3.camunda-zeebe.camunda-london.svc.cluster.local:26502,camunda-zeebe-3.camunda-zeebe.camunda-paris.svc.cluster.local:26502 + +Please use the following to change the existing environment variable ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-london.svc.cluster.local:9200 + +Please use the following to change the existing environment variable ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL in the failover Camunda Helm chart values file 'camunda-values-failover.yml'. It's part of the 'zeebe.env' path. + +- name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-paris.svc.cluster.local:9200 +``` + + +
+ +2. As the script suggests, replace the environment variables within the `camunda-values-failover.yml`. +3. Repeat the adjustments for the base Helm values file `camunda-values.yml` in `aws/dual-region/kubernetes` with the same output for the mentioned environment variables. +4. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to point to the new Elasticsearch + +```bash +helm upgrade camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_0 \ + --namespace $CAMUNDA_NAMESPACE_0 \ + -f camunda-values.yml \ + -f region0/camunda-values.yml \ + --set operate.enabled=false \ + --set tasklist.enabled=false +``` + +5. Upgrade the failover Camunda environment in `CAMUNDA_NAMESPACE_0_FAILOVER` and `REGION 0` to point to the new Elasticsearch + +```bash +helm upgrade camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_0 \ + --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ + -f camunda-values.yml \ + -f region0/camunda-values-failover.yml +``` + +6. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to point to the new Elasticsearch + +```bash +helm install camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_1 \ + --namespace $CAMUNDA_NAMESPACE_1 \ + -f camunda-values.yml \ + -f region1/camunda-values.yml \ + --set global.multiregion.installationType=failBack \ + --set operate.enabled=false \ + --set tasklist.enabled=false +``` + +7. Delete the sleeping pods in the new region, as those are blocking a successful rollout due to the failback mode. + +```bash +kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --selector=app\.kubernetes\.io/component=zeebe-broker +``` + #### Verification
-#### Reactivate Exporters and WebApps +#### Reactivate Exporters and Operate / Tasklist } @@ -314,10 +698,43 @@ The Camunda Platforms are pointing at the Elasticsearch instances in both region #### Desired -You are reactivating the exporters and enabling the WebApps again within the two regions. This will allow users to interact with the Camunda Platform again. +You are reactivating the exporters and enabling Operate and Tasklist again within the two regions. This will allow users to interact with the Camunda Platform again. #### How to get there +1. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to deploy Operate and Tasklist. + +```bash +helm upgrade camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_0 \ + --namespace $CAMUNDA_NAMESPACE_0 \ + -f camunda-values.yml \ + -f region0/camunda-values.yml +``` + +2. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to deploy Operate and Tasklist. + +```bash +helm install camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_1 \ + --namespace $CAMUNDA_NAMESPACE_1 \ + -f camunda-values.yml \ + -f region1/camunda-values.yml \ + --set global.multiregion.installationType=failBack +``` + +3. Reactivate the exporters by sending the API activation request via the Zeebe Gateway + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 +curl -i localhost:9600/actuator/exporting/resume -XPOST +# The successful response should be: +# HTTP/1.1 204 No Content +``` + #### Verification @@ -343,6 +760,12 @@ You can remove the temporary failover solution since it is not required anymore. #### How to get there +1. You can uninstall the failover installation via Helm. + +```bash +helm uninstall camunda --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_0_FAILOVER +``` + #### Verification @@ -372,6 +795,23 @@ They would otherwise hinder the rollout since they will never be ready. #### How to get there +1. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` by removing the failback mode + +```bash +helm install camunda camunda/camunda-platform \ + --version 9.3.1 \ + --kube-context $CLUSTER_1 \ + --namespace $CAMUNDA_NAMESPACE_1 \ + -f camunda-values.yml \ + -f region1/camunda-values.yml +``` + +2. Delete the sleeping pods in the new region, as those are blocking a successful rollout due to the failback mode. + +```bash +kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --selector=app\.kubernetes\.io/component=zeebe-broker +``` + #### Verification From 48308bc5928c53021d0c7601789786a608d97542 Mon Sep 17 00:00:00 2001 From: Langleu Date: Thu, 28 Mar 2024 15:58:57 +0100 Subject: [PATCH 04/13] docs(dual-region): adjust viewbox and add more verification steps --- .../multi-region/dual-region-ops.md | 124 ++++++++++++------ 1 file changed, 83 insertions(+), 41 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 7672b845f86..7e83bbc93be 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -66,8 +66,8 @@ Please ensure to have followed the points [environment prerequisites](./../../pl #### Ensure Network Disconnection } -desired={} +current={} +desired={} />
@@ -95,8 +95,8 @@ Potential approaches are the following: #### Deploy Temporary Camunda 8 Installation in Failover Mode in Existing Region } -desired={} +current={} +desired={} />
@@ -204,8 +204,8 @@ zbctl status --insecure --address localhost:26500 #### Adjust Elasticsearch Exporters Endpoints to Temporary Deployment } -desired={} +current={} +desired={} />
@@ -255,8 +255,8 @@ TODO: We can check that the yaml was updated and Zeebe is restarting. Not sure t #### Deploy Camunda 8 in Failback Mode in Newly Created Region } -desired={} +current={} +desired={} />
@@ -324,8 +324,8 @@ zbctl status --insecure --address localhost:26500 #### Pause Elasticsearch Exporters and Operate / Tasklist } -desired={} +current={} +desired={} />
@@ -340,25 +340,29 @@ You currently have the following setups: #### Desired +:::warning + +This step is very important to minimise the risk of loosing any data when restoring the backup in the new region. + +There remains a small chance of losing some data in Elasticsearch (in turn in Operate and Tasklist). This is because Zeebe might have exported some records to the failover Elasticsearch in `REGION_0`, but not to the main Elasticsearch in `REGION_0` before pausing the exporters. So those records are not included in the `REGION_0` Elasticsearch backup when the new `REGION_1` Elasticsearch is restored from the `REGION_0` backup, the new region is missing those records and Zeebe does not re-export them. + +::: + You are preparing everything for the newly created region to take over again to restore the benefits of a dual-region setup. For this, you need to stop the Zeebe exporters to not export any new data to Elasticsearch, so you can create a backup. Additionally, you need to scale down Operate and Tasklist. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to Elasticsearch. -#### How to get there +:::note -1. Disable the Zeebe Elasticsearch exporters in Zeebe via kubectl +That this **does not** affect processing of process instances in any way. The impact is that some information about the affected instances might not be visible in Operate. -```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 -curl -i localhost:9600/actuator/exporting/pause -XPOST -# The successful response should be: -# HTTP/1.1 204 No Content -``` +::: -2. Disable Operate and Tasklist by scaling to 0 +#### How to get there + +1. Disable Operate and Tasklist by scaling to 0 ```bash OPERATE_DEPLOYMENT=$(kubectl --context $CLUSTER_0 get deployment --selector=app\.kubernetes\.io/component=operate -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) @@ -369,9 +373,17 @@ kubectl --context $CLUSTER_0 scale deployments/$TASKLIST_DEPLOYMENT --replicas 0 ``` -#### Verification +2. Disable the Zeebe Elasticsearch exporters in Zeebe via kubectl -For the Zeebe Elasticsearch exporters, there's currently no API available to confirm this. Only the response code of `204` indicates a successful disabling. +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 +curl -i localhost:9600/actuator/exporting/pause -XPOST +# The successful response should be: +# HTTP/1.1 204 No Content +``` + +#### Verification For Operate and Tasklist, you can confirm that the deployments have successfully scaled down by listing those and indicating `0/0` ready. @@ -382,6 +394,8 @@ kubectl --context $CLUSTER_0 get deployments $OPERATE_DEPLOYMENT $TASKLIST_DEPLO # camunda-tasklist 0/0 0 0 23m ``` +For the Zeebe Elasticsearch exporters, there's currently no API available to confirm this. Only the response code of `204` indicates a successful disabling. +
@@ -389,8 +403,8 @@ kubectl --context $CLUSTER_0 get deployments $OPERATE_DEPLOYMENT $TASKLIST_DEPLO #### Create and Restore Elasticsearch Backup } -desired={} +current={} +desired={} />
@@ -570,8 +584,8 @@ TODO: provide example output. #### Adjust Elasticsearch Exporters Endpoints to Newly Created Region } -desired={} +current={} +desired={} />
@@ -660,7 +674,7 @@ helm upgrade camunda camunda/camunda-platform \ 6. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to point to the new Elasticsearch ```bash -helm install camunda camunda/camunda-platform \ +helm upgrade camunda camunda/camunda-platform \ --version 9.3.1 \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ @@ -686,8 +700,8 @@ kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --sele #### Reactivate Exporters and Operate / Tasklist } -desired={} +current={} +desired={} />
@@ -716,7 +730,7 @@ helm upgrade camunda camunda/camunda-platform \ 2. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to deploy Operate and Tasklist. ```bash -helm install camunda camunda/camunda-platform \ +helm upgrade camunda camunda/camunda-platform \ --version 9.3.1 \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ @@ -744,19 +758,19 @@ curl -i localhost:9600/actuator/exporting/resume -XPOST #### Remove Temporary Failover Installation } -desired={} +current={} +desired={} />
#### Current -The Camunda Platform is healthy and running in two regions again. +The Camunda Platform is healthy and running in two regions again. You have redeployed Operate and Tasklist and enabled the Elasticsearch exporters again. This will allow users to interact with Camunda 8 again. #### Desired -You can remove the temporary failover solution since it is not required anymore. +You can remove the temporary failover solution since it is not required anymore and would hinder disablement of the failback mode within the new region. #### How to get there @@ -766,8 +780,28 @@ You can remove the temporary failover solution since it is not required anymore. helm uninstall camunda --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_0_FAILOVER ``` +2. Delete the leftover persistent volume claims of the Camunda 8 components + +```bash +kubectl --context $CLUSTER_0 delete pvc --all -n $CAMUNDA_NAMESPACE_0_FAILOVER +``` + #### Verification +The following will show the pods within the namespace. You deleted the Helm installation in the failover namespace, which should result in no pods or in deletion state. + +```bash +kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0_FAILOVER +``` + +Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the failover brokers are missing. + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +zbctl status --insecure --address localhost:26500 +``` +
@@ -775,30 +809,30 @@ helm uninstall camunda --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_ #### Switch to Normal Mode in Zeebe for Newly Created Region } -desired={} +current={} +desired={} />
#### Current -Only the two Camunda Platform regions remain, without any temporary solution. +You have almost fully restored the dual-region setup. Two Camunda deployments exist in two different regions. -The failback mode in the new region is still active. +The failback mode is still enabled in the restored region. #### Desired -You restore the new region to its normal functionality by removing the failback mode and forcefully removing the sleeping Zeebe pods. +You restore the new region to its normal functionality by removing the failback mode and forcefully removing the sleeping Zeebe pods. They would otherwise hinder the rollout since they will never be ready. -They would otherwise hinder the rollout since they will never be ready. +With this done Zeebe is fully functional again and you are prepared in case of another region loss. #### How to get there 1. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` by removing the failback mode ```bash -helm install camunda camunda/camunda-platform \ +helm upgrade camunda camunda/camunda-platform \ --version 9.3.1 \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ @@ -814,6 +848,14 @@ kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --sele #### Verification +Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that all brokers have joined the Zeebe cluster again. + +```bash +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +zbctl status --insecure --address localhost:26500 +``` +
From 78219d083a8a66db486834958e4ce4f76af21b57 Mon Sep 17 00:00:00 2001 From: Langleu Date: Thu, 28 Mar 2024 16:15:11 +0100 Subject: [PATCH 05/13] docs(dual-region): apply prettier suggestion --- .../components/stateContainer.jsx | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx index 70b298caada..57b457d3500 100644 --- a/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx +++ b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx @@ -2,37 +2,38 @@ import React from "react"; import "./stateContainer.css"; -export default function StateContainer({ current, desired, description, code }) { -return ( -
-
-
-
Current State
+export default function StateContainer({ + current, + desired, + description, + code, +}) { + return ( +
+
+
+
Current State
+
+
{current}
+
+
+
+
Desired State
+
+
{desired}
+
+
+
+

Description / Code

+
+
+ {description} + {code} +
+
+
+

+
-
- {current} -
-
-
-
-
Desired State
-
-
- {desired} -
-
-
-
-

Description / Code

-
-
- {description} - {code} -
-
-
-

-
-
-); + ); } From 865655e3aa61a31cd059ccb746406d90ecc599cc Mon Sep 17 00:00:00 2001 From: Langleu Date: Tue, 2 Apr 2024 10:16:45 +0200 Subject: [PATCH 06/13] docs(dual-region): move helm chart release and version to env var --- .../multi-region/dual-region-ops.md | 42 +++++++++---------- .../platforms/amazon-eks/dual-region.md | 11 ++--- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 7e83bbc93be..4e7f44c8fe3 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -133,8 +133,6 @@ The chosen `camunda-values-failover.yml` requires adjustments before installing # It will ask you to provide the following values # Enter the region that was lost, values can either be 0 or 1: ## In our case we lost region 1, therefore input 1 -# Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: -## the way you'll call the Helm release, for example camunda # Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): ## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. ``` @@ -168,8 +166,8 @@ Please use the following to change the existing environment variable ZEEBE_BROKE 4. From the terminal context of `aws/dual-region/kubernetes` execute: ```bash -helm install camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm install $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ -f camunda-values.yml \ @@ -231,8 +229,8 @@ In **Step 2** you have already adjusted the base Helm values file `camunda-value 1. From the terminal context of `aws/dual-region/kubernetes`, you will do a Helm upgrade to update the existing Zeebe deployment in `CAMUNDA_NAMESPACE_0` to point to the failover Elasticsearch instance: ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0 \ -f camunda-values.yml \ @@ -285,8 +283,8 @@ Lastly, the `installationType` is set to `failBack` to switch the behaviour of Z 1. From the terminal context of `aws/dual-region/kubernetes` execute: ```bash -helm install camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm install $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ -f camunda-values.yml \ @@ -615,8 +613,6 @@ Your `camunda-values-failover.yml` and base `camunda-values.yml` require adjustm ./generate_zeebe_helm_values.sh # It will ask you to provide the following values -# Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: -## the way you'll call the Helm release, for example camunda # Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): ## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. ``` @@ -650,8 +646,8 @@ Please use the following to change the existing environment variable ZEEBE_BROKE 4. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to point to the new Elasticsearch ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0 \ -f camunda-values.yml \ @@ -663,8 +659,8 @@ helm upgrade camunda camunda/camunda-platform \ 5. Upgrade the failover Camunda environment in `CAMUNDA_NAMESPACE_0_FAILOVER` and `REGION 0` to point to the new Elasticsearch ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ -f camunda-values.yml \ @@ -674,8 +670,8 @@ helm upgrade camunda camunda/camunda-platform \ 6. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to point to the new Elasticsearch ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ -f camunda-values.yml \ @@ -719,8 +715,8 @@ You are reactivating the exporters and enabling Operate and Tasklist again withi 1. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to deploy Operate and Tasklist. ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0 \ -f camunda-values.yml \ @@ -730,8 +726,8 @@ helm upgrade camunda camunda/camunda-platform \ 2. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to deploy Operate and Tasklist. ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ -f camunda-values.yml \ @@ -777,7 +773,7 @@ You can remove the temporary failover solution since it is not required anymore 1. You can uninstall the failover installation via Helm. ```bash -helm uninstall camunda --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_0_FAILOVER +helm uninstall $HELM_RELEASE_NAME --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_0_FAILOVER ``` 2. Delete the leftover persistent volume claims of the Camunda 8 components @@ -832,8 +828,8 @@ With this done Zeebe is fully functional again and you are prepared in case of a 1. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` by removing the failback mode ```bash -helm upgrade camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ -f camunda-values.yml \ diff --git a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md index 64685c706e2..4dc52eabe00 100644 --- a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md +++ b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md @@ -465,8 +465,6 @@ The base `camunda-values.yml`, in `aws/dual-region/kubernetes` requires adjustme ./generate_zeebe_helm_values.sh # It will ask you to provide the following values -# Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: -## the way you'll call the Helm release, for example camunda # Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): ## for a dual-region setup we recommend 8. Resulting in 4 brokers per region. ``` @@ -481,7 +479,6 @@ For illustration purposes. These values will not work in your environment! ```bash ./generate_zeebe_helm_values.sh -Enter Helm release name used for installing Camunda 8 in both Kubernetes clusters: camunda Enter Zeebe cluster size (total number of Zeebe brokers in both Kubernetes clusters): 8 Please use the following to set the environment variable ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS in the base Camunda Helm chart values file for Zeebe. @@ -510,15 +507,15 @@ Please use the following to set the environment variable ZEEBE_BROKER_EXPORTERS_ 1. From the terminal context of `aws/dual-region/kubernetes` execute: ```bash -helm install camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm install $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_0 \ --namespace $CAMUNDA_NAMESPACE_0 \ -f camunda-values.yml \ -f region0/camunda-values.yml -helm install camunda camunda/camunda-platform \ - --version 9.3.1 \ +helm install $HELM_RELEASE_NAME camunda/camunda-platform \ + --version $HELM_CHART_VERSION \ --kube-context $CLUSTER_1 \ --namespace $CAMUNDA_NAMESPACE_1 \ -f camunda-values.yml \ From bcf56308b5b40462ea9f90f35dfdfc0ff7dbcd09 Mon Sep 17 00:00:00 2001 From: Langleu Date: Tue, 2 Apr 2024 10:26:09 +0200 Subject: [PATCH 07/13] docs(dual-region): adjust missing links by resolving todos --- .../concepts/multi-region/dual-region.md | 24 +++++++++---------- .../platforms/amazon-eks/dual-region.md | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/self-managed/concepts/multi-region/dual-region.md b/docs/self-managed/concepts/multi-region/dual-region.md index 01329d9fbc5..4adf982f0f3 100644 --- a/docs/self-managed/concepts/multi-region/dual-region.md +++ b/docs/self-managed/concepts/multi-region/dual-region.md @@ -27,9 +27,9 @@ By contrast, an **active-passive** setup designates one region as the main or ac :::danger -- Customers must develop and test [operational procedures]() in non-production environments based on the framework steps outlined by Camunda **before applying them in production setups**. +- Customers must develop and test [operational procedures](./../../operational-guides/multi-region/dual-region-ops.md) in non-production environments based on the framework steps outlined by Camunda **before applying them in production setups**. - Before advancing to production go-live, validating these procedures with Camunda is strongly recommended. -- Customers are solely responsible for detecting any regional failures and implementing the necessary [operational procedures](). +- Customers are solely responsible for detecting any regional failures and implementing the necessary [operational procedures](./../../operational-guides/multi-region/dual-region-ops.md). ::: @@ -102,7 +102,7 @@ In the event of a total active region loss, the following data will be lost: - Two Kubernetes clusters - OpenShift is not supported - The Kubernetes clusters need to be able to connect to each other (for example, via VPC peering) - - See an [example implementation]() of two VPC peered Kubernetes clusters based on AWS EKS. + - See an [example implementation](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) of two VPC peered Kubernetes clusters based on AWS EKS. - Maximum round trip time (RTT) of 100ms between the two Kubernetes clusters - Open ports between the two Kubernetes clusters - **9200** for Elasticsearch for Zeebe to push data cross-region @@ -114,7 +114,7 @@ In the event of a total active region loss, the following data will be lost: - `replicationFactor` must be **4** to ensure that the partitions are evenly distributed across the two regions. - `partitionCount` is not restricted and depends on your workload requirements, consider having a look at [understanding sizing and scalability behavior](../../../components/best-practices/architecture/sizing-your-environment.md#understanding-sizing-and-scalability-behavior). - For further information and visualization of the partition distribution, consider consulting the documentation on [partitions](../../../components/zeebe/technical-concepts/partitions.md). -- The customers operating their Camunda 8 setup are responsible for detecting a regional failure and executing the [operational procedure](<-- TODO: link -->). +- The customers operating their Camunda 8 setup are responsible for detecting a regional failure and executing the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md). ## Limitations @@ -132,7 +132,7 @@ In the event of a total active region loss, the following data will be lost: - This is due to Connectors depending on Operate to work for inbound Connectors and potentially resulting in race condition. - During the failback procedure, there’s a small chance that some data will be lost in Elasticsearch affecting Operate and Tasklist. - This **does not** affect the processing of process instances in any way. The impact is that some information about the affected instances might not be visible in Operate and Tasklist. - - This is further explained in the [operational procedure]() during the relevant step. + - This is further explained in the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md?failback=step2#failback) during the relevant step. - Zeebe cluster scaling is not supported. - Web-Modeler is a standalone component and is not covered in this guide. - Modeling applications can operate independently outside of the automation clusters. @@ -156,7 +156,7 @@ In a dual-region setup, a loss of a region will invariably affect Camunda 8, reg This means the Zeebe stretch cluster will not have a quorum when half of its brokers are not reachable anymore and will stop processing any new data. This will also affect the components, as they cannot update or push new workflows. Essentially, this means the workflow engine will halt until the region failover procedure is complete. -The [operational procedure]() looks in detail at short-term recovery from a region loss and how to long-term fully re-establish the lost region. The procedure works the same way for active or passive region loss since we don't consider traffic routing (DNS) in the scenario. +The [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md) looks in detail at short-term recovery from a region loss and how to long-term fully re-establish the lost region. The procedure works the same way for active or passive region loss since we don't consider traffic routing (DNS) in the scenario. ### Active region loss @@ -168,12 +168,12 @@ The loss of the active region means: The following high-level steps need to be taken in case of the active region loss: -1. Follow the [operational procedure]() to temporarily recover from the region loss and unblock the workflow engine. +1. Follow the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md#failover) to temporarily recover from the region loss and unblock the workflow engine. 2. Reroute traffic to the passive region that will now become the new active region. 3. Due to the loss of data in Operate and Tasklist, you'll have to: 1. Reassign uncompleted tasks in Tasklist. 2. Recreate batch operations in Operate. -4. Follow the [operational procedure]() to recreate a new permanent region that will become your new passive region. +4. Follow the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md#failback) to recreate a new permanent region that will become your new passive region. ### Passive region loss @@ -181,8 +181,8 @@ The loss of the passive region means the workflow engine will stop processing du The following high-level steps need to be taken in case of passive region loss: -- Follow the [operational procedure]() to temporarily recover from the region loss and unblock the workflow engine. -- Follow the [operational procedure]() to recreate a new permanent region that will become your new passive region. +- Follow the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md#failover) to temporarily recover from the region loss and unblock the workflow engine. +- Follow the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md#failback) to recreate a new permanent region that will become your new passive region. Unlike the active region loss, no data will be lost, nor will any traffic require rerouting. @@ -211,6 +211,6 @@ The described minutes for the **Recovery Time Objective** are estimated and may ## Guides -- Familiarize yourself with our [AWS setup guide]() that showcases an example setup in AWS by utilizing the managed Elastic Kubernetes Service (EKS) and VPC peering for a dual-region setup with Terraform. +- Familiarize yourself with our [AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) that showcases an example setup in AWS by utilizing the managed Elastic Kubernetes Service (EKS) and VPC peering for a dual-region setup with Terraform. - The concepts in the guide are mainly cloud-agnostic and the guide can be adopted to other cloud providers. -- Familiarize yourself with the [operational procedure]() to understand how to proceed in the case of a total region loss and how to prepare yourself to ensure smooth operations. +- Familiarize yourself with the [operational procedure](./../../operational-guides/multi-region/dual-region-ops.md) to understand how to proceed in the case of a total region loss and how to prepare yourself to ensure smooth operations. diff --git a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md index 4dc52eabe00..47aecc2d281 100644 --- a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md +++ b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md @@ -66,7 +66,7 @@ You have to choose unique namespaces for Camunda 8 installations. The namespace For example, you can install Camunda 8 into the `CAMUNDA_NAMESPACE_0` namespace in the `CLUSTER_0` cluster, and `CAMUNDA_NAMESPACE_1` namespace on the `CLUSTER_1` cluster, where `CAMUNDA_NAMESPACE_0` != `CAMUNDA_NAMESPACE_1`. Using the same namespace names on both clusters won't work as CoreDNS won't be able to distinguish between traffic targeted at the local and remote cluster. -In addition to namespaces for Camunda installations, you need to create the namespaces for failover (`CAMUNDA_NAMESPACE_0_FAILOVER` in `CLUSTER_0` and `CAMUNDA_NAMESPACE_1_FAILOVER` in `CLUSTER_1`), for the case of a total region loss. This is for completeness, so you don't forget to add the mapping on region recovery. The operational procedure is handled in a different [document](#). +In addition to namespaces for Camunda installations, you need to create the namespaces for failover (`CAMUNDA_NAMESPACE_0_FAILOVER` in `CLUSTER_0` and `CAMUNDA_NAMESPACE_1_FAILOVER` in `CLUSTER_1`), for the case of a total region loss. This is for completeness, so you don't forget to add the mapping on region recovery. The operational procedure is handled in a different [document](./../../../../operational-guides/multi-region/dual-region-ops.md). ::: @@ -418,7 +418,7 @@ Key changes of the dual-region setup: - `global.multiregion.regions: 2` - indicates the use for two regions - `global.identity.auth.enabled: false` - - Identity is currently not supported. Please see the [limitations section](#) on the dual-region concept page. . + - Identity is currently not supported. Please see the [limitations section](../../../../concepts/multi-region/dual-region.md#limitations) on the dual-region concept page. - `global.elasticsearch.disableExporter: true` - disables the automatic Elasticsearch configuration of the helm chart. We will manually supply the values via environment variables. - `identity.enabled: false` From 2579c9ab7b2d7ffa767a9d37ef5b4332302015fc Mon Sep 17 00:00:00 2001 From: Langleu Date: Tue, 2 Apr 2024 14:15:05 +0200 Subject: [PATCH 08/13] docs(dual-region): add remaining verifcation steps --- .../multi-region/dual-region-ops.md | 116 +++++++++++++++++- 1 file changed, 112 insertions(+), 4 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 4e7f44c8fe3..b53c05c2e92 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -239,7 +239,32 @@ helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ #### Verification -TODO: We can check that the yaml was updated and Zeebe is restarting. Not sure there's an endpoint that reports on that kind of stuff. +The following command will show the deployed pods of the healthy namespace. You should see that the Zeebe brokers have just restarted or are still restarting due to the configuration upgrade. + +```bash +kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0 +``` + +Alternatively, you can check that the Elasticsearch value was updated in the [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) configuration of the Zeebe brokers and are reflecting the previous output of the script `generate_zeebe_helm_values.sh` in **Step 2**. + +```bash +kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_0 | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' +``` + +
+ Example Output + + +```bash + - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-primary.svc.cluster.local:9200 +-- + - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-primary-failover.svc.cluster.local:9200 +``` + + +
@@ -458,7 +483,7 @@ kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- cu Example Output -```bash +```json { "snapshots": [ { @@ -569,10 +594,54 @@ kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- cu 8. Verify that the restore has been completed successfully in the new region. ```bash -kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPOST "http://localhost:9200/_snapshot/camunda_backup/failback/_status" +kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/failback/_status" +``` + +
+ Example Output + + +The important part being the `state: "SUCCESS"` and that `done` and `total` are equal. This is just an example and the values will differ for you! + +```json +{ + "snapshots": [ + { + "snapshot": "failback", + "repository": "camunda_backup", + "uuid": "8AmblqA2Q9WAhuDk-NO5Cg", + "state": "SUCCESS", + "include_global_state": true, + "shards_stats": { + "initializing": 0, + "started": 0, + "finalizing": 0, + "done": 43, + "failed": 0, + "total": 43 + }, + "stats": { + "incremental": { + "file_count": 145, + "size_in_bytes": 353953 + }, + "total": { + "file_count": 145, + "size_in_bytes": 353953 + }, + "start_time_in_millis": 1712058365525, + "time_in_millis": 1005 + }, + "indices": { + ... + } + } + ] +} ``` -TODO: provide example output. + +
@@ -689,6 +758,33 @@ kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --sele #### Verification +The following command will show the deployed pods of the namespaces. You should see that the Zeebe brokers are restarting. Adjusting the command for the other cluster and namespaces should reveal the same. + +```bash +kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0 +``` + +Alternatively, you can check that the Elasticsearch value was updated in the [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) configuration of the Zeebe brokers and are reflecting the previous output of the script `generate_zeebe_helm_values.sh` in **Step 1**. + +```bash +kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_0 | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' +``` + +
+ Example Output + + +```bash + - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-primary.svc.cluster.local:9200 +-- + - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL + value: http://camunda-elasticsearch-master-hl.camunda-primary-failover.svc.cluster.local:9200 +``` + + +
+
@@ -747,6 +843,18 @@ curl -i localhost:9600/actuator/exporting/resume -XPOST #### Verification +For Operate and Tasklist, you can confirm that the deployments have successfully been deployed by listing those and indicating `1/1` ready. The same command can be applied for the `CLUSTER_1` and `CAMUNDA_NAMESPACE_1`. + +```bash +kubectl --context $CLUSTER_0 get deployments -n $CAMUNDA_NAMESPACE_0 +# NAME READY UP-TO-DATE AVAILABLE AGE +# camunda-operate 1/1 1 1 3h24m +# camunda-tasklist 1/1 1 1 3h24m +# camunda-zeebe-gateway 1/1 1 1 3h24m +``` + +For the Zeebe Elasticsearch exporters, there's currently no API available to confirm this. Only the response code of `204` indicates a successful resumption. +
From 762b71629668b28826fc7b107e96fc6df6b1aa67 Mon Sep 17 00:00:00 2001 From: Langleu Date: Tue, 2 Apr 2024 14:22:23 +0200 Subject: [PATCH 09/13] docs(dual-region): add info admonition on elasticsearch backup source --- .../operational-guides/multi-region/dual-region-ops.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index b53c05c2e92..691443e3ba0 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -444,6 +444,12 @@ You are creating a backup of the healthy Elasticsearch instance in `CAMUNDA_NAME This builds on top of the [AWS Setup](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) and assumes that the S3 bucket was automatically created as part of the Terraform execution. +:::info + +The procedure works for other cloud providers and bare metal the same. You have to adjust the AWS S3 specific part depending on your chosen backup source for Elasticsearch. Make sure to conduct the [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/snapshot-restore.html) on snapshot and restore to learn more about it and specifically the [different supported types](https://www.elastic.co/guide/en/elasticsearch/reference/current/snapshots-register-repository.html#ess-repo-types) by Elasticsearch. + +::: + 1. Determine the S3 bucket name by retrieving it via Terraform. Go to `aws/dual-region/terraform` within the repository and retrieve the bucket name from the Terraform state. ```bash From dbbed6e2cd7448342e690b0afe1feceb0fb06dce Mon Sep 17 00:00:00 2001 From: Langleu Date: Wed, 3 Apr 2024 10:41:43 +0200 Subject: [PATCH 10/13] docs(dual-region): address review feedback first batch including changes in wording and capitalization --- .../components/stateContainer.jsx | 4 +- .../multi-region/dual-region-ops.md | 248 ++++++++++++------ .../multi-region/img/11.svg | 2 +- .../platforms/amazon-eks/dual-region.md | 3 +- optimize_sidebars.js | 4 +- sidebars.js | 2 +- 6 files changed, 171 insertions(+), 92 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx index 57b457d3500..5b33c419072 100644 --- a/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx +++ b/docs/self-managed/operational-guides/multi-region/components/stateContainer.jsx @@ -12,13 +12,13 @@ export default function StateContainer({
-
Current State
+
Current state
{current}
-
Desired State
+
Desired state
{desired}
diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 691443e3ba0..222fc66ba7f 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -1,7 +1,7 @@ --- id: dual-region-operational-procedure -title: "Dual-Region Operational Procedure" -sidebar_label: "Dual-Region Operational Procedure" +title: "Dual-region operational procedure" +sidebar_label: "Dual-region operational procedure" description: "The operational procedure concerning dual-region setups to recover from a region loss." --- @@ -30,7 +30,9 @@ import Fifteen from './img/15.svg'; ## Introduction -The operational procedure is a step-by-step guide on how to proceed in the case of a total region failure. Allowing you to temporarily restore functionality and ultimately do a full recovery to restore the dual-region setup. The operational procedure builds on top of the [dual-region AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md) but is generally applicable for any dual-region setup. +This operational procedure is a step-by-step guide on how to restore operations in the case of a total region failure. It explains how to temporarily restore functionality in the surviving region, and how to ultimately do a full recovery to restore the dual-region setup. The operational procedure builds on top of the [dual-region AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md), but is generally applicable for any dual-region setup. + +Before proceeding with the operational procedure, you must thoroughly read and comprehend the contents of the [dual-region concept page](./../../concepts/multi-region/dual-region.md). This page outlines various limitations and requirements pertinent to the procedure, which are crucial for successful execution. ## Disclaimer @@ -42,28 +44,37 @@ The operational procedure is a step-by-step guide on how to proceed in the case ::: +## Prerequisites + +- A dual-region Camunda 8 setup installed in 2 different regions, preferably derived from our [AWS dual-region guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md). +- [Helm (3.x)](https://helm.sh/docs/intro/install/) for installing and upgrading the [Camunda Helm chart](https://github.com/camunda/camunda-platform-helm). +- [Kubectl (1.28.x)](https://kubernetes.io/docs/tasks/tools/#kubectl) to interact with the Kubernetes cluster. +- [zbctl](./../../../apis-tools/cli-client/index.md) to interact with the Zeebe cluster. + ## Procedure We don't differ between active and passive regions as the procedure is the same for either loss. We will focus on losing the passive region while still having the active region. -You'll have to take care of DNS considerations by rerouting traffic to the functioning region, which are disregarded in the following. +You'll need to reroute the traffic to the surviving region with the help of DNS (details on how to do that depend on your DNS setup and are not covered in this guide) + +After you've identified a region loss, before beginning the region restoration procedure, you need to ensure that the lost region cannot reconnect back (as this will hinder a successful recovery during failover and failback execution). -After identifying or considering a region as lost, you should ensure that it doesn't reconnect, as this will hinder a successful recovery during failover and failback execution. In case this is just temporary, Zeebe can survive a region loss but will stop processing due the loss in quorum and ultimately fill up the persistent disk before running out of volume resulting in the loss of data. +In case the region is only lost temporarily (e.g. due to the network hiccups), Zeebe can survive a region loss but will stop processing due to the loss in quorum and ultimately fill up the persistent disk before running out of volume, resulting in the loss of data. -The **failover** procedure aims to temporarily restore operations by redeploying Camunda 8 within the same region to resume workflow engine functionality. During this period, Zeebe is unable to export or process new data until it achieves quorum and the configured Elasticsearch endpoints for the exporters become accessible, which is the outcome of the failover procedure. +The **failover** phase of the procedure results in the temporary restoration of Camunda 8 functionality by redeploying it within the surviving region to resume Zeebe engine functionality. Before the completion of this phase, Zeebe is unable to export or process new data until it achieves quorum and the configured Elasticsearch endpoints for the exporters become accessible, which is the outcome of the failover procedure. -The **failback** procedure involves completely restoring the failed region, thereby restoring your dual-region setup to its full functionality. +The **failback** phase of the procedure results in completely restoring the failed region to its full functionality. It requires you to have the second region ready again for the redeployment of Camunda. -The following procedures are building on top of the work done in the [AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) about deploying Camunda 8 to a dual-region cluster. We assume you have your own copy of the [c8-multi-region](https://github.com/camunda/c8-multi-region) repository and previously done changes in the `camunda-values.yml`. +The following procedures are building on top of the work done in the [AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) about deploying Camunda 8 to two Kubernetes clusters in different regions. We assume you have your own copy of the [c8-multi-region](https://github.com/camunda/c8-multi-region) repository and previously done changes in the `camunda-values.yml` to adjust them to your setup. -Please ensure to have followed the points [environment prerequisites](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites) and [deploy Camunda 8 to the clusters](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) to have the required base to build upon. +Please ensure to have followed the points in [environment prerequisites](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites) and [deploy Camunda 8 to the clusters](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) to have Camunda 8 installed and configured for a dual-region setup. ### Failover -#### Ensure Network Disconnection +#### Ensure network isolation between Kubernetes clusters } @@ -72,27 +83,30 @@ desired={}
-#### Current +#### Current state -The current state is that one of the regions is lost. This will result in Zeebe being unable to process anything new due to the loss in quorum, nor can it export data to Elasticsearch since one of the instances is unreachable. Neither would it export to the local region since exporters are -invoked sequentially. +One of the regions is lost, meaning Zeebe: -#### Desired +- Is unable to process new requests due to losing the quorum +- Stops exporting new data to Elasticsearch in the lost region +- Stops exporting new data to Elasticsearch in the survived region -For the failover procedure, we need to ensure that the lost region does not accidentally reconnect. You should be sure it is lost, and if so, look into measures to prevent it from reconnecting by for example utilizing the suggested solution below to isolate your active environment. +#### Desired state + +For the failover procedure, you need to ensure that the lost region does not accidentally reconnect. You should be sure it is lost, and if so, look into measures to prevent it from reconnecting by for example utilizing the suggested solution below to isolate your active environment. #### How to get there -Potential approaches are the following: +Depending on your architecture, possible approaches are: -- [Kubernetes Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -- Firewall rules to block the traffic from the lost region +- Configuring [Kubernetes Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) to disable traffic flow between the clusters +- Configure firewall rules to disable traffic flow between the clusters
-#### Deploy Temporary Camunda 8 Installation in Failover Mode in Existing Region +#### Create temporary Camunda 8 installation in the failover mode in the surviving region } @@ -101,23 +115,25 @@ desired={}
-#### Current +#### Current state -You have made sure by previous measures, for example, firewall rules that the lost region does not reconnect during the failover procedure. +You have previously ensured that the lost region cannot not reconnect during the failover procedure. -Due to the partitioning of Zeebe, no data has been lost so far. +Due to the Zeebe data partitioning, no data has been lost. -#### Desired +#### Desired state -You are creating a temporary Camunda Platform deployment within the same region, but different namespace, to recover functionality. The extra namespace allows for easier distinguishing between the normal Zeebe deployment and Zeebe failover deployment. +You are creating a temporary Camunda 8 deployment within the same region, but different namespace, to recover the Zeebe cluster functionality. Using a different namespace allows for easier distinguishing between the normal Zeebe deployment and Zeebe failover deployment. -The newly deployed Zeebe brokers will be running in failover mode to restore the quorum and allow processing again. Additionally, they will be pointed at the existing Elasticsearch instance and the newly deployed Elasticsearch instance to allow exporting the data again. +The newly deployed Zeebe brokers will be running in the failover mode. This will restore the quorum and the Zeebe data processing. Additionally, the new failover brokers are configured to export the data to the surviving Elasticsearch instance and to the newly deployed failover Elasticsearch instance. #### How to get there -In the previously cloned repository [c8-multi-region](https://github.com/camunda/c8-multi-region) navigate to the folder [aws/dual-region/kubernetes/region0](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region0/) it contains the example Helm values yaml `camunda-values-failover.yml` containing the required overlay for the **failover** mode. +In the case **Region 1** was lost: in the previously cloned repository [c8-multi-region](https://github.com/camunda/c8-multi-region), navigate to the folder [aws/dual-region/kubernetes/region0](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region0/). It contains the example Helm values yaml `camunda-values-failover.yml` containing the required overlay for the **failover** mode. + +In the case when your **Region 0** was lost, please instead go to the folder [aws/dual-region/kubernetes/region1](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region1/) for the `camunda-values-failover.yml` file. -In the case your **Region 0** was lost, please consider the folder [aws/dual-region/kubernetes/region1](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region1/). We will refrain from mentioning both possibilities always but as you can see it's simply the other way around in case of the loss of the **Region 0**. +Later in the guide, we will refrain from always mentioning both possible scenarios (losing either Region 0 or Region 1), but as you can see, the commands for when losing **Region 1** are the other way around compared to the loss of the **Region 0**. The chosen `camunda-values-failover.yml` requires adjustments before installing the Helm chart and the same has to be done for the base `camunda-values.yml` in `aws/dual-region/kubernetes`. @@ -178,9 +194,7 @@ helm install $HELM_RELEASE_NAME camunda/camunda-platform \ The following command will show the deployed pods of the failover namespace. -Depending on your chosen `clusterSize` you should see that the failover deployment contains only a subset of Zeebe instances. - -For example 2 in the case of `clusterSize: 8`. This allows to recover the quorum. +Only the minimal amount of brokers required to restore the quorum will be deployed in the failover installation. For example, if `clusterSize` is 8, 2 Zeebe brokers will be deployed in the failover installation instead of the normal 4. This is expected. ```bash kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0_FAILOVER @@ -194,12 +208,63 @@ kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500: zbctl status --insecure --address localhost:26500 ``` +
+ Example output + + +```bash +Cluster size: 8 +Partitions count: 8 +Replication factor: 4 +Gateway version: 8.5.0 +Brokers: + Broker 0 - camunda-zeebe-0.camunda-zeebe.camunda-london.svc:26501 + Version: 8.5.0 + Partition 1 : Leader, Healthy + Partition 6 : Follower, Healthy + Partition 7 : Follower, Healthy + Partition 8 : Follower, Healthy + Broker 1 - camunda-zeebe-0.camunda-zeebe.camunda-london-failover.svc:26501 + Version: 8.5.0 + Partition 1 : Follower, Healthy + Partition 2 : Leader, Healthy + Partition 7 : Follower, Healthy + Partition 8 : Follower, Healthy + Broker 2 - camunda-zeebe-1.camunda-zeebe.camunda-london.svc:26501 + Version: 8.5.0 + Partition 1 : Follower, Healthy + Partition 2 : Follower, Healthy + Partition 3 : Follower, Healthy + Partition 8 : Leader, Healthy + Broker 4 - camunda-zeebe-2.camunda-zeebe.camunda-london.svc:26501 + Version: 8.5.0 + Partition 2 : Follower, Healthy + Partition 3 : Follower, Healthy + Partition 4 : Follower, Healthy + Partition 5 : Follower, Healthy + Broker 5 - camunda-zeebe-1.camunda-zeebe.camunda-london-failover.svc:26501 + Version: 8.5.0 + Partition 3 : Leader, Healthy + Partition 4 : Follower, Healthy + Partition 5 : Follower, Healthy + Partition 6 : Leader, Healthy + Broker 6 - camunda-zeebe-3.camunda-zeebe.camunda-london.svc:26501 + Version: 8.5.0 + Partition 4 : Leader, Healthy + Partition 5 : Leader, Healthy + Partition 6 : Follower, Healthy + Partition 7 : Leader, Healthy +``` + + +
+
-#### Adjust Elasticsearch Exporters Endpoints to Temporary Deployment +#### Configure Zeebe to export data to temporary Elasticsearch deployment } @@ -208,15 +273,23 @@ desired={}
-#### Current +#### Current state + +Zeebe is not yet be able to continue processing data since the Zeebe brokers in the surviving region are configured to point to the Elasticsearch instance of the lost region. + +:::info + +Simply disabling the exporter would not be helpful here, since the sequence numbers in the exported data are not persistent when an exporter configuration is removed from Zeebe settings and added back later. The correct sequence numbers are required by Operate and Tasklist to import Elasticsearch data correctly. + +::: -Zeebe won't be able to continue processing yet since the existing Zeebe brokers are still pointing at the Elasticsearch of the lost region. +#### Desired state -Simply disabling the exporter would not be enough since the sequence numbers are not persistent when an exporter is removed and those are required by the Operate and Tasklist importers. +You have reconfigured the existing Camunda deployment in `CAMUNDA_NAMESPACE_0` to point Zeebe to the export data to the temporary Elasticsearch instance that was previously created in **Step 2**. -#### Desired +The outcome will be that Zeebe cluster is unblocked and can export data to Elasticsearch again. -You are reconfiguring the existing Camunda deployment of `CAMUNDA_NAMESPACE_0` to point Zeebe to the temporary Elasticsearch instance that was previously created in **Step 2**. The outcome will be that Zeebe is unblocked and can export data to Elasticsearch again. This allows users to interact with the Camunda Platform again. +Completing this step will restore regular interaction with Camunda 8 for your users, marking the conclusion of the temporary recovery. #### How to get there @@ -226,7 +299,7 @@ In **Step 2** you have already adjusted the base Helm values file `camunda-value - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` -1. From the terminal context of `aws/dual-region/kubernetes`, you will do a Helm upgrade to update the existing Zeebe deployment in `CAMUNDA_NAMESPACE_0` to point to the failover Elasticsearch instance: +From the `aws/dual-region/kubernetes` directory, do a Helm upgrade to update the configure Zeebe deployment in `CAMUNDA_NAMESPACE_0` to point to the failover Elasticsearch instance: ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ @@ -252,15 +325,15 @@ kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n ```
- Example Output + Example output ```bash - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL - value: http://camunda-elasticsearch-master-hl.camunda-primary.svc.cluster.local:9200 + value: http://camunda-elasticsearch-master-hl.camunda-london.svc.cluster.local:9200 -- - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL - value: http://camunda-elasticsearch-master-hl.camunda-primary-failover.svc.cluster.local:9200 + value: http://camunda-elasticsearch-master-hl.camunda-london-failover.svc.cluster.local:9200 ``` @@ -275,7 +348,7 @@ kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n -#### Deploy Camunda 8 in Failback Mode in Newly Created Region +#### Deploy Camunda 8 in the failback mode in the newly created region } @@ -284,17 +357,20 @@ desired={}
-#### Current +#### Current state You have temporary Zeebe brokers deployed in failover mode together with a temporary Elasticsearch within the same surviving region. -#### Desired +#### Desired state You want to restore the dual-region functionality again and deploy Zeebe in failback mode to the newly restored region. -Failback mode means that two brokers will be added to the cluster to allow processing and restore data. While two brokers are sleeping since you still have the temporary setup that you have to transfer. +Failback mode means that `clusterSize/2` new brokers will be installed in the restored region: -An Elasticsearch will also be deployed but not used yet since you have to restore a backup from the temporary setup. +- `clusterSize/4` brokers are running in the normal mode, participating processing and restoring the data +- `clusterSize/4` brokers are temporarily running in the sleeping mode. They will be running in the normal mode later once the failover setup is removed. + +An Elasticsearch will also be deployed in the restored region, but not used yet, before the data is restored into it from the backup from the surviving Elasticsearch cluster. #### How to get there @@ -344,7 +420,7 @@ zbctl status --insecure --address localhost:26500 -#### Pause Elasticsearch Exporters and Operate / Tasklist +#### Pause Zeebe exporters to Elasticsearch, pause Operate and Tasklist } @@ -353,33 +429,35 @@ desired={}
-#### Current +#### Current state You currently have the following setups: -- Healthy Camunda Platform -- Camunda Platform in failover mode within the same region as the healthy setup -- Camunda Platform in failback mode within a newly created region +- Functioning Zeebe cluster (in multi-region mode): + - Camunda 8 installation in the failover mode in the surviving region + - Camunda 8 installation in the failback mode in the recreated region -#### Desired +#### Desired state :::warning -This step is very important to minimise the risk of loosing any data when restoring the backup in the new region. +This step is very important to minimize the risk of losing any data when restoring the backup in the recreated region. + +There remains a small chance of losing some data in Elasticsearch (and in turn, in Operate and Tasklist too). This is because Zeebe might have exported some records to the failover Elasticsearch in the surviving region, but not to the main Elasticsearch in the surviving region, before the exporters have been paused. -There remains a small chance of losing some data in Elasticsearch (in turn in Operate and Tasklist). This is because Zeebe might have exported some records to the failover Elasticsearch in `REGION_0`, but not to the main Elasticsearch in `REGION_0` before pausing the exporters. So those records are not included in the `REGION_0` Elasticsearch backup when the new `REGION_1` Elasticsearch is restored from the `REGION_0` backup, the new region is missing those records and Zeebe does not re-export them. +This means those records will not be included in the surviving region's Elasticsearch backup when the recreated region's Elasticsearch is restored from the backup, leading to the new region missing those records (as Zeebe does not re-export them). ::: -You are preparing everything for the newly created region to take over again to restore the benefits of a dual-region setup. +You are preparing everything for the newly created region to take over again to restore the functioning dual-region setup. -For this, you need to stop the Zeebe exporters to not export any new data to Elasticsearch, so you can create a backup. +For this, you need to stop the Zeebe exporters to not export any new data to Elasticsearch, so you can create an Elasticsearch backup. -Additionally, you need to scale down Operate and Tasklist. This will result in users not being able to interact with the Camunda Platform anymore and is required to guarantee no new data is imported to Elasticsearch. +Additionally, you need to temporarily scale down Operate and Tasklist to zero replicas. This will result in users not being able to interact with Camunda 8 anymore and is required to guarantee no new data is imported to Elasticsearch. :::note -That this **does not** affect processing of process instances in any way. The impact is that some information about the affected instances might not be visible in Operate. +This **does not** affect the processing of process instances in any way. The impact is that process information about the affected instances might not be visible in Operate and Tasklist. ::: @@ -423,7 +501,7 @@ For the Zeebe Elasticsearch exporters, there's currently no API available to con -#### Create and Restore Elasticsearch Backup +#### Create and restore Elasticsearch backup } @@ -432,13 +510,13 @@ desired={}
-#### Current +#### Current state -The Camunda Platform is currently not reachable by end-users and does not process any new processes to allow creating a backup of Elasticsearch without losing any new data. +The Camunda web applications are currently not reachable by end-users and will not process any new process instances. This allows creating a backup of Elasticsearch without losing any data. -#### Desired +#### Desired state -You are creating a backup of the healthy Elasticsearch instance in `CAMUNDA_NAMESPACE_0` and restore it in the new region. This Elasticsearch backup contains all the data and may take some time to backup. The failover Elasticsearch instance only contains a subset of the data from after the region loss and is not sufficient to restore this in the new region. +You are creating a backup of the main Elasticsearch instance in the surviving region and restore it in the recreated region. This Elasticsearch backup contains all the data and may take some time to be finished. The failover Elasticsearch instance only contains a subset of the data from after the region loss and is not sufficient to restore this in the new region. #### How to get there @@ -486,7 +564,7 @@ kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- cu ```
- Example Output + Example output ```json @@ -604,7 +682,7 @@ kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- cu ```
- Example Output + Example output The important part being the `state: "SUCCESS"` and that `done` and `total` are equal. This is just an example and the values will differ for you! @@ -654,7 +732,7 @@ The important part being the `state: "SUCCESS"` and that `done` and `total` are -#### Adjust Elasticsearch Exporters Endpoints to Newly Created Region +#### Configure Zeebe exporters to use Elasticsearch in the recreated region } @@ -663,15 +741,15 @@ desired={}
-#### Current +#### Current state -The backup of Elasticsearch has been created and restored to the new region. +The backup of Elasticsearch has been created and restored to the recreated region. -The Camunda Platform remains unreachable by end-users as you proceed to restore functionality. +The Camunda web applications remain unreachable by end-users as you proceed to restore functionality. -#### Desired +#### Desired state -You are pointing all Camunda Platforms from the temporary Elasticsearch to the Elasticsearch in the new region. +You are repointing all Zeebe brokers from the temporary Elasticsearch instance to the Elasticsearch in the recreated region. The Elasticsearch exporters will remain paused during this step. @@ -777,15 +855,15 @@ kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n ```
- Example Output + Example output ```bash - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL - value: http://camunda-elasticsearch-master-hl.camunda-primary.svc.cluster.local:9200 + value: http://camunda-elasticsearch-master-hl.camunda-london.svc.cluster.local:9200 -- - name: ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL - value: http://camunda-elasticsearch-master-hl.camunda-primary-failover.svc.cluster.local:9200 + value: http://camunda-elasticsearch-master-hl.camunda-london-failover.svc.cluster.local:9200 ``` @@ -795,7 +873,7 @@ kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n -#### Reactivate Exporters and Operate / Tasklist +#### Reactivate Zeebe exporters, Operate and Tasklist } @@ -804,13 +882,13 @@ desired={}
-#### Current +#### Current state -The Camunda Platforms are pointing at the Elasticsearch instances in both regions again and not the temporary instance. It still remains unreachable to the end-users and no processes are advanced. +Camunda 8 is pointing at the Elasticsearch instances in both regions again and not the temporary instance. It still remains unreachable to the end-users and no processes are advanced. -#### Desired +#### Desired state -You are reactivating the exporters and enabling Operate and Tasklist again within the two regions. This will allow users to interact with the Camunda Platform again. +You are reactivating the exporters and enabling Operate and Tasklist again within the two regions. This will allow users to interact with Camunda 8 again. #### How to get there @@ -865,7 +943,7 @@ For the Zeebe Elasticsearch exporters, there's currently no API available to con -#### Remove Temporary Failover Installation +#### Remove temporary failover installation } @@ -874,11 +952,11 @@ desired={}
-#### Current +#### Current state -The Camunda Platform is healthy and running in two regions again. You have redeployed Operate and Tasklist and enabled the Elasticsearch exporters again. This will allow users to interact with Camunda 8 again. +Camunda 8 is healthy and running in two regions again. You have redeployed Operate and Tasklist and enabled the Elasticsearch exporters again. This will allow users to interact with Camunda 8 again. -#### Desired +#### Desired state You can remove the temporary failover solution since it is not required anymore and would hinder disablement of the failback mode within the new region. @@ -916,7 +994,7 @@ zbctl status --insecure --address localhost:26500 -#### Switch to Normal Mode in Zeebe for Newly Created Region +#### Switch to Zeebe brokers in the recreated region to normal mode } @@ -925,13 +1003,13 @@ desired={}
-#### Current +#### Current state You have almost fully restored the dual-region setup. Two Camunda deployments exist in two different regions. The failback mode is still enabled in the restored region. -#### Desired +#### Desired state You restore the new region to its normal functionality by removing the failback mode and forcefully removing the sleeping Zeebe pods. They would otherwise hinder the rollout since they will never be ready. diff --git a/docs/self-managed/operational-guides/multi-region/img/11.svg b/docs/self-managed/operational-guides/multi-region/img/11.svg index ed2de493dc8..460316eee9e 100644 --- a/docs/self-managed/operational-guides/multi-region/img/11.svg +++ b/docs/self-managed/operational-guides/multi-region/img/11.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md index 47aecc2d281..2909ca178b0 100644 --- a/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md +++ b/docs/self-managed/platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md @@ -19,8 +19,9 @@ This guide requires you to have previously completed or reviewed the steps taken ## Prerequisites - An [AWS account](https://docs.aws.amazon.com/accounts/latest/reference/accounts-welcome.html) to create resources within AWS. -- [Terraform (1.7.x)](https://developer.hashicorp.com/terraform/downloads) +- [Helm (3.x)](https://helm.sh/docs/intro/install/) for installing and upgrading the [Camunda Helm chart](https://github.com/camunda/camunda-platform-helm). - [Kubectl (1.28.x)](https://kubernetes.io/docs/tasks/tools/#kubectl) to interact with the cluster. +- [Terraform (1.7.x)](https://developer.hashicorp.com/terraform/downloads) ## Considerations diff --git a/optimize_sidebars.js b/optimize_sidebars.js index 1f92d15173a..21d6d93b4fb 100644 --- a/optimize_sidebars.js +++ b/optimize_sidebars.js @@ -1929,9 +1929,9 @@ module.exports = { }, { - "Multi-Region": [ + "Multi-region": [ docsLink( - "Dual-Region Operational Procedure", + "Dual-region operational procedure", "self-managed/operational-guides/multi-region/dual-region-operational-procedure/" ), ], diff --git a/sidebars.js b/sidebars.js index 16dc7709d0d..d633ca925ad 100644 --- a/sidebars.js +++ b/sidebars.js @@ -900,7 +900,7 @@ module.exports = { ], }, { - "Multi-Region": [ + "Multi-region": [ "self-managed/operational-guides/multi-region/dual-region-operational-procedure", ], }, From 315b5cc5b45ea717b88837955cae848790d41f84 Mon Sep 17 00:00:00 2001 From: Langleu Date: Wed, 3 Apr 2024 11:32:14 +0200 Subject: [PATCH 11/13] docs(dual-region): change use of environment variables --- .../multi-region/dual-region-ops.md | 188 ++++++++++-------- 1 file changed, 105 insertions(+), 83 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 222fc66ba7f..d14caea3adf 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -63,11 +63,35 @@ In case the region is only lost temporarily (e.g. due to the network hiccups), Z The **failover** phase of the procedure results in the temporary restoration of Camunda 8 functionality by redeploying it within the surviving region to resume Zeebe engine functionality. Before the completion of this phase, Zeebe is unable to export or process new data until it achieves quorum and the configured Elasticsearch endpoints for the exporters become accessible, which is the outcome of the failover procedure. -The **failback** phase of the procedure results in completely restoring the failed region to its full functionality. It requires you to have the second region ready again for the redeployment of Camunda. +The **failback** phase of the procedure results in completely restoring the failed region to its full functionality. It requires you to have the lost region ready again for the redeployment of Camunda 8. + +:::warning + +For the **failback** procedure, your recreated region mustn't contain any active Camunda 8 deployments or leftover persistent volumes related to Camunda 8 or its Elasticsearch instance. You must start from a clean slate and not bring old data from the lost region, as states may have diverged. + +::: The following procedures are building on top of the work done in the [AWS setup guide](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) about deploying Camunda 8 to two Kubernetes clusters in different regions. We assume you have your own copy of the [c8-multi-region](https://github.com/camunda/c8-multi-region) repository and previously done changes in the `camunda-values.yml` to adjust them to your setup. -Please ensure to have followed the points in [environment prerequisites](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites) and [deploy Camunda 8 to the clusters](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) to have Camunda 8 installed and configured for a dual-region setup. +Please ensure to have followed the point in [deploy Camunda 8 to the clusters](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#deploy-camunda-8-to-the-clusters) to have Camunda 8 installed and configured for a dual-region setup. + +### Environment prerequisites + +Please ensure to have followed the point in [environment prerequisites](./../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites) to have the general environment variables setup already. + +We will try to refrain from always mentioning both possible scenarios (losing either Region 0 or Region 1). Instead, we generalized the commands and require you to do a one-time setup to configure environment variables to help execute the procedure based on the surviving and to be recreated region. + +The following is just an example based on our assumption that **Region 1** was lost and **Region 0** survived. Adjust those depending on your setup based on the previously exported environment variables from the [AWS setup guide](../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites). After adjusting, make sure to have exported them in your terminal. + +```bash +export CLUSTER_SURVIVING=$CLUSTER_0 +export CLUSTER_RECREATED=$CLUSTER_1 +export CAMUNDA_NAMESPACE_SURVIVING=$CAMUNDA_NAMESPACE_0 +export CAMUNDA_NAMESPACE_FAILOVER=$CAMUNDA_NAMESPACE_0_FAILOVER +export CAMUNDA_NAMESPACE_RECREATED=$CAMUNDA_NAMESPACE_1 +export REGION_SURVIVING=region0 +export REGION_RECREATED=region1 +``` ### Failover @@ -133,8 +157,6 @@ In the case **Region 1** was lost: in the previously cloned repository [c8-multi In the case when your **Region 0** was lost, please instead go to the folder [aws/dual-region/kubernetes/region1](https://github.com/camunda/c8-multi-region/blob/main/aws/dual-region/kubernetes/region1/) for the `camunda-values-failover.yml` file. -Later in the guide, we will refrain from always mentioning both possible scenarios (losing either Region 0 or Region 1), but as you can see, the commands for when losing **Region 1** are the other way around compared to the loss of the **Region 0**. - The chosen `camunda-values-failover.yml` requires adjustments before installing the Helm chart and the same has to be done for the base `camunda-values.yml` in `aws/dual-region/kubernetes`. - `ZEEBE_BROKER_CLUSTER_INITIALCONTACTPOINTS` @@ -184,10 +206,10 @@ Please use the following to change the existing environment variable ZEEBE_BROKE ```bash helm install $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_0 \ - --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ + --kube-context $CLUSTER_SURVIVING \ + --namespace $CAMUNDA_NAMESPACE_FAILOVER \ -f camunda-values.yml \ - -f region0/camunda-values-failover.yml + -f $REGION_SURVIVING/camunda-values-failover.yml ``` #### Verification @@ -197,14 +219,14 @@ The following command will show the deployed pods of the failover namespace. Only the minimal amount of brokers required to restore the quorum will be deployed in the failover installation. For example, if `clusterSize` is 8, 2 Zeebe brokers will be deployed in the failover installation instead of the normal 4. This is expected. ```bash -kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0_FAILOVER +kubectl --context $CLUSTER_SURVIVING get pods -n $CAMUNDA_NAMESPACE_FAILOVER ``` Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the **failover** brokers have joined the cluster. ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_SURVIVING zbctl status --insecure --address localhost:26500 ``` @@ -285,7 +307,7 @@ Simply disabling the exporter would not be helpful here, since the sequence numb #### Desired state -You have reconfigured the existing Camunda deployment in `CAMUNDA_NAMESPACE_0` to point Zeebe to the export data to the temporary Elasticsearch instance that was previously created in **Step 2**. +You have reconfigured the existing Camunda deployment in `CAMUNDA_NAMESPACE_SURVIVING` to point Zeebe to the export data to the temporary Elasticsearch instance that was previously created in **Step 2**. The outcome will be that Zeebe cluster is unblocked and can export data to Elasticsearch again. @@ -299,29 +321,29 @@ In **Step 2** you have already adjusted the base Helm values file `camunda-value - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION0_ARGS_URL` - `ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION1_ARGS_URL` -From the `aws/dual-region/kubernetes` directory, do a Helm upgrade to update the configure Zeebe deployment in `CAMUNDA_NAMESPACE_0` to point to the failover Elasticsearch instance: +From the `aws/dual-region/kubernetes` directory, do a Helm upgrade to update the configuration of the Zeebe deployment in `CAMUNDA_NAMESPACE_SURVIVING` to point to the failover Elasticsearch instance: ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_0 \ - --namespace $CAMUNDA_NAMESPACE_0 \ + --kube-context $CLUSTER_SURVIVING \ + --namespace $CAMUNDA_NAMESPACE_SURVIVING \ -f camunda-values.yml \ - -f region0/camunda-values.yml + -f $REGION_SURVIVING/camunda-values.yml ``` #### Verification -The following command will show the deployed pods of the healthy namespace. You should see that the Zeebe brokers have just restarted or are still restarting due to the configuration upgrade. +The following command will show the deployed pods of the surviving namespace. You should see that the Zeebe brokers have just restarted or are still restarting due to the configuration upgrade. ```bash -kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0 +kubectl --context $CLUSTER_SURVIVING get pods -n $CAMUNDA_NAMESPACE_SURVIVING ``` Alternatively, you can check that the Elasticsearch value was updated in the [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) configuration of the Zeebe brokers and are reflecting the previous output of the script `generate_zeebe_helm_values.sh` in **Step 2**. ```bash -kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_0 | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' +kubectl --context $CLUSTER_SURVIVING get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_SURVIVING | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' ```
@@ -386,10 +408,10 @@ Lastly, the `installationType` is set to `failBack` to switch the behaviour of Z ```bash helm install $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_1 \ - --namespace $CAMUNDA_NAMESPACE_1 \ + --kube-context $CLUSTER_RECREATED \ + --namespace $CAMUNDA_NAMESPACE_RECREATED \ -f camunda-values.yml \ - -f region1/camunda-values.yml \ + -f $REGION_RECREATED/camunda-values.yml \ --set global.multiregion.installationType=failBack \ --set operate.enabled=false \ --set tasklist.enabled=false @@ -405,14 +427,14 @@ This behaviour stems from the **failback** mode since we still have the temporar For example in the case of `clusterSize: 8`, you find 2 active Zeebe brokers and 2 unready brokers in the newly created region. ```bash -kubectl --context $CLUSTER_1 get pods -n $CAMUNDA_NAMESPACE_1 +kubectl --context $CLUSTER_RECREATED get pods -n $CAMUNDA_NAMESPACE_RECREATED ``` Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the **failback** brokers have joined the cluster. ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_SURVIVING zbctl status --insecure --address localhost:26500 ``` @@ -466,19 +488,19 @@ This **does not** affect the processing of process instances in any way. The imp 1. Disable Operate and Tasklist by scaling to 0 ```bash -OPERATE_DEPLOYMENT=$(kubectl --context $CLUSTER_0 get deployment --selector=app\.kubernetes\.io/component=operate -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -TASKLIST_DEPLOYMENT=$(kubectl --context $CLUSTER_0 get deployment --selector=app\.kubernetes\.io/component=tasklist -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) +OPERATE_DEPLOYMENT=$(kubectl --context $CLUSTER_SURVIVING get deployment --selector=app\.kubernetes\.io/component=operate -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +TASKLIST_DEPLOYMENT=$(kubectl --context $CLUSTER_SURVIVING get deployment --selector=app\.kubernetes\.io/component=tasklist -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) -kubectl --context $CLUSTER_0 scale deployments/$OPERATE_DEPLOYMENT --replicas 0 -kubectl --context $CLUSTER_0 scale deployments/$TASKLIST_DEPLOYMENT --replicas 0 +kubectl --context $CLUSTER_SURVIVING scale deployments/$OPERATE_DEPLOYMENT --replicas 0 +kubectl --context $CLUSTER_SURVIVING scale deployments/$TASKLIST_DEPLOYMENT --replicas 0 ``` 2. Disable the Zeebe Elasticsearch exporters in Zeebe via kubectl ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_SURVIVING curl -i localhost:9600/actuator/exporting/pause -XPOST # The successful response should be: # HTTP/1.1 204 No Content @@ -489,7 +511,7 @@ curl -i localhost:9600/actuator/exporting/pause -XPOST For Operate and Tasklist, you can confirm that the deployments have successfully scaled down by listing those and indicating `0/0` ready. ```bash -kubectl --context $CLUSTER_0 get deployments $OPERATE_DEPLOYMENT $TASKLIST_DEPLOYMENT -n $CAMUNDA_NAMESPACE_0 +kubectl --context $CLUSTER_SURVIVING get deployments $OPERATE_DEPLOYMENT $TASKLIST_DEPLOYMENT -n $CAMUNDA_NAMESPACE_SURVIVING # NAME READY UP-TO-DATE AVAILABLE AGE # camunda-operate 0/0 0 0 23m # camunda-tasklist 0/0 0 0 23m @@ -534,11 +556,11 @@ The procedure works for other cloud providers and bare metal the same. You have export S3_BUCKET_NAME=$(terraform output -raw s3_bucket_name) ``` -2. Configure Elasticsearch backup endpoint in the healthy namespace `CAMUNDA_NAMESPACE_0` +2. Configure Elasticsearch backup endpoint in the surviving namespace `CAMUNDA_NAMESPACE_SURVIVING` ```bash -ELASTIC_POD=$(kubectl --context $CLUSTER_0 get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' +ELASTIC_POD=$(kubectl --context $CLUSTER_SURVIVING get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING exec -n $CAMUNDA_NAMESPACE_SURVIVING -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' { "type": "s3", "settings": { @@ -550,17 +572,17 @@ kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- cu ' ``` -3. Create an Elasticsearch backup in the healthy namespace `CAMUNDA_NAMESPACE_0`. Depending on the amount of data, this operation will take a while to complete. +3. Create an Elasticsearch backup in the surviving namespace `CAMUNDA_NAMESPACE_SURVIVING`. Depending on the amount of data, this operation will take a while to complete. ```bash # The backup will be called failback -kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup/failback?wait_for_completion=true" +kubectl --context $CLUSTER_SURVIVING exec -n $CAMUNDA_NAMESPACE_SURVIVING -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup/failback?wait_for_completion=true" ``` 4. Verify that the backup has been completed successfully by checking all backups and ensuring the `state` is `SUCCESS` ```bash -kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" +kubectl --context $CLUSTER_SURVIVING exec -n $CAMUNDA_NAMESPACE_SURVIVING -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" ```
@@ -645,11 +667,11 @@ kubectl --context $CLUSTER_0 exec -n $CAMUNDA_NAMESPACE_0 -it $ELASTIC_POD -- cu
-5. Configure Elasticsearch backup endpoint in the new region namespace `CAMUNDA_NAMESPACE_1`. It's essential to only do this step now as otherwise it won't see the backup. +5. Configure Elasticsearch backup endpoint in the new region namespace `CAMUNDA_NAMESPACE_RECREATED`. It's essential to only do this step now as otherwise it won't see the backup. ```bash -ELASTIC_POD=$(kubectl --context $CLUSTER_1 get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_1) -kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' +ELASTIC_POD=$(kubectl --context $CLUSTER_RECREATED get pod --selector=app\.kubernetes\.io/name=elasticsearch -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_RECREATED) +kubectl --context $CLUSTER_RECREATED exec -n $CAMUNDA_NAMESPACE_RECREATED -it $ELASTIC_POD -- curl -XPUT "http://localhost:9200/_snapshot/camunda_backup" -H "Content-Type: application/json" -d' { "type": "s3", "settings": { @@ -664,21 +686,21 @@ kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- cu 6. Verify that the backup can be found in the shared S3 bucket ```bash -kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" +kubectl --context $CLUSTER_RECREATED exec -n $CAMUNDA_NAMESPACE_RECREATED -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/_all" ``` The example output above should be the same since it's the same backup. -7. Restore Elasticsearch backup in the new region namespace `CAMUNDA_NAMESPACE_1`. Depending on the amount of data, this operation will take a while to complete. +7. Restore Elasticsearch backup in the new region namespace `CAMUNDA_NAMESPACE_RECREATED`. Depending on the amount of data, this operation will take a while to complete. ```bash -kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XPOST "http://localhost:9200/_snapshot/camunda_backup/failback/_restore?wait_for_completion=true" +kubectl --context $CLUSTER_RECREATED exec -n $CAMUNDA_NAMESPACE_RECREATED -it $ELASTIC_POD -- curl -XPOST "http://localhost:9200/_snapshot/camunda_backup/failback/_restore?wait_for_completion=true" ``` 8. Verify that the restore has been completed successfully in the new region. ```bash -kubectl --context $CLUSTER_1 exec -n $CAMUNDA_NAMESPACE_1 -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/failback/_status" +kubectl --context $CLUSTER_RECREATED exec -n $CAMUNDA_NAMESPACE_RECREATED -it $ELASTIC_POD -- curl -XGET "http://localhost:9200/_snapshot/camunda_backup/failback/_status" ```
@@ -796,39 +818,39 @@ Please use the following to change the existing environment variable ZEEBE_BROKE 2. As the script suggests, replace the environment variables within the `camunda-values-failover.yml`. 3. Repeat the adjustments for the base Helm values file `camunda-values.yml` in `aws/dual-region/kubernetes` with the same output for the mentioned environment variables. -4. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to point to the new Elasticsearch +4. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_SURVIVING` and `REGION_SURVIVING` to point to the new Elasticsearch ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_0 \ - --namespace $CAMUNDA_NAMESPACE_0 \ + --kube-context $CLUSTER_SURVIVING \ + --namespace $CAMUNDA_NAMESPACE_SURVIVING \ -f camunda-values.yml \ - -f region0/camunda-values.yml \ + -f $REGION_SURVIVING/camunda-values.yml \ --set operate.enabled=false \ --set tasklist.enabled=false ``` -5. Upgrade the failover Camunda environment in `CAMUNDA_NAMESPACE_0_FAILOVER` and `REGION 0` to point to the new Elasticsearch +5. Upgrade the failover Camunda environment in `CAMUNDA_NAMESPACE_FAILOVER` and `REGION_SURVIVING` to point to the new Elasticsearch ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_0 \ - --namespace $CAMUNDA_NAMESPACE_0_FAILOVER \ + --kube-context $CLUSTER_SURVIVING \ + --namespace $CAMUNDA_NAMESPACE_FAILOVER \ -f camunda-values.yml \ - -f region0/camunda-values-failover.yml + -f $REGION_SURVIVING/camunda-values-failover.yml ``` -6. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to point to the new Elasticsearch +6. Upgrade the new region environment in `CAMUNDA_NAMESPACE_RECREATED` and `REGION_RECREATED` to point to the new Elasticsearch ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_1 \ - --namespace $CAMUNDA_NAMESPACE_1 \ + --kube-context $CLUSTER_RECREATED \ + --namespace $CAMUNDA_NAMESPACE_RECREATED \ -f camunda-values.yml \ - -f region1/camunda-values.yml \ + -f $REGION_RECREATED/camunda-values.yml \ --set global.multiregion.installationType=failBack \ --set operate.enabled=false \ --set tasklist.enabled=false @@ -837,7 +859,7 @@ helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ 7. Delete the sleeping pods in the new region, as those are blocking a successful rollout due to the failback mode. ```bash -kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --selector=app\.kubernetes\.io/component=zeebe-broker +kubectl --context $CLUSTER_RECREATED --namespace $CAMUNDA_NAMESPACE_RECREATED delete pods --selector=app\.kubernetes\.io/component=zeebe-broker ``` #### Verification @@ -845,13 +867,13 @@ kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --sele The following command will show the deployed pods of the namespaces. You should see that the Zeebe brokers are restarting. Adjusting the command for the other cluster and namespaces should reveal the same. ```bash -kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0 +kubectl --context $CLUSTER_SURVIVING get pods -n $CAMUNDA_NAMESPACE_SURVIVING ``` Alternatively, you can check that the Elasticsearch value was updated in the [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) configuration of the Zeebe brokers and are reflecting the previous output of the script `generate_zeebe_helm_values.sh` in **Step 1**. ```bash -kubectl --context $CLUSTER_0 get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_0 | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' +kubectl --context $CLUSTER_SURVIVING get statefulsets $HELM_RELEASE_NAME-zeebe -oyaml -n $CAMUNDA_NAMESPACE_SURVIVING | grep -A1 'ZEEBE_BROKER_EXPORTERS_ELASTICSEARCHREGION[0-1]_ARGS_URL' ```
@@ -892,34 +914,34 @@ You are reactivating the exporters and enabling Operate and Tasklist again withi #### How to get there -1. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_0` and `REGION 0` to deploy Operate and Tasklist. +1. Upgrade the normal Camunda environment in `CAMUNDA_NAMESPACE_SURVIVING` and `REGION_SURVIVING` to deploy Operate and Tasklist. ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_0 \ - --namespace $CAMUNDA_NAMESPACE_0 \ + --kube-context $CLUSTER_SURVIVING \ + --namespace $CAMUNDA_NAMESPACE_SURVIVING \ -f camunda-values.yml \ - -f region0/camunda-values.yml + -f $REGION_SURVIVING/camunda-values.yml ``` -2. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` to deploy Operate and Tasklist. +2. Upgrade the new region environment in `CAMUNDA_NAMESPACE_RECREATED` and `REGION_RECREATED` to deploy Operate and Tasklist. ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_1 \ - --namespace $CAMUNDA_NAMESPACE_1 \ + --kube-context $CLUSTER_RECREATED \ + --namespace $CAMUNDA_NAMESPACE_RECREATED \ -f camunda-values.yml \ - -f region1/camunda-values.yml \ + -f $REGION_RECREATED/camunda-values.yml \ --set global.multiregion.installationType=failBack ``` 3. Reactivate the exporters by sending the API activation request via the Zeebe Gateway ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 9600:9600 -n $CAMUNDA_NAMESPACE_SURVIVING curl -i localhost:9600/actuator/exporting/resume -XPOST # The successful response should be: # HTTP/1.1 204 No Content @@ -927,10 +949,10 @@ curl -i localhost:9600/actuator/exporting/resume -XPOST #### Verification -For Operate and Tasklist, you can confirm that the deployments have successfully been deployed by listing those and indicating `1/1` ready. The same command can be applied for the `CLUSTER_1` and `CAMUNDA_NAMESPACE_1`. +For Operate and Tasklist, you can confirm that the deployments have successfully been deployed by listing those and indicating `1/1` ready. The same command can be applied for the `CLUSTER_RECREATED` and `CAMUNDA_NAMESPACE_RECREATED`. ```bash -kubectl --context $CLUSTER_0 get deployments -n $CAMUNDA_NAMESPACE_0 +kubectl --context $CLUSTER_SURVIVING get deployments -n $CAMUNDA_NAMESPACE_SURVIVING # NAME READY UP-TO-DATE AVAILABLE AGE # camunda-operate 1/1 1 1 3h24m # camunda-tasklist 1/1 1 1 3h24m @@ -965,13 +987,13 @@ You can remove the temporary failover solution since it is not required anymore 1. You can uninstall the failover installation via Helm. ```bash -helm uninstall $HELM_RELEASE_NAME --kube-context $CLUSTER_0 --namespace $CAMUNDA_NAMESPACE_0_FAILOVER +helm uninstall $HELM_RELEASE_NAME --kube-context $CLUSTER_SURVIVING --namespace $CAMUNDA_NAMESPACE_FAILOVER ``` 2. Delete the leftover persistent volume claims of the Camunda 8 components ```bash -kubectl --context $CLUSTER_0 delete pvc --all -n $CAMUNDA_NAMESPACE_0_FAILOVER +kubectl --context $CLUSTER_SURVIVING delete pvc --all -n $CAMUNDA_NAMESPACE_FAILOVER ``` #### Verification @@ -979,14 +1001,14 @@ kubectl --context $CLUSTER_0 delete pvc --all -n $CAMUNDA_NAMESPACE_0_FAILOVER The following will show the pods within the namespace. You deleted the Helm installation in the failover namespace, which should result in no pods or in deletion state. ```bash -kubectl --context $CLUSTER_0 get pods -n $CAMUNDA_NAMESPACE_0_FAILOVER +kubectl --context $CLUSTER_SURVIVING get pods -n $CAMUNDA_NAMESPACE_FAILOVER ``` Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that the failover brokers are missing. ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_SURVIVING zbctl status --insecure --address localhost:26500 ``` @@ -1017,21 +1039,21 @@ With this done Zeebe is fully functional again and you are prepared in case of a #### How to get there -1. Upgrade the new region environment in `CAMUNDA_NAMESPACE_1` and `REGION 1` by removing the failback mode +1. Upgrade the new region environment in `CAMUNDA_NAMESPACE_RECREATED` and `REGION_RECREATED` by removing the failback mode ```bash helm upgrade $HELM_RELEASE_NAME camunda/camunda-platform \ --version $HELM_CHART_VERSION \ - --kube-context $CLUSTER_1 \ - --namespace $CAMUNDA_NAMESPACE_1 \ + --kube-context $CLUSTER_RECREATED \ + --namespace $CAMUNDA_NAMESPACE_RECREATED \ -f camunda-values.yml \ - -f region1/camunda-values.yml + -f $REGION_RECREATED/camunda-values.yml ``` 2. Delete the sleeping pods in the new region, as those are blocking a successful rollout due to the failback mode. ```bash -kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --selector=app\.kubernetes\.io/component=zeebe-broker +kubectl --context $CLUSTER_RECREATED --namespace $CAMUNDA_NAMESPACE_RECREATED delete pods --selector=app\.kubernetes\.io/component=zeebe-broker ``` #### Verification @@ -1039,8 +1061,8 @@ kubectl --context $CLUSTER_1 --namespace $CAMUNDA_NAMESPACE_1 delete pods --sele Port-forwarding the Zeebe Gateway via `kubectl` and printing the topology should reveal that all brokers have joined the Zeebe cluster again. ```bash -ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_0 get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_0) -kubectl --context $CLUSTER_0 port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_0 +ZEEBE_GATEWAY_SERVICE=$(kubectl --context $CLUSTER_SURVIVING get service --selector=app\.kubernetes\.io/component=zeebe-gateway -o jsonpath='{.items[0].metadata.name}' -n $CAMUNDA_NAMESPACE_SURVIVING) +kubectl --context $CLUSTER_SURVIVING port-forward services/$ZEEBE_GATEWAY_SERVICE 26500:26500 -n $CAMUNDA_NAMESPACE_SURVIVING zbctl status --insecure --address localhost:26500 ``` From fafed3324c338f652204931008a721a647152146 Mon Sep 17 00:00:00 2001 From: Langleu Date: Wed, 3 Apr 2024 11:48:31 +0200 Subject: [PATCH 12/13] docs(dual-region): align disclaimer with concept page --- .../operational-guides/multi-region/dual-region-ops.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index d14caea3adf..73a9e0eb944 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -39,7 +39,7 @@ Before proceeding with the operational procedure, you must thoroughly read and c :::danger - Customers must develop and test the below-described operational procedure in non-production environments based on the framework steps outlined by Camunda, **before applying them in production setups**. -- Before advancing to production go-live, customers need to validate these procedures with Camunda. +- Before advancing to production go-live, validating these procedures with Camunda is strongly recommended. - Customers are solely responsible for detecting any regional failures and implementing the necessary described operational procedure. ::: From 362553391411cdaa42508c0115640f83d0404927 Mon Sep 17 00:00:00 2001 From: Langleu Date: Wed, 3 Apr 2024 14:18:49 +0200 Subject: [PATCH 13/13] docs(dual-region): add termonology and tabs for env preqs --- .../multi-region/dual-region-ops.md | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md index 73a9e0eb944..968eaa4a12b 100644 --- a/docs/self-managed/operational-guides/multi-region/dual-region-ops.md +++ b/docs/self-managed/operational-guides/multi-region/dual-region-ops.md @@ -51,6 +51,16 @@ Before proceeding with the operational procedure, you must thoroughly read and c - [Kubectl (1.28.x)](https://kubernetes.io/docs/tasks/tools/#kubectl) to interact with the Kubernetes cluster. - [zbctl](./../../../apis-tools/cli-client/index.md) to interact with the Zeebe cluster. +## Terminology + +- **Surviving region** + - A surviving region refers to a region within a dual-region setup that remains operational and unaffected by a failure or disaster that affects other regions. +- **Lost region** + - A lost region refers to a region within a dual-region setup that becomes unavailable or unusable due to a failure or disaster. +- **Recreated region** + - A recreated region refers to a region within a dual-region setup that was previously lost but has been restored or recreated to resume its operational state. + - We assume this region contains no Camunda 8 deployments or related persistent volumes. Please ensure this is the case before executing the **failover** procedure. + ## Procedure We don't differ between active and passive regions as the procedure is the same for either loss. We will focus on losing the passive region while still having the active region. @@ -81,7 +91,23 @@ Please ensure to have followed the point in [environment prerequisites](./../../ We will try to refrain from always mentioning both possible scenarios (losing either Region 0 or Region 1). Instead, we generalized the commands and require you to do a one-time setup to configure environment variables to help execute the procedure based on the surviving and to be recreated region. -The following is just an example based on our assumption that **Region 1** was lost and **Region 0** survived. Adjust those depending on your setup based on the previously exported environment variables from the [AWS setup guide](../../platform-deployment/helm-kubernetes/platforms/amazon-eks/dual-region.md#environment-prerequisites). After adjusting, make sure to have exported them in your terminal. +Depending on which region you lost, select the correct tab below and export those environment variables to your terminal to make the procedure executions easier. + + + + +```bash +export CLUSTER_SURVIVING=$CLUSTER_1 +export CLUSTER_RECREATED=$CLUSTER_0 +export CAMUNDA_NAMESPACE_SURVIVING=$CAMUNDA_NAMESPACE_1 +export CAMUNDA_NAMESPACE_FAILOVER=$CAMUNDA_NAMESPACE_1_FAILOVER +export CAMUNDA_NAMESPACE_RECREATED=$CAMUNDA_NAMESPACE_0 +export REGION_SURVIVING=region1 +export REGION_RECREATED=region0 +``` + + + ```bash export CLUSTER_SURVIVING=$CLUSTER_0 @@ -93,6 +119,9 @@ export REGION_SURVIVING=region0 export REGION_RECREATED=region1 ``` + + + ### Failover @@ -143,7 +172,7 @@ desired={} You have previously ensured that the lost region cannot not reconnect during the failover procedure. -Due to the Zeebe data partitioning, no data has been lost. +Due to the Zeebe data replication, no data has been lost. #### Desired state @@ -297,7 +326,7 @@ desired={} #### Current state -Zeebe is not yet be able to continue processing data since the Zeebe brokers in the surviving region are configured to point to the Elasticsearch instance of the lost region. +Zeebe is not yet be able to continue exporting data since the Zeebe brokers in the surviving region are configured to point to the Elasticsearch instance of the lost region. :::info