strangelove-ventures · DavidNix · Feb 16, 2023 · Feb 16, 2023 · Feb 16, 2023 · Feb 17, 2023
diff --git a/api/v1/cosmosfullnode_types.go b/api/v1/cosmosfullnode_types.go
@@ -84,6 +84,12 @@ type FullNodeSpec struct {
 	// Used for debugging.
 	// +optional
 	InstanceOverrides map[string]InstanceOverridesSpec `json:"instanceOverrides"`
+
+	// Strategies for automatic recovery of faults and errors.
+	// SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce
+	// complexity of the CosmosFullNodeController.
+	// +optional
+	SelfHealing *SelfHealingSpec `json:"selfHealing"`
 }
 
 type FullNodeType string

diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go
@@ -0,0 +1,38 @@
+package v1
+
+import "k8s.io/apimachinery/pkg/util/intstr"
+
+// SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController.
+// This is an effort to reduce complexity in the CosmosFullNodeController.
+type SelfHealingSpec struct {
+	// Determines when to destroy and recreate a replica (aka pod/pvc combo) that is crashlooping.
+	// Occasionally, data may become corrupt and the chain exits and cannot restart.
+	// This strategy only watches the pods' "node" containers running the `start` command.
+	//
+	// This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource.
+	// With this pairing, a new PVC is created with a recent VolumeSnapshot.
+	// Otherwise, ensure your snapshot, genesis, etc. creation are idempotent.
+	// (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
+	//
+	// +optional
+	CrashLoopRecovery *CrashLoopRecovery `json:"crashLoopRecovery"`
+}
+
+type CrashLoopRecovery struct {
+	// How many healthy pods are required to trigger destroying a crashlooping pod and pvc.
+	// Set an integer or a percentage string such as 50%.
+	// Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery.
+	// Fractional values are rounded down, but the minimum is 1.
+	// It's not recommended to use this feature with only 1 replica.
+	//
+	// This setting attempts to minimize false positives in order to detect data corruption vs.
+	// endless other reasons for unhealthy pods.
+	// If the majority of pods are unhealthy, then there's probably something else wrong, and recreating
+	// the pod and pvc will have no effect.
+	HealthyThreshold intstr.IntOrString `json:"healthyThreshold"`
+
+	// How many restarts to wait before destroying and recreating the unhealthy replica.
+	// Defaults to 5.
+	// +optional
+	RestartThreshold int32 `json:"restartThreshold"`
+}
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
@@ -1466,6 +1466,47 @@ spec:
                 format: int32
                 minimum: 0
                 type: integer
+              selfHealing:
+                description: Strategies for automatic recovery of faults and errors.
+                  SelfHealing is managed by a separate controller, SelfHealingController,
+                  in an effort to reduce complexity of the CosmosFullNodeController.
+                properties:
+                  crashLoopRecovery:
+                    description: "Determines when to destroy and recreate a replica
+                      (aka pod/pvc combo) that is crashlooping. Occasionally, data
+                      may become corrupt and the chain exits and cannot restart. This
+                      strategy only watches the pods' \"node\" containers running
+                      the `start` command. \n This pairs well with volumeClaimTemplate.autoDataSource
+                      and a ScheduledVolumeSnapshot resource. With this pairing, a
+                      new PVC is created with a recent VolumeSnapshot. Otherwise,
+                      ensure your snapshot, genesis, etc. creation are idempotent.
+                      (e.g. chain.snapshotURL and chain.genesisURL have stable urls)"
+                    properties:
+                      healthyThreshold:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: "How many healthy pods are required to trigger
+                          destroying a crashlooping pod and pvc. Set an integer or
+                          a percentage string such as 50%. Example: If you set to
+                          80% and there are 10 total pods, at least 8 must be healthy
+                          to trigger the recovery. Fractional values are rounded down,
+                          but the minimum is 1. It's not recommended to use this feature
+                          with only 1 replica. \n This setting attempts to minimize
+                          false positives in order to detect data corruption vs. endless
+                          other reasons for unhealthy pods. If the majority of pods
+                          are unhealthy, then there's probably something else wrong,
+                          and recreating the pod and pvc will have no effect."
+                        x-kubernetes-int-or-string: true
+                      restartThreshold:
+                        description: How many restarts to wait before destroying and
+                          recreating the unhealthy replica. Defaults to 5.
+                        format: int32
+                        type: integer
+                    required:
+                    - healthyThreshold
+                    type: object
+                type: object
               service:
                 description: Configure Operator created services. A singe rpc service
                   is created for load balancing api, grpc, rpc, etc. requests. This