From 67ec3f535c2ab615359d55352d025dbd2d6cfd63 Mon Sep 17 00:00:00 2001
From: David Nix <hello@davidnix.io>
Date: Thu, 16 Feb 2023 11:05:49 -0700
Subject: [PATCH 1/4] First pass of self healing types

---
 api/v1/cosmosfullnode_types.go                |  6 +++
 api/v1/self_healing_types.go                  | 34 +++++++++++++++
 api/v1/zz_generated.deepcopy.go               | 41 +++++++++++++++++++
 .../cosmos.strange.love_cosmosfullnodes.yaml  | 38 +++++++++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 api/v1/self_healing_types.go

diff --git a/api/v1/cosmosfullnode_types.go b/api/v1/cosmosfullnode_types.go
index c3f8c9eb..0e2c9a24 100644
--- a/api/v1/cosmosfullnode_types.go
+++ b/api/v1/cosmosfullnode_types.go
@@ -84,6 +84,12 @@ type FullNodeSpec struct {
 	// Used for debugging.
 	// +optional
 	InstanceOverrides map[string]InstanceOverridesSpec `json:"instanceOverrides"`
+
+	// Strategies for automatic recovery of faults and errors.
+	// SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce
+	// complexity of the CosmosFullNodeController.
+	// +optional
+	SelfHealing *SelfHealingSpec `json:"selfHealing"`
 }
 
 type FullNodeType string
diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go
new file mode 100644
index 00000000..87f6674c
--- /dev/null
+++ b/api/v1/self_healing_types.go
@@ -0,0 +1,34 @@
+package v1
+
+import "k8s.io/apimachinery/pkg/util/intstr"
+
+// SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController.
+// This is an effort to reduce complexity in the CosmosFullNodeController.
+type SelfHealingSpec struct {
+	// Determines when to destroy and recreate a replica (aka instance, pod/pvc combo) that is failing.
+	// Occasionally, data may become corrupt and the chain exits and cannot restart.
+	// This strategy only watches the pods "node" container running the `start` command, and only for
+	// pods that are crashlooping.
+	//
+	// This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource.
+	// With this pairing, a new PVC is created with a recent VolumeSnapshot.
+	// Otherwise, ensure your snapshot, genesis, etc. creation are idempotent.
+	// (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
+	// +optional
+	ReplicaFaultRecovery *ReplicaFaultRecoverySpec `json:"replicaFaultRecovery"`
+}
+
+type ReplicaFaultRecoverySpec struct {
+	// How many healthy pods are required to trigger destroying a crashlooping pod and pvc.
+	// The controller periodically inspects the status of all pods.
+	// If the majority of pods are crashlooping, then there's probably something else wrong, and recreating
+	// the pod and pvc will have no effect.
+	// Set an integer or a percentage string such as 50%.
+	// If the threshold is too high, defaults to watching for 1 unhealthy pod.
+	HealthyThreshold intstr.IntOrString `json:"healthyThreshold"`
+
+	// How many restarts to wait before destroying and recreating an unhealthy replica.
+	// Defaults to 5.
+	// +optional
+	RestartThreshold int32 `json:"restartThreshold"`
+}
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index bb95beaa..5295138b 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -210,6 +210,11 @@ func (in *FullNodeSpec) DeepCopyInto(out *FullNodeSpec) {
 			(*out)[key] = *val.DeepCopy()
 		}
 	}
+	if in.SelfHealing != nil {
+		in, out := &in.SelfHealing, &out.SelfHealing
+		*out = new(SelfHealingSpec)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FullNodeSpec.
@@ -453,6 +458,22 @@ func (in *RPCServiceSpec) DeepCopy() *RPCServiceSpec {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ReplicaFaultRecoverySpec) DeepCopyInto(out *ReplicaFaultRecoverySpec) {
+	*out = *in
+	out.HealthyThreshold = in.HealthyThreshold
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaFaultRecoverySpec.
+func (in *ReplicaFaultRecoverySpec) DeepCopy() *ReplicaFaultRecoverySpec {
+	if in == nil {
+		return nil
+	}
+	out := new(ReplicaFaultRecoverySpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RolloutStrategy) DeepCopyInto(out *RolloutStrategy) {
 	*out = *in
@@ -503,6 +524,26 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) {
+	*out = *in
+	if in.ReplicaFaultRecovery != nil {
+		in, out := &in.ReplicaFaultRecovery, &out.ReplicaFaultRecovery
+		*out = new(ReplicaFaultRecoverySpec)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfHealingSpec.
+func (in *SelfHealingSpec) DeepCopy() *SelfHealingSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(SelfHealingSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) {
 	*out = *in
diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
index fcba963d..782032eb 100644
--- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
+++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
@@ -1466,6 +1466,44 @@ spec:
                 format: int32
                 minimum: 0
                 type: integer
+              selfHealing:
+                description: Strategies for automatic recovery of faults and errors.
+                  SelfHealing is managed by a separate controller, SelfHealingController,
+                  in an effort to reduce complexity of the CosmosFullNodeController.
+                properties:
+                  replicaFaultRecovery:
+                    description: "Determines when to destroy and recreate a replica
+                      (aka instance, pod/pvc combo) that is failing. Occasionally,
+                      data may become corrupt and the chain exits and cannot restart.
+                      This strategy only watches the pods \"node\" container running
+                      the `start` command, and only for pods that are crashlooping.
+                      \n This pairs well with volumeClaimTemplate.autoDataSource and
+                      a ScheduledVolumeSnapshot resource. With this pairing, a new
+                      PVC is created with a recent VolumeSnapshot. Otherwise, ensure
+                      your snapshot, genesis, etc. creation are idempotent. (e.g.
+                      chain.snapshotURL and chain.genesisURL have stable urls)"
+                    properties:
+                      healthyThreshold:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: How many healthy pods are required to trigger
+                          destroying a crashlooping pod and pvc. The controller periodically
+                          inspects the status of all pods. If the majority of pods
+                          are crashlooping, then there's probably something else wrong,
+                          and recreating the pod and pvc will have no effect. Set
+                          an integer or a percentage string such as 50%. If the threshold
+                          is too high, defaults to watching for 1 unhealthy pod.
+                        x-kubernetes-int-or-string: true
+                      restartThreshold:
+                        description: How many restarts to wait before destroying and
+                          recreating an unhealthy replica. Defaults to 5.
+                        format: int32
+                        type: integer
+                    required:
+                    - healthyThreshold
+                    type: object
+                type: object
               service:
                 description: Configure Operator created services. A singe rpc service
                   is created for load balancing api, grpc, rpc, etc. requests. This

From 294c516ae0b073a0e8fd423267fefff26681a87d Mon Sep 17 00:00:00 2001
From: David Nix <hello@davidnix.io>
Date: Thu, 16 Feb 2023 16:35:56 -0700
Subject: [PATCH 2/4] More api design tweaking

---
 api/v1/self_healing_types.go                  | 20 ++++++----
 api/v1/zz_generated.deepcopy.go               | 38 +++++++++---------
 .../cosmos.strange.love_cosmosfullnodes.yaml  | 39 +++++++++++--------
 3 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go
index 87f6674c..1eeb1734 100644
--- a/api/v1/self_healing_types.go
+++ b/api/v1/self_healing_types.go
@@ -5,26 +5,32 @@ import "k8s.io/apimachinery/pkg/util/intstr"
 // SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController.
 // This is an effort to reduce complexity in the CosmosFullNodeController.
 type SelfHealingSpec struct {
-	// Determines when to destroy and recreate a replica (aka instance, pod/pvc combo) that is failing.
+	// Determines when to destroy and recreate a replica (aka pod/pvc combo) that is crashlooping.
 	// Occasionally, data may become corrupt and the chain exits and cannot restart.
-	// This strategy only watches the pods "node" container running the `start` command, and only for
-	// pods that are crashlooping.
+	// This strategy only watches the pods' "node" containers running the `start` command.
 	//
 	// This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource.
 	// With this pairing, a new PVC is created with a recent VolumeSnapshot.
 	// Otherwise, ensure your snapshot, genesis, etc. creation are idempotent.
 	// (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
+	//
+	// This feature may be extended to detect other failed pod states instead of just crashloops.
 	// +optional
-	ReplicaFaultRecovery *ReplicaFaultRecoverySpec `json:"replicaFaultRecovery"`
+	PodFaultRecovery *PodFaultRecovery `json:"podFaultRecovery"`
 }
 
-type ReplicaFaultRecoverySpec struct {
+type PodFaultRecovery struct {
 	// How many healthy pods are required to trigger destroying a crashlooping pod and pvc.
+	// Set an integer or a percentage string such as 50%.
+	//
+	// This setting attempts to minimize false positives in order to detect data corruption instead of
+	// a variety of other reasons for crashloops.
 	// The controller periodically inspects the status of all pods.
 	// If the majority of pods are crashlooping, then there's probably something else wrong, and recreating
 	// the pod and pvc will have no effect.
-	// Set an integer or a percentage string such as 50%.
-	// If the threshold is too high, defaults to watching for 1 unhealthy pod.
+	//
+	// If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy.
+	// It's not recommended to use this feature with only 1 replica.
 	HealthyThreshold intstr.IntOrString `json:"healthyThreshold"`
 
 	// How many restarts to wait before destroying and recreating an unhealthy replica.
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 5295138b..9cfc3cd0 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -345,6 +345,22 @@ func (in *PersistentVolumeClaimSpec) DeepCopy() *PersistentVolumeClaimSpec {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodFaultRecovery) DeepCopyInto(out *PodFaultRecovery) {
+	*out = *in
+	out.HealthyThreshold = in.HealthyThreshold
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFaultRecovery.
+func (in *PodFaultRecovery) DeepCopy() *PodFaultRecovery {
+	if in == nil {
+		return nil
+	}
+	out := new(PodFaultRecovery)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *PodSpec) DeepCopyInto(out *PodSpec) {
 	*out = *in
@@ -458,22 +474,6 @@ func (in *RPCServiceSpec) DeepCopy() *RPCServiceSpec {
 	return out
 }
 
-// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *ReplicaFaultRecoverySpec) DeepCopyInto(out *ReplicaFaultRecoverySpec) {
-	*out = *in
-	out.HealthyThreshold = in.HealthyThreshold
-}
-
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaFaultRecoverySpec.
-func (in *ReplicaFaultRecoverySpec) DeepCopy() *ReplicaFaultRecoverySpec {
-	if in == nil {
-		return nil
-	}
-	out := new(ReplicaFaultRecoverySpec)
-	in.DeepCopyInto(out)
-	return out
-}
-
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RolloutStrategy) DeepCopyInto(out *RolloutStrategy) {
 	*out = *in
@@ -527,9 +527,9 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) {
 	*out = *in
-	if in.ReplicaFaultRecovery != nil {
-		in, out := &in.ReplicaFaultRecovery, &out.ReplicaFaultRecovery
-		*out = new(ReplicaFaultRecoverySpec)
+	if in.PodFaultRecovery != nil {
+		in, out := &in.PodFaultRecovery, &out.PodFaultRecovery
+		*out = new(PodFaultRecovery)
 		**out = **in
 	}
 }
diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
index 782032eb..6caefa37 100644
--- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
+++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
@@ -1471,29 +1471,34 @@ spec:
                   SelfHealing is managed by a separate controller, SelfHealingController,
                   in an effort to reduce complexity of the CosmosFullNodeController.
                 properties:
-                  replicaFaultRecovery:
+                  podFaultRecovery:
                     description: "Determines when to destroy and recreate a replica
-                      (aka instance, pod/pvc combo) that is failing. Occasionally,
-                      data may become corrupt and the chain exits and cannot restart.
-                      This strategy only watches the pods \"node\" container running
-                      the `start` command, and only for pods that are crashlooping.
-                      \n This pairs well with volumeClaimTemplate.autoDataSource and
-                      a ScheduledVolumeSnapshot resource. With this pairing, a new
-                      PVC is created with a recent VolumeSnapshot. Otherwise, ensure
-                      your snapshot, genesis, etc. creation are idempotent. (e.g.
-                      chain.snapshotURL and chain.genesisURL have stable urls)"
+                      (aka pod/pvc combo) that is crashlooping. Occasionally, data
+                      may become corrupt and the chain exits and cannot restart. This
+                      strategy only watches the pods' \"node\" containers running
+                      the `start` command. \n This pairs well with volumeClaimTemplate.autoDataSource
+                      and a ScheduledVolumeSnapshot resource. With this pairing, a
+                      new PVC is created with a recent VolumeSnapshot. Otherwise,
+                      ensure your snapshot, genesis, etc. creation are idempotent.
+                      (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
+                      \n This feature may be extended to detect other failed pod states
+                      instead of just crashloops."
                     properties:
                       healthyThreshold:
                         anyOf:
                         - type: integer
                         - type: string
-                        description: How many healthy pods are required to trigger
-                          destroying a crashlooping pod and pvc. The controller periodically
-                          inspects the status of all pods. If the majority of pods
-                          are crashlooping, then there's probably something else wrong,
-                          and recreating the pod and pvc will have no effect. Set
-                          an integer or a percentage string such as 50%. If the threshold
-                          is too high, defaults to watching for 1 unhealthy pod.
+                        description: "How many healthy pods are required to trigger
+                          destroying a crashlooping pod and pvc. Set an integer or
+                          a percentage string such as 50%. \n This setting attempts
+                          to minimize false positives in order to detect data corruption
+                          instead of a variety of other reasons for crashloops. The
+                          controller periodically inspects the status of all pods.
+                          If the majority of pods are crashlooping, then there's probably
+                          something else wrong, and recreating the pod and pvc will
+                          have no effect. \n If the threshold is too high, defaults
+                          to recovering 1 unhealthy pod, the rest must be healthy.
+                          It's not recommended to use this feature with only 1 replica."
                         x-kubernetes-int-or-string: true
                       restartThreshold:
                         description: How many restarts to wait before destroying and

From fd8affbd2c2838704e04f18c2145b1c5b8d58aac Mon Sep 17 00:00:00 2001
From: David Nix <hello@davidnix.io>
Date: Thu, 16 Feb 2023 16:46:33 -0700
Subject: [PATCH 3/4] More tweaking

---
 api/v1/self_healing_types.go                  |  9 +++++----
 .../cosmos.strange.love_cosmosfullnodes.yaml  | 20 ++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go
index 1eeb1734..7d332b7e 100644
--- a/api/v1/self_healing_types.go
+++ b/api/v1/self_healing_types.go
@@ -22,11 +22,12 @@ type SelfHealingSpec struct {
 type PodFaultRecovery struct {
 	// How many healthy pods are required to trigger destroying a crashlooping pod and pvc.
 	// Set an integer or a percentage string such as 50%.
+	// Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery.
+	// Fractional values are rounded down.
 	//
-	// This setting attempts to minimize false positives in order to detect data corruption instead of
-	// a variety of other reasons for crashloops.
-	// The controller periodically inspects the status of all pods.
-	// If the majority of pods are crashlooping, then there's probably something else wrong, and recreating
+	// This setting attempts to minimize false positives in order to detect data corruption vs.
+	// endless other reasons for unhealthy pods.
+	// If the majority of pods are unhealthy, then there's probably something else wrong, and recreating
 	// the pod and pvc will have no effect.
 	//
 	// If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy.
diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
index 6caefa37..faf4c9ed 100644
--- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
+++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
@@ -1490,15 +1490,17 @@ spec:
                         - type: string
                         description: "How many healthy pods are required to trigger
                           destroying a crashlooping pod and pvc. Set an integer or
-                          a percentage string such as 50%. \n This setting attempts
-                          to minimize false positives in order to detect data corruption
-                          instead of a variety of other reasons for crashloops. The
-                          controller periodically inspects the status of all pods.
-                          If the majority of pods are crashlooping, then there's probably
-                          something else wrong, and recreating the pod and pvc will
-                          have no effect. \n If the threshold is too high, defaults
-                          to recovering 1 unhealthy pod, the rest must be healthy.
-                          It's not recommended to use this feature with only 1 replica."
+                          a percentage string such as 50%. Example: If you set to
+                          80% and there are 10 total pods, at least 8 must be healthy
+                          to trigger the recovery. Fractional values are rounded down.
+                          \n This setting attempts to minimize false positives in
+                          order to detect data corruption vs. endless other reasons
+                          for unhealthy pods. If the majority of pods are unhealthy,
+                          then there's probably something else wrong, and recreating
+                          the pod and pvc will have no effect. \n If the threshold
+                          is too high, defaults to recovering 1 unhealthy pod, the
+                          rest must be healthy. It's not recommended to use this feature
+                          with only 1 replica."
                         x-kubernetes-int-or-string: true
                       restartThreshold:
                         description: How many restarts to wait before destroying and

From eebe0ad8bc27142e29ec7ddb2ba5d01995388865 Mon Sep 17 00:00:00 2001
From: David Nix <hello@davidnix.io>
Date: Fri, 17 Feb 2023 13:35:08 -0700
Subject: [PATCH 4/4] Maybe settle on a final name

---
 api/v1/self_healing_types.go                  | 13 +++----
 api/v1/zz_generated.deepcopy.go               | 38 +++++++++----------
 .../cosmos.strange.love_cosmosfullnodes.yaml  | 24 +++++-------
 3 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go
index 7d332b7e..89beb34c 100644
--- a/api/v1/self_healing_types.go
+++ b/api/v1/self_healing_types.go
@@ -14,27 +14,24 @@ type SelfHealingSpec struct {
 	// Otherwise, ensure your snapshot, genesis, etc. creation are idempotent.
 	// (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
 	//
-	// This feature may be extended to detect other failed pod states instead of just crashloops.
 	// +optional
-	PodFaultRecovery *PodFaultRecovery `json:"podFaultRecovery"`
+	CrashLoopRecovery *CrashLoopRecovery `json:"crashLoopRecovery"`
 }
 
-type PodFaultRecovery struct {
+type CrashLoopRecovery struct {
 	// How many healthy pods are required to trigger destroying a crashlooping pod and pvc.
 	// Set an integer or a percentage string such as 50%.
 	// Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery.
-	// Fractional values are rounded down.
+	// Fractional values are rounded down, but the minimum is 1.
+	// It's not recommended to use this feature with only 1 replica.
 	//
 	// This setting attempts to minimize false positives in order to detect data corruption vs.
 	// endless other reasons for unhealthy pods.
 	// If the majority of pods are unhealthy, then there's probably something else wrong, and recreating
 	// the pod and pvc will have no effect.
-	//
-	// If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy.
-	// It's not recommended to use this feature with only 1 replica.
 	HealthyThreshold intstr.IntOrString `json:"healthyThreshold"`
 
-	// How many restarts to wait before destroying and recreating an unhealthy replica.
+	// How many restarts to wait before destroying and recreating the unhealthy replica.
 	// Defaults to 5.
 	// +optional
 	RestartThreshold int32 `json:"restartThreshold"`
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 9cfc3cd0..d226a0f9 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -160,6 +160,22 @@ func (in *CosmosFullNodeList) DeepCopyObject() runtime.Object {
 	return nil
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CrashLoopRecovery) DeepCopyInto(out *CrashLoopRecovery) {
+	*out = *in
+	out.HealthyThreshold = in.HealthyThreshold
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopRecovery.
+func (in *CrashLoopRecovery) DeepCopy() *CrashLoopRecovery {
+	if in == nil {
+		return nil
+	}
+	out := new(CrashLoopRecovery)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *FullNodeProbesSpec) DeepCopyInto(out *FullNodeProbesSpec) {
 	*out = *in
@@ -345,22 +361,6 @@ func (in *PersistentVolumeClaimSpec) DeepCopy() *PersistentVolumeClaimSpec {
 	return out
 }
 
-// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *PodFaultRecovery) DeepCopyInto(out *PodFaultRecovery) {
-	*out = *in
-	out.HealthyThreshold = in.HealthyThreshold
-}
-
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFaultRecovery.
-func (in *PodFaultRecovery) DeepCopy() *PodFaultRecovery {
-	if in == nil {
-		return nil
-	}
-	out := new(PodFaultRecovery)
-	in.DeepCopyInto(out)
-	return out
-}
-
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *PodSpec) DeepCopyInto(out *PodSpec) {
 	*out = *in
@@ -527,9 +527,9 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) {
 	*out = *in
-	if in.PodFaultRecovery != nil {
-		in, out := &in.PodFaultRecovery, &out.PodFaultRecovery
-		*out = new(PodFaultRecovery)
+	if in.CrashLoopRecovery != nil {
+		in, out := &in.CrashLoopRecovery, &out.CrashLoopRecovery
+		*out = new(CrashLoopRecovery)
 		**out = **in
 	}
 }
diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
index faf4c9ed..13877864 100644
--- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
+++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml
@@ -1471,7 +1471,7 @@ spec:
                   SelfHealing is managed by a separate controller, SelfHealingController,
                   in an effort to reduce complexity of the CosmosFullNodeController.
                 properties:
-                  podFaultRecovery:
+                  crashLoopRecovery:
                     description: "Determines when to destroy and recreate a replica
                       (aka pod/pvc combo) that is crashlooping. Occasionally, data
                       may become corrupt and the chain exits and cannot restart. This
@@ -1480,9 +1480,7 @@ spec:
                       and a ScheduledVolumeSnapshot resource. With this pairing, a
                       new PVC is created with a recent VolumeSnapshot. Otherwise,
                       ensure your snapshot, genesis, etc. creation are idempotent.
-                      (e.g. chain.snapshotURL and chain.genesisURL have stable urls)
-                      \n This feature may be extended to detect other failed pod states
-                      instead of just crashloops."
+                      (e.g. chain.snapshotURL and chain.genesisURL have stable urls)"
                     properties:
                       healthyThreshold:
                         anyOf:
@@ -1492,19 +1490,17 @@ spec:
                           destroying a crashlooping pod and pvc. Set an integer or
                           a percentage string such as 50%. Example: If you set to
                           80% and there are 10 total pods, at least 8 must be healthy
-                          to trigger the recovery. Fractional values are rounded down.
-                          \n This setting attempts to minimize false positives in
-                          order to detect data corruption vs. endless other reasons
-                          for unhealthy pods. If the majority of pods are unhealthy,
-                          then there's probably something else wrong, and recreating
-                          the pod and pvc will have no effect. \n If the threshold
-                          is too high, defaults to recovering 1 unhealthy pod, the
-                          rest must be healthy. It's not recommended to use this feature
-                          with only 1 replica."
+                          to trigger the recovery. Fractional values are rounded down,
+                          but the minimum is 1. It's not recommended to use this feature
+                          with only 1 replica. \n This setting attempts to minimize
+                          false positives in order to detect data corruption vs. endless
+                          other reasons for unhealthy pods. If the majority of pods
+                          are unhealthy, then there's probably something else wrong,
+                          and recreating the pod and pvc will have no effect."
                         x-kubernetes-int-or-string: true
                       restartThreshold:
                         description: How many restarts to wait before destroying and
-                          recreating an unhealthy replica. Defaults to 5.
+                          recreating the unhealthy replica. Defaults to 5.
                         format: int32
                         type: integer
                     required: