From 67ec3f535c2ab615359d55352d025dbd2d6cfd63 Mon Sep 17 00:00:00 2001 From: David Nix Date: Thu, 16 Feb 2023 11:05:49 -0700 Subject: [PATCH 1/4] First pass of self healing types --- api/v1/cosmosfullnode_types.go | 6 +++ api/v1/self_healing_types.go | 34 +++++++++++++++ api/v1/zz_generated.deepcopy.go | 41 +++++++++++++++++++ .../cosmos.strange.love_cosmosfullnodes.yaml | 38 +++++++++++++++++ 4 files changed, 119 insertions(+) create mode 100644 api/v1/self_healing_types.go diff --git a/api/v1/cosmosfullnode_types.go b/api/v1/cosmosfullnode_types.go index c3f8c9eb..0e2c9a24 100644 --- a/api/v1/cosmosfullnode_types.go +++ b/api/v1/cosmosfullnode_types.go @@ -84,6 +84,12 @@ type FullNodeSpec struct { // Used for debugging. // +optional InstanceOverrides map[string]InstanceOverridesSpec `json:"instanceOverrides"` + + // Strategies for automatic recovery of faults and errors. + // SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce + // complexity of the CosmosFullNodeController. + // +optional + SelfHealing *SelfHealingSpec `json:"selfHealing"` } type FullNodeType string diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go new file mode 100644 index 00000000..87f6674c --- /dev/null +++ b/api/v1/self_healing_types.go @@ -0,0 +1,34 @@ +package v1 + +import "k8s.io/apimachinery/pkg/util/intstr" + +// SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController. +// This is an effort to reduce complexity in the CosmosFullNodeController. +type SelfHealingSpec struct { + // Determines when to destroy and recreate a replica (aka instance, pod/pvc combo) that is failing. + // Occasionally, data may become corrupt and the chain exits and cannot restart. + // This strategy only watches the pods "node" container running the `start` command, and only for + // pods that are crashlooping. + // + // This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource. + // With this pairing, a new PVC is created with a recent VolumeSnapshot. + // Otherwise, ensure your snapshot, genesis, etc. creation are idempotent. + // (e.g. chain.snapshotURL and chain.genesisURL have stable urls) + // +optional + ReplicaFaultRecovery *ReplicaFaultRecoverySpec `json:"replicaFaultRecovery"` +} + +type ReplicaFaultRecoverySpec struct { + // How many healthy pods are required to trigger destroying a crashlooping pod and pvc. + // The controller periodically inspects the status of all pods. + // If the majority of pods are crashlooping, then there's probably something else wrong, and recreating + // the pod and pvc will have no effect. + // Set an integer or a percentage string such as 50%. + // If the threshold is too high, defaults to watching for 1 unhealthy pod. + HealthyThreshold intstr.IntOrString `json:"healthyThreshold"` + + // How many restarts to wait before destroying and recreating an unhealthy replica. + // Defaults to 5. + // +optional + RestartThreshold int32 `json:"restartThreshold"` +} diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index bb95beaa..5295138b 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -210,6 +210,11 @@ func (in *FullNodeSpec) DeepCopyInto(out *FullNodeSpec) { (*out)[key] = *val.DeepCopy() } } + if in.SelfHealing != nil { + in, out := &in.SelfHealing, &out.SelfHealing + *out = new(SelfHealingSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FullNodeSpec. @@ -453,6 +458,22 @@ func (in *RPCServiceSpec) DeepCopy() *RPCServiceSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ReplicaFaultRecoverySpec) DeepCopyInto(out *ReplicaFaultRecoverySpec) { + *out = *in + out.HealthyThreshold = in.HealthyThreshold +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaFaultRecoverySpec. +func (in *ReplicaFaultRecoverySpec) DeepCopy() *ReplicaFaultRecoverySpec { + if in == nil { + return nil + } + out := new(ReplicaFaultRecoverySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RolloutStrategy) DeepCopyInto(out *RolloutStrategy) { *out = *in @@ -503,6 +524,26 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) { + *out = *in + if in.ReplicaFaultRecovery != nil { + in, out := &in.ReplicaFaultRecovery, &out.ReplicaFaultRecovery + *out = new(ReplicaFaultRecoverySpec) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfHealingSpec. +func (in *SelfHealingSpec) DeepCopy() *SelfHealingSpec { + if in == nil { + return nil + } + out := new(SelfHealingSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) { *out = *in diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml index fcba963d..782032eb 100644 --- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml +++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml @@ -1466,6 +1466,44 @@ spec: format: int32 minimum: 0 type: integer + selfHealing: + description: Strategies for automatic recovery of faults and errors. + SelfHealing is managed by a separate controller, SelfHealingController, + in an effort to reduce complexity of the CosmosFullNodeController. + properties: + replicaFaultRecovery: + description: "Determines when to destroy and recreate a replica + (aka instance, pod/pvc combo) that is failing. Occasionally, + data may become corrupt and the chain exits and cannot restart. + This strategy only watches the pods \"node\" container running + the `start` command, and only for pods that are crashlooping. + \n This pairs well with volumeClaimTemplate.autoDataSource and + a ScheduledVolumeSnapshot resource. With this pairing, a new + PVC is created with a recent VolumeSnapshot. Otherwise, ensure + your snapshot, genesis, etc. creation are idempotent. (e.g. + chain.snapshotURL and chain.genesisURL have stable urls)" + properties: + healthyThreshold: + anyOf: + - type: integer + - type: string + description: How many healthy pods are required to trigger + destroying a crashlooping pod and pvc. The controller periodically + inspects the status of all pods. If the majority of pods + are crashlooping, then there's probably something else wrong, + and recreating the pod and pvc will have no effect. Set + an integer or a percentage string such as 50%. If the threshold + is too high, defaults to watching for 1 unhealthy pod. + x-kubernetes-int-or-string: true + restartThreshold: + description: How many restarts to wait before destroying and + recreating an unhealthy replica. Defaults to 5. + format: int32 + type: integer + required: + - healthyThreshold + type: object + type: object service: description: Configure Operator created services. A singe rpc service is created for load balancing api, grpc, rpc, etc. requests. This From 294c516ae0b073a0e8fd423267fefff26681a87d Mon Sep 17 00:00:00 2001 From: David Nix Date: Thu, 16 Feb 2023 16:35:56 -0700 Subject: [PATCH 2/4] More api design tweaking --- api/v1/self_healing_types.go | 20 ++++++---- api/v1/zz_generated.deepcopy.go | 38 +++++++++--------- .../cosmos.strange.love_cosmosfullnodes.yaml | 39 +++++++++++-------- 3 files changed, 54 insertions(+), 43 deletions(-) diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go index 87f6674c..1eeb1734 100644 --- a/api/v1/self_healing_types.go +++ b/api/v1/self_healing_types.go @@ -5,26 +5,32 @@ import "k8s.io/apimachinery/pkg/util/intstr" // SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController. // This is an effort to reduce complexity in the CosmosFullNodeController. type SelfHealingSpec struct { - // Determines when to destroy and recreate a replica (aka instance, pod/pvc combo) that is failing. + // Determines when to destroy and recreate a replica (aka pod/pvc combo) that is crashlooping. // Occasionally, data may become corrupt and the chain exits and cannot restart. - // This strategy only watches the pods "node" container running the `start` command, and only for - // pods that are crashlooping. + // This strategy only watches the pods' "node" containers running the `start` command. // // This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource. // With this pairing, a new PVC is created with a recent VolumeSnapshot. // Otherwise, ensure your snapshot, genesis, etc. creation are idempotent. // (e.g. chain.snapshotURL and chain.genesisURL have stable urls) + // + // This feature may be extended to detect other failed pod states instead of just crashloops. // +optional - ReplicaFaultRecovery *ReplicaFaultRecoverySpec `json:"replicaFaultRecovery"` + PodFaultRecovery *PodFaultRecovery `json:"podFaultRecovery"` } -type ReplicaFaultRecoverySpec struct { +type PodFaultRecovery struct { // How many healthy pods are required to trigger destroying a crashlooping pod and pvc. + // Set an integer or a percentage string such as 50%. + // + // This setting attempts to minimize false positives in order to detect data corruption instead of + // a variety of other reasons for crashloops. // The controller periodically inspects the status of all pods. // If the majority of pods are crashlooping, then there's probably something else wrong, and recreating // the pod and pvc will have no effect. - // Set an integer or a percentage string such as 50%. - // If the threshold is too high, defaults to watching for 1 unhealthy pod. + // + // If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy. + // It's not recommended to use this feature with only 1 replica. HealthyThreshold intstr.IntOrString `json:"healthyThreshold"` // How many restarts to wait before destroying and recreating an unhealthy replica. diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 5295138b..9cfc3cd0 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -345,6 +345,22 @@ func (in *PersistentVolumeClaimSpec) DeepCopy() *PersistentVolumeClaimSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodFaultRecovery) DeepCopyInto(out *PodFaultRecovery) { + *out = *in + out.HealthyThreshold = in.HealthyThreshold +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFaultRecovery. +func (in *PodFaultRecovery) DeepCopy() *PodFaultRecovery { + if in == nil { + return nil + } + out := new(PodFaultRecovery) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PodSpec) DeepCopyInto(out *PodSpec) { *out = *in @@ -458,22 +474,6 @@ func (in *RPCServiceSpec) DeepCopy() *RPCServiceSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ReplicaFaultRecoverySpec) DeepCopyInto(out *ReplicaFaultRecoverySpec) { - *out = *in - out.HealthyThreshold = in.HealthyThreshold -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReplicaFaultRecoverySpec. -func (in *ReplicaFaultRecoverySpec) DeepCopy() *ReplicaFaultRecoverySpec { - if in == nil { - return nil - } - out := new(ReplicaFaultRecoverySpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RolloutStrategy) DeepCopyInto(out *RolloutStrategy) { *out = *in @@ -527,9 +527,9 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) { *out = *in - if in.ReplicaFaultRecovery != nil { - in, out := &in.ReplicaFaultRecovery, &out.ReplicaFaultRecovery - *out = new(ReplicaFaultRecoverySpec) + if in.PodFaultRecovery != nil { + in, out := &in.PodFaultRecovery, &out.PodFaultRecovery + *out = new(PodFaultRecovery) **out = **in } } diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml index 782032eb..6caefa37 100644 --- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml +++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml @@ -1471,29 +1471,34 @@ spec: SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce complexity of the CosmosFullNodeController. properties: - replicaFaultRecovery: + podFaultRecovery: description: "Determines when to destroy and recreate a replica - (aka instance, pod/pvc combo) that is failing. Occasionally, - data may become corrupt and the chain exits and cannot restart. - This strategy only watches the pods \"node\" container running - the `start` command, and only for pods that are crashlooping. - \n This pairs well with volumeClaimTemplate.autoDataSource and - a ScheduledVolumeSnapshot resource. With this pairing, a new - PVC is created with a recent VolumeSnapshot. Otherwise, ensure - your snapshot, genesis, etc. creation are idempotent. (e.g. - chain.snapshotURL and chain.genesisURL have stable urls)" + (aka pod/pvc combo) that is crashlooping. Occasionally, data + may become corrupt and the chain exits and cannot restart. This + strategy only watches the pods' \"node\" containers running + the `start` command. \n This pairs well with volumeClaimTemplate.autoDataSource + and a ScheduledVolumeSnapshot resource. With this pairing, a + new PVC is created with a recent VolumeSnapshot. Otherwise, + ensure your snapshot, genesis, etc. creation are idempotent. + (e.g. chain.snapshotURL and chain.genesisURL have stable urls) + \n This feature may be extended to detect other failed pod states + instead of just crashloops." properties: healthyThreshold: anyOf: - type: integer - type: string - description: How many healthy pods are required to trigger - destroying a crashlooping pod and pvc. The controller periodically - inspects the status of all pods. If the majority of pods - are crashlooping, then there's probably something else wrong, - and recreating the pod and pvc will have no effect. Set - an integer or a percentage string such as 50%. If the threshold - is too high, defaults to watching for 1 unhealthy pod. + description: "How many healthy pods are required to trigger + destroying a crashlooping pod and pvc. Set an integer or + a percentage string such as 50%. \n This setting attempts + to minimize false positives in order to detect data corruption + instead of a variety of other reasons for crashloops. The + controller periodically inspects the status of all pods. + If the majority of pods are crashlooping, then there's probably + something else wrong, and recreating the pod and pvc will + have no effect. \n If the threshold is too high, defaults + to recovering 1 unhealthy pod, the rest must be healthy. + It's not recommended to use this feature with only 1 replica." x-kubernetes-int-or-string: true restartThreshold: description: How many restarts to wait before destroying and From fd8affbd2c2838704e04f18c2145b1c5b8d58aac Mon Sep 17 00:00:00 2001 From: David Nix Date: Thu, 16 Feb 2023 16:46:33 -0700 Subject: [PATCH 3/4] More tweaking --- api/v1/self_healing_types.go | 9 +++++---- .../cosmos.strange.love_cosmosfullnodes.yaml | 20 ++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go index 1eeb1734..7d332b7e 100644 --- a/api/v1/self_healing_types.go +++ b/api/v1/self_healing_types.go @@ -22,11 +22,12 @@ type SelfHealingSpec struct { type PodFaultRecovery struct { // How many healthy pods are required to trigger destroying a crashlooping pod and pvc. // Set an integer or a percentage string such as 50%. + // Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery. + // Fractional values are rounded down. // - // This setting attempts to minimize false positives in order to detect data corruption instead of - // a variety of other reasons for crashloops. - // The controller periodically inspects the status of all pods. - // If the majority of pods are crashlooping, then there's probably something else wrong, and recreating + // This setting attempts to minimize false positives in order to detect data corruption vs. + // endless other reasons for unhealthy pods. + // If the majority of pods are unhealthy, then there's probably something else wrong, and recreating // the pod and pvc will have no effect. // // If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy. diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml index 6caefa37..faf4c9ed 100644 --- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml +++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml @@ -1490,15 +1490,17 @@ spec: - type: string description: "How many healthy pods are required to trigger destroying a crashlooping pod and pvc. Set an integer or - a percentage string such as 50%. \n This setting attempts - to minimize false positives in order to detect data corruption - instead of a variety of other reasons for crashloops. The - controller periodically inspects the status of all pods. - If the majority of pods are crashlooping, then there's probably - something else wrong, and recreating the pod and pvc will - have no effect. \n If the threshold is too high, defaults - to recovering 1 unhealthy pod, the rest must be healthy. - It's not recommended to use this feature with only 1 replica." + a percentage string such as 50%. Example: If you set to + 80% and there are 10 total pods, at least 8 must be healthy + to trigger the recovery. Fractional values are rounded down. + \n This setting attempts to minimize false positives in + order to detect data corruption vs. endless other reasons + for unhealthy pods. If the majority of pods are unhealthy, + then there's probably something else wrong, and recreating + the pod and pvc will have no effect. \n If the threshold + is too high, defaults to recovering 1 unhealthy pod, the + rest must be healthy. It's not recommended to use this feature + with only 1 replica." x-kubernetes-int-or-string: true restartThreshold: description: How many restarts to wait before destroying and From eebe0ad8bc27142e29ec7ddb2ba5d01995388865 Mon Sep 17 00:00:00 2001 From: David Nix Date: Fri, 17 Feb 2023 13:35:08 -0700 Subject: [PATCH 4/4] Maybe settle on a final name --- api/v1/self_healing_types.go | 13 +++---- api/v1/zz_generated.deepcopy.go | 38 +++++++++---------- .../cosmos.strange.love_cosmosfullnodes.yaml | 24 +++++------- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go index 7d332b7e..89beb34c 100644 --- a/api/v1/self_healing_types.go +++ b/api/v1/self_healing_types.go @@ -14,27 +14,24 @@ type SelfHealingSpec struct { // Otherwise, ensure your snapshot, genesis, etc. creation are idempotent. // (e.g. chain.snapshotURL and chain.genesisURL have stable urls) // - // This feature may be extended to detect other failed pod states instead of just crashloops. // +optional - PodFaultRecovery *PodFaultRecovery `json:"podFaultRecovery"` + CrashLoopRecovery *CrashLoopRecovery `json:"crashLoopRecovery"` } -type PodFaultRecovery struct { +type CrashLoopRecovery struct { // How many healthy pods are required to trigger destroying a crashlooping pod and pvc. // Set an integer or a percentage string such as 50%. // Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery. - // Fractional values are rounded down. + // Fractional values are rounded down, but the minimum is 1. + // It's not recommended to use this feature with only 1 replica. // // This setting attempts to minimize false positives in order to detect data corruption vs. // endless other reasons for unhealthy pods. // If the majority of pods are unhealthy, then there's probably something else wrong, and recreating // the pod and pvc will have no effect. - // - // If the threshold is too high, defaults to recovering 1 unhealthy pod, the rest must be healthy. - // It's not recommended to use this feature with only 1 replica. HealthyThreshold intstr.IntOrString `json:"healthyThreshold"` - // How many restarts to wait before destroying and recreating an unhealthy replica. + // How many restarts to wait before destroying and recreating the unhealthy replica. // Defaults to 5. // +optional RestartThreshold int32 `json:"restartThreshold"` diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 9cfc3cd0..d226a0f9 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -160,6 +160,22 @@ func (in *CosmosFullNodeList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CrashLoopRecovery) DeepCopyInto(out *CrashLoopRecovery) { + *out = *in + out.HealthyThreshold = in.HealthyThreshold +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopRecovery. +func (in *CrashLoopRecovery) DeepCopy() *CrashLoopRecovery { + if in == nil { + return nil + } + out := new(CrashLoopRecovery) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FullNodeProbesSpec) DeepCopyInto(out *FullNodeProbesSpec) { *out = *in @@ -345,22 +361,6 @@ func (in *PersistentVolumeClaimSpec) DeepCopy() *PersistentVolumeClaimSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PodFaultRecovery) DeepCopyInto(out *PodFaultRecovery) { - *out = *in - out.HealthyThreshold = in.HealthyThreshold -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodFaultRecovery. -func (in *PodFaultRecovery) DeepCopy() *PodFaultRecovery { - if in == nil { - return nil - } - out := new(PodFaultRecovery) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PodSpec) DeepCopyInto(out *PodSpec) { *out = *in @@ -527,9 +527,9 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) { *out = *in - if in.PodFaultRecovery != nil { - in, out := &in.PodFaultRecovery, &out.PodFaultRecovery - *out = new(PodFaultRecovery) + if in.CrashLoopRecovery != nil { + in, out := &in.CrashLoopRecovery, &out.CrashLoopRecovery + *out = new(CrashLoopRecovery) **out = **in } } diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml index faf4c9ed..13877864 100644 --- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml +++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml @@ -1471,7 +1471,7 @@ spec: SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce complexity of the CosmosFullNodeController. properties: - podFaultRecovery: + crashLoopRecovery: description: "Determines when to destroy and recreate a replica (aka pod/pvc combo) that is crashlooping. Occasionally, data may become corrupt and the chain exits and cannot restart. This @@ -1480,9 +1480,7 @@ spec: and a ScheduledVolumeSnapshot resource. With this pairing, a new PVC is created with a recent VolumeSnapshot. Otherwise, ensure your snapshot, genesis, etc. creation are idempotent. - (e.g. chain.snapshotURL and chain.genesisURL have stable urls) - \n This feature may be extended to detect other failed pod states - instead of just crashloops." + (e.g. chain.snapshotURL and chain.genesisURL have stable urls)" properties: healthyThreshold: anyOf: @@ -1492,19 +1490,17 @@ spec: destroying a crashlooping pod and pvc. Set an integer or a percentage string such as 50%. Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy - to trigger the recovery. Fractional values are rounded down. - \n This setting attempts to minimize false positives in - order to detect data corruption vs. endless other reasons - for unhealthy pods. If the majority of pods are unhealthy, - then there's probably something else wrong, and recreating - the pod and pvc will have no effect. \n If the threshold - is too high, defaults to recovering 1 unhealthy pod, the - rest must be healthy. It's not recommended to use this feature - with only 1 replica." + to trigger the recovery. Fractional values are rounded down, + but the minimum is 1. It's not recommended to use this feature + with only 1 replica. \n This setting attempts to minimize + false positives in order to detect data corruption vs. endless + other reasons for unhealthy pods. If the majority of pods + are unhealthy, then there's probably something else wrong, + and recreating the pod and pvc will have no effect." x-kubernetes-int-or-string: true restartThreshold: description: How many restarts to wait before destroying and - recreating an unhealthy replica. Defaults to 5. + recreating the unhealthy replica. Defaults to 5. format: int32 type: integer required: