diff --git a/api/v1/cosmosfullnode_types.go b/api/v1/cosmosfullnode_types.go index c3f8c9eb..0e2c9a24 100644 --- a/api/v1/cosmosfullnode_types.go +++ b/api/v1/cosmosfullnode_types.go @@ -84,6 +84,12 @@ type FullNodeSpec struct { // Used for debugging. // +optional InstanceOverrides map[string]InstanceOverridesSpec `json:"instanceOverrides"` + + // Strategies for automatic recovery of faults and errors. + // SelfHealing is managed by a separate controller, SelfHealingController, in an effort to reduce + // complexity of the CosmosFullNodeController. + // +optional + SelfHealing *SelfHealingSpec `json:"selfHealing"` } type FullNodeType string diff --git a/api/v1/self_healing_types.go b/api/v1/self_healing_types.go new file mode 100644 index 00000000..89beb34c --- /dev/null +++ b/api/v1/self_healing_types.go @@ -0,0 +1,38 @@ +package v1 + +import "k8s.io/apimachinery/pkg/util/intstr" + +// SelfHealingSpec is part of a CosmosFullNode but is managed by a separate controller, SelfHealingController. +// This is an effort to reduce complexity in the CosmosFullNodeController. +type SelfHealingSpec struct { + // Determines when to destroy and recreate a replica (aka pod/pvc combo) that is crashlooping. + // Occasionally, data may become corrupt and the chain exits and cannot restart. + // This strategy only watches the pods' "node" containers running the `start` command. + // + // This pairs well with volumeClaimTemplate.autoDataSource and a ScheduledVolumeSnapshot resource. + // With this pairing, a new PVC is created with a recent VolumeSnapshot. + // Otherwise, ensure your snapshot, genesis, etc. creation are idempotent. + // (e.g. chain.snapshotURL and chain.genesisURL have stable urls) + // + // +optional + CrashLoopRecovery *CrashLoopRecovery `json:"crashLoopRecovery"` +} + +type CrashLoopRecovery struct { + // How many healthy pods are required to trigger destroying a crashlooping pod and pvc. + // Set an integer or a percentage string such as 50%. + // Example: If you set to 80% and there are 10 total pods, at least 8 must be healthy to trigger the recovery. + // Fractional values are rounded down, but the minimum is 1. + // It's not recommended to use this feature with only 1 replica. + // + // This setting attempts to minimize false positives in order to detect data corruption vs. + // endless other reasons for unhealthy pods. + // If the majority of pods are unhealthy, then there's probably something else wrong, and recreating + // the pod and pvc will have no effect. + HealthyThreshold intstr.IntOrString `json:"healthyThreshold"` + + // How many restarts to wait before destroying and recreating the unhealthy replica. + // Defaults to 5. + // +optional + RestartThreshold int32 `json:"restartThreshold"` +} diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index bb95beaa..d226a0f9 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -160,6 +160,22 @@ func (in *CosmosFullNodeList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CrashLoopRecovery) DeepCopyInto(out *CrashLoopRecovery) { + *out = *in + out.HealthyThreshold = in.HealthyThreshold +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopRecovery. +func (in *CrashLoopRecovery) DeepCopy() *CrashLoopRecovery { + if in == nil { + return nil + } + out := new(CrashLoopRecovery) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FullNodeProbesSpec) DeepCopyInto(out *FullNodeProbesSpec) { *out = *in @@ -210,6 +226,11 @@ func (in *FullNodeSpec) DeepCopyInto(out *FullNodeSpec) { (*out)[key] = *val.DeepCopy() } } + if in.SelfHealing != nil { + in, out := &in.SelfHealing, &out.SelfHealing + *out = new(SelfHealingSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FullNodeSpec. @@ -503,6 +524,26 @@ func (in *SDKAppConfig) DeepCopy() *SDKAppConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SelfHealingSpec) DeepCopyInto(out *SelfHealingSpec) { + *out = *in + if in.CrashLoopRecovery != nil { + in, out := &in.CrashLoopRecovery, &out.CrashLoopRecovery + *out = new(CrashLoopRecovery) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfHealingSpec. +func (in *SelfHealingSpec) DeepCopy() *SelfHealingSpec { + if in == nil { + return nil + } + out := new(SelfHealingSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) { *out = *in diff --git a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml index fcba963d..13877864 100644 --- a/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml +++ b/config/crd/bases/cosmos.strange.love_cosmosfullnodes.yaml @@ -1466,6 +1466,47 @@ spec: format: int32 minimum: 0 type: integer + selfHealing: + description: Strategies for automatic recovery of faults and errors. + SelfHealing is managed by a separate controller, SelfHealingController, + in an effort to reduce complexity of the CosmosFullNodeController. + properties: + crashLoopRecovery: + description: "Determines when to destroy and recreate a replica + (aka pod/pvc combo) that is crashlooping. Occasionally, data + may become corrupt and the chain exits and cannot restart. This + strategy only watches the pods' \"node\" containers running + the `start` command. \n This pairs well with volumeClaimTemplate.autoDataSource + and a ScheduledVolumeSnapshot resource. With this pairing, a + new PVC is created with a recent VolumeSnapshot. Otherwise, + ensure your snapshot, genesis, etc. creation are idempotent. + (e.g. chain.snapshotURL and chain.genesisURL have stable urls)" + properties: + healthyThreshold: + anyOf: + - type: integer + - type: string + description: "How many healthy pods are required to trigger + destroying a crashlooping pod and pvc. Set an integer or + a percentage string such as 50%. Example: If you set to + 80% and there are 10 total pods, at least 8 must be healthy + to trigger the recovery. Fractional values are rounded down, + but the minimum is 1. It's not recommended to use this feature + with only 1 replica. \n This setting attempts to minimize + false positives in order to detect data corruption vs. endless + other reasons for unhealthy pods. If the majority of pods + are unhealthy, then there's probably something else wrong, + and recreating the pod and pvc will have no effect." + x-kubernetes-int-or-string: true + restartThreshold: + description: How many restarts to wait before destroying and + recreating the unhealthy replica. Defaults to 5. + format: int32 + type: integer + required: + - healthyThreshold + type: object + type: object service: description: Configure Operator created services. A singe rpc service is created for load balancing api, grpc, rpc, etc. requests. This