diff --git a/config/crds/troubleshoot.sh_analyzers.yaml b/config/crds/troubleshoot.sh_analyzers.yaml index 4a3b81f76..b68bcd21e 100644 --- a/config/crds/troubleshoot.sh_analyzers.yaml +++ b/config/crds/troubleshoot.sh_analyzers.yaml @@ -141,6 +141,61 @@ spec: required: - outcomes type: object + clusterContainerStatuses: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + exclude: + type: BoolString + namespaces: + items: + type: string + type: array + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + restartCount: + format: int32 + type: integer + strict: + type: BoolString + required: + - outcomes + - restartCount + type: object clusterPodStatuses: properties: annotations: diff --git a/config/crds/troubleshoot.sh_preflights.yaml b/config/crds/troubleshoot.sh_preflights.yaml index 7b9f23bf9..f954487d0 100644 --- a/config/crds/troubleshoot.sh_preflights.yaml +++ b/config/crds/troubleshoot.sh_preflights.yaml @@ -141,6 +141,61 @@ spec: required: - outcomes type: object + clusterContainerStatuses: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + exclude: + type: BoolString + namespaces: + items: + type: string + type: array + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + restartCount: + format: int32 + type: integer + strict: + type: BoolString + required: + - outcomes + - restartCount + type: object clusterPodStatuses: properties: annotations: diff --git a/config/crds/troubleshoot.sh_supportbundles.yaml b/config/crds/troubleshoot.sh_supportbundles.yaml index bca8003a0..8f45b0fdf 100644 --- a/config/crds/troubleshoot.sh_supportbundles.yaml +++ b/config/crds/troubleshoot.sh_supportbundles.yaml @@ -172,6 +172,61 @@ spec: required: - outcomes type: object + clusterContainerStatuses: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + exclude: + type: BoolString + namespaces: + items: + type: string + type: array + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + restartCount: + format: int32 + type: integer + strict: + type: BoolString + required: + - outcomes + - restartCount + type: object clusterPodStatuses: properties: annotations: diff --git a/pkg/analyze/analyzer.go b/pkg/analyze/analyzer.go index 000e4c957..948b0f73f 100644 --- a/pkg/analyze/analyzer.go +++ b/pkg/analyze/analyzer.go @@ -214,6 +214,8 @@ func GetAnalyzer(analyzer *troubleshootv1beta2.Analyze) Analyzer { return &AnalyzeReplicaSetStatus{analyzer: analyzer.ReplicaSetStatus} case analyzer.ClusterPodStatuses != nil: return &AnalyzeClusterPodStatuses{analyzer: analyzer.ClusterPodStatuses} + case analyzer.ClusterContainerStatuses != nil: + return &AnalyzeClusterContainerStatuses{analyzer: analyzer.ClusterContainerStatuses} case analyzer.ContainerRuntime != nil: return &AnalyzeContainerRuntime{analyzer: analyzer.ContainerRuntime} case analyzer.Distribution != nil: diff --git a/pkg/analyze/cluster_container_statuses.go b/pkg/analyze/cluster_container_statuses.go new file mode 100644 index 000000000..8dd96978c --- /dev/null +++ b/pkg/analyze/cluster_container_statuses.go @@ -0,0 +1,260 @@ +package analyzer + +import ( + "bytes" + "encoding/json" + "fmt" + "path/filepath" + "slices" + "strings" + "text/template" + + "github.com/pkg/errors" + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "github.com/replicatedhq/troubleshoot/pkg/constants" + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) + +type AnalyzeClusterContainerStatuses struct { + analyzer *troubleshootv1beta2.ClusterContainerStatuses +} + +type podsWithContainers map[string]struct { + name string + namespace string + containerStatuses []corev1.ContainerStatus +} + +type matchedContainerInfo struct { + Namespace string + PodName string + ContainerName string + Ready bool + RestartCount int32 + Message string +} + +func (a *AnalyzeClusterContainerStatuses) Title() string { + if a.analyzer.CheckName != "" { + return a.analyzer.CheckName + } + return "Cluster Container Status" +} + +func (a *AnalyzeClusterContainerStatuses) IsExcluded() (bool, error) { + return isExcluded(a.analyzer.Exclude) +} + +func (a *AnalyzeClusterContainerStatuses) Analyze(getFile getCollectedFileContents, findFiles getChildCollectedFileContents) ([]*AnalyzeResult, error) { + // get all pod list files from clusterResources collector directory + excludeFiles := []string{} + podListFiles, err := findFiles(filepath.Join(constants.CLUSTER_RESOURCES_DIR, constants.CLUSTER_RESOURCES_PODS, "*.json"), excludeFiles) + if err != nil { + return nil, errors.Wrap(err, "failed to read collected pods") + } + + // get pods matched analyzer filters + pods, err := a.getPodsMatchingFilters(podListFiles) + if err != nil { + return nil, errors.Wrap(err, "failed to get pods matching filters") + } + + results, err := a.analyzeContainerStatuses(pods) + if err != nil { + return nil, errors.Wrap(err, "failed to analyze container statuses") + } + + return results, nil +} + +func (a *AnalyzeClusterContainerStatuses) getPodsMatchingFilters(podListFiles map[string][]byte) (podsWithContainers, error) { + var podsMatchedNamespace []corev1.Pod + matchedPods := podsWithContainers{} + + // filter pods matched namespace selector + for fileName, fileContent := range podListFiles { + // pod list fileName is the namespace name, e.g. default.json + currentNamespace := strings.TrimSuffix(filepath.Base(fileName), ".json") + selectedNamespaces := a.analyzer.Namespaces + if len(selectedNamespaces) > 0 { + if !slices.Contains(selectedNamespaces, currentNamespace) { + continue + } + } + + // filter pods by namespace + var podList corev1.PodList + if err := json.Unmarshal(fileContent, &podList); err != nil { + var pods []corev1.Pod + // fallback to old format + if err := json.Unmarshal(fileContent, &pods); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal pods list for namespace %s", currentNamespace) + } + podsMatchedNamespace = append(podsMatchedNamespace, pods...) + } else { + podsMatchedNamespace = append(podsMatchedNamespace, podList.Items...) + } + } + + // filter pods by container criteria + for _, pod := range podsMatchedNamespace { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.RestartCount < a.analyzer.RestartCount { + continue + } + // check if the pod has already been matched + key := string(pod.UID) + if _, ok := matchedPods[key]; !ok { + matchedPods[key] = struct { + name string + namespace string + containerStatuses []corev1.ContainerStatus + }{ + name: pod.Name, + namespace: pod.Namespace, + containerStatuses: []corev1.ContainerStatus{containerStatus}, + } + continue + } + entry := matchedPods[key] + entry.containerStatuses = append(entry.containerStatuses, containerStatus) + } + } + + return matchedPods, nil +} + +func (a *AnalyzeClusterContainerStatuses) analyzeContainerStatuses(podContainers podsWithContainers) ([]*AnalyzeResult, error) { + results := []*AnalyzeResult{} + + // for each outcome, iterate over the pods and match the outcome against the container statues + for _, outcome := range a.analyzer.Outcomes { + r := AnalyzeResult{ + Title: a.Title(), + IconKey: "kubernetes_container_statuses", + IconURI: "https://troubleshoot.sh/images/analyzer-icons/kubernetes.svg?w=16&h=16", + } + when := "" + + switch { + case outcome.Fail != nil: + r.IsFail = true + r.Message = outcome.Fail.Message + r.URI = outcome.Fail.URI + when = outcome.Fail.When + case outcome.Warn != nil: + r.IsWarn = true + r.Message = outcome.Warn.Message + r.URI = outcome.Warn.URI + when = outcome.Warn.When + case outcome.Pass != nil: + r.IsPass = true + r.Message = outcome.Pass.Message + r.URI = outcome.Pass.URI + when = outcome.Pass.When + default: + klog.Warning("unexpected outcome in clusterContainerStatuses analyzer") + continue + } + + // empty when indicates final case, let's return the result + if when == "" { + // return collected results if any + if len(results) > 0 { + return results, nil + } + return []*AnalyzeResult{&r}, nil + } + + // continue matching with when condition + reason, isEqualityOp, err := parseWhen(when) + if err != nil { + return nil, errors.Wrap(err, "failed to parse when") + } + + for _, pod := range podContainers { + matched, matchedContainerInfo := matchContainerReason(reason, pod.name, pod.namespace, pod.containerStatuses) + if matched != isEqualityOp { + continue + } + r.Message = renderContainerMessage(r.Message, &matchedContainerInfo) + results = append(results, &r) + } + } + return results, nil +} + +// matchContainerReason iterates over the containerStatuses and returns true on the first reason that matches +func matchContainerReason(reason string, podName string, namespace string, containerStatuses []corev1.ContainerStatus) (bool, matchedContainerInfo) { + var matched bool + info := matchedContainerInfo{} + info.Namespace = namespace + info.PodName = podName + + for _, containerStatus := range containerStatuses { + state := containerStatus.State + info.ContainerName = containerStatus.Name + info.Ready = containerStatus.Ready + info.RestartCount = containerStatus.RestartCount + + switch { + case containerStatus.LastTerminationState.Terminated != nil && strings.EqualFold(containerStatus.LastTerminationState.Terminated.Reason, reason): + matched = true + info.Message = containerStatus.LastTerminationState.Terminated.Message + case state.Terminated != nil && strings.EqualFold(state.Terminated.Reason, reason): + info.Message = state.Terminated.Message + matched = true + case state.Waiting != nil && strings.EqualFold(state.Waiting.Reason, reason): + matched = true + info.Message = state.Waiting.Message + } + } + return matched, info +} + +// parseWhen parses the when string into operator and reason +// return error if reason is not in the expected format +func parseWhen(when string) (string, bool, error) { + parts := strings.Split(strings.TrimSpace(when), " ") + if len(parts) != 2 { + return "", false, errors.Errorf("expected 2 parts in when %q", when) + } + operator := parts[0] + reason := parts[1] + var isEqualityOp bool + + switch operator { + case "=", "==", "===": + isEqualityOp = true + case "!=", "!==": + isEqualityOp = false + default: + return "", false, errors.Errorf("unexpected operator %q in containerStatuses reason", operator) + } + + return reason, isEqualityOp, nil +} + +func renderContainerMessage(message string, info *matchedContainerInfo) string { + if info == nil { + return message + } + out := fmt.Sprintf("Container matched. Container: %s, Namespace: %s, Pod: %s", info.ContainerName, info.Namespace, info.PodName) + + tmpl := template.New("container") + msgTmpl, err := tmpl.Parse(message) + if err != nil { + klog.V(2).Infof("failed to parse message template: %v", err) + return out + } + + var m bytes.Buffer + err = msgTmpl.Execute(&m, info) + if err != nil { + klog.V(2).Infof("failed to render message template: %v", err) + return out + } + + return strings.TrimSpace(m.String()) +} diff --git a/pkg/analyze/cluster_container_statuses_test.go b/pkg/analyze/cluster_container_statuses_test.go new file mode 100644 index 000000000..b880f201b --- /dev/null +++ b/pkg/analyze/cluster_container_statuses_test.go @@ -0,0 +1,103 @@ +package analyzer + +import ( + "testing" + + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "github.com/stretchr/testify/require" +) + +func Test_analyzeContainerStatuses(t *testing.T) { + tests := []struct { + name string + analyzer troubleshootv1beta2.ClusterContainerStatuses + expectResult []*AnalyzeResult + files map[string][]byte + }{ + { + name: "fail when there is OOMKilled container", + analyzer: troubleshootv1beta2.ClusterContainerStatuses{ + AnalyzeMeta: troubleshootv1beta2.AnalyzeMeta{ + CheckName: "oomkilled-container", + }, + Outcomes: []*troubleshootv1beta2.Outcome{ + { + Fail: &troubleshootv1beta2.SingleOutcome{ + When: "== OOMKilled", + Message: "Container {{ .ContainerName }} from pod {{ .Namespace }}/{{ .PodName }} has OOMKilled", + }, + }, + }, + Namespaces: []string{"message-oomkill-pod"}, + }, + expectResult: []*AnalyzeResult{ + { + IsFail: true, + IsWarn: false, + IsPass: false, + Title: "oomkilled-container", + Message: "Container memory-eater from pod message-oomkill-pod/oom-kill-job3-gbb89 has OOMKilled", + IconKey: "kubernetes_container_statuses", + IconURI: "https://troubleshoot.sh/images/analyzer-icons/kubernetes.svg?w=16&h=16", + }, + }, + files: map[string][]byte{ + "cluster-resources/pods/message-oomkill-pod.json": []byte(messageOOMKillPod), + }, + }, + { + name: "pass when there is no status detected", + analyzer: troubleshootv1beta2.ClusterContainerStatuses{ + AnalyzeMeta: troubleshootv1beta2.AnalyzeMeta{ + CheckName: "oomkilled-container", + }, + Outcomes: []*troubleshootv1beta2.Outcome{ + { + Fail: &troubleshootv1beta2.SingleOutcome{ + When: "== OOMKilled", + Message: "Container {{ .ContainerName }} from pod {{ .Namespace }}/{{ .PodName }} has OOMKilled", + }, + }, + { + Pass: &troubleshootv1beta2.SingleOutcome{ + Message: "No OOMKilled container found", + }, + }, + }, + Namespaces: []string{"default"}, + }, + expectResult: []*AnalyzeResult{ + { + IsFail: false, + IsWarn: false, + IsPass: true, + Title: "oomkilled-container", + Message: "No OOMKilled container found", + IconKey: "kubernetes_container_statuses", + IconURI: "https://troubleshoot.sh/images/analyzer-icons/kubernetes.svg?w=16&h=16", + }, + }, + files: map[string][]byte{ + "cluster-resources/pods/default.json": []byte(defaultPods), + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + req := require.New(t) + + getFiles := func(n string, _ []string) (map[string][]byte, error) { + return test.files, nil + } + + a := AnalyzeClusterContainerStatuses{ + analyzer: &test.analyzer, + } + + actual, err := a.Analyze(nil, getFiles) + req.NoError(err) + req.Equal(test.expectResult, actual) + }) + } +} diff --git a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go index 0ce3e8f24..56a31e536 100644 --- a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go +++ b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go @@ -105,6 +105,13 @@ type ClusterPodStatuses struct { Namespaces []string `json:"namespaces,omitempty" yaml:"namespaces,omitempty"` } +type ClusterContainerStatuses struct { + AnalyzeMeta `json:",inline" yaml:",inline"` + Outcomes []*Outcome `json:"outcomes" yaml:"outcomes"` + Namespaces []string `json:"namespaces,omitempty" yaml:"namespaces,omitempty"` + RestartCount int32 `json:"restartCount" yaml:"restartCount"` +} + type ContainerRuntime struct { AnalyzeMeta `json:",inline" yaml:",inline"` Outcomes []*Outcome `json:"outcomes" yaml:"outcomes"` @@ -274,6 +281,7 @@ type Analyze struct { JobStatus *JobStatus `json:"jobStatus,omitempty" yaml:"jobStatus,omitempty"` ReplicaSetStatus *ReplicaSetStatus `json:"replicasetStatus,omitempty" yaml:"replicasetStatus,omitempty"` ClusterPodStatuses *ClusterPodStatuses `json:"clusterPodStatuses,omitempty" yaml:"clusterPodStatuses,omitempty"` + ClusterContainerStatuses *ClusterContainerStatuses `json:"clusterContainerStatuses,omitempty" yaml:"clusterContainerStatuses,omitempty"` ContainerRuntime *ContainerRuntime `json:"containerRuntime,omitempty" yaml:"containerRuntime,omitempty"` Distribution *Distribution `json:"distribution,omitempty" yaml:"distribution,omitempty"` NodeResources *NodeResources `json:"nodeResources,omitempty" yaml:"nodeResources,omitempty"` diff --git a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go index e19fb0b3b..1f454bf95 100644 --- a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go +++ b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go @@ -114,6 +114,11 @@ func (in *Analyze) DeepCopyInto(out *Analyze) { *out = new(ClusterPodStatuses) (*in).DeepCopyInto(*out) } + if in.ClusterContainerStatuses != nil { + in, out := &in.ClusterContainerStatuses, &out.ClusterContainerStatuses + *out = new(ClusterContainerStatuses) + (*in).DeepCopyInto(*out) + } if in.ContainerRuntime != nil { in, out := &in.ContainerRuntime, &out.ContainerRuntime *out = new(ContainerRuntime) @@ -666,6 +671,38 @@ func (in *CertificatesAnalyze) DeepCopy() *CertificatesAnalyze { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterContainerStatuses) DeepCopyInto(out *ClusterContainerStatuses) { + *out = *in + in.AnalyzeMeta.DeepCopyInto(&out.AnalyzeMeta) + if in.Outcomes != nil { + in, out := &in.Outcomes, &out.Outcomes + *out = make([]*Outcome, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(Outcome) + (*in).DeepCopyInto(*out) + } + } + } + if in.Namespaces != nil { + in, out := &in.Namespaces, &out.Namespaces + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterContainerStatuses. +func (in *ClusterContainerStatuses) DeepCopy() *ClusterContainerStatuses { + if in == nil { + return nil + } + out := new(ClusterContainerStatuses) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterInfo) DeepCopyInto(out *ClusterInfo) { *out = *in diff --git a/schemas/analyzer-troubleshoot-v1beta2.json b/schemas/analyzer-troubleshoot-v1beta2.json index 29d46b5ea..a5f219bb2 100644 --- a/schemas/analyzer-troubleshoot-v1beta2.json +++ b/schemas/analyzer-troubleshoot-v1beta2.json @@ -175,6 +175,90 @@ } } }, + "clusterContainerStatuses": { + "type": "object", + "required": [ + "outcomes", + "restartCount" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "namespaces": { + "type": "array", + "items": { + "type": "string" + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "restartCount": { + "type": "integer", + "format": "int32" + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "clusterPodStatuses": { "type": "object", "required": [ diff --git a/schemas/preflight-troubleshoot-v1beta2.json b/schemas/preflight-troubleshoot-v1beta2.json index 8e1e2af55..ce590d051 100644 --- a/schemas/preflight-troubleshoot-v1beta2.json +++ b/schemas/preflight-troubleshoot-v1beta2.json @@ -175,6 +175,90 @@ } } }, + "clusterContainerStatuses": { + "type": "object", + "required": [ + "outcomes", + "restartCount" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "namespaces": { + "type": "array", + "items": { + "type": "string" + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "restartCount": { + "type": "integer", + "format": "int32" + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "clusterPodStatuses": { "type": "object", "required": [ diff --git a/schemas/supportbundle-troubleshoot-v1beta2.json b/schemas/supportbundle-troubleshoot-v1beta2.json index d7c25e705..a1f416306 100644 --- a/schemas/supportbundle-troubleshoot-v1beta2.json +++ b/schemas/supportbundle-troubleshoot-v1beta2.json @@ -221,6 +221,90 @@ } } }, + "clusterContainerStatuses": { + "type": "object", + "required": [ + "outcomes", + "restartCount" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "namespaces": { + "type": "array", + "items": { + "type": "string" + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "restartCount": { + "type": "integer", + "format": "int32" + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "clusterPodStatuses": { "type": "object", "required": [