From db871e6889f69856be3084230569d1e558e36613 Mon Sep 17 00:00:00 2001 From: Evans Mungai Date: Tue, 9 Apr 2024 12:14:10 +0100 Subject: [PATCH] feat: node metrics analyser (#1520) * feat: node metrics analyser The analyser only checks PVC usage at the moment. More analysers can be added on a need to have basis * Add tests * Fix flaky test by waiting for goldpinger pods to start * Fix how outcomes get checked * Fix catch all outcome condition * Fix test * feat: node metrics analyser The analyser only checks PVC usage at the moment. More analysers can be added on a need to have basis * Add tests * Fix flaky test by waiting for goldpinger pods to start * Fix how outcomes get checked * Fix catch all outcome condition * Fix test * Regenerate schemas * Fix failing test --------- Co-authored-by: Dexter Yan --- config/crds/troubleshoot.sh_analyzers.yaml | 60 ++++ config/crds/troubleshoot.sh_preflights.yaml | 60 ++++ .../crds/troubleshoot.sh_supportbundles.yaml | 60 ++++ go.mod | 1 + go.sum | 2 + pkg/analyze/analyzer.go | 2 + pkg/analyze/comparison.go | 34 ++ pkg/analyze/comparison_test.go | 79 +++++ pkg/analyze/k8s_node_metrics.go | 316 ++++++++++++++++++ pkg/analyze/k8s_node_metrics_test.go | 288 ++++++++++++++++ .../troubleshoot/v1beta2/analyzer_shared.go | 17 + .../v1beta2/zz_generated.deepcopy.go | 68 ++++ pkg/preflight/flags.go | 22 +- schemas/analyzer-troubleshoot-v1beta2.json | 93 ++++++ schemas/preflight-troubleshoot-v1beta2.json | 93 ++++++ .../supportbundle-troubleshoot-v1beta2.json | 93 ++++++ .../goldpinger_collector_e2e_test.go | 27 +- 17 files changed, 1303 insertions(+), 12 deletions(-) create mode 100644 pkg/analyze/comparison.go create mode 100644 pkg/analyze/comparison_test.go create mode 100644 pkg/analyze/k8s_node_metrics.go create mode 100644 pkg/analyze/k8s_node_metrics_test.go diff --git a/config/crds/troubleshoot.sh_analyzers.yaml b/config/crds/troubleshoot.sh_analyzers.yaml index 76ecaca77..4a35936fe 100644 --- a/config/crds/troubleshoot.sh_analyzers.yaml +++ b/config/crds/troubleshoot.sh_analyzers.yaml @@ -1045,6 +1045,66 @@ spec: - collectorName - outcomes type: object + nodeMetrics: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + collectorName: + type: string + exclude: + type: BoolString + filters: + properties: + pvc: + properties: + nameRegex: + type: string + namespace: + type: string + type: object + type: object + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + strict: + type: BoolString + required: + - collectorName + - outcomes + type: object nodeResources: properties: annotations: diff --git a/config/crds/troubleshoot.sh_preflights.yaml b/config/crds/troubleshoot.sh_preflights.yaml index d547449e0..6dcc9532f 100644 --- a/config/crds/troubleshoot.sh_preflights.yaml +++ b/config/crds/troubleshoot.sh_preflights.yaml @@ -1045,6 +1045,66 @@ spec: - collectorName - outcomes type: object + nodeMetrics: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + collectorName: + type: string + exclude: + type: BoolString + filters: + properties: + pvc: + properties: + nameRegex: + type: string + namespace: + type: string + type: object + type: object + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + strict: + type: BoolString + required: + - collectorName + - outcomes + type: object nodeResources: properties: annotations: diff --git a/config/crds/troubleshoot.sh_supportbundles.yaml b/config/crds/troubleshoot.sh_supportbundles.yaml index cb87baaee..22a4f1607 100644 --- a/config/crds/troubleshoot.sh_supportbundles.yaml +++ b/config/crds/troubleshoot.sh_supportbundles.yaml @@ -1076,6 +1076,66 @@ spec: - collectorName - outcomes type: object + nodeMetrics: + properties: + annotations: + additionalProperties: + type: string + type: object + checkName: + type: string + collectorName: + type: string + exclude: + type: BoolString + filters: + properties: + pvc: + properties: + nameRegex: + type: string + namespace: + type: string + type: object + type: object + outcomes: + items: + properties: + fail: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + pass: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + warn: + properties: + message: + type: string + uri: + type: string + when: + type: string + type: object + type: object + type: array + strict: + type: BoolString + required: + - collectorName + - outcomes + type: object nodeResources: properties: annotations: diff --git a/go.mod b/go.mod index 0ec01f805..2109e4a64 100644 --- a/go.mod +++ b/go.mod @@ -252,6 +252,7 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect helm.sh/helm/v3 v3.14.3 k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/kubelet v0.29.3 k8s.io/metrics v0.29.3 k8s.io/utils v0.0.0-20230726121419-3b25d923346b periph.io/x/host/v3 v3.8.2 diff --git a/go.sum b/go.sum index 7f77000b7..925e6d255 100644 --- a/go.sum +++ b/go.sum @@ -1573,6 +1573,8 @@ k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/A k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= k8s.io/kubectl v0.29.0 h1:Oqi48gXjikDhrBF67AYuZRTcJV4lg2l42GmvsP7FmYI= k8s.io/kubectl v0.29.0/go.mod h1:0jMjGWIcMIQzmUaMgAzhSELv5WtHo2a8pq67DtviAJs= +k8s.io/kubelet v0.29.3 h1:X9h0ZHzc+eUeNTaksbN0ItHyvGhQ7Z0HPjnQD2oHdwU= +k8s.io/kubelet v0.29.3/go.mod h1:jDiGuTkFOUynyBKzOoC1xRSWlgAZ9UPcTYeFyjr6vas= k8s.io/metrics v0.29.3 h1:nN+eavbMQ7Kuif2tIdTr2/F2ec2E/SIAWSruTZ+Ye6U= k8s.io/metrics v0.29.3/go.mod h1:kb3tGGC4ZcIDIuvXyUE291RwJ5WmDu0tB4wAVZM6h2I= k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= diff --git a/pkg/analyze/analyzer.go b/pkg/analyze/analyzer.go index 62a9dd2ba..01bf8790d 100644 --- a/pkg/analyze/analyzer.go +++ b/pkg/analyze/analyzer.go @@ -248,6 +248,8 @@ func getAnalyzer(analyzer *troubleshootv1beta2.Analyze) Analyzer { return &AnalyzeGoldpinger{analyzer: analyzer.Goldpinger} case analyzer.Event != nil: return &AnalyzeEvent{analyzer: analyzer.Event} + case analyzer.NodeMetrics != nil: + return &AnalyzeNodeMetrics{analyzer: analyzer.NodeMetrics} default: return nil } diff --git a/pkg/analyze/comparison.go b/pkg/analyze/comparison.go new file mode 100644 index 000000000..b31e7c92b --- /dev/null +++ b/pkg/analyze/comparison.go @@ -0,0 +1,34 @@ +package analyzer + +import "fmt" + +type ComparisonOperator int + +const ( + Unknown ComparisonOperator = iota + Equal + NotEqual + GreaterThan + GreaterThanOrEqual + LessThan + LessThanOrEqual +) + +func ParseComparisonOperator(s string) (ComparisonOperator, error) { + switch s { + case "=", "==", "===": + return Equal, nil + case "!=", "!==": + return NotEqual, nil + case "<": + return LessThan, nil + case ">": + return GreaterThan, nil + case "<=": + return LessThanOrEqual, nil + case ">=": + return GreaterThanOrEqual, nil + } + + return Unknown, fmt.Errorf("unknown operator: %s", s) +} diff --git a/pkg/analyze/comparison_test.go b/pkg/analyze/comparison_test.go new file mode 100644 index 000000000..15f9531cd --- /dev/null +++ b/pkg/analyze/comparison_test.go @@ -0,0 +1,79 @@ +package analyzer + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseComparisonOperator(t *testing.T) { + tests := []struct { + name string + input string + want ComparisonOperator + wantErr bool + }{ + { + name: "equal", + input: "=", + want: Equal, + }, + { + name: "equal", + input: "==", + want: Equal, + }, + { + name: "equal", + input: "===", + want: Equal, + }, + { + name: "not equal", + input: "!=", + want: NotEqual, + }, + { + name: "not equal", + input: "!==", + want: NotEqual, + }, + { + name: "less than", + input: "<", + want: LessThan, + }, + { + name: "greater than", + input: ">", + want: GreaterThan, + }, + { + name: "less than or equal", + input: "<=", + want: LessThanOrEqual, + }, + { + name: "greater than or equal", + input: ">=", + want: GreaterThanOrEqual, + }, + { + name: "invalid operator 1", + input: "", + wantErr: true, + }, + { + name: "invalid operator 2", + input: "gibberish", + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseComparisonOperator(tt.input) + assert.Equal(t, tt.want, got, "ParseOperator() = %v, want %v", got, tt.want) + assert.Equalf(t, tt.wantErr, err != nil, "ParseOperator() error = %v, wantErr %v", err, tt.wantErr) + }) + } +} diff --git a/pkg/analyze/k8s_node_metrics.go b/pkg/analyze/k8s_node_metrics.go new file mode 100644 index 000000000..5aa2aafad --- /dev/null +++ b/pkg/analyze/k8s_node_metrics.go @@ -0,0 +1,316 @@ +package analyzer + +import ( + "bytes" + "encoding/json" + "fmt" + "path/filepath" + "regexp" + "strconv" + "strings" + "text/template" + + "github.com/pkg/errors" + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "k8s.io/klog/v2" + kubeletv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" +) + +type AnalyzeNodeMetrics struct { + analyzer *troubleshootv1beta2.NodeMetricsAnalyze +} + +type nodeMetricsComparisonResults struct { + PVC pvcTemplateData +} + +type pvcTemplateData struct { + UsedPercentage float64 + ConcatenatedNames string + Names []string +} + +type pvcUsageStats struct { + PvcName string + Used float64 +} + +func (a *AnalyzeNodeMetrics) Title() string { + title := a.analyzer.CheckName + if title == "" { + title = a.analyzer.CollectorName + } + if title == "" { + title = "Node Metrics" + } + + return title +} + +func (a *AnalyzeNodeMetrics) IsExcluded() (bool, error) { + return isExcluded(a.analyzer.Exclude) +} + +func (a *AnalyzeNodeMetrics) Analyze(getFile getCollectedFileContents, findFiles getChildCollectedFileContents) ([]*AnalyzeResult, error) { + // Gather all collected node-metrics files + collected, err := findFiles(filepath.Join("node-metrics", "*.json"), nil) + if err != nil { + return nil, errors.Wrap(err, "failed to read collected pods") + } + + // Unmarshal all collected node-metrics files + summaries := []kubeletv1alpha1.Summary{} + for _, fileContent := range collected { + summary := kubeletv1alpha1.Summary{} + if err := json.Unmarshal(fileContent, &summary); err != nil { + return nil, errors.Wrap(err, "failed to unmarshal node metrics") + } + + summaries = append(summaries, summary) + } + + // Run through all outcomes to generate results + result, err := a.compareCollectedMetricsWithOutcomes(summaries) + if err != nil { + return nil, errors.Wrap(err, "failed to compare node metrics with outcomes") + } + if result == nil { + return []*AnalyzeResult{}, nil + } + result.Strict = a.analyzer.Strict.BoolOrDefaultFalse() + + return []*AnalyzeResult{result}, nil +} + +func (a *AnalyzeNodeMetrics) compareCollectedMetricsWithOutcomes(summaries []kubeletv1alpha1.Summary) (*AnalyzeResult, error) { + for _, outcome := range a.analyzer.Outcomes { + result := &AnalyzeResult{ + Title: a.Title(), + } + + if outcome.Fail != nil { + if outcome.Fail.When == "" { + result.IsFail = true + result.Message = outcome.Fail.Message + result.URI = outcome.Fail.URI + + return result, nil + } else { + isMatch, out, err := a.compareNodeMetricConditionalsToStats(outcome.Fail.When, summaries) + if err != nil { + return nil, errors.Wrap(err, "failed to compare node metrics conditional with summary stats") + } + + if isMatch { + result.IsFail = true + result.Message = renderTemplate(outcome.Fail.Message, out) + result.URI = outcome.Fail.URI + + return result, nil + } + } + + } else if outcome.Warn != nil { + if outcome.Warn.When == "" { + result.IsWarn = true + result.Message = outcome.Warn.Message + result.URI = outcome.Warn.URI + + return result, nil + } else { + isMatch, out, err := a.compareNodeMetricConditionalsToStats(outcome.Warn.When, summaries) + if err != nil { + return nil, errors.Wrap(err, "failed to compare node metrics conditional with summary stats") + } + + if isMatch { + result.IsWarn = true + result.Message = renderTemplate(outcome.Warn.Message, out) + result.URI = outcome.Warn.URI + + return result, nil + } + } + } else if outcome.Pass != nil { + if outcome.Pass.When == "" { + result.IsPass = true + result.Message = outcome.Pass.Message + result.URI = outcome.Pass.URI + + return result, nil + } else { + isMatch, out, err := a.compareNodeMetricConditionalsToStats(outcome.Pass.When, summaries) + if err != nil { + return nil, errors.Wrap(err, "failed to compare node metrics conditional with summary stats") + } + + if isMatch { + result.IsPass = true + result.Message = renderTemplate(outcome.Pass.Message, out) + result.URI = outcome.Pass.URI + + return result, nil + } + } + } + } + + return nil, nil +} + +func (a *AnalyzeNodeMetrics) findPVCUsageStats(summaries []kubeletv1alpha1.Summary) ([]pvcUsageStats, error) { + // We just collect usage percentages for now. If other stats are needed, we can add them. + stats := []pvcUsageStats{} + var nameRegex *regexp.Regexp + var ns string + var err error + + pvcFilter := a.analyzer.Filters.PVC + if pvcFilter != nil { + if pvcFilter.NameRegex != "" { + nameRegex, err = regexp.Compile(pvcFilter.NameRegex) + if err != nil { + return nil, errors.Wrap(err, "failed to compile PVC name regex") + } + } + + ns = pvcFilter.Namespace + } + + // Analyze PVCs + for _, summary := range summaries { + for i := range summary.Pods { + pod := summary.Pods[i] + if ns != "" && ns != pod.PodRef.Namespace { + klog.V(2).Infof("Skipping pvcs in %s/%s pod due to namespace filter", pod.PodRef.Namespace, pod.PodRef.Name) + continue + } + + for j := range pod.VolumeStats { + volume := pod.VolumeStats[j] + + // This is a persistent volume + if volume.PVCRef != nil { + if nameRegex != nil && !nameRegex.MatchString(volume.PVCRef.Name) { + klog.V(2).Infof("Skipping pvc %s/%s due to name regex filter", volume.PVCRef.Namespace, volume.PVCRef.Name) + continue + } + + // Calculate the usage + pvcName := fmt.Sprintf("%s/%s", volume.PVCRef.Namespace, volume.PVCRef.Name) + + used := volume.UsedBytes + capacity := volume.CapacityBytes + if used != nil && capacity != nil { + pvcUsedPercentage := float64(*used) / float64(*capacity) * 100 + stats = append(stats, pvcUsageStats{ + PvcName: pvcName, + Used: pvcUsedPercentage, + }) + klog.V(2).Infof("PVC usage for %s: %0.2f%%", pvcName, pvcUsedPercentage) + } else { + klog.V(2).Infof("Missing capacity or used bytes for PVC %s", pvcName) + } + } + } + } + } + + return stats, nil +} + +// compareNodeMetricConditionalsToStats compares the conditional with the collected node metrics +// and returns true if the conditional is met. At the moment we only support comparing PVC usage +func (a *AnalyzeNodeMetrics) compareNodeMetricConditionalsToStats(conditional string, summaries []kubeletv1alpha1.Summary) (bool, nodeMetricsComparisonResults, error) { + klog.V(2).Infof("Comparing node metrics with conditional: %s", conditional) + parts := strings.Split(strings.TrimSpace(conditional), " ") + out := nodeMetricsComparisonResults{} + + if len(parts) != 3 { + return false, out, errors.New("unable to parse conditional") + } + + switch parts[0] { + case "pvcUsedPercentage": + // e.g pvcUsedPercentage >= 50.4 + + klog.V(2).Infof("Analyzing volume usage stats for PVCs") + + op, err := ParseComparisonOperator(parts[1]) + if err != nil { + return false, out, errors.Wrap(err, "failed to parse comparison operator") + } + + expected, err := strconv.ParseFloat(parts[2], 64) + if err != nil { + return false, out, errors.Wrap(err, "failed to parse bool") + } + + // Pick all PVCs from all summaries. Filters will be applied here + pvcUsageStats, err := a.findPVCUsageStats(summaries) + if err != nil { + return false, out, errors.Wrap(err, "failed to find PVC usage stats") + } + matchedPVCs := []string{} + + for _, pvcUsage := range pvcUsageStats { + value := pvcUsage.Used + switch op { + case Equal: + if value == expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + case NotEqual: + if value != expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + case LessThan: + if value < expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + case GreaterThan: + if value > expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + case LessThanOrEqual: + if value <= expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + case GreaterThanOrEqual: + if value >= expected { + matchedPVCs = append(matchedPVCs, pvcUsage.PvcName) + } + } + } + + // Concatenate all matched PVC names + out.PVC = pvcTemplateData{ + Names: matchedPVCs, + ConcatenatedNames: strings.Join(matchedPVCs, ", "), + } + return len(matchedPVCs) > 0, out, nil + } + + return false, out, errors.New("unknown node metric conditional") +} + +func renderTemplate(tmpMsg string, data any) string { + if data == nil { + return tmpMsg + } + + t, err := template.New("msg").Parse(tmpMsg) + if err != nil { + klog.V(2).Infof("Failed to parse template: %s", err) + return tmpMsg + } + + var m bytes.Buffer + err = t.Execute(&m, data) + if err != nil { + klog.V(2).Infof("Failed to execute template: %s", err) + return tmpMsg + } + + return m.String() +} diff --git a/pkg/analyze/k8s_node_metrics_test.go b/pkg/analyze/k8s_node_metrics_test.go new file mode 100644 index 000000000..6692d9b35 --- /dev/null +++ b/pkg/analyze/k8s_node_metrics_test.go @@ -0,0 +1,288 @@ +package analyzer + +import ( + "testing" + + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "github.com/stretchr/testify/assert" + kubeletv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + utilptr "k8s.io/utils/ptr" +) + +func TestAnalyzeNodeMetrics_findPVCUsageStats(t *testing.T) { + tests := []struct { + name string + analyzer troubleshootv1beta2.NodeMetricsAnalyze + summaries []kubeletv1alpha1.Summary + want []pvcUsageStats + wantErr bool + }{ + { + name: "no summaries", + summaries: []kubeletv1alpha1.Summary{}, + want: []pvcUsageStats{}, + }, + { + name: "one summary", + summaries: []kubeletv1alpha1.Summary{ + { + Pods: []kubeletv1alpha1.PodStats{ + { + PodRef: kubeletv1alpha1.PodReference{ + Namespace: "default", + Name: "my-pod", + }, + VolumeStats: []kubeletv1alpha1.VolumeStats{ + { + Name: "volume-1", + PVCRef: &kubeletv1alpha1.PVCReference{ + Namespace: "default", + Name: "my-pvc", + }, + FsStats: kubeletv1alpha1.FsStats{ + AvailableBytes: utilptr.To(uint64(20)), + UsedBytes: utilptr.To(uint64(80)), + CapacityBytes: utilptr.To(uint64(100)), + }, + }, + }, + }, + }, + }, + }, + want: []pvcUsageStats{ + { + Used: 80, + PvcName: "default/my-pvc", + }, + }, + }, + { + name: "one summary with namespace filter", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{ + PVC: &troubleshootv1beta2.PVCRef{ + Namespace: "another-namespace", + }, + }, + }, + summaries: []kubeletv1alpha1.Summary{ + { + Pods: []kubeletv1alpha1.PodStats{ + { + PodRef: kubeletv1alpha1.PodReference{ + Namespace: "default", + Name: "my-pod", + }, + VolumeStats: []kubeletv1alpha1.VolumeStats{ + { + Name: "volume-1", + PVCRef: &kubeletv1alpha1.PVCReference{ + Namespace: "default", + Name: "my-pvc", + }, + FsStats: kubeletv1alpha1.FsStats{ + AvailableBytes: utilptr.To(uint64(20)), + UsedBytes: utilptr.To(uint64(80)), + CapacityBytes: utilptr.To(uint64(100)), + }, + }, + }, + }, + }, + }, + }, + want: []pvcUsageStats{}, + }, + { + name: "one summary with name regex filter", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{ + PVC: &troubleshootv1beta2.PVCRef{ + NameRegex: ".*other.*", + }, + }, + }, + summaries: []kubeletv1alpha1.Summary{ + { + Pods: []kubeletv1alpha1.PodStats{ + { + PodRef: kubeletv1alpha1.PodReference{ + Namespace: "default", + Name: "my-pod", + }, + VolumeStats: []kubeletv1alpha1.VolumeStats{ + { + Name: "volume-1", + PVCRef: &kubeletv1alpha1.PVCReference{ + Namespace: "default", + Name: "my-pvc", + }, + FsStats: kubeletv1alpha1.FsStats{ + AvailableBytes: utilptr.To(uint64(20)), + UsedBytes: utilptr.To(uint64(80)), + CapacityBytes: utilptr.To(uint64(100)), + }, + }, + { + Name: "volume-1", + PVCRef: &kubeletv1alpha1.PVCReference{ + Namespace: "default", + Name: "my-other-pvc", + }, + FsStats: kubeletv1alpha1.FsStats{ + AvailableBytes: utilptr.To(uint64(25)), + UsedBytes: utilptr.To(uint64(75)), + CapacityBytes: utilptr.To(uint64(100)), + }, + }, + }, + }, + }, + }, + }, + want: []pvcUsageStats{ + { + Used: 75, + PvcName: "default/my-other-pvc", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &AnalyzeNodeMetrics{ + analyzer: &tt.analyzer, + } + got, err := a.findPVCUsageStats(tt.summaries) + assert.Equalf(t, tt.wantErr, err != nil, "AnalyzeNodeMetrics.findPVCUsageStats() error = %v, wantErr %v", err, tt.wantErr) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestAnalyzeNodeMetrics_Analyze(t *testing.T) { + tests := []struct { + name string + analyzer troubleshootv1beta2.NodeMetricsAnalyze + nodeMetrics string + want []*AnalyzeResult + wantErr bool + }{ + { + name: "no node metrics", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{}, + }, + nodeMetrics: "", + wantErr: true, + }, + { + name: "invalid node metrics", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{}, + }, + nodeMetrics: "invalid", + wantErr: true, + }, + { + name: "no summaries", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{}, + }, + nodeMetrics: "{}", + want: []*AnalyzeResult{}, + }, + { + name: "one summary with name regex filter", + analyzer: troubleshootv1beta2.NodeMetricsAnalyze{ + Outcomes: []*troubleshootv1beta2.Outcome{ + { + Fail: &troubleshootv1beta2.SingleOutcome{ + When: "pvcUsedPercentage >= 75", + Message: "PVC space usage is too high for pvcs [{{ .PVC.ConcatenatedNames }}]", + }, + }, + { + Pass: &troubleshootv1beta2.SingleOutcome{ + Message: "No PVCs are using more than 80% of storage", + }, + }, + }, + Filters: troubleshootv1beta2.NodeMetricsAnalyzeFilters{ + PVC: &troubleshootv1beta2.PVCRef{ + NameRegex: ".*other.*", + }, + }, + }, + nodeMetrics: `{ + "pods": [ + { + "podRef": { + "name": "my-pod", + "namespace": "my-namespace" + }, + "volume": [ + { + "capacityBytes": 100, + "usedBytes": 80, + "pvcRef": { + "name": "backup-pvc", + "namespace": "my-namespace" + } + }, + { + "capacityBytes": 100, + "usedBytes": 75, + "pvcRef": { + "name": "another-pvc", + "namespace": "my-namespace" + } + }, + { + "capacityBytes": 100, + "usedBytes": 80, + "pvcRef": { + "name": "the-other-pvc", + "namespace": "my-namespace" + } + }, + { + "capacityBytes": 100, + "usedBytes": 65, + "pvcRef": { + "name": "to-other-pvc", + "namespace": "my-namespace" + } + } + ] + } + ] + }`, + want: []*AnalyzeResult{ + { + Title: "Node Metrics", + IsFail: true, + Message: "PVC space usage is too high for pvcs [my-namespace/another-pvc, my-namespace/the-other-pvc]", + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &AnalyzeNodeMetrics{ + analyzer: &tt.analyzer, + } + filesFn := func(string, []string) (map[string][]byte, error) { + return map[string][]byte{ + "node-metrics.json": []byte(tt.nodeMetrics), + }, nil + } + + got, err := a.Analyze(nil, filesFn) + assert.Equalf(t, tt.wantErr, err != nil, "AnalyzeNodeMetrics.Analyze() error = %v, wantErr %v", err, tt.wantErr) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go index 0ecfd7a2e..76aba6367 100644 --- a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go +++ b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go @@ -242,6 +242,22 @@ type EventAnalyze struct { Outcomes []*Outcome `json:"outcomes" yaml:"outcomes"` } +type NodeMetricsAnalyze struct { + AnalyzeMeta `json:",inline" yaml:",inline"` + CollectorName string `json:"collectorName" yaml:"collectorName"` + Filters NodeMetricsAnalyzeFilters `json:"filters,omitempty" yaml:"filters,omitempty"` + Outcomes []*Outcome `json:"outcomes" yaml:"outcomes"` +} + +type NodeMetricsAnalyzeFilters struct { + PVC *PVCRef `json:"pvc,omitempty" yaml:"pvc,omitempty"` +} + +type PVCRef struct { + NameRegex string `json:"nameRegex,omitempty" yaml:"nameRegex,omitempty"` + Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"` +} + type Analyze struct { ClusterVersion *ClusterVersion `json:"clusterVersion,omitempty" yaml:"clusterVersion,omitempty"` StorageClass *StorageClass `json:"storageClass,omitempty" yaml:"storageClass,omitempty"` @@ -275,4 +291,5 @@ type Analyze struct { Certificates *CertificatesAnalyze `json:"certificates,omitempty" yaml:"certificates,omitempty"` Goldpinger *GoldpingerAnalyze `json:"goldpinger,omitempty" yaml:"goldpinger,omitempty"` Event *EventAnalyze `json:"event,omitempty" yaml:"event,omitempty"` + NodeMetrics *NodeMetricsAnalyze `json:"nodeMetrics,omitempty" yaml:"nodeMetrics,omitempty"` } diff --git a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go index c7701ffcf..38c98fc35 100644 --- a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go +++ b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go @@ -213,6 +213,11 @@ func (in *Analyze) DeepCopyInto(out *Analyze) { *out = new(EventAnalyze) (*in).DeepCopyInto(*out) } + if in.NodeMetrics != nil { + in, out := &in.NodeMetrics, &out.NodeMetrics + *out = new(NodeMetricsAnalyze) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Analyze. @@ -3015,6 +3020,54 @@ func (in *NodeMetrics) DeepCopy() *NodeMetrics { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeMetricsAnalyze) DeepCopyInto(out *NodeMetricsAnalyze) { + *out = *in + in.AnalyzeMeta.DeepCopyInto(&out.AnalyzeMeta) + in.Filters.DeepCopyInto(&out.Filters) + if in.Outcomes != nil { + in, out := &in.Outcomes, &out.Outcomes + *out = make([]*Outcome, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(Outcome) + (*in).DeepCopyInto(*out) + } + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeMetricsAnalyze. +func (in *NodeMetricsAnalyze) DeepCopy() *NodeMetricsAnalyze { + if in == nil { + return nil + } + out := new(NodeMetricsAnalyze) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeMetricsAnalyzeFilters) DeepCopyInto(out *NodeMetricsAnalyzeFilters) { + *out = *in + if in.PVC != nil { + in, out := &in.PVC, &out.PVC + *out = new(PVCRef) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeMetricsAnalyzeFilters. +func (in *NodeMetricsAnalyzeFilters) DeepCopy() *NodeMetricsAnalyzeFilters { + if in == nil { + return nil + } + out := new(NodeMetricsAnalyzeFilters) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeResourceFilters) DeepCopyInto(out *NodeResourceFilters) { *out = *in @@ -3119,6 +3172,21 @@ func (in *Outcome) DeepCopy() *Outcome { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PVCRef) DeepCopyInto(out *PVCRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PVCRef. +func (in *PVCRef) DeepCopy() *PVCRef { + if in == nil { + return nil + } + out := new(PVCRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PodLaunchOptions) DeepCopyInto(out *PodLaunchOptions) { *out = *in diff --git a/pkg/preflight/flags.go b/pkg/preflight/flags.go index f5c85187a..a5e64991c 100644 --- a/pkg/preflight/flags.go +++ b/pkg/preflight/flags.go @@ -2,7 +2,7 @@ package preflight import ( flag "github.com/spf13/pflag" - utilpointer "k8s.io/utils/pointer" + utilpointer "k8s.io/utils/ptr" ) const ( @@ -35,16 +35,16 @@ var preflightFlags *PreflightFlags func NewPreflightFlags() *PreflightFlags { return &PreflightFlags{ - Interactive: utilpointer.Bool(true), - Format: utilpointer.String("human"), - CollectorImage: utilpointer.String(""), - CollectorPullPolicy: utilpointer.String(""), - CollectWithoutPermissions: utilpointer.Bool(true), - Selector: utilpointer.String(""), - SinceTime: utilpointer.String(""), - Since: utilpointer.String(""), - Output: utilpointer.String("o"), - Debug: utilpointer.Bool(false), + Interactive: utilpointer.To(true), + Format: utilpointer.To("human"), + CollectorImage: utilpointer.To(""), + CollectorPullPolicy: utilpointer.To(""), + CollectWithoutPermissions: utilpointer.To(true), + Selector: utilpointer.To(""), + SinceTime: utilpointer.To(""), + Since: utilpointer.To(""), + Output: utilpointer.To("o"), + Debug: utilpointer.To(false), } } diff --git a/schemas/analyzer-troubleshoot-v1beta2.json b/schemas/analyzer-troubleshoot-v1beta2.json index 6c1f7b8c3..b1ba8ad58 100644 --- a/schemas/analyzer-troubleshoot-v1beta2.json +++ b/schemas/analyzer-troubleshoot-v1beta2.json @@ -1565,6 +1565,99 @@ } } }, + "nodeMetrics": { + "type": "object", + "required": [ + "collectorName", + "outcomes" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "filters": { + "type": "object", + "properties": { + "pvc": { + "type": "object", + "properties": { + "nameRegex": { + "type": "string" + }, + "namespace": { + "type": "string" + } + } + } + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "nodeResources": { "type": "object", "required": [ diff --git a/schemas/preflight-troubleshoot-v1beta2.json b/schemas/preflight-troubleshoot-v1beta2.json index e6d85367e..20eebcbbc 100644 --- a/schemas/preflight-troubleshoot-v1beta2.json +++ b/schemas/preflight-troubleshoot-v1beta2.json @@ -1565,6 +1565,99 @@ } } }, + "nodeMetrics": { + "type": "object", + "required": [ + "collectorName", + "outcomes" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "filters": { + "type": "object", + "properties": { + "pvc": { + "type": "object", + "properties": { + "nameRegex": { + "type": "string" + }, + "namespace": { + "type": "string" + } + } + } + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "nodeResources": { "type": "object", "required": [ diff --git a/schemas/supportbundle-troubleshoot-v1beta2.json b/schemas/supportbundle-troubleshoot-v1beta2.json index 67b55411a..56b05bcd3 100644 --- a/schemas/supportbundle-troubleshoot-v1beta2.json +++ b/schemas/supportbundle-troubleshoot-v1beta2.json @@ -1611,6 +1611,99 @@ } } }, + "nodeMetrics": { + "type": "object", + "required": [ + "collectorName", + "outcomes" + ], + "properties": { + "annotations": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "checkName": { + "type": "string" + }, + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "filters": { + "type": "object", + "properties": { + "pvc": { + "type": "object", + "properties": { + "nameRegex": { + "type": "string" + }, + "namespace": { + "type": "string" + } + } + } + } + }, + "outcomes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "fail": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "pass": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + }, + "warn": { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "uri": { + "type": "string" + }, + "when": { + "type": "string" + } + } + } + } + } + }, + "strict": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + } + } + }, "nodeResources": { "type": "object", "required": [ diff --git a/test/e2e/support-bundle/goldpinger_collector_e2e_test.go b/test/e2e/support-bundle/goldpinger_collector_e2e_test.go index 72d9f808f..15d300be4 100644 --- a/test/e2e/support-bundle/goldpinger_collector_e2e_test.go +++ b/test/e2e/support-bundle/goldpinger_collector_e2e_test.go @@ -10,11 +10,16 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/replicatedhq/troubleshoot/internal/testutils" "github.com/replicatedhq/troubleshoot/pkg/convert" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/e2e-framework/klient/k8s/resources" + "sigs.k8s.io/e2e-framework/klient/wait" + "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" "sigs.k8s.io/e2e-framework/third_party/helm" @@ -27,6 +32,10 @@ metadata: name: goldpinger spec: collectors: + - clusterResources: + exclude: true + - clusterInfo: + exclude: true - goldpinger: namespace: $NAMESPACE analyzers: @@ -48,6 +57,22 @@ func Test_GoldpingerCollector(t *testing.T) { helm.WithTimeout("2m"), ) require.NoError(t, err) + client, err := c.NewClient() + require.NoError(t, err) + pods := &v1.PodList{} + + // Lets wait for the goldpinger pods to be running + err = client.Resources().WithNamespace(c.Namespace()).List(ctx, pods, + resources.WithLabelSelector("app.kubernetes.io/name=goldpinger"), + ) + require.NoError(t, err) + require.Len(t, pods.Items, 1) + + err = wait.For( + conditions.New(client.Resources()).PodRunning(&pods.Items[0]), + wait.WithTimeout(time.Second*30), + ) + require.NoError(t, err) return ctx }). Assess("collect and analyse goldpinger pings", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context { @@ -83,13 +108,13 @@ func Test_GoldpingerCollector(t *testing.T) { // Check that we analysed collected goldpinger results. // We should expect a single analysis result for goldpinger. assert.Equal(t, 1, len(analysisResults)) + assert.True(t, strings.HasPrefix(analysisResults[0].Name, "missing.ping.results.for.goldpinger.")) if t.Failed() { t.Logf("Analysis results: %s\n", analysisJSON) t.Logf("Stdout: %s\n", out.String()) t.Logf("Stderr: %s\n", stdErr.String()) t.FailNow() } - assert.True(t, strings.HasPrefix(analysisResults[0].Name, "missing.ping.results.for.goldpinger.")) return ctx }).