diff --git a/go.mod b/go.mod index 6ceb22adb..2666ce075 100644 --- a/go.mod +++ b/go.mod @@ -86,6 +86,7 @@ require ( github.com/mistifyio/go-zfs/v3 v3.0.0 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/onsi/ginkgo v1.14.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/rubenv/sql-migrate v1.3.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect @@ -118,7 +119,7 @@ require ( github.com/Microsoft/go-winio v0.6.0 // indirect github.com/Microsoft/hcsshim v0.10.0-rc.7 // indirect github.com/andybalholm/brotli v1.0.1 // indirect - github.com/aws/aws-sdk-go v1.44.122 // indirect + github.com/aws/aws-sdk-go v1.44.198 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect github.com/c9s/goprocinfo v0.0.0-20170724085704-0010a05ce49f // indirect @@ -221,7 +222,7 @@ require ( go.opencensus.io v0.24.0 // indirect go.starlark.net v0.0.0-20230525235612-a134d8f9ddca // indirect golang.org/x/crypto v0.12.0 // indirect - golang.org/x/net v0.14.0 // indirect + golang.org/x/net v0.14.0 golang.org/x/oauth2 v0.8.0 // indirect golang.org/x/sys v0.12.0 // indirect golang.org/x/term v0.11.0 // indirect diff --git a/go.sum b/go.sum index 03abc7bad..385ab102f 100644 --- a/go.sum +++ b/go.sum @@ -244,8 +244,9 @@ github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgI github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/aws/aws-sdk-go v1.44.122 h1:p6mw01WBaNpbdP2xrisz5tIkcNwzj/HysobNoaAHjgo= github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= +github.com/aws/aws-sdk-go v1.44.198 h1:kgnvxQv4/kP5M0nbxBx0Ac0so9ndr9f8Ti0g+NmPQF8= +github.com/aws/aws-sdk-go v1.44.198/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -609,7 +610,6 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= -github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= github.com/huandu/xstrings v1.3.2/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= @@ -800,13 +800,19 @@ github.com/nsf/termbox-go v0.0.0-20190121233118-02980233997d/go.mod h1:IuKpRQcYE github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= github.com/nwaples/rardecode v1.1.2 h1:Cj0yZY6T1Zx1R7AhTbyGSALm44/Mmq+BAPc4B/p/d3M= github.com/nwaples/rardecode v1.1.2/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= +github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= +github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.10.1 h1:q/mM8GF/n0shIN8SaAZ0V+jnLPzen6WIVZdiwrRlMlo= github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA= +github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU= github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= +github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= @@ -1138,6 +1144,7 @@ golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/ golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= @@ -1234,12 +1241,14 @@ golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190801041406-cbf593c0f2f3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1254,6 +1263,7 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1648,7 +1658,6 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/cheggaaa/pb.v1 v1.0.27/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= diff --git a/pkg/analyze/host_filesystem_performance.go b/pkg/analyze/host_filesystem_performance.go index dbccd2636..cb38b8f82 100644 --- a/pkg/analyze/host_filesystem_performance.go +++ b/pkg/analyze/host_filesystem_performance.go @@ -42,7 +42,18 @@ func (a *AnalyzeHostFilesystemPerformance) Analyze( return nil, errors.Wrapf(err, "failed to get collected file %s", name) } - fsPerf := collect.FSPerfResults{} + fioResult := collect.FioResult{} + if err := json.Unmarshal(contents, &fioResult); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal fio results from %s", name) + } + + if len(fioResult.Jobs) == 0 { + return nil, errors.Errorf("no jobs found in fio results from %s", name) + } + + fioWriteLatency := fioResult.Jobs[0].Sync + + fsPerf := fioWriteLatency.FSPerfResults() if err := json.Unmarshal(contents, &fsPerf); err != nil { return nil, errors.Wrapf(err, "failed to unmarshal filesystem performance results from %s", name) } @@ -179,7 +190,7 @@ func compareHostFilesystemPerformanceConditionalToActual(conditional string, fsP return doCompareHostFilesystemPerformance(comparator, fsPerf.P9999, desiredDuration) } - return false, fmt.Errorf("Unknown filesystem performance keyword %q", keyword) + return false, fmt.Errorf("unknown filesystem performance keyword %q", keyword) } func doCompareHostFilesystemPerformance(operator string, actual time.Duration, desired time.Duration) (bool, error) { @@ -196,7 +207,7 @@ func doCompareHostFilesystemPerformance(operator string, actual time.Duration, d return actual == desired, nil } - return false, fmt.Errorf("Unknown filesystem performance operator %q", operator) + return false, fmt.Errorf("unknown filesystem performance operator %q", operator) } func renderFSPerfOutcome(outcome string, fsPerf collect.FSPerfResults) string { diff --git a/pkg/analyze/host_filesystem_performance_test.go b/pkg/analyze/host_filesystem_performance_test.go index 577287597..52ab54920 100644 --- a/pkg/analyze/host_filesystem_performance_test.go +++ b/pkg/analyze/host_filesystem_performance_test.go @@ -1,12 +1,9 @@ package analyzer import ( - "encoding/json" "testing" - "time" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" - "github.com/replicatedhq/troubleshoot/pkg/collect" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -14,35 +11,305 @@ import ( func TestAnalyzeHostFilesystemPerformance(t *testing.T) { tests := []struct { name string - fsPerf *collect.FSPerfResults + fioResult string hostAnalyzer *troubleshootv1beta2.FilesystemPerformanceAnalyze result []*AnalyzeResult expectErr bool }{ { name: "Cover", - fsPerf: &collect.FSPerfResults{ - Min: 200 * time.Nanosecond, - Max: time.Second, - Average: 55 * time.Microsecond, - P1: 1 * time.Microsecond, - P5: 5 * time.Microsecond, - P10: 10 * time.Microsecond, - P20: 20 * time.Microsecond, - P30: 30 * time.Microsecond, - P40: 40 * time.Microsecond, - P50: 50 * time.Microsecond, - P60: 60 * time.Microsecond, - P70: 70 * time.Microsecond, - P80: 80 * time.Microsecond, - P90: 90 * time.Microsecond, - P95: 95 * time.Microsecond, - P99: 99 * time.Microsecond, - P995: 995 * time.Microsecond, - P999: 999 * time.Microsecond, - P9995: 5 * time.Millisecond, - P9999: 9 * time.Millisecond, - }, + fioResult: `{ + "fio version" : "fio-3.28", + "timestamp" : 1691679955, + "timestamp_ms" : 1691679955590, + "time" : "Thu Aug 10 15:05:55 2023", + "global options" : { + "rw" : "write", + "ioengine" : "sync", + "fdatasync" : "1", + "directory" : "/var/lib/etcd", + "size" : "23068672", + "bs" : "1024" + }, + "jobs" : [ + { + "jobname" : "fsperf", + "groupid" : 0, + "error" : 0, + "eta" : 0, + "elapsed" : 15, + "job options" : { + "name" : "fsperf", + "runtime" : "120" + }, + "read" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw_bytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 22527, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "write" : { + "io_bytes" : 23068672, + "io_kbytes" : 22528, + "bw_bytes" : 1651182, + "bw" : 1612, + "iops" : 1612.483001, + "runtime" : 13971, + "total_ios" : 22528, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 200, + "max" : 1000000000, + "mean" : 55000, + "stddev" : 12345.6789, + "N" : 32400, + "percentile" : { + "1.000000" : 1000, + "5.000000" : 5000, + "10.000000" : 10000, + "20.000000" : 20000, + "30.000000" : 30000, + "40.000000" : 40000, + "50.000000" : 50000, + "60.000000" : 60000, + "70.000000" : 70000, + "80.000000" : 80000, + "90.000000" : 90000, + "95.000000" : 95000, + "99.000000" : 99000, + "99.500000" : 995000, + "99.900000" : 999000, + "99.950000" : 5000000, + "99.990000" : 9000000 + } + }, + "lat_ns" : { + "min" : 2684, + "max" : 8710446, + "mean" : 95169.335405, + "stddev" : 172145.383902, + "N" : 22528 + }, + "bw_min" : 1516, + "bw_max" : 1706, + "bw_agg" : 100.000000, + "bw_mean" : 1613.629630, + "bw_dev" : 35.708379, + "bw_samples" : 27, + "iops_min" : 1516, + "iops_max" : 1706, + "iops_mean" : 1613.629630, + "iops_stddev" : 35.708379, + "iops_samples" : 27 + }, + "trim" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw_bytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "sync" : { + "total_ios" : 0, + "lat_ns" : { + "min" : 200, + "max" : 1000000000, + "mean" : 55000, + "stddev" : 12345.6789, + "N" : 32400, + "percentile" : { + "1.000000" : 1000, + "5.000000" : 5000, + "10.000000" : 10000, + "20.000000" : 20000, + "30.000000" : 30000, + "40.000000" : 40000, + "50.000000" : 50000, + "60.000000" : 60000, + "70.000000" : 70000, + "80.000000" : 80000, + "90.000000" : 90000, + "95.000000" : 95000, + "99.000000" : 99000, + "99.500000" : 995000, + "99.900000" : 999000, + "99.950000" : 5000000, + "99.990000" : 9000000 + } + } + }, + "job_runtime" : 13970, + "usr_cpu" : 1.410165, + "sys_cpu" : 5.454545, + "ctx" : 72137, + "majf" : 0, + "minf" : 16, + "iodepth_level" : { + "1" : 199.995561, + "2" : 0.000000, + "4" : 0.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + ">=64" : 0.000000 + }, + "iodepth_submit" : { + "0" : 0.000000, + "4" : 100.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + "64" : 0.000000, + ">=64" : 0.000000 + }, + "iodepth_complete" : { + "0" : 0.000000, + "4" : 100.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + "64" : 0.000000, + ">=64" : 0.000000 + }, + "latency_ns" : { + "2" : 0.000000, + "4" : 0.000000, + "10" : 0.000000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000 + }, + "latency_us" : { + "2" : 0.000000, + "4" : 27.077415, + "10" : 42.032138, + "20" : 5.450994, + "50" : 0.306286, + "100" : 0.026634, + "250" : 0.461648, + "500" : 23.291016, + "750" : 1.269531, + "1000" : 0.035511 + }, + "latency_ms" : { + "2" : 0.026634, + "4" : 0.017756, + "10" : 0.010000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000, + "2000" : 0.000000, + ">=2000" : 0.000000 + }, + "latency_depth" : 1, + "latency_target" : 0, + "latency_percentile" : 100.000000, + "latency_window" : 0 + } + ], + "disk_util" : [ + { + "name" : "sda", + "read_ios" : 5610, + "write_ios" : 45550, + "read_merges" : 0, + "write_merges" : 568, + "read_ticks" : 1863, + "write_ticks" : 11605, + "in_queue" : 14353, + "util" : 99.435028 + } + ] + }`, hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{ CollectorName: "etcd", Outcomes: []*troubleshootv1beta2.Outcome{ @@ -298,9 +565,298 @@ func TestAnalyzeHostFilesystemPerformance(t *testing.T) { }, { name: "skip warn if pass first", - fsPerf: &collect.FSPerfResults{ - P99: 9 * time.Millisecond, - }, + fioResult: `{ + "fio version" : "fio-3.28", + "timestamp" : 1691679955, + "timestamp_ms" : 1691679955590, + "time" : "Thu Aug 10 15:05:55 2023", + "global options" : { + "rw" : "write", + "ioengine" : "sync", + "fdatasync" : "1", + "directory" : "/var/lib/etcd", + "size" : "23068672", + "bs" : "1024" + }, + "jobs" : [ + { + "jobname" : "fsperf", + "groupid" : 0, + "error" : 0, + "eta" : 0, + "elapsed" : 15, + "job options" : { + "name" : "fsperf", + "runtime" : "120" + }, + "read" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw_bytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 22527, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "write" : { + "io_bytes" : 23068672, + "io_kbytes" : 22528, + "bw_bytes" : 1651182, + "bw" : 1612, + "iops" : 1612.483001, + "runtime" : 13971, + "total_ios" : 22528, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 200, + "max" : 1000000000, + "mean" : 55000, + "stddev" : 12345.6789, + "N" : 32400, + "percentile" : { + "1.000000" : 1000, + "5.000000" : 5000, + "10.000000" : 10000, + "20.000000" : 20000, + "30.000000" : 30000, + "40.000000" : 40000, + "50.000000" : 50000, + "60.000000" : 60000, + "70.000000" : 70000, + "80.000000" : 80000, + "90.000000" : 90000, + "95.000000" : 95000, + "99.000000" : 99000, + "99.500000" : 995000, + "99.900000" : 999000, + "99.950000" : 5000000, + "99.990000" : 9000000 + } + }, + "lat_ns" : { + "min" : 2684, + "max" : 8710446, + "mean" : 95169.335405, + "stddev" : 172145.383902, + "N" : 22528 + }, + "bw_min" : 1516, + "bw_max" : 1706, + "bw_agg" : 100.000000, + "bw_mean" : 1613.629630, + "bw_dev" : 35.708379, + "bw_samples" : 27, + "iops_min" : 1516, + "iops_max" : 1706, + "iops_mean" : 1613.629630, + "iops_stddev" : 35.708379, + "iops_samples" : 27 + }, + "trim" : { + "io_bytes" : 0, + "io_kbytes" : 0, + "bw_bytes" : 0, + "bw" : 0, + "iops" : 0.000000, + "runtime" : 0, + "total_ios" : 0, + "short_ios" : 0, + "drop_ios" : 0, + "slat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "clat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "lat_ns" : { + "min" : 0, + "max" : 0, + "mean" : 0.000000, + "stddev" : 0.000000, + "N" : 0 + }, + "bw_min" : 0, + "bw_max" : 0, + "bw_agg" : 0.000000, + "bw_mean" : 0.000000, + "bw_dev" : 0.000000, + "bw_samples" : 0, + "iops_min" : 0, + "iops_max" : 0, + "iops_mean" : 0.000000, + "iops_stddev" : 0.000000, + "iops_samples" : 0 + }, + "sync" : { + "total_ios" : 0, + "lat_ns" : { + "min" : 200, + "max" : 1000000000, + "mean" : 55000, + "stddev" : 12345.6789, + "N" : 32400, + "percentile" : { + "1.000000" : 1000, + "5.000000" : 5000, + "10.000000" : 10000, + "20.000000" : 20000, + "30.000000" : 30000, + "40.000000" : 40000, + "50.000000" : 50000, + "60.000000" : 60000, + "70.000000" : 70000, + "80.000000" : 80000, + "90.000000" : 90000, + "95.000000" : 95000, + "99.000000" : 9000000, + "99.500000" : 995000, + "99.900000" : 999000, + "99.950000" : 5000000, + "99.990000" : 9000000 + } + } + }, + "job_runtime" : 13970, + "usr_cpu" : 1.410165, + "sys_cpu" : 5.454545, + "ctx" : 72137, + "majf" : 0, + "minf" : 16, + "iodepth_level" : { + "1" : 199.995561, + "2" : 0.000000, + "4" : 0.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + ">=64" : 0.000000 + }, + "iodepth_submit" : { + "0" : 0.000000, + "4" : 100.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + "64" : 0.000000, + ">=64" : 0.000000 + }, + "iodepth_complete" : { + "0" : 0.000000, + "4" : 100.000000, + "8" : 0.000000, + "16" : 0.000000, + "32" : 0.000000, + "64" : 0.000000, + ">=64" : 0.000000 + }, + "latency_ns" : { + "2" : 0.000000, + "4" : 0.000000, + "10" : 0.000000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000 + }, + "latency_us" : { + "2" : 0.000000, + "4" : 27.077415, + "10" : 42.032138, + "20" : 5.450994, + "50" : 0.306286, + "100" : 0.026634, + "250" : 0.461648, + "500" : 23.291016, + "750" : 1.269531, + "1000" : 0.035511 + }, + "latency_ms" : { + "2" : 0.026634, + "4" : 0.017756, + "10" : 0.010000, + "20" : 0.000000, + "50" : 0.000000, + "100" : 0.000000, + "250" : 0.000000, + "500" : 0.000000, + "750" : 0.000000, + "1000" : 0.000000, + "2000" : 0.000000, + ">=2000" : 0.000000 + }, + "latency_depth" : 1, + "latency_target" : 0, + "latency_percentile" : 100.000000, + "latency_window" : 0 + } + ], + "disk_util" : [ + { + "name" : "sda", + "read_ios" : 5610, + "write_ios" : 45550, + "read_merges" : 0, + "write_merges" : 568, + "read_ticks" : 1863, + "write_ticks" : 11605, + "in_queue" : 14353, + "util" : 99.435028 + } + ] + }`, hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{ CollectorName: "file system performance", Outcomes: []*troubleshootv1beta2.Outcome{ @@ -332,20 +888,66 @@ func TestAnalyzeHostFilesystemPerformance(t *testing.T) { }, }, }, + { + name: "bail if malformed JSON", + fioResult: `{ + bad JSON + }`, + hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{ + CollectorName: "file system performance", + Outcomes: []*troubleshootv1beta2.Outcome{ + { + Fail: &troubleshootv1beta2.SingleOutcome{ + Message: "bad JSON should not be analyzed", + }, + }, + }, + }, + expectErr: true, + }, + { + name: "bail if fio ran no jobs", + fioResult: `{ + "fio version" : "fio-3.28", + "timestamp" : 1691679955, + "timestamp_ms" : 1691679955590, + "time" : "Thu Aug 10 15:05:55 2023", + "global options" : { + "rw" : "write", + "ioengine" : "sync", + "fdatasync" : "1", + "directory" : "/var/lib/etcd", + "size" : "23068672", + "bs" : "1024" + }, + "jobs" : [ + ] + }`, + hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{ + CollectorName: "file system performance", + Outcomes: []*troubleshootv1beta2.Outcome{ + { + Fail: &troubleshootv1beta2.SingleOutcome{ + Message: "an empty Jobs array should not be analyzed", + }, + }, + }, + }, + expectErr: true, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { req := require.New(t) - b, err := json.Marshal(test.fsPerf) - if err != nil { - t.Fatal(err) - } + + b := []byte(test.fioResult) getCollectedFileContents := func(filename string) ([]byte, error) { return b, nil } - result, err := (&AnalyzeHostFilesystemPerformance{test.hostAnalyzer}).Analyze(getCollectedFileContents, nil) + a := AnalyzeHostFilesystemPerformance{test.hostAnalyzer} + result, err := a.Analyze(getCollectedFileContents, nil) if test.expectErr { req.Error(err) } else { diff --git a/pkg/collect/host_filesystem_performance.go b/pkg/collect/host_filesystem_performance.go index 6c7edf6fd..1c54379cd 100644 --- a/pkg/collect/host_filesystem_performance.go +++ b/pkg/collect/host_filesystem_performance.go @@ -2,16 +2,35 @@ package collect import ( "bytes" + "encoding/json" + "fmt" "math" - "math/rand" + "os/exec" + "reflect" + "strconv" + "strings" "text/template" "time" + "github.com/pkg/errors" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "golang.org/x/net/context" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" ) -func init() { - rand.Seed(time.Now().UnixNano()) +type Durations []time.Duration + +func (d Durations) Len() int { + return len(d) +} + +func (d Durations) Less(i, j int) bool { + return d[i] < d[j] +} + +func (d Durations) Swap(i, j int) { + d[i], d[j] = d[j], d[i] } type CollectHostFilesystemPerformance struct { @@ -90,3 +109,324 @@ func (f FSPerfResults) String() string { return buf.String() } + +type FioResult struct { + FioVersion string `json:"fio version,omitempty"` + Timestamp int64 `json:"timestamp,omitempty"` + TimestampMS int64 `json:"timestamp_ms,omitempty"` + Time string `json:"time,omitempty"` + GlobalOptions FioGlobalOptions `json:"global options,omitempty"` + Jobs []FioJobs `json:"jobs,omitempty"` + DiskUtil []FioDiskUtil `json:"disk_util,omitempty"` +} + +func (f FioResult) String() string { + var res string + res += fmt.Sprintf("FIO version - %s\n", f.FioVersion) + res += fmt.Sprintf("Global options - %s\n\n", f.GlobalOptions) + for _, job := range f.Jobs { + res += fmt.Sprintf("%s\n", job) + } + res += "Disk stats (read/write):\n" + for _, du := range f.DiskUtil { + res += fmt.Sprintf("%s\n", du) + } + + return res +} + +type FioGlobalOptions struct { + Directory string `json:"directory,omitempty"` + RandRepeat string `json:"randrepeat,omitempty"` + Verify string `json:"verify,omitempty"` + IOEngine string `json:"ioengine,omitempty"` + Direct string `json:"direct,omitempty"` + GtodReduce string `json:"gtod_reduce,omitempty"` +} + +func (g FioGlobalOptions) String() string { + return fmt.Sprintf("ioengine=%s verify=%s direct=%s gtod_reduce=%s", g.IOEngine, g.Verify, g.Direct, g.GtodReduce) +} + +type FioJobs struct { + JobName string `json:"jobname,omitempty"` + GroupID int `json:"groupid,omitempty"` + Error int `json:"error,omitempty"` + Eta int `json:"eta,omitempty"` + Elapsed int `json:"elapsed,omitempty"` + JobOptions FioJobOptions `json:"job options,omitempty"` + Read FioStats `json:"read,omitempty"` + Write FioStats `json:"write,omitempty"` + Trim FioStats `json:"trim,omitempty"` + Sync FioStats `json:"sync,omitempty"` + JobRuntime int32 `json:"job_runtime,omitempty"` + UsrCpu float32 `json:"usr_cpu,omitempty"` + SysCpu float32 `json:"sys_cpu,omitempty"` + Ctx int32 `json:"ctx,omitempty"` + MajF int32 `json:"majf,omitempty"` + MinF int32 `json:"minf,omitempty"` + IoDepthLevel FioDepth `json:"iodepth_level,omitempty"` + IoDepthSubmit FioDepth `json:"iodepth_submit,omitempty"` + IoDepthComplete FioDepth `json:"iodepth_complete,omitempty"` + LatencyNs FioLatency `json:"latency_ns,omitempty"` + LatencyUs FioLatency `json:"latency_us,omitempty"` + LatencyMs FioLatency `json:"latency_ms,omitempty"` + LatencyDepth int32 `json:"latency_depth,omitempty"` + LatencyTarget int32 `json:"latency_target,omitempty"` + LatencyPercentile float32 `json:"latency_percentile,omitempty"` + LatencyWindow int32 `json:"latency_window,omitempty"` +} + +func (j FioJobs) String() string { + var job string + job += fmt.Sprintf("%s\n", j.JobOptions) + if j.Read.Iops != 0 || j.Read.BW != 0 { + job += fmt.Sprintf("read:\n%s\n", j.Read) + } + if j.Write.Iops != 0 || j.Write.BW != 0 { + job += fmt.Sprintf("write:\n%s\n", j.Write) + } + return job +} + +type FioJobOptions struct { + Name string `json:"name,omitempty"` + BS string `json:"bs,omitempty"` + Directory string `json:"directory,omitempty"` + RW string `json:"rw,omitempty"` + IOEngine string `json:"ioengine,omitempty"` + FDataSync string `json:"fdatasync,omitempty"` + Size string `json:"size,omitempty"` + RunTime string `json:"runtime,omitempty"` +} + +func (o FioJobOptions) String() string { + return fmt.Sprintf("JobName: %s\n blocksize=%s filesize=%s rw=%s", o.Name, o.BS, o.Size, o.RW) +} + +type FioStats struct { + IOBytes int64 `json:"io_bytes,omitempty"` + IOKBytes int64 `json:"io_kbytes,omitempty"` + BWBytes int64 `json:"bw_bytes,omitempty"` + BW int64 `json:"bw,omitempty"` + Iops float32 `json:"iops,omitempty"` + Runtime int64 `json:"runtime,omitempty"` + TotalIos int64 `json:"total_ios,omitempty"` + ShortIos int64 `json:"short_ios,omitempty"` + DropIos int64 `json:"drop_ios,omitempty"` + SlatNs FioNS `json:"slat_ns,omitempty"` + ClatNs FioNS `json:"clat_ns,omitempty"` + LatNs FioNS `json:"lat_ns,omitempty"` + Percentile FioPercentile `json:"percentile,omitempty"` + BwMin int64 `json:"bw_min,omitempty"` + BwMax int64 `json:"bw_max,omitempty"` + BwAgg float32 `json:"bw_agg,omitempty"` + BwMean float32 `json:"bw_mean,omitempty"` + BwDev float32 `json:"bw_dev,omitempty"` + BwSamples int32 `json:"bw_samples,omitempty"` + IopsMin int32 `json:"iops_min,omitempty"` + IopsMax int32 `json:"iops_max,omitempty"` + IopsMean float32 `json:"iops_mean,omitempty"` + IopsStdDev float32 `json:"iops_stddev,omitempty"` + IopsSamples int32 `json:"iops_samples,omitempty"` +} + +func (s FioStats) String() string { + var stats string + stats += fmt.Sprintf(" IOPS=%f BW(KiB/s)=%d\n", s.Iops, s.BW) + stats += fmt.Sprintf(" iops: min=%d max=%d avg=%f\n", s.IopsMin, s.IopsMax, s.IopsMean) + stats += fmt.Sprintf(" bw(KiB/s): min=%d max=%d avg=%f", s.BwMin, s.BwMax, s.BwMean) + return stats +} + +func (s FioStats) FSPerfResults() FSPerfResults { + return FSPerfResults{ + Min: time.Duration(s.LatNs.Min), + Max: time.Duration(s.LatNs.Max), + Average: time.Duration(s.LatNs.Mean), + P1: time.Duration(s.LatNs.Percentile.P1), + P5: time.Duration(s.LatNs.Percentile.P5), + P10: time.Duration(s.LatNs.Percentile.P10), + P20: time.Duration(s.LatNs.Percentile.P20), + P30: time.Duration(s.LatNs.Percentile.P30), + P40: time.Duration(s.LatNs.Percentile.P40), + P50: time.Duration(s.LatNs.Percentile.P50), + P60: time.Duration(s.LatNs.Percentile.P60), + P70: time.Duration(s.LatNs.Percentile.P70), + P80: time.Duration(s.LatNs.Percentile.P80), + P90: time.Duration(s.LatNs.Percentile.P90), + P95: time.Duration(s.LatNs.Percentile.P95), + P99: time.Duration(s.LatNs.Percentile.P99), + P995: time.Duration(s.LatNs.Percentile.P995), + P999: time.Duration(s.LatNs.Percentile.P999), + P9995: time.Duration(s.LatNs.Percentile.P9995), + P9999: time.Duration(s.LatNs.Percentile.P9999), + } +} + +type FioNS struct { + Min int64 `json:"min,omitempty"` + Max int64 `json:"max,omitempty"` + Mean float32 `json:"mean,omitempty"` + StdDev float32 `json:"stddev,omitempty"` + N int64 `json:"N,omitempty"` + Percentile FioPercentile `json:"percentile,omitempty"` +} + +type FioDepth struct { + FioDepth0 float32 `json:"0,omitempty"` + FioDepth1 float32 `json:"1,omitempty"` + FioDepth2 float32 `json:"2,omitempty"` + FioDepth4 float32 `json:"4,omitempty"` + FioDepth8 float32 `json:"8,omitempty"` + FioDepth16 float32 `json:"16,omitempty"` + FioDepth32 float32 `json:"32,omitempty"` + FioDepth64 float32 `json:"64,omitempty"` + FioDepthGE64 float32 `json:">=64,omitempty"` +} + +type FioLatency struct { + FioLat2 float32 `json:"2,omitempty"` + FioLat4 float32 `json:"4,omitempty"` + FioLat10 float32 `json:"10,omitempty"` + FioLat20 float32 `json:"20,omitempty"` + FioLat50 float32 `json:"50,omitempty"` + FioLat100 float32 `json:"100,omitempty"` + FioLat250 float32 `json:"250,omitempty"` + FioLat500 float32 `json:"500,omitempty"` + FioLat750 float32 `json:"750,omitempty"` + FioLat1000 float32 `json:"1000,omitempty"` + FioLat2000 float32 `json:"2000,omitempty"` + FioLatGE2000 float32 `json:">=2000,omitempty"` +} + +type FioDiskUtil struct { + Name string `json:"name,omitempty"` + ReadIos int64 `json:"read_ios,omitempty"` + WriteIos int64 `json:"write_ios,omitempty"` + ReadMerges int64 `json:"read_merges,omitempty"` + WriteMerges int64 `json:"write_merges,omitempty"` + ReadTicks int64 `json:"read_ticks,omitempty"` + WriteTicks int64 `json:"write_ticks,omitempty"` + InQueue int64 `json:"in_queue,omitempty"` + Util float32 `json:"util,omitempty"` +} + +type FioPercentile struct { + P1 int `json:"1.000000,omitempty"` + P5 int `json:"5.000000,omitempty"` + P10 int `json:"10.000000,omitempty"` + P20 int `json:"20.000000,omitempty"` + P30 int `json:"30.000000,omitempty"` + P40 int `json:"40.000000,omitempty"` + P50 int `json:"50.000000,omitempty"` + P60 int `json:"60.000000,omitempty"` + P70 int `json:"70.000000,omitempty"` + P80 int `json:"80.000000,omitempty"` + P90 int `json:"90.000000,omitempty"` + P95 int `json:"95.000000,omitempty"` + P99 int `json:"99.000000,omitempty"` + P995 int `json:"99.500000,omitempty"` + P999 int `json:"99.900000,omitempty"` + P9995 int `json:"99.950000,omitempty"` + P9999 int `json:"99.990000,omitempty"` +} + +func (d FioDiskUtil) String() string { + //Disk stats (read/write): + //rbd4: ios=30022/11982, merge=0/313, ticks=1028675/1022768, in_queue=2063740, util=99.67% + var du string + du += fmt.Sprintf(" %s: ios=%d/%d merge=%d/%d ticks=%d/%d in_queue=%d, util=%f%%", d.Name, d.ReadIos, + d.WriteIos, d.ReadMerges, d.WriteMerges, d.ReadTicks, d.WriteTicks, d.InQueue, d.Util) + return du +} + +func parseCollectorOptions(hostCollector *troubleshootv1beta2.FilesystemPerformance) ([]string, *FioJobOptions, error) { + + var operationSize uint64 = 1024 + if hostCollector.OperationSizeBytes > 0 { + operationSize = hostCollector.OperationSizeBytes + } + var fileSize uint64 = 10 * 1024 * 1024 + if hostCollector.FileSize != "" { + quantity, err := resource.ParseQuantity(hostCollector.FileSize) + if err != nil { + return nil, nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize) + } + fileSizeInt64, ok := quantity.AsInt64() + if !ok { + return nil, nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize) + } + if fileSizeInt64 <= 0 { + return nil, nil, errors.Wrapf(err, "fileSize %q must be greater than 0", hostCollector.FileSize) + } + fileSize = uint64(fileSizeInt64) + } + + if hostCollector.Directory == "" { + return nil, nil, errors.New("Directory is required to collect filesystem performance info") + } + + latencyBenchmarkOptions := FioJobOptions{ + RW: "write", + IOEngine: "sync", + FDataSync: "1", + Directory: hostCollector.Directory, + Size: strconv.FormatUint(fileSize, 10), + BS: strconv.FormatUint(operationSize, 10), + Name: "fsperf", + RunTime: "120", + } + + command := buildFioCommand(latencyBenchmarkOptions) + + return command, &latencyBenchmarkOptions, nil +} + +func buildFioCommand(opts FioJobOptions) []string { + command := []string{"fio"} + v := reflect.ValueOf(opts) + t := reflect.TypeOf(opts) + for i := 0; i < v.NumField(); i++ { + field := t.Field(i) + value := v.Field(i) + if !value.IsZero() { + command = append(command, fmt.Sprintf("--%s=%v", strings.ToLower(field.Name), value.Interface())) + } + } + command = append(command, "--output-format=json") + return command +} + +func collectFioResults(ctx context.Context, hostCollector *troubleshootv1beta2.FilesystemPerformance) (*FioResult, error) { + + command, opts, err := parseCollectorOptions(hostCollector) + + if err != nil { + return nil, errors.Wrap(err, "failed to parse collector options") + } + + klog.V(2).Infof("collecting fio results: %s", strings.Join(command, " ")) + output, err := exec.CommandContext(ctx, command[0], command[1:]...).Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + if exitErr.ExitCode() == 1 { + return nil, errors.Wrapf(err, "fio failed; permission denied opening %s. ensure this collector runs as root", opts.Directory) + } else { + return nil, errors.Wrapf(err, "fio failed with exit status %d", exitErr.ExitCode()) + } + } else if e, ok := err.(*exec.Error); ok && e.Err == exec.ErrNotFound { + return nil, errors.Wrapf(err, "command not found: %v. ensure fio is installed", command) + } else { + return nil, errors.Wrapf(err, "failed to run command: %v", command) + } + } + + var result FioResult + err = json.Unmarshal([]byte(output), &result) + if err != nil { + return nil, errors.Wrap(err, "failed to unmarshal fio result") + } + + return &result, nil +} diff --git a/pkg/collect/host_filesystem_performance_linux.go b/pkg/collect/host_filesystem_performance_linux.go index d2cc66f05..abcb98603 100644 --- a/pkg/collect/host_filesystem_performance_linux.go +++ b/pkg/collect/host_filesystem_performance_linux.go @@ -10,36 +10,22 @@ import ( "math/rand" "os" "path/filepath" - "sort" "sync" "syscall" "time" "github.com/pkg/errors" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" - "k8s.io/apimachinery/pkg/api/resource" ) -func init() { - rand.Seed(time.Now().UnixNano()) -} - -type Durations []time.Duration - -func (d Durations) Len() int { - return len(d) -} - -func (d Durations) Less(i, j int) bool { - return d[i] < d[j] -} - -func (d Durations) Swap(i, j int) { - d[i], d[j] = d[j], d[i] -} +// Today we only care about checking for write latency so the options struct +// only has what we need for that. we'll collect all the results from a single run of fio +// and filter out the fsync results for analysis. TODO: update the analyzer so any/all results +// from fio can be analyzed. func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.FilesystemPerformance, bundlePath string) (map[string][]byte, error) { timeout := time.Minute + if hostCollector.Timeout != "" { d, err := time.ParseDuration(hostCollector.Timeout) if err != nil { @@ -50,46 +36,15 @@ func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.Filesys ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - var operationSize uint64 = 1024 - if hostCollector.OperationSizeBytes != 0 { - operationSize = hostCollector.OperationSizeBytes - } - - var fileSize uint64 = 10 * 1024 * 1024 - if hostCollector.FileSize != "" { - quantity, err := resource.ParseQuantity(hostCollector.FileSize) - if err != nil { - return nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize) - } - fileSizeInt64, ok := quantity.AsInt64() - if !ok { - return nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize) - } - fileSize = uint64(fileSizeInt64) + collectorName := hostCollector.CollectorName + if collectorName == "" { + collectorName = "filesystemPerformance" } + name := filepath.Join("host-collectors/filesystemPerformance", collectorName+".json") - if hostCollector.Directory == "" { - return nil, errors.New("Directory is required to collect filesystem performance info") - } - // TODO: clean up this directory if its created if err := os.MkdirAll(hostCollector.Directory, 0700); err != nil { return nil, errors.Wrapf(err, "failed to mkdir %q", hostCollector.Directory) } - filename := filepath.Join(hostCollector.Directory, "fsperf") - - f, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) - if err != nil { - log.Panic(err) - return nil, errors.Wrapf(err, "open %s", filename) - } - defer func() { - if err := f.Close(); err != nil { - log.Println(err.Error()) - } - if err := os.Remove(filename); err != nil { - log.Println(err.Error()) - } - }() // Start the background IOPS task and wait for warmup if hostCollector.EnableBackgroundIOPS { @@ -123,86 +78,16 @@ func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.Filesys time.Sleep(time.Second * time.Duration(hostCollector.BackgroundIOPSWarmupSeconds)) } - // Sequential writes benchmark - var written uint64 = 0 - var results Durations - - for { - if written >= fileSize { - break - } - - data := make([]byte, int(operationSize)) - rand.Read(data) - - start := time.Now() - - n, err := f.Write(data) - if err != nil { - return nil, errors.Wrapf(err, "write to %s", filename) - } - if hostCollector.Sync { - if err := f.Sync(); err != nil { - return nil, errors.Wrapf(err, "sync %s", filename) - } - } else if hostCollector.Datasync { - if err := syscall.Fdatasync(int(f.Fd())); err != nil { - return nil, errors.Wrapf(err, "datasync %s", filename) - } - } - - d := time.Now().Sub(start) - results = append(results, d) - - written += uint64(n) - - if ctx.Err() != nil { - break - } - } - - if len(results) == 0 { - return nil, errors.New("No filesystem performance results collected") - } - - sort.Sort(results) - - var sum time.Duration - for _, d := range results { - sum += d - } + var fioResult *FioResult - fsPerf := &FSPerfResults{ - Min: results[0], - Max: results[len(results)-1], - Average: sum / time.Duration(len(results)), - P1: results[getPercentileIndex(.01, len(results))], - P5: results[getPercentileIndex(.05, len(results))], - P10: results[getPercentileIndex(.1, len(results))], - P20: results[getPercentileIndex(.2, len(results))], - P30: results[getPercentileIndex(.3, len(results))], - P40: results[getPercentileIndex(.4, len(results))], - P50: results[getPercentileIndex(.5, len(results))], - P60: results[getPercentileIndex(.6, len(results))], - P70: results[getPercentileIndex(.7, len(results))], - P80: results[getPercentileIndex(.8, len(results))], - P90: results[getPercentileIndex(.9, len(results))], - P95: results[getPercentileIndex(.95, len(results))], - P99: results[getPercentileIndex(.99, len(results))], - P995: results[getPercentileIndex(.995, len(results))], - P999: results[getPercentileIndex(.999, len(results))], - P9995: results[getPercentileIndex(.9995, len(results))], - P9999: results[getPercentileIndex(.9999, len(results))], + fioResult, err := collectFioResults(ctx, hostCollector) + if err != nil { + return nil, errors.Wrap(err, "failed to collect fio results") } - collectorName := hostCollector.CollectorName - if collectorName == "" { - collectorName = "filesystemPerformance" - } - name := filepath.Join("host-collectors/filesystemPerformance", collectorName+".json") - b, err := json.Marshal(fsPerf) + b, err := json.Marshal(fioResult) if err != nil { - return nil, errors.Wrap(err, "failed to marshal fs perf results") + return nil, errors.Wrap(err, "failed to unmarshal fio results") } output := NewResult() diff --git a/pkg/collect/host_filesystem_performance_test.go b/pkg/collect/host_filesystem_performance_test.go index a78c71b6d..4c293dece 100644 --- a/pkg/collect/host_filesystem_performance_test.go +++ b/pkg/collect/host_filesystem_performance_test.go @@ -2,7 +2,10 @@ package collect import ( "fmt" + "reflect" "testing" + + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" ) func TestGetPercentileIndex(t *testing.T) { @@ -57,3 +60,128 @@ func TestGetPercentileIndex(t *testing.T) { }) } } + +func Test_parseCollectorOptions(t *testing.T) { + type args struct { + hostCollector *troubleshootv1beta2.FilesystemPerformance + } + tests := []struct { + name string + args args + wantCommand []string + wantOptions *FioJobOptions + wantErr bool + }{ + { + name: "Happy spec", + args: args{ + hostCollector: &troubleshootv1beta2.FilesystemPerformance{ + HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{ + CollectorName: "fsperf", + }, + OperationSizeBytes: 1024, + Directory: "/var/lib/etcd", + FileSize: "22Mi", + Sync: true, + Datasync: true, + Timeout: "120", + EnableBackgroundIOPS: true, + BackgroundIOPSWarmupSeconds: 10, + BackgroundWriteIOPS: 100, + BackgroundReadIOPS: 100, + BackgroundWriteIOPSJobs: 1, + BackgroundReadIOPSJobs: 1, + }, + }, + wantCommand: []string{ + "fio", + "--name=fsperf", + "--bs=1024", + "--directory=/var/lib/etcd", + "--rw=write", + "--ioengine=sync", + "--fdatasync=1", + "--size=23068672", + "--runtime=120", + "--output-format=json", + }, + wantOptions: &FioJobOptions{ + RW: "write", + IOEngine: "sync", + FDataSync: "1", + Directory: "/var/lib/etcd", + Size: "23068672", + BS: "1024", + Name: "fsperf", + RunTime: "120", + }, + wantErr: false, + }, + { + name: "Empty spec fails", + args: args{ + hostCollector: &troubleshootv1beta2.FilesystemPerformance{ + HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{ + CollectorName: "fsperf", + }, + }, + }, + wantCommand: nil, + wantOptions: nil, + wantErr: true, + }, + { + name: "Invalid filesize", + args: args{ + hostCollector: &troubleshootv1beta2.FilesystemPerformance{ + HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{ + CollectorName: "fsperf", + }, + OperationSizeBytes: 1024, + Directory: "/var/lib/etcd", + FileSize: "abcd", + Sync: true, + Datasync: true, + Timeout: "120", + }, + }, + wantCommand: nil, + wantOptions: nil, + wantErr: true, + }, + { + name: "invalid path parameter", + args: args{ + hostCollector: &troubleshootv1beta2.FilesystemPerformance{ + HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{ + CollectorName: "fsperf", + }, + OperationSizeBytes: 1024, + Directory: "", + FileSize: "22Mi", + Sync: true, + Datasync: true, + Timeout: "120", + }, + }, + wantCommand: nil, + wantOptions: nil, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotCommand, gotOptions, err := parseCollectorOptions(tt.args.hostCollector) + if (err != nil) != tt.wantErr { + t.Errorf("parseCollectorOptions() error = %v, wantErr %v", err, tt.wantErr) + } else { + if !reflect.DeepEqual(gotCommand, tt.wantCommand) { + t.Errorf("parseCollectorOptions() got command = %v, want %v", gotCommand, tt.wantCommand) + } + if !reflect.DeepEqual(gotOptions, tt.wantOptions) { + t.Errorf("parseCollectorOptions() got options = %v, want %v", gotOptions, tt.wantOptions) + } + } + }) + } +} diff --git a/testdata/filesystem_performance_preflight.yaml b/testdata/filesystem_performance_preflight.yaml new file mode 100644 index 000000000..3cc8f24f8 --- /dev/null +++ b/testdata/filesystem_performance_preflight.yaml @@ -0,0 +1,19 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: HostPreflight +metadata: + name: sample +spec: + collectors: + - filesystemPerformance: + collectorName: Filesystem Latency Two Minute Benchmark + timeout: 2m + directory: /var/lib/etcd + fileSize: 22Mi + operationSizeBytes: 2300 + datasync: true + enableBackgroundIOPS: true + backgroundIOPSWarmupSeconds: 10 + backgroundWriteIOPS: 300 + backgroundWriteIOPSJobs: 6 + backgroundReadIOPS: 50 + backgroundReadIOPSJobs: 1 diff --git a/testdata/kurl_preflights.yaml b/testdata/kurl_preflights.yaml new file mode 100644 index 000000000..a714c0b43 --- /dev/null +++ b/testdata/kurl_preflights.yaml @@ -0,0 +1,552 @@ +# https://kurl.sh/docs/install-with-kurl/system-requirements +apiVersion: troubleshoot.sh/v1beta2 +kind: HostPreflight +metadata: + name: kurl-builtin +spec: + collectors: + - time: {} + - cpu: {} + - memory: {} + - hostServices: {} + - hostOS: {} + - diskUsage: + collectorName: "Ephemeral Disk Usage /var/lib/kubelet" + path: /var/lib/kubelet + - diskUsage: + collectorName: "Ephemeral Disk Usage /var/lib/docker" + path: /var/lib/docker + exclude: '{{kurl not .Installer.Spec.Docker.Version}}' + - diskUsage: + collectorName: "Ephemeral Disk Usage /var/lib/containerd" + path: /var/lib/containerd + exclude: '{{kurl not .Installer.Spec.Containerd.Version}}' + - diskUsage: + collectorName: "Ephemeral Disk Usage /var/lib/rook" + path: /var/lib/rook + exclude: '{{kurl not .Installer.Spec.Rook.Version}}' + - diskUsage: + collectorName: "Ephemeral Disk Usage /var/openebs" + path: /var/openebs + exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}' + - tcpLoadBalancer: + collectorName: "Kubernetes API Server Load Balancer" + port: 6443 + address: {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} + timeout: 3m + # ha and is first master (primary and not join) and not is upgrade + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}' + - http: + collectorName: "Kubernetes API Server Load Balancer Upgrade" + get: + url: https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress | trimSuffix "/" }}/healthz + insecureSkipVerify: true + # ha and is first master (primary and not join) and is upgrade (the load balancer backend should already be available) + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}' + - tcpPortStatus: + collectorName: "Kubernetes API TCP Port Status" + port: 6443 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + - tcpPortStatus: + collectorName: "ETCD Client API TCP Port Status" + port: 2379 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + - tcpPortStatus: + collectorName: "ETCD Server API TCP Port Status" + port: 2380 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + - tcpPortStatus: + collectorName: "ETCD Health Server TCP Port Status" + port: 2381 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + interface: lo + - tcpPortStatus: + collectorName: "Kubelet Health Server TCP Port Status" + port: 10248 + exclude: '{{kurl and (not .IsUpgrade) | not }}' + interface: lo + - tcpPortStatus: + collectorName: "Kubelet API TCP Port Status" + port: 10250 + exclude: '{{kurl and (not .IsUpgrade) | not }}' + - tcpPortStatus: + collectorName: "Kube Controller Manager Health Server TCP Port Status" + port: 10257 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + interface: lo + - tcpPortStatus: + collectorName: "Kube Scheduler Health Server TCP Port Status" + port: 10259 + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + interface: lo + - tcpConnect: + collectorName: "Kubernetes API TCP Connection Status" + address: '{{kurl .Installer.Spec.Kubernetes.MasterAddress }}' + # run the collector if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}' + - filesystemPerformance: + collectorName: Filesystem Latency Two Minute Benchmark + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + timeout: 2m + directory: /var/lib/etcd + fileSize: 22Mi + operationSizeBytes: 2300 + datasync: true + enableBackgroundIOPS: true + backgroundIOPSWarmupSeconds: 10 + backgroundWriteIOPS: 300 + backgroundWriteIOPSJobs: 6 + backgroundReadIOPS: 50 + backgroundReadIOPSJobs: 1 + - certificate: + collectorName: "Kubernetes API key pair certificate" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + certificatePath: /etc/kubernetes/pki/apiserver.crt + keyPath: /etc/kubernetes/pki/apiserver.key + - certificate: + collectorName: "Kubernetes ETCD key pair certificate" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + certificatePath: /etc/kubernetes/pki/etcd/server.crt + keyPath: /etc/kubernetes/pki/etcd/server.key + - http: + collectorName: "Kubernetes API Health" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + get: + url: https://localhost:6443/healthz + insecureSkipVerify: true + analyzers: + - certificate: + collectorName: "Kubernetes API key pair certificate" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + outcomes: + - fail: + when: "key-pair-missing" + message: Kubernetes API key pair certificate not found in /etc/kubernetes/pki/apiserver.* + - fail: + when: "key-pair-switched" + message: Kubernetes API key pair certificate and key pair are switched + - fail: + when: "key-pair-encrypted" + message: Kubernetes API key pair certificate private key is encrypted + - fail: + when: "key-pair-mismatch" + message: Kubernetes API key pair certificate and key do not match + - fail: + when: "key-pair-invalid" + message: Kubernetes API key pair certificate is invalid + - pass: + when: "key-pair-valid" + message: Kubernetes API key pair certificate is valid + - certificate: + collectorName: "Kubernetes ETCD key pair certificate" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + outcomes: + - fail: + when: "key-pair-missing" + message: Kubernetes ETCD key pair certificate not found in /etc/kubernetes/pki/etcd/server.* + - fail: + when: "key-pair-switched" + message: Kubernetes ETCD certificate and key pair are switched + - fail: + when: "key-pair-encrypted" + message: Kubernetes ETCD certificate private key is encrypted + - fail: + when: "key-pair-mismatch" + message: Kubernetes ETCD certificate and key do not match + - fail: + when: "key-pair-invalid" + message: Kubernetes ETCD key pair certificate is invalid + - pass: + when: "key-pair-valid" + message: Kubernetes ETCD key pair certificate is valid + - http: + checkName: "Kubernetes API Health" + exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}' + collectorName: "Kubernetes API Health" + outcomes: + - warn: + when: "error" + message: Error connecting to Kubernetes API at https://localhost:6443/healthz + - pass: + when: "statusCode == 200" + message: OK HTTP status response from Kubernetes API at https://localhost:6443/healthz + - warn: + message: Unexpected status code response from Kubernetes API at https://localhost:6443/healthz + - cpu: + checkName: "Number of CPUs" + outcomes: + - fail: + when: "count < 2" + message: At least 2 CPU cores are required, and 4 CPU cores are recommended + - warn: + when: "count < 4" + message: At least 4 CPU cores are recommended + - pass: + message: This server has at least 4 CPU cores + - memory: + checkName: "Amount of Memory" + outcomes: + - fail: + when: "< 4G" + message: At least 4G of memory is required, and 8G of memory is recommended + - warn: + when: "< 8G" + message: At least 8G of memory is recommended + - pass: + message: The system has at least 8G of memory + - diskUsage: + checkName: "Ephemeral Disk Usage /var/lib/kubelet" + collectorName: "Ephemeral Disk Usage /var/lib/kubelet" + outcomes: + - fail: + when: "total < 30Gi" + message: The disk containing directory /var/lib/kubelet has less than 30Gi of total space + - fail: + when: "used/total > 80%" + message: The disk containing directory /var/lib/kubelet is more than 80% full + - warn: + when: "used/total > 60%" + message: The disk containing directory /var/lib/kubelet is more than 60% full + - warn: + when: "available < 10Gi" + message: The disk containing directory /var/lib/kubelet has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/lib/kubelet has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full + - diskUsage: + checkName: "Ephemeral Disk Usage /var/lib/docker" + collectorName: "Ephemeral Disk Usage /var/lib/docker" + exclude: '{{kurl not .Installer.Spec.Docker.Version}}' + outcomes: + - fail: + when: "total < 30Gi" + message: The disk containing directory /var/lib/docker has less than 30Gi of total space + - fail: + when: "used/total > 80%" + message: The disk containing directory /var/lib/docker is more than 80% full + - warn: + when: "used/total > 60%" + message: The disk containing directory /var/lib/docker is more than 60% full + - warn: + when: "available < 10Gi" + message: The disk containing directory /var/lib/docker has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/lib/docker has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full. + - diskUsage: + checkName: "Ephemeral Disk Usage /var/lib/containerd" + collectorName: "Ephemeral Disk Usage /var/lib/containerd" + exclude: '{{kurl not .Installer.Spec.Containerd.Version}}' + outcomes: + - fail: + when: "total < 30Gi" + message: The disk containing directory /var/lib/containerd has less than 30Gi of total space + - fail: + when: "used/total > 80%" + message: The disk containing directory /var/lib/containerd is more than 80% full + - warn: + when: "used/total > 60%" + message: The disk containing directory /var/lib/containerd is more than 60% full + - warn: + when: "available < 10Gi" + message: The disk containing directory /var/lib/containerd has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/lib/containerd has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full. + - diskUsage: + checkName: "Ephemeral Disk Usage /var/lib/rook" + collectorName: "Ephemeral Disk Usage /var/lib/rook" + exclude: '{{kurl not .Installer.Spec.Rook.Version}}' + outcomes: + - fail: + when: "used/total > 80%" + message: The disk containing directory /var/lib/rook is more than 80% full + - fail: + when: "available < 10Gi" + message: The disk containing directory /var/lib/rook has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/lib/rook has sufficient space + - diskUsage: + checkName: "Ephemeral Disk Usage /var/openebs" + collectorName: "Ephemeral Disk Usage /var/openebs" + exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}' + outcomes: + - warn: + when: "used/total > 80%" + message: The disk containing directory /var/openebs is more than 80% full + - warn: + when: "available < 10Gi" + message: The disk containing directory /var/openebs has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/openebs has sufficient space + - tcpLoadBalancer: + checkName: "Kubernetes API Server Load Balancer" + collectorName: "Kubernetes API Server Load Balancer" + # ha and is first master (primary and not join) and not is upgrade + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "invalid-address" + message: The load balancer address {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} is not valid. + - warn: + when: "connection-refused" + message: Connection to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer was refused. + - warn: + when: "connection-timeout" + message: Timed out connecting to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer. Check your firewall. + - warn: + when: "error" + message: Unexpected port status + - warn: + when: "address-in-use" + message: Port 6443 is unavailable + - pass: + when: "connected" + message: Successfully connected to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer + - warn: + message: Unexpected port status + - http: + checkName: "Kubernetes API Server Load Balancer Upgrade" + collectorName: "Kubernetes API Server Load Balancer Upgrade" + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}' + outcomes: + - fail: + when: "error" + message: Error connecting to load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz + - pass: + when: "statusCode == 200" + message: OK HTTP status response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz + - fail: + message: Unexpected status code response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz + - tcpPortStatus: + checkName: "Kubernetes API TCP Port Status" + collectorName: "Kubernetes API TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 6443 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 6443. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 6443. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 6443 is open + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "ETCD Client API TCP Port Status" + collectorName: "ETCD Client API TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 2379 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 2379. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 2379. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 2379 is open + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "ETCD Server API TCP Port Status" + collectorName: "ETCD Server API TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 2380 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 2380. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 2380. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 2380 is open + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "ETCD Health Server TCP Port Status" + collectorName: "ETCD Health Server TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 2381 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 2381. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 2381. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 2381 is available + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "Kubelet Health Server TCP Port Status" + collectorName: "Kubelet Health Server TCP Port Status" + exclude: '{{kurl and (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 10248 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 10248. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 10248. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 10248 is available + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "Kubelet API TCP Port Status" + collectorName: "Kubelet API TCP Port Status" + exclude: '{{kurl and (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 10250 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 10250. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 10250. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 10250 is open + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "Kube Controller Manager Health Server TCP Port Status" + collectorName: "Kube Controller Manager Health Server TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 10257 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 10257. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 10257. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 10257 is available + - warn: + message: Unexpected port status + - tcpPortStatus: + checkName: "Kube Scheduler Health Server TCP Port Status" + collectorName: "Kube Scheduler Health Server TCP Port Status" + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to port 10259 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port. + - warn: + when: "address-in-use" + message: Another process was already listening on port 10259. + - fail: + when: "connection-timeout" + message: Timed out connecting to port 10259. Check your firewall. + - fail: + when: "error" + message: Unexpected port status + - pass: + when: "connected" + message: Port 10259 is available + - warn: + message: Unexpected port status + - tcpConnect: + checkName: "Kubernetes API TCP Connection Status" + collectorName: "Kubernetes API TCP Connection Status" + # run the analyzer if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install + exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}' + outcomes: + - fail: + when: "connection-refused" + message: Connection to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} was refused + - fail: + when: "connection-timeout" + message: Timed out connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} + - fail: + when: "error" + message: Unexpected error connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} + - pass: + when: "connected" + message: Successfully connected to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} + - filesystemPerformance: + collectorName: Filesystem Latency Two Minute Benchmark + exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}' + outcomes: + - pass: + when: "p99 < 10ms" + message: "Write latency is ok (p99 target < 10ms, actual: {{ .P99 }})" + - warn: + message: "Write latency is high. p99 target < 10ms, actual:{{ .String }}" + - time: + checkName: "NTP Status" + outcomes: + - fail: + when: "ntp == unsynchronized+inactive" + message: "System clock is not synchronized" + - warn: + when: "ntp == unsynchronized+active" + message: System clock not yet synchronized + - pass: + when: "ntp == synchronized+active" + message: "System clock is synchronized" + - warn: + when: "timezone != UTC" + message: "Non UTC timezone can interfere with system function" + - pass: + when: "timezone == UTC" + message: "Timezone is set to UTC" + - hostOS: + checkName: "Docker Support" + exclude: '{{kurl or (not .Installer.Spec.Docker.Version) (semverCompare ">= 20.10.17" .Installer.Spec.Docker.Version) }}' + outcomes: + - fail: + when: "ubuntu = 22.04" + message: "Docker versions < 20.10.17 not supported on ubuntu 22.04" + # hijack hostOS analyzer in order to analyze the kURL Installer spec + - hostOS: + checkName: "Containerd and Weave Compatibility" + exclude: '{{kurl or (not .Installer.Spec.Weave.Version) (not .Installer.Spec.Containerd.Version) (semverCompare "1.6.0 - 1.6.4" .Installer.Spec.Containerd.Version | not) }}' + outcomes: + - fail: + message: "Weave is not compatible with containerd versions 1.6.0 - 1.6.4"