From b4f7984895461c3ed8d84b1ba82e4fd1ff22702a Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 9 Aug 2019 13:47:22 -0400 Subject: [PATCH] Improve error & panic metrics (#169) * Add ndt5 metrics for connections, results, errors, and panics. * Remove global ndt_test_total and ndt_test_errors_total * Improve meta error handling and metrics * Add error counts for login and status check * Add protocol and direction labels throughout * Update README with ndt5 monitoring metrics --- metrics/metrics.go | 18 ++---------- ndt5/README.md | 56 ++++++++++++++++++++++++++++++++++-- ndt5/c2s/c2s.go | 19 ++++++------ ndt5/meta/meta.go | 27 ++++++++++++++--- ndt5/meta/meta_test.go | 18 +++++++++++- ndt5/metrics/metrics.go | 41 ++++++++++++++++++++++++-- ndt5/ndt/server.go | 4 +++ ndt5/ndt5.go | 52 ++++++++++++++++++++++++--------- ndt5/plain/plain.go | 2 -- ndt5/s2c/s2c.go | 26 +++++++++-------- ndt5/singleserving/server.go | 21 +++++++------- 11 files changed, 210 insertions(+), 74 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 7a0903d0..bd630029 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -12,7 +12,7 @@ var ( Name: "ndt_active_tests", Help: "A gauge of requests currently being served by the NDT server.", }, - []string{"type"}) + []string{"protocol"}) TestRate = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "ndt_test_rate_mbps", @@ -24,20 +24,6 @@ var ( 100, 150, 250, 400, 600, 1000}, }, - []string{"direction"}, - ) - TestCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "ndt_test_total", - Help: "Number of NDT tests run by this server.", - }, - []string{"direction", "code"}, - ) - ErrorCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "ndt_test_errors_total", - Help: "Number of test errors of each type for each test.", - }, - []string{"test", "error"}, + []string{"protocol", "direction"}, ) ) diff --git a/ndt5/README.md b/ndt5/README.md index 794ed44b..9277e98c 100644 --- a/ndt5/README.md +++ b/ndt5/README.md @@ -1,6 +1,6 @@ -# legacy ndt-server code +# NDT5 ndt-server code -All code in this directory tree is related to the support of the legacy NDT +All code in this directory tree is related to the support of the legacy NDT5 protocol. We have many extant clients that use this protocol, and we don't want to leave them high and dry, but new clients are encouraged to use the services provided by ndt7. The test is streamlined, the client is easier to @@ -8,6 +8,56 @@ write, and basically everything about it is better. In this subtree, we support existing clients, but we will be adding no new functionality. If you are reading this and trying to decide how to implement -a speed test, use ndt7 and not the legacy protocol. The legacy protocol is +a speed test, use ndt7 and not the legacy, ndt5 protocol. The legacy protocol is deprecated. It will be supported until usage drops to very low levels, but it is also not recommended for new integrations or code. + +## NDT5 Metrics + +Summary of metrics useful for monitoring client request, success, and error rates. + +* `ndt5_control_total{protocol, result}` counts every client connection + that reaches `HandleControlChannel`. + + * The "protocol=" label matches the client protocol, e.g., "WS", "WSS", or + "PLAIN". + * The "result=" label is either "okay" or "panic". + * All result="panic" results also count specific causes in + `ndt5_control_panic_total`. + * All result="okay" results come from "protocol complete" clients. + +* `ndt5_client_test_requested_total{protocol, direction}` counts + client-requested tests. + + * The "protocol=" label is the same as above. + * The "direction=" label will have values like "c2s" and "s2c". + * If the client continues the test, then the result will be counted in + `ndt5_client_test_results_total`. + +* `ndt5_client_test_results_total{protocol, direction, result}` counts the + results of client-requested tests. + + * The "protocol=" and "direction=" labels are as above. + * The "result=" label is either "okay-with-rate", "error-with-rate" or + "error-without-rate". + * All result="okay-with-rate" count all "protocol complete" clients up to that + point. + * All result=~"error-.*" results also count specific causes in + `ndt5_client_test_errors_total`. + +* `ndt5_client_test_errors_total{protocol, direction, error}` + + * The "protocol=" and "direction=" labels are as above. + * The "error=" label contains unique values mapping to specific error paths in + the ndt-server. + +Expected invariants: + +* `sum(ndt5_control_channel_duration_count) == sum(ndt5_control_total)` +* `sum(ndt5_control_total{result="panic"}) == sum(ndt5_control_panic_total)` +* `sum(ndt5_client_test_results_total{result=~"error.*"}) == sum(ndt5_client_test_errors_total)` + +NOTE: + +* `ndt5_client_test_results_total` may be less than `ndt5_client_test_requested_total` + if the client hangs up before the test can run. diff --git a/ndt5/c2s/c2s.go b/ndt5/c2s/c2s.go index beca4169..cf5f2e04 100644 --- a/ndt5/c2s/c2s.go +++ b/ndt5/c2s/c2s.go @@ -6,7 +6,7 @@ import ( "strconv" "time" - "github.com/m-lab/ndt-server/metrics" + "github.com/m-lab/ndt-server/ndt5/metrics" "github.com/m-lab/go/warnonerror" "github.com/m-lab/ndt-server/ndt5/ndt" @@ -48,25 +48,26 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv record = &ArchivalData{} m := controlConn.Messager() + connType := s.ConnectionType().String() srv, err := s.SingleServingServer("c2s") if err != nil { log.Println("Could not start SingleServingServer", err) - metrics.ErrorCount.WithLabelValues("c2s", "StartSingleServingServer").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "StartSingleServingServer").Inc() return record, err } err = m.SendMessage(protocol.TestPrepare, []byte(strconv.Itoa(srv.Port()))) if err != nil { log.Println("Could not send TestPrepare", err) - metrics.ErrorCount.WithLabelValues("c2s", "TestPrepare").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestPrepare").Inc() return record, err } testConn, err := srv.ServeOnce(localContext) if err != nil { log.Println("Could not successfully ServeOnce", err) - metrics.ErrorCount.WithLabelValues("c2s", "ServeOnce").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "ServeOnce").Inc() return record, err } @@ -88,7 +89,7 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv err = m.SendMessage(protocol.TestStart, []byte{}) if err != nil { log.Println("Could not send TestStart", err) - metrics.ErrorCount.WithLabelValues("c2s", "TestStart").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestStart").Inc() return record, err } @@ -100,13 +101,13 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv if err != nil { if byteCount == 0 { log.Println("Could not drain the test connection", byteCount, err) - metrics.ErrorCount.WithLabelValues("c2s", "Drain").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "Drain").Inc() return record, err } // It is possible for the client to reach 10 seconds slightly before the server does. if seconds < 9 { log.Printf("C2S test client only uploaded for %f seconds\n", seconds) - metrics.ErrorCount.WithLabelValues("c2s", "EarlyExit").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "EarlyExit").Inc() return record, err } // More than 9 seconds is fine. @@ -120,14 +121,14 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv err = m.SendMessage(protocol.TestMsg, []byte(strconv.FormatInt(int64(throughputValue), 10))) if err != nil { log.Println("Could not send TestMsg with C2S results", err) - metrics.ErrorCount.WithLabelValues("c2s", "TestMsg").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestMsg").Inc() return record, err } err = m.SendMessage(protocol.TestFinalize, []byte{}) if err != nil { log.Println("Could not send TestFinalize", err) - metrics.ErrorCount.WithLabelValues("c2s", "TestFinalize").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestFinalize").Inc() return record, err } diff --git a/ndt5/meta/meta.go b/ndt5/meta/meta.go index 7c2d884f..79342a17 100644 --- a/ndt5/meta/meta.go +++ b/ndt5/meta/meta.go @@ -7,6 +7,7 @@ import ( "time" "github.com/m-lab/ndt-server/ndt5/metrics" + "github.com/m-lab/ndt-server/ndt5/ndt" "github.com/m-lab/ndt-server/ndt5/protocol" ) @@ -20,16 +21,27 @@ type ArchivalData map[string]string // takes longer than 15sec, then ManageTest will return after the next ReceiveMessage. // The given protocolMessager should have its own connection timeout to prevent // "slow drip" clients holding the connection open indefinitely. -func ManageTest(ctx context.Context, m protocol.Messager) (ArchivalData, error) { +func ManageTest(ctx context.Context, m protocol.Messager, s ndt.Server) (ArchivalData, error) { localCtx, localCancel := context.WithTimeout(ctx, 15*time.Second) defer localCancel() var err error var message []byte results := map[string]string{} + connType := s.ConnectionType().String() - m.SendMessage(protocol.TestPrepare, []byte{}) - m.SendMessage(protocol.TestStart, []byte{}) + err = m.SendMessage(protocol.TestPrepare, []byte{}) + if err != nil { + log.Println("META TestPrepare:", err) + metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestPrepare").Inc() + return nil, err + } + err = m.SendMessage(protocol.TestStart, []byte{}) + if err != nil { + log.Println("META TestStart:", err) + metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestStart").Inc() + return nil, err + } count := 0 for count < maxClientMessages && localCtx.Err() == nil { message, err = m.ReceiveMessage(protocol.TestMsg) @@ -54,14 +66,21 @@ func ManageTest(ctx context.Context, m protocol.Messager) (ArchivalData, error) } if localCtx.Err() != nil { log.Println("META context error:", localCtx.Err()) + metrics.ClientTestErrors.WithLabelValues(connType, "meta", "context").Inc() return nil, localCtx.Err() } if err != nil { log.Println("Error reading JSON message:", err) + metrics.ClientTestErrors.WithLabelValues(connType, "meta", "ReceiveMessage").Inc() return nil, err } // Count the number meta values sent by the client (when there are no errors). metrics.SubmittedMetaValues.Observe(float64(count)) - m.SendMessage(protocol.TestFinalize, []byte{}) + err = m.SendMessage(protocol.TestFinalize, []byte{}) + if err != nil { + log.Println("META TestFinalize:", err) + metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestFinalize").Inc() + return nil, err + } return results, nil } diff --git a/ndt5/meta/meta_test.go b/ndt5/meta/meta_test.go index 61bf1c23..9e0d1a25 100644 --- a/ndt5/meta/meta_test.go +++ b/ndt5/meta/meta_test.go @@ -6,6 +6,7 @@ import ( "reflect" "testing" + "github.com/m-lab/ndt-server/ndt5/ndt" "github.com/m-lab/ndt-server/ndt5/protocol" ) @@ -22,6 +23,20 @@ type fakeMessager struct { recv []recvMessage c int } +type fakeServer struct{} + +func (s *fakeServer) SingleServingServer(direction string) (ndt.SingleMeasurementServer, error) { + return nil, nil +} +func (s *fakeServer) ConnectionType() ndt.ConnectionType { + return ndt.Plain +} +func (s *fakeServer) DataDir() string { + return "" +} +func (s *fakeServer) LoginCeremony(protocol.Connection) (int, error) { + return 0, nil +} func (m *fakeMessager) SendMessage(t protocol.MessageType, msg []byte) error { m.sent = append(m.sent, sendMessage{t: t, msg: msg}) @@ -112,8 +127,9 @@ func TestManageTest(t *testing.T) { }, } for _, tt := range tests { + s := &fakeServer{} t.Run(tt.name, func(t *testing.T) { - got, err := ManageTest(tt.ctx, tt.m) + got, err := ManageTest(tt.ctx, tt.m, s) if (err != nil) != tt.wantErr { t.Errorf("ManageTest() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/ndt5/metrics/metrics.go b/ndt5/metrics/metrics.go index 53fbd0b2..00549bb9 100644 --- a/ndt5/metrics/metrics.go +++ b/ndt5/metrics/metrics.go @@ -21,6 +21,20 @@ var ( }, []string{"protocol"}, ) + ControlPanicCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "ndt5_control_panic_total", + Help: "Number of recovered panics in the control channel.", + }, + []string{"protocol", "error"}, + ) + ControlCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "ndt5_control_total", + Help: "Number of control channel requests that results for each protocol and test type.", + }, + []string{"protocol", "result"}, + ) MeasurementServerStart = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "ndt5_measurementserver_start_total", @@ -28,6 +42,13 @@ var ( }, []string{"protocol"}, ) + MeasurementServerAccept = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "ndt5_measurementserver_accept_total", + Help: "The number of times a single-serving server received a successful client connections.", + }, + []string{"protocol", "direction"}, + ) MeasurementServerStop = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "ndt5_measurementserver_stop_total", @@ -46,14 +67,28 @@ var ( Name: "ndt5_client_requested_suites_total", Help: "The number of client request test suites (the combination of all test types as an integer 0-255).", }, - []string{"suite"}, + []string{"protocol", "suite"}, ) ClientRequestedTests = promauto.NewCounterVec( prometheus.CounterOpts{ - Name: "ndt5_client_requested_tests_total", + Name: "ndt5_client_test_requested_total", Help: "The number of client requests for each ndt5 test type.", }, - []string{"type"}, + []string{"protocol", "direction"}, + ) + ClientTestResults = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "ndt5_client_test_results_total", + Help: "Number of client-connections for NDT tests run by this server.", + }, + []string{"protocol", "direction", "result"}, + ) + ClientTestErrors = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "ndt5_client_test_errors_total", + Help: "Number of test errors of each type for each test.", + }, + []string{"protocol", "direction", "error"}, ) SubmittedMetaValues = promauto.NewHistogram( prometheus.HistogramOpts{ diff --git a/ndt5/ndt/server.go b/ndt5/ndt/server.go index a3da395f..de2b89bb 100644 --- a/ndt5/ndt/server.go +++ b/ndt5/ndt/server.go @@ -10,6 +10,10 @@ import ( // websockets, or secure websockets. type ConnectionType string +func (c ConnectionType) String() string { + return string(c) +} + // The types of connections we support. var ( WS = ConnectionType("WS") diff --git a/ndt5/ndt5.go b/ndt5/ndt5.go index 56585cad..8f04fd21 100644 --- a/ndt5/ndt5.go +++ b/ndt5/ndt5.go @@ -87,26 +87,42 @@ func panicMsgToErrType(msg string) string { return "panic" } +func getResultLabel(err error, rate float64) string { + withErr := "okay" + if err != nil { + withErr = "error" + } + withResult := "-with-rate" + if rate == 0 { + withResult = "-without-rate" + } + return withErr + withResult +} + // HandleControlChannel is the "business logic" of an NDT test. It is designed // to run every test, and to never need to know whether the underlying // connection is just a TCP socket, a WS connection, or a WSS connection. It // only needs a connection, and a factory for making single-use servers for // connections of that same type. func HandleControlChannel(conn protocol.Connection, s ndt.Server) { - metrics.ActiveTests.WithLabelValues(string(s.ConnectionType())).Inc() - defer metrics.ActiveTests.WithLabelValues(string(s.ConnectionType())).Dec() + connType := s.ConnectionType().String() + metrics.ActiveTests.WithLabelValues(connType).Inc() + defer metrics.ActiveTests.WithLabelValues(connType).Dec() defer func(start time.Time) { - ndt5metrics.ControlChannelDuration.WithLabelValues(string(s.ConnectionType())).Observe( + ndt5metrics.ControlChannelDuration.WithLabelValues(connType).Observe( time.Since(start).Seconds()) }(time.Now()) defer func() { + completed := "okay" r := recover() if r != nil { log.Println("Test failed, but we recovered:", r) // All of our panic messages begin with an informative first word. Use that as a label. errType := panicMsgToErrType(fmt.Sprint(r)) - metrics.ErrorCount.WithLabelValues(string(s.ConnectionType()), errType).Inc() + ndt5metrics.ControlPanicCount.WithLabelValues(connType, errType).Inc() + completed = "panic" } + ndt5metrics.ControlCount.WithLabelValues(connType, completed).Inc() }() handleControlChannel(conn, s) } @@ -118,7 +134,7 @@ func handleControlChannel(conn protocol.Connection, s ndt.Server) { log.Println("Handling connection", conn) defer warnonerror.Close(conn, "Could not close "+conn.String()) - + connType := s.ConnectionType().String() sIP, sPort := conn.ServerIPAndPort() cIP, cPort := conn.ClientIPAndPort() record := &data.NDTResult{ @@ -140,10 +156,14 @@ func handleControlChannel(conn protocol.Connection, s ndt.Server) { }() tests, err := s.LoginCeremony(conn) + if err != nil { + ndt5metrics.ClientTestErrors.WithLabelValues(connType, "control", "LoginCeremony").Inc() + } rtx.PanicOnError(err, "Login - error reading JSON message") if (tests & cTestStatus) == 0 { log.Println("We don't support clients that don't support TestStatus") + ndt5metrics.ClientTestErrors.WithLabelValues(connType, "control", "TestStatus").Inc() return } testsToRun := []string{} @@ -155,30 +175,30 @@ func handleControlChannel(conn protocol.Connection, s ndt.Server) { suites := []string{"status"} if runMID { - ndt5metrics.ClientRequestedTests.WithLabelValues("mid").Inc() + ndt5metrics.ClientRequestedTests.WithLabelValues(connType, "mid").Inc() suites = append(suites, "mid") } if runC2s { testsToRun = append(testsToRun, strconv.Itoa(cTestC2S)) - ndt5metrics.ClientRequestedTests.WithLabelValues("c2s").Inc() + ndt5metrics.ClientRequestedTests.WithLabelValues(connType, "c2s").Inc() suites = append(suites, "c2s") } if runS2c { testsToRun = append(testsToRun, strconv.Itoa(cTestS2C)) - ndt5metrics.ClientRequestedTests.WithLabelValues("s2c").Inc() + ndt5metrics.ClientRequestedTests.WithLabelValues(connType, "s2c").Inc() suites = append(suites, "s2c") } if runSFW { - ndt5metrics.ClientRequestedTests.WithLabelValues("sfw").Inc() + ndt5metrics.ClientRequestedTests.WithLabelValues(connType, "sfw").Inc() suites = append(suites, "sfw") } if runMeta { testsToRun = append(testsToRun, strconv.Itoa(cTestMETA)) - ndt5metrics.ClientRequestedTests.WithLabelValues("meta").Inc() + ndt5metrics.ClientRequestedTests.WithLabelValues(connType, "meta").Inc() suites = append(suites, "meta") } // Count the combined test suites by name. i.e. "status-s2c-meta" - ndt5metrics.ClientRequestedTestSuites.WithLabelValues(strings.Join(suites, "-")).Inc() + ndt5metrics.ClientRequestedTestSuites.WithLabelValues(connType, strings.Join(suites, "-")).Inc() m := conn.Messager() record.Control.MessageProtocol = m.Encoding().String() @@ -197,20 +217,24 @@ func handleControlChannel(conn protocol.Connection, s ndt.Server) { record.C2S, err = c2s.ManageTest(ctx, conn, s) if record.C2S != nil && record.C2S.MeanThroughputMbps != 0 { c2sRate = record.C2S.MeanThroughputMbps - metrics.TestRate.WithLabelValues("c2s").Observe(c2sRate) + metrics.TestRate.WithLabelValues(connType, "c2s").Observe(c2sRate) } + r := getResultLabel(err, record.C2S.MeanThroughputMbps) + ndt5metrics.ClientTestResults.WithLabelValues(connType, "c2s", r).Inc() rtx.PanicOnError(err, "C2S - Could not run c2s test") } if runS2c { record.S2C, err = s2c.ManageTest(ctx, conn, s) if record.S2C != nil && record.S2C.MeanThroughputMbps != 0 { s2cRate = record.S2C.MeanThroughputMbps - metrics.TestRate.WithLabelValues("s2c").Observe(s2cRate) + metrics.TestRate.WithLabelValues(connType, "s2c").Observe(s2cRate) } + r := getResultLabel(err, record.S2C.MeanThroughputMbps) + ndt5metrics.ClientTestResults.WithLabelValues(connType, "s2c", r).Inc() rtx.PanicOnError(err, "S2C - Could not run s2c test") } if runMeta { - record.Control.ClientMetadata, err = meta.ManageTest(ctx, m) + record.Control.ClientMetadata, err = meta.ManageTest(ctx, m, s) rtx.PanicOnError(err, "META - Could not run meta test") } speedMsg := fmt.Sprintf("You uploaded at %.4f and downloaded at %.4f", c2sRate*1000, s2cRate*1000) diff --git a/ndt5/plain/plain.go b/ndt5/plain/plain.go index 137d8347..35aee442 100644 --- a/ndt5/plain/plain.go +++ b/ndt5/plain/plain.go @@ -13,7 +13,6 @@ import ( "time" "github.com/m-lab/go/warnonerror" - "github.com/m-lab/ndt-server/metrics" "github.com/m-lab/ndt-server/ndt5" ndt5metrics "github.com/m-lab/ndt-server/ndt5/metrics" "github.com/m-lab/ndt-server/ndt5/ndt" @@ -62,7 +61,6 @@ func (ps *plainServer) sniffThenHandle(conn net.Conn) { fwd, err := ps.dialer.Dial("tcp", ps.wsAddr) if err != nil { log.Println("Could not forward connection", err) - metrics.ErrorCount.WithLabelValues(string(ndt.WS), "forwarding").Inc() return } wg := sync.WaitGroup{} diff --git a/ndt5/s2c/s2c.go b/ndt5/s2c/s2c.go index 783d98e6..1f735322 100644 --- a/ndt5/s2c/s2c.go +++ b/ndt5/s2c/s2c.go @@ -8,7 +8,7 @@ import ( "time" "github.com/m-lab/go/warnonerror" - "github.com/m-lab/ndt-server/metrics" + "github.com/m-lab/ndt-server/ndt5/metrics" "github.com/m-lab/ndt-server/ndt5/ndt" "github.com/m-lab/ndt-server/ndt5/protocol" ) @@ -51,24 +51,26 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv } }() + connType := s.ConnectionType().String() + srv, err := s.SingleServingServer("s2c") if err != nil { log.Println("Could not start single serving server", err) - metrics.ErrorCount.WithLabelValues("s2c", "StartSingleServingServer").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "StartSingleServingServer").Inc() return record, err } m := controlConn.Messager() err = m.SendMessage(protocol.TestPrepare, []byte(strconv.Itoa(srv.Port()))) if err != nil { log.Println("Could not send TestPrepare", err) - metrics.ErrorCount.WithLabelValues("s2c", "TestPrepare").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "TestPrepare").Inc() return record, err } testConn, err := srv.ServeOnce(localCtx) if err != nil || testConn == nil { log.Println("Could not successfully ServeOnce", err) - metrics.ErrorCount.WithLabelValues("s2c", "ServeOnce").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "ServeOnce").Inc() if err == nil { err = errors.New("nil testConn, but also a nil error") } @@ -87,7 +89,7 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv if err != nil { warnonerror.Close(testConn, "Could not close test connection") log.Println("Could not write TestStart", err) - metrics.ErrorCount.WithLabelValues("s2c", "TestStart").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "TestStart").Inc() return record, err } @@ -98,7 +100,7 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv if err != nil { warnonerror.Close(testConn, "Could not close test connection") log.Println("Could not FillUntil", err) - metrics.ErrorCount.WithLabelValues("s2c", "FillUntil").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "FillUntil").Inc() return record, err } @@ -106,12 +108,12 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv if err != nil { warnonerror.Close(testConn, "Could not close test connection") log.Println("Could not read metrics", err) - metrics.ErrorCount.WithLabelValues("s2c", "web100Metrics").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "web100Metrics").Inc() return record, err } // Close the test connection to signal to single-threaded clients that the - // download has completed. Note: a possible optimisation is to wait for + // download has completed. Note: a possible optimization is to wait for // one-two seconds for the client to close the connection and then close // it anyway. This gives us the advantage that the client will retain // the state assciated with initiating the close. @@ -128,13 +130,13 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv err = m.SendS2CResults(int64(kbps), 0, byteCount) if err != nil { log.Println("Could not write a TestMsg", err) - metrics.ErrorCount.WithLabelValues("s2c", "TestMsgSend").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "TestMsgSend").Inc() return record, err } clientRateMsg, err := m.ReceiveMessage(protocol.TestMsg) if err != nil { - metrics.ErrorCount.WithLabelValues("s2c", "TestMsgRcv").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "TestMsgRcv").Inc() log.Println("Could not receive a TestMsg", err) return record, err } @@ -150,14 +152,14 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv err = protocol.SendMetrics(web100metrics, m) if err != nil { log.Println("Could not SendMetrics", err) - metrics.ErrorCount.WithLabelValues("s2c", "SendMetrics").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "SendMetrics").Inc() return record, err } err = m.SendMessage(protocol.TestFinalize, []byte{}) if err != nil { log.Println("Could not send TestFinalize", err) - metrics.ErrorCount.WithLabelValues("s2c", "TestFinalize").Inc() + metrics.ClientTestErrors.WithLabelValues(connType, "s2c", "TestFinalize").Inc() return record, err } diff --git a/ndt5/singleserving/server.go b/ndt5/singleserving/server.go index 6230a4ef..7e8f8024 100644 --- a/ndt5/singleserving/server.go +++ b/ndt5/singleserving/server.go @@ -13,12 +13,9 @@ import ( "github.com/m-lab/ndt-server/ndt5/ws" "github.com/m-lab/ndt-server/ndt7/listener" - "github.com/m-lab/ndt-server/metrics" ndt5metrics "github.com/m-lab/ndt-server/ndt5/metrics" "github.com/m-lab/ndt-server/ndt5/protocol" "github.com/m-lab/ndt-server/ndt5/tcplistener" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" ) // wsServer is a single-serving server for unencrypted websockets. @@ -80,6 +77,8 @@ func (s *wsServer) ServeOnce(ctx context.Context) (protocol.MeasuredConnection, if s.newConn == nil && s.newConnErr == nil { return nil, errors.New("No connection created") } + // Because the client has contacted the test server successfully, count the test. + ndt5metrics.MeasurementServerAccept.WithLabelValues(s.kind.String(), s.direction) return s.newConn, s.newConnErr } @@ -112,8 +111,7 @@ func listenWS(direction string) (*wsServer, error) { kind: ndt.WS, } s.serve = s.srv.Serve - mux.HandleFunc("/ndt_protocol", - promhttp.InstrumentHandlerCounter(metrics.TestCount.MustCurryWith(prometheus.Labels{"direction": direction}), s)) + mux.Handle("/ndt_protocol", s) // Start listening right away to ensure that subsequent connections succeed. l, err := net.Listen("tcp", ":0") @@ -159,8 +157,9 @@ func ListenWSS(direction, certFile, keyFile string) (ndt.SingleMeasurementServer // plainServer is a single-serving server for plain TCP sockets. type plainServer struct { - listener net.Listener - port int + listener net.Listener + port int + direction string } func (ps *plainServer) Close() { @@ -190,6 +189,8 @@ func (ps *plainServer) ServeOnce(ctx context.Context) (protocol.MeasuredConnecti if conn == nil { return nil, errors.New("nil conn, nil err: " + derivedCtx.Err().Error()) } + // Because the client has contacted the test server successfully, count the test. + ndt5metrics.MeasurementServerAccept.WithLabelValues(ndt.Plain.String(), ps.direction) return protocol.AdaptNetConn(conn, conn), nil } @@ -201,10 +202,10 @@ func (ps *plainServer) ServeOnce(ctx context.Context) (protocol.MeasuredConnecti // timeouts) after this returns. func ListenPlain(direction string) (ndt.SingleMeasurementServer, error) { ndt5metrics.MeasurementServerStart.WithLabelValues(string(ndt.Plain)).Inc() - // The "code" label is required for the TestCount metric. The code value is hardcoded. - metrics.TestCount.MustCurryWith(prometheus.Labels{"direction": direction, "code": "200"}) // Start listening right away to ensure that subsequent connections succeed. - s := &plainServer{} + s := &plainServer{ + direction: direction, + } l, err := net.Listen("tcp", ":0") if err != nil { return nil, err