Skip to content

Commit

Permalink
Improve error & panic metrics (#169)
Browse files Browse the repository at this point in the history
* Add ndt5 metrics for connections, results, errors, and panics.
* Remove global ndt_test_total and ndt_test_errors_total
* Improve meta error handling and metrics
* Add error counts for login and status check
* Add protocol and direction labels throughout
* Update README with ndt5 monitoring metrics
  • Loading branch information
stephen-soltesz authored Aug 9, 2019
1 parent 76e0c34 commit b4f7984
Show file tree
Hide file tree
Showing 11 changed files with 210 additions and 74 deletions.
18 changes: 2 additions & 16 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ var (
Name: "ndt_active_tests",
Help: "A gauge of requests currently being served by the NDT server.",
},
[]string{"type"})
[]string{"protocol"})
TestRate = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "ndt_test_rate_mbps",
Expand All @@ -24,20 +24,6 @@ var (
100, 150, 250, 400, 600,
1000},
},
[]string{"direction"},
)
TestCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt_test_total",
Help: "Number of NDT tests run by this server.",
},
[]string{"direction", "code"},
)
ErrorCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt_test_errors_total",
Help: "Number of test errors of each type for each test.",
},
[]string{"test", "error"},
[]string{"protocol", "direction"},
)
)
56 changes: 53 additions & 3 deletions ndt5/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,63 @@
# legacy ndt-server code
# NDT5 ndt-server code

All code in this directory tree is related to the support of the legacy NDT
All code in this directory tree is related to the support of the legacy NDT5
protocol. We have many extant clients that use this protocol, and we don't
want to leave them high and dry, but new clients are encouraged to use the
services provided by ndt7. The test is streamlined, the client is easier to
write, and basically everything about it is better.

In this subtree, we support existing clients, but we will be adding no new
functionality. If you are reading this and trying to decide how to implement
a speed test, use ndt7 and not the legacy protocol. The legacy protocol is
a speed test, use ndt7 and not the legacy, ndt5 protocol. The legacy protocol is
deprecated. It will be supported until usage drops to very low levels, but it
is also not recommended for new integrations or code.

## NDT5 Metrics

Summary of metrics useful for monitoring client request, success, and error rates.

* `ndt5_control_total{protocol, result}` counts every client connection
that reaches `HandleControlChannel`.

* The "protocol=" label matches the client protocol, e.g., "WS", "WSS", or
"PLAIN".
* The "result=" label is either "okay" or "panic".
* All result="panic" results also count specific causes in
`ndt5_control_panic_total`.
* All result="okay" results come from "protocol complete" clients.

* `ndt5_client_test_requested_total{protocol, direction}` counts
client-requested tests.

* The "protocol=" label is the same as above.
* The "direction=" label will have values like "c2s" and "s2c".
* If the client continues the test, then the result will be counted in
`ndt5_client_test_results_total`.

* `ndt5_client_test_results_total{protocol, direction, result}` counts the
results of client-requested tests.

* The "protocol=" and "direction=" labels are as above.
* The "result=" label is either "okay-with-rate", "error-with-rate" or
"error-without-rate".
* All result="okay-with-rate" count all "protocol complete" clients up to that
point.
* All result=~"error-.*" results also count specific causes in
`ndt5_client_test_errors_total`.

* `ndt5_client_test_errors_total{protocol, direction, error}`

* The "protocol=" and "direction=" labels are as above.
* The "error=" label contains unique values mapping to specific error paths in
the ndt-server.

Expected invariants:

* `sum(ndt5_control_channel_duration_count) == sum(ndt5_control_total)`
* `sum(ndt5_control_total{result="panic"}) == sum(ndt5_control_panic_total)`
* `sum(ndt5_client_test_results_total{result=~"error.*"}) == sum(ndt5_client_test_errors_total)`

NOTE:

* `ndt5_client_test_results_total` may be less than `ndt5_client_test_requested_total`
if the client hangs up before the test can run.
19 changes: 10 additions & 9 deletions ndt5/c2s/c2s.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strconv"
"time"

"github.com/m-lab/ndt-server/metrics"
"github.com/m-lab/ndt-server/ndt5/metrics"

"github.com/m-lab/go/warnonerror"
"github.com/m-lab/ndt-server/ndt5/ndt"
Expand Down Expand Up @@ -48,25 +48,26 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv
record = &ArchivalData{}

m := controlConn.Messager()
connType := s.ConnectionType().String()

srv, err := s.SingleServingServer("c2s")
if err != nil {
log.Println("Could not start SingleServingServer", err)
metrics.ErrorCount.WithLabelValues("c2s", "StartSingleServingServer").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "StartSingleServingServer").Inc()
return record, err
}

err = m.SendMessage(protocol.TestPrepare, []byte(strconv.Itoa(srv.Port())))
if err != nil {
log.Println("Could not send TestPrepare", err)
metrics.ErrorCount.WithLabelValues("c2s", "TestPrepare").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestPrepare").Inc()
return record, err
}

testConn, err := srv.ServeOnce(localContext)
if err != nil {
log.Println("Could not successfully ServeOnce", err)
metrics.ErrorCount.WithLabelValues("c2s", "ServeOnce").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "ServeOnce").Inc()
return record, err
}

Expand All @@ -88,7 +89,7 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv
err = m.SendMessage(protocol.TestStart, []byte{})
if err != nil {
log.Println("Could not send TestStart", err)
metrics.ErrorCount.WithLabelValues("c2s", "TestStart").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestStart").Inc()
return record, err
}

Expand All @@ -100,13 +101,13 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv
if err != nil {
if byteCount == 0 {
log.Println("Could not drain the test connection", byteCount, err)
metrics.ErrorCount.WithLabelValues("c2s", "Drain").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "Drain").Inc()
return record, err
}
// It is possible for the client to reach 10 seconds slightly before the server does.
if seconds < 9 {
log.Printf("C2S test client only uploaded for %f seconds\n", seconds)
metrics.ErrorCount.WithLabelValues("c2s", "EarlyExit").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "EarlyExit").Inc()
return record, err
}
// More than 9 seconds is fine.
Expand All @@ -120,14 +121,14 @@ func ManageTest(ctx context.Context, controlConn protocol.Connection, s ndt.Serv
err = m.SendMessage(protocol.TestMsg, []byte(strconv.FormatInt(int64(throughputValue), 10)))
if err != nil {
log.Println("Could not send TestMsg with C2S results", err)
metrics.ErrorCount.WithLabelValues("c2s", "TestMsg").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestMsg").Inc()
return record, err
}

err = m.SendMessage(protocol.TestFinalize, []byte{})
if err != nil {
log.Println("Could not send TestFinalize", err)
metrics.ErrorCount.WithLabelValues("c2s", "TestFinalize").Inc()
metrics.ClientTestErrors.WithLabelValues(connType, "c2s", "TestFinalize").Inc()
return record, err
}

Expand Down
27 changes: 23 additions & 4 deletions ndt5/meta/meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/m-lab/ndt-server/ndt5/metrics"
"github.com/m-lab/ndt-server/ndt5/ndt"
"github.com/m-lab/ndt-server/ndt5/protocol"
)

Expand All @@ -20,16 +21,27 @@ type ArchivalData map[string]string
// takes longer than 15sec, then ManageTest will return after the next ReceiveMessage.
// The given protocolMessager should have its own connection timeout to prevent
// "slow drip" clients holding the connection open indefinitely.
func ManageTest(ctx context.Context, m protocol.Messager) (ArchivalData, error) {
func ManageTest(ctx context.Context, m protocol.Messager, s ndt.Server) (ArchivalData, error) {
localCtx, localCancel := context.WithTimeout(ctx, 15*time.Second)
defer localCancel()

var err error
var message []byte
results := map[string]string{}
connType := s.ConnectionType().String()

m.SendMessage(protocol.TestPrepare, []byte{})
m.SendMessage(protocol.TestStart, []byte{})
err = m.SendMessage(protocol.TestPrepare, []byte{})
if err != nil {
log.Println("META TestPrepare:", err)
metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestPrepare").Inc()
return nil, err
}
err = m.SendMessage(protocol.TestStart, []byte{})
if err != nil {
log.Println("META TestStart:", err)
metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestStart").Inc()
return nil, err
}
count := 0
for count < maxClientMessages && localCtx.Err() == nil {
message, err = m.ReceiveMessage(protocol.TestMsg)
Expand All @@ -54,14 +66,21 @@ func ManageTest(ctx context.Context, m protocol.Messager) (ArchivalData, error)
}
if localCtx.Err() != nil {
log.Println("META context error:", localCtx.Err())
metrics.ClientTestErrors.WithLabelValues(connType, "meta", "context").Inc()
return nil, localCtx.Err()
}
if err != nil {
log.Println("Error reading JSON message:", err)
metrics.ClientTestErrors.WithLabelValues(connType, "meta", "ReceiveMessage").Inc()
return nil, err
}
// Count the number meta values sent by the client (when there are no errors).
metrics.SubmittedMetaValues.Observe(float64(count))
m.SendMessage(protocol.TestFinalize, []byte{})
err = m.SendMessage(protocol.TestFinalize, []byte{})
if err != nil {
log.Println("META TestFinalize:", err)
metrics.ClientTestErrors.WithLabelValues(connType, "meta", "TestFinalize").Inc()
return nil, err
}
return results, nil
}
18 changes: 17 additions & 1 deletion ndt5/meta/meta_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"reflect"
"testing"

"github.com/m-lab/ndt-server/ndt5/ndt"
"github.com/m-lab/ndt-server/ndt5/protocol"
)

Expand All @@ -22,6 +23,20 @@ type fakeMessager struct {
recv []recvMessage
c int
}
type fakeServer struct{}

func (s *fakeServer) SingleServingServer(direction string) (ndt.SingleMeasurementServer, error) {
return nil, nil
}
func (s *fakeServer) ConnectionType() ndt.ConnectionType {
return ndt.Plain
}
func (s *fakeServer) DataDir() string {
return ""
}
func (s *fakeServer) LoginCeremony(protocol.Connection) (int, error) {
return 0, nil
}

func (m *fakeMessager) SendMessage(t protocol.MessageType, msg []byte) error {
m.sent = append(m.sent, sendMessage{t: t, msg: msg})
Expand Down Expand Up @@ -112,8 +127,9 @@ func TestManageTest(t *testing.T) {
},
}
for _, tt := range tests {
s := &fakeServer{}
t.Run(tt.name, func(t *testing.T) {
got, err := ManageTest(tt.ctx, tt.m)
got, err := ManageTest(tt.ctx, tt.m, s)
if (err != nil) != tt.wantErr {
t.Errorf("ManageTest() error = %v, wantErr %v", err, tt.wantErr)
return
Expand Down
41 changes: 38 additions & 3 deletions ndt5/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,34 @@ var (
},
[]string{"protocol"},
)
ControlPanicCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_control_panic_total",
Help: "Number of recovered panics in the control channel.",
},
[]string{"protocol", "error"},
)
ControlCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_control_total",
Help: "Number of control channel requests that results for each protocol and test type.",
},
[]string{"protocol", "result"},
)
MeasurementServerStart = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_measurementserver_start_total",
Help: "The number of times a single-serving server was started.",
},
[]string{"protocol"},
)
MeasurementServerAccept = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_measurementserver_accept_total",
Help: "The number of times a single-serving server received a successful client connections.",
},
[]string{"protocol", "direction"},
)
MeasurementServerStop = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_measurementserver_stop_total",
Expand All @@ -46,14 +67,28 @@ var (
Name: "ndt5_client_requested_suites_total",
Help: "The number of client request test suites (the combination of all test types as an integer 0-255).",
},
[]string{"suite"},
[]string{"protocol", "suite"},
)
ClientRequestedTests = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_client_requested_tests_total",
Name: "ndt5_client_test_requested_total",
Help: "The number of client requests for each ndt5 test type.",
},
[]string{"type"},
[]string{"protocol", "direction"},
)
ClientTestResults = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_client_test_results_total",
Help: "Number of client-connections for NDT tests run by this server.",
},
[]string{"protocol", "direction", "result"},
)
ClientTestErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ndt5_client_test_errors_total",
Help: "Number of test errors of each type for each test.",
},
[]string{"protocol", "direction", "error"},
)
SubmittedMetaValues = promauto.NewHistogram(
prometheus.HistogramOpts{
Expand Down
4 changes: 4 additions & 0 deletions ndt5/ndt/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ import (
// websockets, or secure websockets.
type ConnectionType string

func (c ConnectionType) String() string {
return string(c)
}

// The types of connections we support.
var (
WS = ConnectionType("WS")
Expand Down
Loading

0 comments on commit b4f7984

Please sign in to comment.