Skip to content

Commit

Permalink
metrics, pprof: support reloading services with SIGHUP
Browse files Browse the repository at this point in the history
Reload prometheus and pprof services, if the config is updated.
Make metric to work with multithreading.

Closes #1868.

Signed-off-by: Andrey Butusov <[email protected]>
  • Loading branch information
End-rey committed Nov 19, 2024
1 parent 3592189 commit 0d24df4
Show file tree
Hide file tree
Showing 23 changed files with 346 additions and 52 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ attribute, which is used for container domain name in NNS contracts (#2954)
- `logger.encoding` config option (#2999)
- Reloading morph endpoints with SIGHUP (#2998)
- New `peapod-to-fstree` tool providing peapod-to-fstree data migration (#3013)
- Reloading pprof/metrics services with SIGHUP (#3016)

### Fixed
- Do not search for tombstones when handling their expiration, use local indexes instead (#2929)
Expand Down
68 changes: 66 additions & 2 deletions cmd/neofs-node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
"github.com/nspcc-dev/neofs-node/pkg/network/cache"
"github.com/nspcc-dev/neofs-node/pkg/services/control"
controlSvc "github.com/nspcc-dev/neofs-node/pkg/services/control/server"
objectService "github.com/nspcc-dev/neofs-node/pkg/services/object"
getsvc "github.com/nspcc-dev/neofs-node/pkg/services/object/get"
"github.com/nspcc-dev/neofs-node/pkg/services/policer"
"github.com/nspcc-dev/neofs-node/pkg/services/replicator"
Expand Down Expand Up @@ -299,6 +300,8 @@ type internals struct {
healthStatus atomic.Int32
// is node under maintenance
isMaintenance atomic.Bool

nodeMetrics *metrics.NodeMetrics
}

// starts node's maintenance.
Expand Down Expand Up @@ -378,7 +381,8 @@ type shared struct {

treeService *tree.Service

metricsCollector *metrics.NodeMetrics
metricsCollectorLock sync.RWMutex
metricsCollector *metrics.NodeMetrics

control *controlSvc.Server
}
Expand Down Expand Up @@ -488,6 +492,8 @@ type cfgObject struct {
tombstoneLifetime uint64

containerNodes *containerNodes

metricsCollectorService *objectService.MetricCollector
}

type cfgLocalStorage struct {
Expand Down Expand Up @@ -637,9 +643,15 @@ func initCfg(appCfg *config.Config) *cfg {

c.ownerIDFromKey = user.NewFromECDSAPublicKey(key.PrivateKey.PublicKey)

c.nodeMetrics = metrics.NewNodeMetrics(misc.Version)

Check warning on line 646 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L646

Added line #L646 was not covered by tests
if metricsconfig.Enabled(c.cfgReader) {
c.metricsCollector = metrics.NewNodeMetrics(misc.Version)
c.metricsCollectorLock.Lock()
c.metricsCollector = c.nodeMetrics
c.metricsCollectorLock.Unlock()

c.basics.networkState.metricsLock.Lock()

Check warning on line 652 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L648-L652

Added lines #L648 - L652 were not covered by tests
c.basics.networkState.metrics = c.metricsCollector
c.basics.networkState.metricsLock.Unlock()

Check warning on line 654 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L654

Added line #L654 was not covered by tests
}

c.veryLastClosers = make(map[string]func())
Expand Down Expand Up @@ -848,12 +860,20 @@ func (c *cfg) configWatcher(ctx context.Context) {
case <-ch:
c.log.Info("SIGHUP has been received, rereading configuration...")

oldMetrics := writeMetricConfig(c.cfgReader)
oldProfiler := writeProfilerConfig(c.cfgReader)

Check warning on line 865 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L863-L865

Added lines #L863 - L865 were not covered by tests
err := c.readConfig(c.cfgReader)
if err != nil {
c.log.Error("configuration reading", zap.Error(err))
continue
}

// Prometheus and pprof

// nolint:contextcheck
c.reloadMetricsAndPprof(oldMetrics, oldProfiler)

Check warning on line 876 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L875-L876

Added lines #L875 - L876 were not covered by tests
// Logger

err = c.internals.logLevel.UnmarshalText([]byte(c.logger.level))
Expand Down Expand Up @@ -929,3 +949,47 @@ func writeSystemAttributes(c *cfg) error {

return nil
}

func (c *cfg) reloadMetricsAndPprof(oldMetrics metricConfig, oldProfiler profilerConfig) {
// Metrics

if oldMetrics.isUpdated(c.cfgReader) {
if closer, ok := c.veryLastClosers[metricName]; ok {
closer()
}

Check warning on line 959 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L953-L959

Added lines #L953 - L959 were not covered by tests

if metricsconfig.Enabled(c.cfgReader) != oldMetrics.enabled {
var nodeMetrics *metrics.NodeMetrics
if metricsconfig.Enabled(c.cfgReader) {
nodeMetrics = c.nodeMetrics
delete(c.veryLastClosers, metricName)
c.cfgObject.metricsCollectorService.ReloadMetrics(nodeMetrics)
c.cfgObject.cfgLocalStorage.localStorage.ReloadMetrics(nodeMetrics)
} else {
c.cfgObject.metricsCollectorService.ReloadMetrics(nil)
c.cfgObject.cfgLocalStorage.localStorage.ReloadMetrics(nil)
}

Check warning on line 971 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L961-L971

Added lines #L961 - L971 were not covered by tests

c.metricsCollectorLock.Lock()
c.metricsCollector = nodeMetrics
c.metricsCollectorLock.Unlock()

c.basics.networkState.metricsLock.Lock()
c.basics.networkState.metrics = c.metricsCollector
c.basics.networkState.metricsLock.Unlock()

Check warning on line 979 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L973-L979

Added lines #L973 - L979 were not covered by tests
}

preRunAndLog(c, metricName, initMetrics(c))

Check warning on line 982 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L982

Added line #L982 was not covered by tests
}

//Profiler

if oldProfiler.isUpdated(c.cfgReader) {
if closer, ok := c.veryLastClosers[profilerName]; ok {
closer()
}
delete(c.veryLastClosers, profilerName)

preRunAndLog(c, profilerName, initProfiler(c))

Check warning on line 993 in cmd/neofs-node/config.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/config.go#L987-L993

Added lines #L987 - L993 were not covered by tests
}
}
2 changes: 2 additions & 0 deletions cmd/neofs-node/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,11 @@ func (c *cfg) NetmapStatus() control.NetmapStatus {
func (c *cfg) setHealthStatus(st control.HealthStatus) {
c.healthStatus.Store(int32(st))

c.metricsCollectorLock.Lock()

Check warning on line 70 in cmd/neofs-node/control.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/control.go#L70

Added line #L70 was not covered by tests
if c.metricsCollector != nil {
c.metricsCollector.SetHealth(int32(st))
}
c.metricsCollectorLock.Unlock()

Check warning on line 74 in cmd/neofs-node/control.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/control.go#L74

Added line #L74 was not covered by tests
}

func (c *cfg) HealthStatus() control.HealthStatus {
Expand Down
23 changes: 23 additions & 0 deletions cmd/neofs-node/metrics.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package main

import (
"time"

"github.com/nspcc-dev/neofs-node/cmd/neofs-node/config"
metricsconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/metrics"
httputil "github.com/nspcc-dev/neofs-node/pkg/util/http"
"github.com/prometheus/client_golang/prometheus/promhttp"
Expand All @@ -25,3 +28,23 @@ func initMetrics(c *cfg) *httputil.Server {

return srv
}

type metricConfig struct {
enabled bool
shutdownTimeout time.Duration
address string
}

func writeMetricConfig(c *config.Config) metricConfig {
return metricConfig{
enabled: metricsconfig.Enabled(c),
shutdownTimeout: metricsconfig.ShutdownTimeout(c),
address: metricsconfig.Address(c),
}

Check warning on line 43 in cmd/neofs-node/metrics.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/metrics.go#L38-L43

Added lines #L38 - L43 were not covered by tests
}

func (m1 metricConfig) isUpdated(c *config.Config) bool {
return m1.enabled != metricsconfig.Enabled(c) ||
m1.shutdownTimeout != metricsconfig.ShutdownTimeout(c) ||
m1.address != metricsconfig.Address(c)

Check warning on line 49 in cmd/neofs-node/metrics.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/metrics.go#L46-L49

Added lines #L46 - L49 were not covered by tests
}
7 changes: 6 additions & 1 deletion cmd/neofs-node/netmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"errors"
"fmt"
"sync"
"sync/atomic"

netmapGRPC "github.com/nspcc-dev/neofs-api-go/v2/netmap/grpc"
Expand All @@ -29,7 +30,8 @@ type networkState struct {

nodeInfo atomic.Value // *netmapSDK.NodeInfo

metrics *metrics.NodeMetrics
metricsLock sync.RWMutex
metrics *metrics.NodeMetrics
}

func newNetworkState() *networkState {
Expand All @@ -47,6 +49,9 @@ func (s *networkState) CurrentEpoch() uint64 {

func (s *networkState) setCurrentEpoch(v uint64) {
s.epoch.Store(v)

s.metricsLock.Lock()
defer s.metricsLock.Unlock()

Check warning on line 54 in cmd/neofs-node/netmap.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/netmap.go#L52-L54

Added lines #L52 - L54 were not covered by tests
if s.metrics != nil {
s.metrics.SetEpoch(v)
}
Expand Down
11 changes: 7 additions & 4 deletions cmd/neofs-node/object.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,12 +347,15 @@ func initObjectService(c *cfg) {
respSvc,
)

var firstSvc objectService.ServiceServer = signSvc
if c.metricsCollector != nil {
firstSvc = objectService.NewMetricCollector(signSvc, c.metricsCollector)
c.metricsCollectorLock.Lock()
if c.metricsCollector == nil {
c.cfgObject.metricsCollectorService = objectService.NewMetricCollector(signSvc, nil)
} else {
c.cfgObject.metricsCollectorService = objectService.NewMetricCollector(signSvc, c.metricsCollector)

Check warning on line 354 in cmd/neofs-node/object.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/object.go#L350-L354

Added lines #L350 - L354 were not covered by tests
}
c.metricsCollectorLock.Unlock()

Check warning on line 356 in cmd/neofs-node/object.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/object.go#L356

Added line #L356 was not covered by tests

server := objectTransportGRPC.New(firstSvc, mNumber, objNode, neofsecdsa.SignerRFC6979(c.shared.basics.key.PrivateKey))
server := objectTransportGRPC.New(c.cfgObject.metricsCollectorService, mNumber, objNode, neofsecdsa.SignerRFC6979(c.shared.basics.key.PrivateKey))

Check warning on line 358 in cmd/neofs-node/object.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/object.go#L358

Added line #L358 was not covered by tests

for _, srv := range c.cfgGRPC.servers {
objectGRPC.RegisterObjectServiceServer(srv, server)
Expand Down
23 changes: 23 additions & 0 deletions cmd/neofs-node/pprof.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package main

import (
"time"

"github.com/nspcc-dev/neofs-node/cmd/neofs-node/config"
profilerconfig "github.com/nspcc-dev/neofs-node/cmd/neofs-node/config/profiler"
httputil "github.com/nspcc-dev/neofs-node/pkg/util/http"
)
Expand All @@ -24,3 +27,23 @@ func initProfiler(c *cfg) *httputil.Server {

return srv
}

type profilerConfig struct {
enabled bool
shutdownTimeout time.Duration
address string
}

func writeProfilerConfig(c *config.Config) profilerConfig {
return profilerConfig{
enabled: profilerconfig.Enabled(c),
shutdownTimeout: profilerconfig.ShutdownTimeout(c),
address: profilerconfig.Address(c),
}

Check warning on line 42 in cmd/neofs-node/pprof.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/pprof.go#L37-L42

Added lines #L37 - L42 were not covered by tests
}

func (m1 profilerConfig) isUpdated(c *config.Config) bool {
return m1.enabled != profilerconfig.Enabled(c) ||
m1.shutdownTimeout != profilerconfig.ShutdownTimeout(c) ||
m1.address != profilerconfig.Address(c)

Check warning on line 48 in cmd/neofs-node/pprof.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/pprof.go#L45-L48

Added lines #L45 - L48 were not covered by tests
}
2 changes: 2 additions & 0 deletions cmd/neofs-node/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ func (c *cfg) engineOpts() []engine.Option {
opts = append(opts, engine.WithContainersSource(cntClient.AsContainerSource(c.shared.basics.cCli)))
}

c.metricsCollectorLock.Lock()

Check warning on line 95 in cmd/neofs-node/storage.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/storage.go#L95

Added line #L95 was not covered by tests
if c.metricsCollector != nil {
opts = append(opts, engine.WithMetrics(c.metricsCollector))
}
c.metricsCollectorLock.Unlock()

Check warning on line 99 in cmd/neofs-node/storage.go

View check run for this annotation

Codecov / codecov/patch

cmd/neofs-node/storage.go#L99

Added line #L99 was not covered by tests

return opts
}
Expand Down
20 changes: 18 additions & 2 deletions pkg/local_object_storage/engine/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@ import (
//
// Returns an error if executions are blocked (see BlockExecution).
func (e *StorageEngine) ContainerSize(cnr cid.ID) (uint64, error) {
var elapsedFn func()

e.mtx.Lock()

Check warning on line 21 in pkg/local_object_storage/engine/container.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/container.go#L19-L21

Added lines #L19 - L21 were not covered by tests
if e.metrics != nil {
defer elapsed(e.metrics.AddEstimateContainerSizeDuration)()
elapsedFn = elapsed(e.metrics.AddEstimateContainerSizeDuration)
}
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 28 in pkg/local_object_storage/engine/container.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/container.go#L23-L28

Added lines #L23 - L28 were not covered by tests
}

e.blockMtx.RLock()
Expand Down Expand Up @@ -50,8 +58,16 @@ func (e *StorageEngine) ContainerSize(cnr cid.ID) (uint64, error) {
//
// Returns an error if executions are blocked (see BlockExecution).
func (e *StorageEngine) ListContainers() ([]cid.ID, error) {
var elapsedFn func()

e.mtx.Lock()

Check warning on line 63 in pkg/local_object_storage/engine/container.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/container.go#L61-L63

Added lines #L61 - L63 were not covered by tests
if e.metrics != nil {
defer elapsed(e.metrics.AddListContainersDuration)()
elapsedFn = elapsed(e.metrics.AddListContainersDuration)
}
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 70 in pkg/local_object_storage/engine/container.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/container.go#L65-L70

Added lines #L65 - L70 were not covered by tests
}

e.blockMtx.RLock()
Expand Down
7 changes: 7 additions & 0 deletions pkg/local_object_storage/engine/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,13 @@ loop:
return nil
}

func (e *StorageEngine) ReloadMetrics(newMetric MetricRegister) {
e.mtx.Lock()
defer e.mtx.Unlock()

e.metrics = newMetric

Check warning on line 276 in pkg/local_object_storage/engine/control.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/control.go#L272-L276

Added lines #L272 - L276 were not covered by tests
}

func calculateShardID(info shard.Info) string {
// This calculation should be kept in sync with node
// configuration parsing during SIGHUP.
Expand Down
10 changes: 9 additions & 1 deletion pkg/local_object_storage/engine/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,16 @@ import (
// NOTE: This is a forced removal, marks any object to be deleted (despite
// any prohibitions on operations with that object).
func (e *StorageEngine) Delete(addr oid.Address) error {
var elapsedFn func()

e.mtx.Lock()
if e.metrics != nil {
defer elapsed(e.metrics.AddDeleteDuration)()
elapsedFn = elapsed(e.metrics.AddDeleteDuration)
}

Check warning on line 19 in pkg/local_object_storage/engine/delete.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/delete.go#L18-L19

Added lines #L18 - L19 were not covered by tests
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 23 in pkg/local_object_storage/engine/delete.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/delete.go#L23

Added line #L23 was not covered by tests
}

e.blockMtx.RLock()
Expand Down
10 changes: 9 additions & 1 deletion pkg/local_object_storage/engine/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,16 @@ import (
//
// Returns an error if executions are blocked (see BlockExecution).
func (e *StorageEngine) Get(addr oid.Address) (*objectSDK.Object, error) {
var elapsedFn func()

e.mtx.Lock()
if e.metrics != nil {
defer elapsed(e.metrics.AddGetDuration)()
elapsedFn = elapsed(e.metrics.AddGetDuration)
}

Check warning on line 30 in pkg/local_object_storage/engine/get.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/get.go#L29-L30

Added lines #L29 - L30 were not covered by tests
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 34 in pkg/local_object_storage/engine/get.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/get.go#L34

Added line #L34 was not covered by tests
}

e.blockMtx.RLock()
Expand Down
10 changes: 9 additions & 1 deletion pkg/local_object_storage/engine/head.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,16 @@ import (
//
// Returns an error if executions are blocked (see BlockExecution).
func (e *StorageEngine) Head(addr oid.Address, raw bool) (*objectSDK.Object, error) {
var elapsedFn func()

e.mtx.Lock()
if e.metrics != nil {
defer elapsed(e.metrics.AddHeadDuration)()
elapsedFn = elapsed(e.metrics.AddHeadDuration)
}

Check warning on line 30 in pkg/local_object_storage/engine/head.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/head.go#L29-L30

Added lines #L29 - L30 were not covered by tests
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 34 in pkg/local_object_storage/engine/head.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/head.go#L34

Added line #L34 was not covered by tests
}

e.blockMtx.RLock()
Expand Down
10 changes: 9 additions & 1 deletion pkg/local_object_storage/engine/inhume.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,16 @@ var errInhumeFailure = errors.New("inhume operation failed")
//
// Returns an error if executions are blocked (see BlockExecution).
func (e *StorageEngine) Inhume(tombstone oid.Address, tombExpiration uint64, addrs ...oid.Address) error {
var elapsedFn func()

e.mtx.Lock()
if e.metrics != nil {
defer elapsed(e.metrics.AddInhumeDuration)()
elapsedFn = elapsed(e.metrics.AddInhumeDuration)
}

Check warning on line 32 in pkg/local_object_storage/engine/inhume.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/inhume.go#L31-L32

Added lines #L31 - L32 were not covered by tests
e.mtx.Unlock()

if elapsedFn != nil {
defer elapsedFn()

Check warning on line 36 in pkg/local_object_storage/engine/inhume.go

View check run for this annotation

Codecov / codecov/patch

pkg/local_object_storage/engine/inhume.go#L36

Added line #L36 was not covered by tests
}

e.blockMtx.RLock()
Expand Down
Loading

0 comments on commit 0d24df4

Please sign in to comment.