Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reboot on network errors #101

Closed
wants to merge 13 commits into from
33 changes: 24 additions & 9 deletions cmd/event/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package event
import (
"context"
"fmt"
"sync/atomic"
"time"

"github.com/metal-stack/metal-hammer/pkg/kernel"

v1 "github.com/metal-stack/metal-api/pkg/api/v1"
"go.uber.org/zap"
"google.golang.org/protobuf/types/known/timestamppb"
Expand All @@ -30,16 +33,20 @@ const (
)

type EventEmitter struct {
log *zap.SugaredLogger
eventClient v1.EventServiceClient
machineID string
log *zap.SugaredLogger
eventClient v1.EventServiceClient
machineID string
consecutiveErrors atomic.Uint32
maxConsecutiveErrors uint32
}

func NewEventEmitter(log *zap.SugaredLogger, eventClient v1.EventServiceClient, machineID string) *EventEmitter {
func NewEventEmitter(log *zap.SugaredLogger, eventClient v1.EventServiceClient, machineID string, maxErrors uint32) *EventEmitter {
emitter := &EventEmitter{
eventClient: eventClient,
machineID: machineID,
log: log,
eventClient: eventClient,
machineID: machineID,
log: log,
consecutiveErrors: atomic.Uint32{},
maxConsecutiveErrors: maxErrors,
}

ticker := time.NewTicker(1 * time.Minute)
Expand All @@ -53,7 +60,7 @@ func NewEventEmitter(log *zap.SugaredLogger, eventClient v1.EventServiceClient,

func (e *EventEmitter) Emit(eventType ProvisioningEventType, message string) {
eventString := string(eventType)
e.log.Infow("event", "event", eventString, "message", message)
e.log.Infow("event", "event", eventString, "message", message, "errorCount", e.consecutiveErrors.Load())
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
s, err := e.eventClient.Send(ctx, &v1.EventServiceSendRequest{
Expand All @@ -66,9 +73,17 @@ func (e *EventEmitter) Emit(eventType ProvisioningEventType, message string) {
},
})
if err != nil {
e.log.Errorw("event", "cannot send event", eventType, "error", err)
e.log.Errorw("event", "cannot send event", eventType, "errorCount", e.consecutiveErrors.Load(), "error", err)
e.consecutiveErrors.Add(1)
if e.consecutiveErrors.Load() > e.maxConsecutiveErrors {
err = kernel.Reboot()
if err != nil {
e.log.Errorw("event, unable to reboot because of too many consecutive errors", "error", err)
}
}
}
if s != nil {
e.log.Infow("event", "send", s.Events, "failed", s.Failed)
e.consecutiveErrors.Store(0)
}
}
18 changes: 11 additions & 7 deletions cmd/network/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ package network

import (
"fmt"
v1 "github.com/metal-stack/metal-api/pkg/api/v1"
"net"
"strings"
"time"

"github.com/metal-stack/go-lldpd/pkg/lldp"
"github.com/metal-stack/metal-go/api/models"
"github.com/metal-stack/v"
"go.uber.org/zap"

Expand Down Expand Up @@ -95,9 +95,7 @@ func linkSetUp(name string) error {
}

// Neighbors of a interface, detected via ip neighbor detection
func (n *Network) Neighbors(name string) ([]*models.V1MachineNic, error) {
neighbors := make([]*models.V1MachineNic, 0)

func (n *Network) Neighbors(name string) (neighbors []*v1.MachineNic, err error) {
host := n.LLDPClient.Host

for !host.done {
Expand All @@ -115,13 +113,19 @@ func (n *Network) Neighbors(name string) ([]*models.V1MachineNic, error) {

neighs := host.neighbors[name]
for _, neigh := range neighs {
macAddress := neigh.Port.Value
neighbors = append(neighbors, &models.V1MachineNic{Mac: &macAddress, Name: &name})
identifier := neigh.Port.Value
n.Log.Infow("register add neigbor", "nic", name, "identifier", identifier)
neighbors = append(neighbors, &v1.MachineNic{
Mac: identifier,
Identifier: identifier,
Name: name,
Hostname: neigh.Name,
})
}
return neighbors, nil
}

// InternalIP returns the first ipv4 ip of a eth* interface.
// InternalIP returns the first ipv4 ip of an eth* interface.
func InternalIP() string {
for _, name := range Interfaces() {
if !strings.HasPrefix(name, "eth") {
Expand Down
6 changes: 3 additions & 3 deletions cmd/network/lldpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,12 @@ func (l *LLDPClient) requirementsMet() bool {
if len(l.Host.neighbors) < l.Host.minimumInterfaces {
return false
}
// Then check if 2 distinct Chassis neighbors where found
// and every port type of a interface on the switch is set to mac

// Then check if 2 distinct Chassis neighbors were found
neighMap := make(map[string]string)
for iface, neighs := range l.Host.neighbors {
for _, neigh := range neighs {
if neigh.Chassis.Type == lldp.Mac && neigh.Port.Type == lldp.Mac {
if neigh.Chassis.Type == lldp.Mac && (neigh.Port.Type == lldp.Mac || neigh.Port.Type == lldp.Local) {
neighMap[neigh.Chassis.Value] = iface
}
}
Expand Down
14 changes: 1 addition & 13 deletions cmd/register/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,19 +136,7 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
return nil, fmt.Errorf("unable to determine neighbors of interface:%s %w", nic.Name, err)
}
r.log.Infow("register found neigbor for", "nic", nic.Name, "neighbors", neighbors)
ns := []*v1.MachineNic{}
for i := range neighbors {
n := neighbors[i]
if n.Mac == nil || n.Name == nil {
continue
}
r.log.Infow("register add neigbor", "nic", n.Name, "mac", n.Mac)
ns = append(ns, &v1.MachineNic{
Mac: *n.Mac,
Name: *n.Name,
})
}
nic.Neighbors = ns
nic.Neighbors = neighbors
}

// Disks
Expand Down
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func Run(log *zap.SugaredLogger, spec *Specification, hal hal.InBand) (*event.Ev

bootService := metalAPIClient.BootService()

eventEmitter := event.NewEventEmitter(log, metalAPIClient.Event(), spec.MachineUUID)
eventEmitter := event.NewEventEmitter(log, metalAPIClient.Event(), spec.MachineUUID, 20)

eventEmitter.Emit(event.ProvisioningEventPreparing, fmt.Sprintf("starting metal-hammer version:%q", v.V))

Expand Down
16 changes: 12 additions & 4 deletions cmd/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,26 @@ import (
"google.golang.org/grpc/status"
)

const defaultWaitTimeOut = 2 * time.Second
const (
defaultWaitTimeOut = 2 * time.Second
// 30 retries per minute, 20 minutes total
maxConsecutiveErrors = 30 * 20
)

func (c *MetalAPIClient) WaitForAllocation(e *event.EventEmitter, machineID string) error {
e.Emit(event.ProvisioningEventWaiting, "waiting for allocation")

req := &v1.BootServiceWaitRequest{
MachineId: machineID,
}
consecutiveErrors := 0
for {
stream, err := c.BootService().Wait(context.Background(), req)
if err != nil {
c.log.Errorw("failed waiting for allocation", "retry after", defaultWaitTimeOut, "error", err)

c.log.Errorw("failed waiting for allocation", "retry after", defaultWaitTimeOut, "consecutive errors", consecutiveErrors, "error", err)
consecutiveErrors++
if consecutiveErrors > maxConsecutiveErrors {
return fmt.Errorf("maximum number of consecutive communication errors reached, rebooting: %w", err)
}
time.Sleep(defaultWaitTimeOut)
continue
}
Expand All @@ -38,6 +45,7 @@ func (c *MetalAPIClient) WaitForAllocation(e *event.EventEmitter, machineID stri
}

if err != nil {
consecutiveErrors = 0
if e, ok := status.FromError(err); ok {
c.log.Errorw("got error from wait call", "code", e.Code(), "message", e.Message(), "details", e.Details())
switch e.Code() { // nolint:exhaustive
Expand Down
67 changes: 36 additions & 31 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ require (
github.com/google/gopacket v1.1.19
github.com/google/uuid v1.3.0
github.com/jaypipes/ghw v0.9.0
github.com/metal-stack/go-hal v0.4.2
github.com/metal-stack/go-hal v0.4.3
github.com/metal-stack/go-lldpd v0.4.2
github.com/metal-stack/metal-api v0.21.4
github.com/metal-stack/metal-go v0.21.4
github.com/metal-stack/metal-api v0.21.5-0.20221108201044-00cea8081c41
github.com/metal-stack/metal-go v0.22.0
github.com/metal-stack/pixie v0.2.2
github.com/metal-stack/v v1.0.3
// archiver must stay in version v2.1.0, see replace below
github.com/mholt/archiver v3.1.1+incompatible
github.com/pierrec/lz4/v4 v4.1.17
github.com/u-root/u-root v0.10.0
github.com/vishvananda/netlink v1.2.1-beta.2
go.uber.org/zap v1.23.0
go.uber.org/zap v1.24.0
golang.org/x/sync v0.1.0
golang.org/x/sys v0.1.0
google.golang.org/grpc v1.50.1
golang.org/x/sys v0.4.0
google.golang.org/grpc v1.52.3
google.golang.org/protobuf v1.28.1
gopkg.in/yaml.v3 v3.0.1
)
Expand All @@ -32,29 +32,32 @@ require (
github.com/VividCortex/ewma v1.2.0 // indirect
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be // indirect
github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d // indirect
github.com/avast/retry-go/v4 v4.3.0 // indirect
github.com/avast/retry-go/v4 v4.3.2 // indirect
github.com/benbjohnson/clock v1.3.0 // indirect
github.com/coreos/go-oidc/v3 v3.4.0 // indirect
github.com/coreos/go-oidc/v3 v3.5.0 // indirect
github.com/creack/pty v1.1.18 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0 // indirect
github.com/dsnet/compress v0.0.1 // indirect
github.com/fatih/color v1.13.0 // indirect
github.com/frankban/quicktest v1.14.3 // indirect
github.com/fatih/color v1.14.1 // indirect
github.com/frankban/quicktest v1.14.4 // indirect
github.com/ghodss/yaml v1.0.0 // indirect
github.com/gliderlabs/ssh v0.3.5 // indirect
github.com/go-jose/go-jose/v3 v3.0.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/go-openapi/analysis v0.21.4 // indirect
github.com/go-openapi/errors v0.20.3 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.20.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/loads v0.21.2 // indirect
github.com/go-openapi/runtime v0.24.2 // indirect
github.com/go-openapi/spec v0.20.7 // indirect
github.com/go-openapi/runtime v0.25.0 // indirect
github.com/go-openapi/spec v0.20.8 // indirect
github.com/go-openapi/strfmt v0.21.3 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/go-openapi/validate v0.22.0 // indirect
github.com/goccy/go-json v0.9.11 // indirect
github.com/golang-jwt/jwt/v4 v4.4.2 // indirect
github.com/go-openapi/validate v0.22.1 // indirect
github.com/goccy/go-json v0.10.0 // indirect
github.com/golang-jwt/jwt/v4 v4.4.3 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gorilla/mux v1.8.0 // indirect
Expand All @@ -65,37 +68,39 @@ require (
github.com/lestrrat-go/httpcc v1.0.1 // indirect
github.com/lestrrat-go/iter v1.0.2 // indirect
github.com/lestrrat-go/jwx v1.2.25 // indirect
github.com/lestrrat-go/option v1.0.0 // indirect
github.com/lestrrat-go/option v1.0.1 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.16 // indirect
github.com/mattn/go-isatty v0.0.17 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/mdlayher/ethernet v0.0.0-20220221185849-529eae5b6118 // indirect
github.com/mdlayher/lldp v0.0.0-20150915211757-afd9f83164c5 // indirect
github.com/metal-stack/metal-lib v0.11.2 // indirect
github.com/metal-stack/security v0.6.5 // indirect
github.com/metal-stack/metal-lib v0.11.3 // indirect
github.com/metal-stack/security v0.6.6 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/nwaples/rardecode v1.1.3 // indirect
github.com/oklog/ulid v1.3.1 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/pierrec/lz4 v2.6.1+incompatible // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/rivo/uniseg v0.4.2 // indirect
github.com/rivo/uniseg v0.4.3 // indirect
github.com/sethvargo/go-password v0.2.0 // indirect
github.com/stmcginnis/gofish v0.13.0 // indirect
github.com/ulikunitz/xz v0.5.10 // indirect
github.com/vishvananda/netns v0.0.0-20220913150850-18c4f4234207 // indirect
github.com/ulikunitz/xz v0.5.11 // indirect
github.com/vishvananda/netns v0.0.4 // indirect
github.com/vmware/goipmi v0.0.0-20181114221114-2333cd82d702 // indirect
go.mongodb.org/mongo-driver v1.10.3 // indirect
go.mongodb.org/mongo-driver v1.11.1 // indirect
go.opentelemetry.io/otel v1.11.1 // indirect
go.opentelemetry.io/otel/trace v1.11.1 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.8.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/net v0.1.0 // indirect
golang.org/x/oauth2 v0.1.0 // indirect
golang.org/x/text v0.4.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/crypto v0.5.0 // indirect
golang.org/x/net v0.5.0 // indirect
golang.org/x/oauth2 v0.4.0 // indirect
golang.org/x/text v0.6.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71 // indirect
google.golang.org/genproto v0.0.0-20230127162408-596548ed4efa // indirect
gopkg.in/square/go-jose.v2 v2.6.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
howett.net/plist v1.0.0 // indirect
Expand Down
Loading