Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add endpoint for machine issues. #471

Merged
merged 17 commits into from
Oct 11, 2023
Merged
122 changes: 122 additions & 0 deletions cmd/metal-api/internal/issues/asn-uniqueness.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package issues

import (
"fmt"
"sort"
"strings"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
TypeASNUniqueness Type = "asn-not-unique"
)

type (
issueASNUniqueness struct {
details string
}
)

func (i *issueASNUniqueness) Spec() *spec {
return &spec{
Type: TypeASNUniqueness,
Severity: SeverityMinor,
Description: "The ASN is not unique (only impact on firewalls)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique",
}
}

func (i *issueASNUniqueness) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
var (
machineASNs = map[uint32]metal.Machines{}
overlaps []string
isNoFirewall = func(m metal.Machine) bool {
return m.Allocation == nil || m.Allocation.Role != metal.RoleFirewall
}
)

if isNoFirewall(m) {
return false
}

for _, n := range m.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

machineASNs[n.ASN] = nil
}

for _, machineFromAll := range c.Machines {
machineFromAll := machineFromAll

if machineFromAll.ID == m.ID {
continue
}
otherMachine := machineFromAll

if isNoFirewall(otherMachine) {
continue
}

for _, n := range otherMachine.Allocation.MachineNetworks {
n := n

if n.ASN == 0 {
continue
}

_, ok := machineASNs[n.ASN]
if !ok {
continue
}

machineASNs[n.ASN] = append(machineASNs[n.ASN], otherMachine)
}
}

var asnList []uint32
for asn := range machineASNs {
asn := asn
asnList = append(asnList, asn)
}
sort.Slice(asnList, func(i, j int) bool {
return asnList[i] < asnList[j]
})

for _, asn := range asnList {
asn := asn

overlappingMachines, ok := machineASNs[asn]
if !ok || len(overlappingMachines) == 0 {
continue
}

var sharedIDs []string
for _, m := range overlappingMachines {
m := m
sharedIDs = append(sharedIDs, m.ID)
}

overlaps = append(overlaps, fmt.Sprintf("- ASN (%d) not unique, shared with %s", asn, sharedIDs))
}

if len(overlaps) == 0 {
return false
}

sort.Slice(overlaps, func(i, j int) bool {
return overlaps[i] < overlaps[j]
})

i.details = strings.Join(overlaps, "\n")

return true
}

func (i *issueASNUniqueness) Details() string {
return i.details
}
47 changes: 47 additions & 0 deletions cmd/metal-api/internal/issues/bmc-info-outdated.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package issues

import (
"fmt"
"time"

"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
)

const (
TypeBMCInfoOutdated Type = "bmc-info-outdated"
)

type (
issueBMCInfoOutdated struct {
details string
}
)

func (i *issueBMCInfoOutdated) Details() string {
return i.details
}

func (i *issueBMCInfoOutdated) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if m.IPMI.LastUpdated.IsZero() {
i.details = "machine ipmi has never been set"
return true
}

lastUpdated := time.Since(m.IPMI.LastUpdated)

if lastUpdated > 20*time.Minute {
i.details = fmt.Sprintf("last updated %s ago", lastUpdated.String())
return true
}

return false
}

func (*issueBMCInfoOutdated) Spec() *spec {
return &spec{
Type: TypeBMCInfoOutdated,
Severity: SeverityMajor,
Description: "BMC has not been updated from either metal-hammer or metal-bmc",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-info-outdated",
}
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-ip.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
TypeBMCWithoutIP Type = "bmc-without-ip"
)

type (
issueBMCWithoutIP struct{}
)

func (i *issueBMCWithoutIP) Spec() *spec {
return &spec{
Type: TypeBMCWithoutIP,
Severity: SeverityMajor,
Description: "BMC has no ip address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip",
}
}

func (i *issueBMCWithoutIP) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
return m.IPMI.Address == ""
}

func (i *issueBMCWithoutIP) Details() string {
return ""
}
28 changes: 28 additions & 0 deletions cmd/metal-api/internal/issues/bmc-without-mac.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package issues

import "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"

const (
TypeBMCWithoutMAC Type = "bmc-without-mac"
)

type (
issueBMCWithoutMAC struct{}
)

func (i *issueBMCWithoutMAC) Spec() *spec {
return &spec{
Type: TypeBMCWithoutMAC,
Severity: SeverityMajor,
Description: "BMC has no mac address",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac",
}
}

func (i *issueBMCWithoutMAC) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
return m.IPMI.MacAddress == ""
}

func (i *issueBMCWithoutMAC) Details() string {
return ""
}
38 changes: 38 additions & 0 deletions cmd/metal-api/internal/issues/crash-loop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
TypeCrashLoop Type = "crashloop"
)

type (
issueCrashLoop struct{}
)

func (i *issueCrashLoop) Spec() *spec {
return &spec{
Type: TypeCrashLoop,
Severity: SeverityMajor,
Description: "machine is in a provisioning crash loop (⭕)",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop",
}
}

func (i *issueCrashLoop) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if ec.CrashLoop {
if pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventWaiting {
// Machine which are waiting are not considered to have issues
} else {
return true
}
}
return false
}

func (i *issueCrashLoop) Details() string {
return ""
}
41 changes: 41 additions & 0 deletions cmd/metal-api/internal/issues/failed-machine-reclaim.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package issues

import (
"github.com/metal-stack/metal-api/cmd/metal-api/internal/metal"
"github.com/metal-stack/metal-lib/pkg/pointer"
)

const (
TypeFailedMachineReclaim Type = "failed-machine-reclaim"
)

type (
issueFailedMachineReclaim struct{}
)

func (i *issueFailedMachineReclaim) Spec() *spec {
return &spec{
Type: TypeFailedMachineReclaim,
Severity: SeverityCritical,
Description: "machine phones home but not allocated",
RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim",
}
}

func (i *issueFailedMachineReclaim) Evaluate(m metal.Machine, ec metal.ProvisioningEventContainer, c *Config) bool {
if ec.FailedMachineReclaim {
return true
}

// compatibility: before the provisioning FSM was renewed, this state could be detected the following way
// we should keep this condition
if m.Allocation == nil && pointer.FirstOrZero(ec.Events).Event == metal.ProvisioningEventPhonedHome {
return true
}

return false
}

func (i *issueFailedMachineReclaim) Details() string {
return ""
}
Loading