Skip to content

Commit

Permalink
More elaborate system status handling (#4571)
Browse files Browse the repository at this point in the history
  • Loading branch information
yosefmih authored Apr 24, 2024
1 parent 6152485 commit 445ff1e
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 174 deletions.
17 changes: 17 additions & 0 deletions api/server/handlers/webhook/prometheus_incoming.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"context"
"fmt"
"net/http"
"time"

"connectrpc.com/connect"
"google.golang.org/protobuf/types/known/timestamppb"

porterv1 "github.com/porter-dev/api-contracts/generated/go/porter/v1"
"github.com/porter-dev/porter/api/server/authz"
Expand Down Expand Up @@ -83,10 +85,25 @@ func (p *PrometheusAlertWebhookHandler) handlePrometheusAlert(ctx context.Contex
if alert.Labels["alertname"] == "NoopAlert" {
continue
}
startTime, err := time.Parse(time.RFC3339, alert.StartsAt)
if err != nil {
return telemetry.Error(ctx, span, err, "error parsing alert start time")
}
endTime, err := time.Parse(time.RFC3339, alert.EndsAt)
if err != nil {
return telemetry.Error(ctx, span, err, "error parsing alert end time")
}
var endTimestamp *timestamppb.Timestamp
if endTime.After(startTime) {
endTimestamp = timestamppb.New(endTime)
}
recordPrometheusAlertRequest.Msg.Alerts = append(recordPrometheusAlertRequest.Msg.Alerts, &porterv1.Alert{
Name: alert.Labels["name"],
Namespace: alert.Labels["namespace"],
Type: p.getType(alert),
Severity: alert.Labels["severity"],
StartTime: timestamppb.New(startTime),
EndTime: endTimestamp,
})
}
telemetry.WithAttributes(span, telemetry.AttributeKV{Key: "porter-app-alert-labels", Value: labelKeyValues})
Expand Down
245 changes: 142 additions & 103 deletions api/types/system_service_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,71 @@ const (
ServiceDaemonSet InvolvedObjectType = "daemonset"
)

// Status is the status of a service
// it has to be one of healthy, partial_failure or failure
type Status string
func toInternalInvolvedObjectType(apiType porterv1.InvolvedObjectType) InvolvedObjectType {
switch apiType {
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DEPLOYMENT:
return ServiceDeployment
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_STATEFULSET:
return ServiceStatefulSet
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DAEMONSET:
return ServiceDaemonSet
default:
return ""
}
}

// ServiceStatus is the status of a system service
type ServiceStatus string

const (
// StatusHealthy is when a service is fully healthy
StatusHealthy Status = "healthy"
// StatusPartialFailure is when a service is partially failing
StatusPartialFailure Status = "partial_failure"
// StatusFailure is when a service is critically in failure mode
StatusFailure Status = "failure"
// ServiceStatus_Healthy is the status of a system service when it is fully healthy
ServiceStatus_Healthy ServiceStatus = "healthy"
// ServiceStatus_PartialFailure is the status of a system service when it is partially failing
ServiceStatus_PartialFailure ServiceStatus = "partial_failure"
// ServiceStatus_Failure is the status of a system service when it is critically in failure mode
ServiceStatus_Failure ServiceStatus = "failure"
// ServiceStatus_Undefined is the status of a system service when it is in an undefined state
ServiceStatus_Undefined ServiceStatus = "undefined"
)

// SystemStatusHistory contains the system infrastructure status for a cluster
type SystemStatusHistory struct {
// ClusterStatusHistory is a time series of the cluster's health
ClusterStatusHistory []ClusterHealthStatus `json:"cluster_status_history"`
// SystemServiceStatusHistories is a list of SystemServiceStatusHistory for each service
// there should be only one entry for a service
SystemServiceStatusHistories []SystemServiceStatusHistory `json:"system_service_status_histories"`
func toServiceStatus(apiStatus porterv1.Status) ServiceStatus {
switch apiStatus {
case porterv1.Status_STATUS_HEALTHY:
return ServiceStatus_Healthy
case porterv1.Status_STATUS_PARTIAL_FAILURE:
return ServiceStatus_PartialFailure
case porterv1.Status_STATUS_FAILURE:
return ServiceStatus_Failure
default:
return ServiceStatus_Undefined
}
}

// ClusterHealthStatus is the status of a cluster at a certain timestamp
type ClusterHealthStatus struct {
Timestamp time.Time `json:"timestamp"`
// Responsive is set to true if the cluster sent all heartbeats in the time period represented by the Timestamp
Responsive bool `json:"responsive"`
}
// ClusterHealthType is the type of health check on the cluster that a history is generated from
type ClusterHealthType string

// SystemServiceStatusHistory contains the status of a system service
type SystemServiceStatusHistory struct {
SystemService SystemService `json:"system_service"`
StatusHistory []ServiceStatus `json:"status_history"`
const (
// ClusterHealthType_Connected is the health history from for checking if the cluster is connected
ClusterHealthType_Connected ClusterHealthType = "connected"

// ClusterHealthType_Pingable is the health history from for checking if the cluster is pingable
ClusterHealthType_Pingable ClusterHealthType = "pingable"

// ClusterHealthType_MetricsHealthy is the health history from for checking if the cluster metrics are healthy
ClusterHealthType_MetricsHealthy ClusterHealthType = "metrics_healthy"
)

func toClusterHealthType(clusterHealthType porterv1.ClusterHealthType) (ClusterHealthType, error) {
switch clusterHealthType {
case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_CONNECTED:
return ClusterHealthType_Connected, nil
case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_PINGABLE:
return ClusterHealthType_Pingable, nil
case porterv1.ClusterHealthType_CLUSTER_HEALTH_TYPE_METRICS_HEALTHY:
return ClusterHealthType_MetricsHealthy, nil
default:
return "", errors.New("unknown cluster health type")
}
}

// SystemService identifies a system service
Expand All @@ -63,106 +95,113 @@ type SystemService struct {
InvolvedObjectType InvolvedObjectType `json:"involved_object_type"`
}

// ServiceStatus is the status of a system service at a certain timestamp
type ServiceStatus struct {
Timestamp time.Time `json:"timestamp"`
Status Status `json:"status"`
func toSystemService(apiSystemService *porterv1.SystemService) (SystemService, error) {
if apiSystemService == nil {
return SystemService{}, errors.New("unexpected nil: SystemService")
}
return SystemService{
Name: apiSystemService.Name,
Namespace: apiSystemService.Namespace,
InvolvedObjectType: toInternalInvolvedObjectType(apiSystemService.InvolvedObjectType),
}, nil
}

// ToSystemStatusHistory converts the CCP resposne to the internal SystemStatusHistory
func ToSystemStatusHistory(apiResp *porterv1.SystemStatusHistoryResponse) (SystemStatusHistory, error) {
if apiResp == nil {
return SystemStatusHistory{}, fmt.Errorf("nil system service status response")
}
resp := SystemStatusHistory{
ClusterStatusHistory: []ClusterHealthStatus{},
SystemServiceStatusHistories: []SystemServiceStatusHistory{},
// HealthStatus is the status over a certain period of time
type HealthStatus struct {
StartTime time.Time `json:"start_time"`
EndTime *time.Time `json:"end_time,omitempty"`
Status ServiceStatus `json:"status"`
Description string `json:"description,omitempty"`
}

// DailyHealthStatus contains the health status of a system service or cluster over one day
type DailyHealthStatus struct {
StatusPercentages map[ServiceStatus]float32 `json:"status_percentages,omitempty"`
HealthStatuses []*HealthStatus `json:"health_statuses,omitempty"`
}

// toDailyHealthStatus converts from the proto DailyHealthStatus to the local DailyHealthStatus
func toDailyHealthStatus(protoDailyHealthStatus *porterv1.DailyHealthStatus) DailyHealthStatus {
dailyHealthStatus := DailyHealthStatus{
StatusPercentages: map[ServiceStatus]float32{},
HealthStatuses: make([]*HealthStatus, 0),
}
for _, apiClusterStatus := range apiResp.ClusterStatusHistory {
clusterHealthStatus, err := toClusterHealthStatus(apiClusterStatus)
if err != nil {
return resp, err
}
resp.ClusterStatusHistory = append(resp.ClusterStatusHistory, clusterHealthStatus)
for _, statusPercentage := range protoDailyHealthStatus.StatusPercentages {
dailyHealthStatus.StatusPercentages[toServiceStatus(statusPercentage.Status)] = statusPercentage.Percentage
}
for _, apiServiceStatusHistory := range apiResp.SystemServiceStatusHistories {
statusHistory, err := toSystemServiceStatusHistory(apiServiceStatusHistory)
if err != nil {
return resp, err
for _, healthStatus := range protoDailyHealthStatus.HealthStatuses {
var endTime *time.Time = nil
if healthStatus.EndTime != nil {
endTimeTemp := healthStatus.EndTime.AsTime()
endTime = &endTimeTemp
}
resp.SystemServiceStatusHistories = append(resp.SystemServiceStatusHistories, statusHistory)
dailyHealthStatus.HealthStatuses = append(dailyHealthStatus.HealthStatuses, &HealthStatus{
StartTime: healthStatus.StartTime.AsTime(),
EndTime: endTime,
Status: toServiceStatus(healthStatus.Status),
Description: healthStatus.Description,
})
}
return resp, nil
return dailyHealthStatus
}

func toClusterHealthStatus(apiClusterStatus *porterv1.ClusterStatus) (ClusterHealthStatus, error) {
if apiClusterStatus == nil {
return ClusterHealthStatus{}, errors.New("unexpected nil: ClusterStatus")
}
return ClusterHealthStatus{
Timestamp: apiClusterStatus.TimestampField.AsTime(),
Responsive: apiClusterStatus.Responsive,
}, nil
// SystemServiceStatusHistory contains the daily status history of a system service
type SystemServiceStatusHistory struct {
SystemService SystemService `json:"system_service"`
DailyHealthHistory map[int32]DailyHealthStatus `json:"daily_health_history,omitempty"`
}

func toSystemServiceStatusHistory(apiServiceStatusHistory *porterv1.SystemServiceStatusHistory) (SystemServiceStatusHistory, error) {
if apiServiceStatusHistory == nil {
func toSystemServiceStatusHistory(protoSystemServiceStatusHistory *porterv1.SystemServiceStatusHistory) (SystemServiceStatusHistory, error) {
if protoSystemServiceStatusHistory == nil {
return SystemServiceStatusHistory{}, errors.New("unexpected nil: SystemServiceStatusHistory")
}
systemService, err := toSystemService(apiServiceStatusHistory.SystemService)
systemService, err := toSystemService(protoSystemServiceStatusHistory.SystemService)
if err != nil {
return SystemServiceStatusHistory{}, err
}
resp := SystemServiceStatusHistory{
SystemService: systemService,
StatusHistory: []ServiceStatus{},
SystemService: systemService,
DailyHealthHistory: map[int32]DailyHealthStatus{},
}
for _, apiStatus := range apiServiceStatusHistory.StatusHistory {
status, err := toStatus(apiStatus.Status)
if err != nil {
return resp, err
}
resp.StatusHistory = append(resp.StatusHistory, ServiceStatus{
Timestamp: apiStatus.TimestampField.AsTime(),
Status: status,
})
for day, protoDailyHealthStatus := range protoSystemServiceStatusHistory.DailyStatusHistory {
resp.DailyHealthHistory[day] = toDailyHealthStatus(protoDailyHealthStatus)
}
return resp, nil
}

func toSystemService(apiSystemService *porterv1.SystemService) (SystemService, error) {
if apiSystemService == nil {
return SystemService{}, errors.New("unexpected nil: SystemService")
}
return SystemService{
Name: apiSystemService.Name,
Namespace: apiSystemService.Namespace,
InvolvedObjectType: toInternalInvolvedObjectType(apiSystemService.InvolvedObjectType),
}, nil
// SystemStatusHistory contains the system infrastructure status for a cluster
type SystemStatusHistory struct {
ClusterStatusHistories map[ClusterHealthType]map[int32]DailyHealthStatus `json:"cluster_status_histories,omitempty"`
SystemServiceStatusHistories []SystemServiceStatusHistory `json:"system_service_status_histories,omitempty"`
}

func toInternalInvolvedObjectType(apiType porterv1.InvolvedObjectType) InvolvedObjectType {
switch apiType {
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DEPLOYMENT:
return ServiceDeployment
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_STATEFULSET:
return ServiceStatefulSet
case porterv1.InvolvedObjectType_INVOLVED_OBJECT_TYPE_DAEMONSET:
return ServiceDaemonSet
default:
return ""
// ToSystemStatusHistory converts the CCP resposne to the internal SystemStatusHistory
func ToSystemStatusHistory(apiResp *porterv1.SystemStatusHistoryResponse) (SystemStatusHistory, error) {
if apiResp == nil {
return SystemStatusHistory{}, fmt.Errorf("nil system service status response")
}
}

func toStatus(apiStatus porterv1.Status) (Status, error) {
switch apiStatus {
case porterv1.Status_STATUS_HEALTHY:
return StatusHealthy, nil
case porterv1.Status_STATUS_PARTIAL_FAILURE:
return StatusPartialFailure, nil
case porterv1.Status_STATUS_FAILURE:
return StatusFailure, nil
default:
return "", errors.New("unknown service status")
resp := SystemStatusHistory{
ClusterStatusHistories: map[ClusterHealthType]map[int32]DailyHealthStatus{},
SystemServiceStatusHistories: []SystemServiceStatusHistory{},
}
for _, clusterHealthHistory := range apiResp.ClusterStatusHistories {
clusterHealthType, err := toClusterHealthType(clusterHealthHistory.ClusterHealthType)
if err != nil {
return resp, fmt.Errorf("unknown cluster health type: %s", clusterHealthHistory.ClusterHealthType)
}
// We don't expect duplicate cluster health types in the output, thus this should be safe
resp.ClusterStatusHistories[clusterHealthType] = map[int32]DailyHealthStatus{}
for day, protoDailyHealthStatus := range clusterHealthHistory.DailyStatusHistory {
dailyHealthStatus := toDailyHealthStatus(protoDailyHealthStatus)
resp.ClusterStatusHistories[clusterHealthType][day] = dailyHealthStatus
}
}
for _, apiServiceStatusHistory := range apiResp.SystemServiceStatusHistories {
statusHistory, err := toSystemServiceStatusHistory(apiServiceStatusHistory)
if err != nil {
return resp, err
}
resp.SystemServiceStatusHistories = append(resp.SystemServiceStatusHistories, statusHistory)
}
return resp, nil
}
Loading

0 comments on commit 445ff1e

Please sign in to comment.