Skip to content

Commit

Permalink
Fix the naming as per convention for Metrics
Browse files Browse the repository at this point in the history
Some metrics aren't as per convention. This fixes it in a backward compatible
way.
  • Loading branch information
khrm committed Sep 7, 2023
1 parent 55e0450 commit b82d209
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 7 deletions.
14 changes: 12 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,24 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
| ---------- | ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | deprecate |
| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | deprecate |
| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental |
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |

The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.
Expand Down
34 changes: 30 additions & 4 deletions pkg/pipelinerunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,20 @@ var (
stats.UnitDimensionless)
prCountView *view.View

prTotal = stats.Float64("pipelinerun_total",
"Number of pipelineruns",
stats.UnitDimensionless)
prTotalView *view.View

runningPRsCount = stats.Float64("running_pipelineruns_count",
"Number of pipelineruns executing currently",
stats.UnitDimensionless)
runningPRsCountView *view.View

runningPRs = stats.Float64("running_pipelineruns_count",
"Number of pipelineruns executing currently",
stats.UnitDimensionless)
runningPRsView *view.View
)

const (
Expand Down Expand Up @@ -161,15 +171,29 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: view.LastValue(),
}

prTotalView = &view.View{
Description: prTotal.Description(),
Measure: prTotal,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
}
runningPRsView = &view.View{
Description: runningPRs.Description(),
Measure: runningPRs,
Aggregation: view.LastValue(),
}

return view.Register(
prDurationView,
prCountView,
runningPRsCountView,
prTotalView,
runningPRsView,
)
}

func viewUnregister() {
view.Unregister(prDurationView, prCountView, runningPRsCountView)
view.Unregister(prDurationView, prCountView, runningPRsCountView, prTotalView, runningPRsView)
}

// MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so
Expand Down Expand Up @@ -252,6 +276,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co

metrics.Record(ctx, prDuration.M(duration.Seconds()))
metrics.Record(ctx, prCount.M(1))
metrics.Record(ctx, prTotal.M(1))

return nil
}
Expand All @@ -271,18 +296,19 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
return fmt.Errorf("failed to list pipelineruns while generating metrics : %w", err)
}

var runningPRs int
var runningPipelineRuns int
for _, pr := range prs {
if !pr.IsDone() {
runningPRs++
runningPipelineRuns++
}
}

ctx, err := tag.New(context.Background())
if err != nil {
return err
}
metrics.Record(ctx, runningPRsCount.M(float64(runningPRs)))
metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns)))
metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns)))

return nil
}
Expand Down
55 changes: 54 additions & 1 deletion pkg/taskrunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,13 @@ var (
trDurationView *view.View
prTRDurationView *view.View
trCountView *view.View
trTotalView *view.View
runningTRsCountView *view.View
runningTRsView *view.View
runningTRsThrottledByQuotaCountView *view.View
runningTRsThrottledByNodeCountView *view.View
runningTRsThrottledByQuotaView *view.View
runningTRsThrottledByNodeView *view.View
podLatencyView *view.View

trDuration = stats.Float64(
Expand All @@ -74,10 +78,18 @@ var (
"number of taskruns",
stats.UnitDimensionless)

trTotal = stats.Float64("taskrun_total",
"Number of taskruns",
stats.UnitDimensionless)

runningTRsCount = stats.Float64("running_taskruns_count",
"Number of taskruns executing currently",
stats.UnitDimensionless)

runningTRs = stats.Float64("running_taskruns",
"Number of taskruns executing currently",
stats.UnitDimensionless)

runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)
Expand All @@ -86,7 +98,15 @@ var (
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
runningTRsThrottledByQuota = stats.Float64("running_taskruns_throttled_by_quota",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

runningTRsThrottledByNode = stats.Float64("running_taskruns_throttled_by_node",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

podLatency = stats.Float64("taskruns_pod_latency",
"scheduling latency for the taskruns pods",
stats.UnitMilliseconds)
)
Expand Down Expand Up @@ -204,11 +224,22 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
}
trTotalView = &view.View{
Description: trTotal.Description(),
Measure: trTotal,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
}
runningTRsCountView = &view.View{
Description: runningTRsCount.Description(),
Measure: runningTRsCount,
Aggregation: view.LastValue(),
}
runningTRsView = &view.View{
Description: runningTRs.Description(),
Measure: runningTRs,
Aggregation: view.LastValue(),
}
runningTRsThrottledByQuotaCountView = &view.View{
Description: runningTRsThrottledByQuotaCount.Description(),
Measure: runningTRsThrottledByQuotaCount,
Expand All @@ -219,6 +250,16 @@ func viewRegister(cfg *config.Metrics) error {
Measure: runningTRsThrottledByNodeCount,
Aggregation: view.LastValue(),
}
runningTRsThrottledByQuotaView = &view.View{
Description: runningTRsThrottledByQuota.Description(),
Measure: runningTRsThrottledByQuota,
Aggregation: view.LastValue(),
}
runningTRsThrottledByNodeView = &view.View{
Description: runningTRsThrottledByNode.Description(),
Measure: runningTRsThrottledByNode,
Aggregation: view.LastValue(),
}
podLatencyView = &view.View{
Description: podLatency.Description(),
Measure: podLatency,
Expand All @@ -229,9 +270,13 @@ func viewRegister(cfg *config.Metrics) error {
trDurationView,
prTRDurationView,
trCountView,
trTotalView,
runningTRsCountView,
runningTRsView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
runningTRsThrottledByQuotaView,
runningTRsThrottledByNodeView,
podLatencyView,
)
}
Expand All @@ -241,9 +286,13 @@ func viewUnregister() {
trDurationView,
prTRDurationView,
trCountView,
trTotalView,
runningTRsCountView,
runningTRsView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
runningTRsThrottledByQuotaView,
runningTRsThrottledByNodeView,
podLatencyView,
)
}
Expand Down Expand Up @@ -336,6 +385,7 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC

metrics.Record(ctx, durationStat.M(duration.Seconds()))
metrics.Record(ctx, trCount.M(1))
metrics.Record(ctx, trTotal.M(1))

return nil
}
Expand Down Expand Up @@ -379,8 +429,11 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
return err
}
metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
metrics.Record(ctx, runningTRs.M(float64(runningTrs)))
metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))
metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode)))
metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota)))

return nil
}
Expand Down

0 comments on commit b82d209

Please sign in to comment.