diff --git a/docs/metrics.md b/docs/metrics.md index b0b8ab75e2f..9f9ef52e057 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -15,14 +15,24 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver, | ---------- | ----------- | ----------- | ----------- | | `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`namespace`=<pipelinerun-namespace> | experimental | | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace>| experimental | -| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | experimental | -| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental | +| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental | | `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace> | experimental | | `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | experimental | | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental | | `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=<taskruns-namespace>
`pod`= < taskrun_pod_name>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
| experimental | +| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental | | `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics. diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go index f89a5b46e06..b9b68afaf59 100644 --- a/pkg/pipelinerunmetrics/metrics.go +++ b/pkg/pipelinerunmetrics/metrics.go @@ -55,10 +55,20 @@ var ( stats.UnitDimensionless) prCountView *view.View + prTotal = stats.Float64("pipelinerun_total", + "Number of pipelineruns", + stats.UnitDimensionless) + prTotalView *view.View + runningPRsCount = stats.Float64("running_pipelineruns_count", "Number of pipelineruns executing currently", stats.UnitDimensionless) runningPRsCountView *view.View + + runningPRs = stats.Float64("running_pipelineruns_count", + "Number of pipelineruns executing currently", + stats.UnitDimensionless) + runningPRsView *view.View ) const ( @@ -161,15 +171,29 @@ func viewRegister(cfg *config.Metrics) error { Aggregation: view.LastValue(), } + prTotalView = &view.View{ + Description: prTotal.Description(), + Measure: prTotal, + Aggregation: view.Count(), + TagKeys: []tag.Key{statusTag}, + } + runningPRsView = &view.View{ + Description: runningPRs.Description(), + Measure: runningPRs, + Aggregation: view.LastValue(), + } + return view.Register( prDurationView, prCountView, runningPRsCountView, + prTotalView, + runningPRsView, ) } func viewUnregister() { - view.Unregister(prDurationView, prCountView, runningPRsCountView) + view.Unregister(prDurationView, prCountView, runningPRsCountView, prTotalView, runningPRsView) } // MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so @@ -252,6 +276,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co metrics.Record(ctx, prDuration.M(duration.Seconds())) metrics.Record(ctx, prCount.M(1)) + metrics.Record(ctx, prTotal.M(1)) return nil } @@ -271,10 +296,10 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { return fmt.Errorf("failed to list pipelineruns while generating metrics : %w", err) } - var runningPRs int + var runningPipelineRuns int for _, pr := range prs { if !pr.IsDone() { - runningPRs++ + runningPipelineRuns++ } } @@ -282,7 +307,8 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { if err != nil { return err } - metrics.Record(ctx, runningPRsCount.M(float64(runningPRs))) + metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns))) + metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns))) return nil } diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go index 473b8f4c604..94ad722d87d 100644 --- a/pkg/taskrunmetrics/metrics.go +++ b/pkg/taskrunmetrics/metrics.go @@ -55,9 +55,13 @@ var ( trDurationView *view.View prTRDurationView *view.View trCountView *view.View + trTotalView *view.View runningTRsCountView *view.View + runningTRsView *view.View runningTRsThrottledByQuotaCountView *view.View runningTRsThrottledByNodeCountView *view.View + runningTRsThrottledByQuotaView *view.View + runningTRsThrottledByNodeView *view.View podLatencyView *view.View trDuration = stats.Float64( @@ -74,10 +78,18 @@ var ( "number of taskruns", stats.UnitDimensionless) + trTotal = stats.Float64("taskrun_total", + "Number of taskruns", + stats.UnitDimensionless) + runningTRsCount = stats.Float64("running_taskruns_count", "Number of taskruns executing currently", stats.UnitDimensionless) + runningTRs = stats.Float64("running_taskruns", + "Number of taskruns executing currently", + stats.UnitDimensionless) + runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count", "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", stats.UnitDimensionless) @@ -86,7 +98,15 @@ var ( "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", stats.UnitDimensionless) - podLatency = stats.Float64("taskruns_pod_latency_milliseconds", + runningTRsThrottledByQuota = stats.Float64("running_taskruns_throttled_by_quota", + "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", + stats.UnitDimensionless) + + runningTRsThrottledByNode = stats.Float64("running_taskruns_throttled_by_node", + "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", + stats.UnitDimensionless) + + podLatency = stats.Float64("taskruns_pod_latency", "scheduling latency for the taskruns pods", stats.UnitMilliseconds) ) @@ -204,11 +224,22 @@ func viewRegister(cfg *config.Metrics) error { Aggregation: view.Count(), TagKeys: []tag.Key{statusTag}, } + trTotalView = &view.View{ + Description: trTotal.Description(), + Measure: trTotal, + Aggregation: view.Count(), + TagKeys: []tag.Key{statusTag}, + } runningTRsCountView = &view.View{ Description: runningTRsCount.Description(), Measure: runningTRsCount, Aggregation: view.LastValue(), } + runningTRsView = &view.View{ + Description: runningTRs.Description(), + Measure: runningTRs, + Aggregation: view.LastValue(), + } runningTRsThrottledByQuotaCountView = &view.View{ Description: runningTRsThrottledByQuotaCount.Description(), Measure: runningTRsThrottledByQuotaCount, @@ -219,6 +250,16 @@ func viewRegister(cfg *config.Metrics) error { Measure: runningTRsThrottledByNodeCount, Aggregation: view.LastValue(), } + runningTRsThrottledByQuotaView = &view.View{ + Description: runningTRsThrottledByQuota.Description(), + Measure: runningTRsThrottledByQuota, + Aggregation: view.LastValue(), + } + runningTRsThrottledByNodeView = &view.View{ + Description: runningTRsThrottledByNode.Description(), + Measure: runningTRsThrottledByNode, + Aggregation: view.LastValue(), + } podLatencyView = &view.View{ Description: podLatency.Description(), Measure: podLatency, @@ -229,9 +270,13 @@ func viewRegister(cfg *config.Metrics) error { trDurationView, prTRDurationView, trCountView, + trTotalView, runningTRsCountView, + runningTRsView, runningTRsThrottledByQuotaCountView, runningTRsThrottledByNodeCountView, + runningTRsThrottledByQuotaView, + runningTRsThrottledByNodeView, podLatencyView, ) } @@ -241,9 +286,13 @@ func viewUnregister() { trDurationView, prTRDurationView, trCountView, + trTotalView, runningTRsCountView, + runningTRsView, runningTRsThrottledByQuotaCountView, runningTRsThrottledByNodeCountView, + runningTRsThrottledByQuotaView, + runningTRsThrottledByNodeView, podLatencyView, ) } @@ -336,6 +385,7 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC metrics.Record(ctx, durationStat.M(duration.Seconds())) metrics.Record(ctx, trCount.M(1)) + metrics.Record(ctx, trTotal.M(1)) return nil } @@ -379,8 +429,11 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi return err } metrics.Record(ctx, runningTRsCount.M(float64(runningTrs))) + metrics.Record(ctx, runningTRs.M(float64(runningTrs))) metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode))) metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota))) + metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode))) + metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota))) return nil }