Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record the time between a task being triggered and it being successful run #217

Merged
merged 10 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 1.48.0 - 2024/12/13

### Added

- Added timer `twTasks.tasks.taskGrabbingTime` tracking the time between a task being triggered and a task being grabbed for processing

## 1.47.0 - 2024/12/13

### Added
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import org.eclipse.jgit.api.errors.RefAlreadyExistsException
import com.github.spotbugs.snom.Confidence
import com.github.spotbugs.snom.Effort
import org.eclipse.jgit.api.errors.RefAlreadyExistsException

buildscript {
if (!project.hasProperty("springBootVersion")) {
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
version=1.47.0
version=1.48.0
org.gradle.internal.http.socketTimeout=120000
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import org.awaitility.Awaitility;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestInfo;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootContextLoader;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package com.transferwise.tasks;

import com.transferwise.tasks.ITasksService.AddTaskResponse.Result;
import com.transferwise.tasks.domain.TaskContext;
import com.transferwise.tasks.domain.TaskStatus;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.time.Duration;
import java.time.ZonedDateTime;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.time.ZonedDateTime;
import java.util.Objects;
import java.util.UUID;
import lombok.Data;
import lombok.experimental.Accessors;
Expand All @@ -24,7 +23,7 @@ public class FullTaskRecord implements ITask {
private ZonedDateTime nextEventTime;
private String processingClientId;
private TaskContext taskContext;

@Override
public ITaskVersionId getVersionId() {
return new TaskVersionId(id, version);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package com.transferwise.tasks.domain;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.util.Objects;
import java.util.UUID;
import lombok.Data;
import lombok.experimental.Accessors;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import io.micrometer.core.instrument.Meter;
import io.micrometer.core.instrument.Tags;
import io.micrometer.core.instrument.binder.kafka.KafkaClientMetrics;
import java.time.Instant;
import java.time.ZonedDateTime;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -39,6 +40,7 @@ public class CoreMetricsTemplate implements ICoreMetricsTemplate {
public static final String METRIC_TASKS_FAILED_STATUS_CHANGE_COUNT = METRIC_PREFIX + "tasks.failedStatusChangeCount";
public static final String METRIC_TASKS_DEBUG_PRIORITY_QUEUE_CHECK = METRIC_PREFIX + "tasks.debug.priorityQueueCheck";
public static final String METRIC_TASKS_TASK_GRABBING = METRIC_PREFIX + "tasks.taskGrabbing";
public static final String METRIC_TASKS_TASK_GRABBING_TIME = METRIC_PREFIX + "tasks.taskGrabbingTime";
public static final String METRIC_TASKS_DEBUG_ROOM_MAP_ALREADY_HAS_TYPE = METRIC_PREFIX + "tasks.debug.roomMapAlreadyHasType";
public static final String METRIC_TASKS_DEBUG_TASK_TRIGGERING_QUEUE_EMPTY = METRIC_PREFIX + "tasks.debug.taskTriggeringQueueEmpty";
public static final String METRIC_TASKS_DUPLICATES_COUNT = METRIC_PREFIX + "tasks.duplicatesCount";
Expand Down Expand Up @@ -179,11 +181,21 @@ public void registerFailedNextEventTimeChange(String taskType, ZonedDateTime fro
}

@Override
public void registerTaskGrabbingResponse(String bucketId, String taskType, int priority, ProcessTaskResponse processTaskResponse) {
meterCache.counter(METRIC_TASKS_TASK_GRABBING, TagsSet.of(TAG_TASK_TYPE, taskType,
TAG_BUCKET_ID, bucketId, TAG_PRIORITY, String.valueOf(priority), TAG_GRABBING_RESPONSE, processTaskResponse.getResult().name(),
TAG_GRABBING_CODE, processTaskResponse.getCode() == null ? "UNKNOWN" : processTaskResponse.getCode().name()))
.increment();
public void registerTaskGrabbingResponse(String bucketId, String taskType, int priority, ProcessTaskResponse processTaskResponse,
Instant taskTriggeredAt) {

TagsSet tags = TagsSet.of(
TAG_TASK_TYPE, taskType,
TAG_BUCKET_ID, bucketId,
TAG_PRIORITY, String.valueOf(priority),
TAG_GRABBING_RESPONSE, processTaskResponse.getResult().name(),
TAG_GRABBING_CODE, processTaskResponse.getCode() == null ? "UNKNOWN" : processTaskResponse.getCode().name()
);

meterCache.counter(METRIC_TASKS_TASK_GRABBING, tags).increment();

long millisSinceTaskTriggered = System.currentTimeMillis() - taskTriggeredAt.toEpochMilli();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just think if you add a sane default here, even affecting quality of metric that way, it would be safer.

We don't know, when someone changes that logic and null pops here by accident.

meterCache.timer(METRIC_TASKS_TASK_GRABBING_TIME, tags).record(millisSinceTaskTriggered, TimeUnit.MILLISECONDS);
Copy link
Contributor Author

@aaronrosser aaronrosser Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one other timer currently exposed by tw tasks

meterCache.timer(METRIC_TASKS_PROCESSING_TIME, TagsSet.of(TAG_BUCKET_ID, resolvedBucketId, TAG_TASK_TYPE, taskType,
TAG_PROCESSING_RESULT, processingResult))
.record(TwContextClockHolder.getClock().millis() - processingStartTimeMs, TimeUnit.MILLISECONDS);

Cannot see it setting histogram buckets via a meter filter or anything so assume leave it to service owners to override

}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.transferwise.tasks.domain.TaskStatus;
import com.transferwise.tasks.handler.interfaces.StuckDetectionSource;
import com.transferwise.tasks.processing.TasksProcessingService.ProcessTaskResponse;
import java.time.Instant;
import java.time.ZonedDateTime;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
Expand Down Expand Up @@ -49,7 +50,7 @@ public interface ICoreMetricsTemplate {

void registerFailedNextEventTimeChange(String taskType, ZonedDateTime fromNextEventTime, ZonedDateTime toNextEventTime);

void registerTaskGrabbingResponse(String bucketId, String type, int priority, ProcessTaskResponse processTaskResponse);
void registerTaskGrabbingResponse(String bucketId, String type, int priority, ProcessTaskResponse processTaskResponse, Instant taskTriggeredAt);

void debugPriorityQueueCheck(String bucketId, int priority);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,7 @@ protected ProcessTasksResponse processTasks(GlobalProcessingState.Bucket bucket)
mdcService.put(task);
try {
ProcessTaskResponse processTaskResponse = grabTaskForProcessing(bucketId, task);

coreMetricsTemplate.registerTaskGrabbingResponse(bucketId, type, priority, processTaskResponse);
coreMetricsTemplate.registerTaskGrabbingResponse(bucketId, type, priority, processTaskResponse, taskTriggering.getTriggerAt());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this cause a NPE with old version of taskTriggering DTO? One that has no triggeredAt saved yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like TaskTriggering is initialised in two places and we set the triggeredAt with both of them.

Kafka one comes from a long from the event timestamp and other one comes from Instant.now() so think should be fine.

Would prefer to make triggeredAt final and force its initiation during constructor but seems the library exclusively uses init then set pattern so followed this

TaskTriggering taskTriggering = new TaskTriggering()
.setTask(task)
.setBucketId(processingBucketId)
.setTriggerAt(Instant.now());

TaskTriggering taskTriggering = new TaskTriggering()
.setTask(task)
.setBucketId(bucketId)
.setOffset(offset)
.setTriggerAt(Instant.ofEpochMilli(consumerRecord.timestamp()))
.setTopicPartition(topicPartition);


if (processTaskResponse.getResult() == ProcessTaskResponse.Result.NO_SPACE) {
noRoomMap.put(type, Boolean.TRUE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import com.transferwise.tasks.utils.WaitUtils;
import com.vdurmont.semver4j.Semver;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
Expand Down Expand Up @@ -180,7 +181,10 @@ public void trigger(BaseTask task) {
}

if (BooleanUtils.isTrue(bucketsManager.getBucketProperties(processingBucketId).getTriggerInSameProcess())) {
TaskTriggering taskTriggering = new TaskTriggering().setTask(task).setBucketId(processingBucketId);
TaskTriggering taskTriggering = new TaskTriggering()
.setTask(task)
.setBucketId(processingBucketId)
.setTriggerAt(Instant.now());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If task triggered in same process - ran on the pod that triggered it without kafka just use time now

ITasksProcessingService.AddTaskForProcessingResponse addTaskForProcessingResponse = tasksProcessingService.addTaskForProcessing(taskTriggering);

if (addTaskForProcessingResponse.getResult() == ITasksProcessingService.AddTaskForProcessingResponse.ResultCode.OK) {
Expand Down Expand Up @@ -284,7 +288,11 @@ public void poll(String bucketId) {
mdcService.with(() -> {
mdcService.put(task);

TaskTriggering taskTriggering = new TaskTriggering().setTask(task).setBucketId(bucketId).setOffset(offset)
TaskTriggering taskTriggering = new TaskTriggering()
.setTask(task)
.setBucketId(bucketId)
.setOffset(offset)
.setTriggerAt(Instant.ofEpochMilli(consumerRecord.timestamp()))
Copy link
Contributor Author

@aaronrosser aaronrosser Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.setTopicPartition(topicPartition);

coreMetricsTemplate.registerKafkaTasksExecutionTriggererTriggersReceive(bucketId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRebalanceListener;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndTimestamp;
import org.apache.kafka.common.TopicPartition;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.transferwise.tasks.triggering;

import com.transferwise.tasks.domain.BaseTask;
import java.time.Instant;
import lombok.Data;
import lombok.experimental.Accessors;
import org.apache.kafka.common.TopicPartition;
Expand All @@ -11,6 +12,7 @@ public class TaskTriggering {

private BaseTask task;
private long offset;
private Instant triggerAt;
private long sequence;
private TopicPartition topicPartition;
private String bucketId;
Expand Down
Loading