From 7855c1fe07961a4c3999edf9776d60326ac35337 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Fri, 1 Sep 2023 14:46:23 +0530 Subject: [PATCH] TEZ-4506: Report the node of a task attempt failure better. (#307) (Ayush Saxena reviewed by Laszlo Bodor) --- .../apache/tez/runtime/task/TaskReporter.java | 6 ++-- .../task/TaskExecutionTestHelpers.java | 31 ++++++++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TaskReporter.java b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TaskReporter.java index 81047a9f56..99d8bbca47 100644 --- a/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TaskReporter.java +++ b/tez-runtime-internals/src/main/java/org/apache/tez/runtime/task/TaskReporter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; +import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -401,9 +402,10 @@ private boolean taskTerminated(TezTaskAttemptID taskAttemptID, boolean isKilled, if (!finalEventQueued.getAndSet(true)) { List tezEvents = new ArrayList(); if (diagnostics == null) { - diagnostics = ExceptionUtils.getStackTrace(t); + diagnostics = "Node: " + InetAddress.getLocalHost() + " : " + ExceptionUtils.getStackTrace(t); } else { - diagnostics = diagnostics + ":" + ExceptionUtils.getStackTrace(t); + diagnostics = + "Node: " + InetAddress.getLocalHost() + " : " + diagnostics + ":" + ExceptionUtils.getStackTrace(t); } if (isKilled) { tezEvents.add(new TezEvent(new TaskAttemptKilledEvent(diagnostics), diff --git a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TaskExecutionTestHelpers.java b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TaskExecutionTestHelpers.java index b6000ccae9..3e6790c6cf 100644 --- a/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TaskExecutionTestHelpers.java +++ b/tez-runtime-internals/src/test/java/org/apache/tez/runtime/task/TaskExecutionTestHelpers.java @@ -47,6 +47,8 @@ import org.apache.tez.runtime.api.impl.TezEvent; import org.apache.tez.runtime.api.impl.TezHeartbeatRequest; import org.apache.tez.runtime.api.impl.TezHeartbeatResponse; + +import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -396,19 +398,20 @@ public void verifyTaskFailedEvent(String diagStart, String diagContains, TaskFai for (TezEvent event : requestEvents) { if (event.getEvent() instanceof TaskAttemptFailedEvent) { TaskAttemptFailedEvent failedEvent = (TaskAttemptFailedEvent) event.getEvent(); - if (failedEvent.getDiagnostics().startsWith(diagStart)) { + String diagnostics = getDiagnosticsWithoutNodeIp(failedEvent.getDiagnostics()); + if (diagnostics.startsWith(diagStart)) { if (diagContains != null) { - if (failedEvent.getDiagnostics().contains(diagContains)) { + if (diagnostics.contains(diagContains)) { assertEquals(taskFailureType, failedEvent.getTaskFailureType()); return; } else { fail("Diagnostic message does not contain expected message. Found [" + - failedEvent.getDiagnostics() + "], Expected: [" + diagContains + "]"); + diagnostics + "], Expected: [" + diagContains + "]"); } } } else { fail("Diagnostic message does not start with expected message. Found [" + - failedEvent.getDiagnostics() + "], Expected: [" + diagStart + "]"); + diagnostics + "], Expected: [" + diagStart + "]"); } } } @@ -425,18 +428,19 @@ public void verifyTaskKilledEvent(String diagStart, String diagContains) { if (event.getEvent() instanceof TaskAttemptKilledEvent) { TaskAttemptKilledEvent killedEvent = (TaskAttemptKilledEvent) event.getEvent(); - if (killedEvent.getDiagnostics().startsWith(diagStart)) { + String diagnostics = getDiagnosticsWithoutNodeIp(killedEvent.getDiagnostics()); + if (diagnostics.startsWith(diagStart)) { if (diagContains != null) { - if (killedEvent.getDiagnostics().contains(diagContains)) { + if (diagnostics.contains(diagContains)) { return; } else { fail("Diagnostic message does not contain expected message. Found [" + - killedEvent.getDiagnostics() + "], Expected: [" + diagContains + "]"); + diagnostics + "], Expected: [" + diagContains + "]"); } } } else { fail("Diagnostic message does not start with expected message. Found [" + - killedEvent.getDiagnostics() + "], Expected: [" + diagStart + "]"); + diagnostics + "], Expected: [" + diagStart + "]"); } } } @@ -518,6 +522,17 @@ public int getTaskInvocations() { } } + private static String getDiagnosticsWithoutNodeIp(String diagnostics) { + String diagnosticsWithoutIP = diagnostics; + if (diagnostics != null && diagnostics.startsWith("Node:")) { + diagnosticsWithoutIP = diagnostics.substring(diagnostics.indexOf(" : ") + 3); + String nodeIp = diagnostics.substring(5, diagnostics.indexOf(" : ")); + Assert.assertFalse(nodeIp.isEmpty()); + } + + return diagnosticsWithoutIP; + } + @SuppressWarnings("deprecation") public static ContainerId createContainerId(ApplicationId appId) { ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);