Skip to content

Commit

Permalink
chore: improve process hang visibility (#16485)
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Tinker <[email protected]>
  • Loading branch information
tinker-michaelj authored Nov 7, 2024
1 parent 2601146 commit 700e95e
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import com.hedera.node.app.spi.metrics.StoreMetricsService;
import com.hedera.node.config.testfixtures.HederaTestConfigBuilder;
import com.hedera.pbj.runtime.io.buffer.Bytes;
import com.swirlds.config.api.Configuration;
import com.swirlds.platform.system.address.Address;
import com.swirlds.platform.test.fixtures.addressbook.RandomAddressBookBuilder;
import com.swirlds.state.spi.ReadableStates;
Expand Down Expand Up @@ -92,6 +93,7 @@ public class AddressBookTestBase {
KEY_BUILDER.apply(B_NAME).build(),
A_COMPLEX_KEY)))
.build();
public static final Configuration DEFAULT_CONFIG = HederaTestConfigBuilder.createConfig();
protected final Key key = A_COMPLEX_KEY;
protected final Key anotherKey = B_COMPLEX_KEY;

Expand Down Expand Up @@ -173,8 +175,7 @@ protected void refreshStoresWithCurrentNodeInReadable() {
given(readableStates.<EntityNumber, Node>get(NODES_KEY)).willReturn(readableNodeState);
given(writableStates.<EntityNumber, Node>get(NODES_KEY)).willReturn(writableNodeState);
readableStore = new ReadableNodeStoreImpl(readableStates);
final var configuration = HederaTestConfigBuilder.createConfig();
writableStore = new WritableNodeStore(writableStates, configuration, storeMetricsService);
writableStore = new WritableNodeStore(writableStates, DEFAULT_CONFIG, storeMetricsService);
}

protected void refreshStoresWithCurrentNodeInBothReadableAndWritable() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import java.security.cert.CertificateEncodingException;
import java.security.cert.X509Certificate;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
Expand All @@ -88,17 +89,21 @@ class NodeUpdateHandlerTest extends AddressBookTestBase {

private TransactionBody txn;
private NodeUpdateHandler subject;
private List<X509Certificate> certList;
private static List<X509Certificate> certList;

@BeforeAll
static void beforeAll() {
certList = generateX509Certificates(3);
}

@BeforeEach
void setUp() {
final var addressBookValidator = new AddressBookValidator();
subject = new NodeUpdateHandler(addressBookValidator);
certList = generateX509Certificates(3);
}

@Test
@DisplayName("pureChecks fail when nodeId is negagive")
@DisplayName("pureChecks fail when nodeId is negative")
void nodeIdCannotNegative() {
txn = new NodeUpdateBuilder().build();
final var msg = assertThrows(PreCheckException.class, () -> subject.pureChecks(txn));
Expand Down Expand Up @@ -142,7 +147,7 @@ void pureCheckPass() throws CertificateEncodingException {
}

@Test
void nodetIdMustInState() {
void nodeIdMustInState() {
txn = new NodeUpdateBuilder().withNodeId(2L).build();
given(handleContext.body()).willReturn(txn);
refreshStoresWithCurrentNodeInWritable();
Expand Down Expand Up @@ -399,14 +404,13 @@ void nothingHappensIfUpdateHasNoop() {
}

@Test
void preHandleWorksWhenAdminKeyValid() throws PreCheckException {
void preHandleRequiresAdminKeySigForNonAddressBookAdmin() throws PreCheckException {
txn = new NodeUpdateBuilder()
.withNodeId(nodeId.number())
.withAccountId(asAccount(53))
.withAdminKey(key)
.build();
final var context = setupPreHandle(true, txn);

subject.preHandle(context);
assertThat(txn).isEqualTo(context.body());
assertThat(context.payerKey()).isEqualTo(anotherKey);
Expand Down Expand Up @@ -515,19 +519,24 @@ private void setupHandle() {

private PreHandleContext setupPreHandle(boolean updateAccountIdAllowed, TransactionBody txn)
throws PreCheckException {
return setupPreHandle(updateAccountIdAllowed, txn, payerId);
}

private PreHandleContext setupPreHandle(
boolean updateAccountIdAllowed, TransactionBody txn, AccountID contextPayerId) throws PreCheckException {
final var config = HederaTestConfigBuilder.create()
.withValue("nodes.updateAccountIdAllowed", updateAccountIdAllowed)
.getOrCreateConfig();
mockPayerLookup(anotherKey);
mockPayerLookup(anotherKey, contextPayerId);
final var context = new FakePreHandleContext(accountStore, txn, config);
context.registerStore(ReadableNodeStore.class, readableStore);
return context;
}

private Key mockPayerLookup(Key key) {
private Key mockPayerLookup(Key key, AccountID contextPayerId) {
final var account = mock(Account.class);
given(account.key()).willReturn(key);
given(accountStore.getAccountById(payerId)).willReturn(account);
given(accountStore.getAccountById(contextPayerId)).willReturn(account);
return key;
}

Expand All @@ -543,38 +552,40 @@ private class NodeUpdateBuilder {

private Bytes grpcCertificateHash = null;
private Key adminKey = null;
private AccountID contextPayerId = payerId;

private NodeUpdateBuilder() {}

public TransactionBody build() {
final var txnId = TransactionID.newBuilder().accountID(payerId).transactionValidStart(consensusTimestamp);
final var txnBody = NodeUpdateTransactionBody.newBuilder();
txnBody.nodeId(nodeId);
final var txnId =
TransactionID.newBuilder().accountID(contextPayerId).transactionValidStart(consensusTimestamp);
final var op = NodeUpdateTransactionBody.newBuilder();
op.nodeId(nodeId);
if (accountId != null) {
txnBody.accountId(accountId);
op.accountId(accountId);
}
if (description != null) {
txnBody.description(description);
op.description(description);
}
if (gossipEndpoint != null) {
txnBody.gossipEndpoint(gossipEndpoint);
op.gossipEndpoint(gossipEndpoint);
}
if (serviceEndpoint != null) {
txnBody.serviceEndpoint(serviceEndpoint);
op.serviceEndpoint(serviceEndpoint);
}
if (gossipCaCertificate != null) {
txnBody.gossipCaCertificate(gossipCaCertificate);
op.gossipCaCertificate(gossipCaCertificate);
}
if (grpcCertificateHash != null) {
txnBody.grpcCertificateHash(grpcCertificateHash);
op.grpcCertificateHash(grpcCertificateHash);
}
if (adminKey != null) {
txnBody.adminKey(adminKey);
op.adminKey(adminKey);
}

return TransactionBody.newBuilder()
.transactionID(txnId)
.nodeUpdate(txnBody.build())
.nodeUpdate(op.build())
.build();
}

Expand All @@ -588,6 +599,11 @@ public NodeUpdateBuilder withAccountId(final AccountID accountId) {
return this;
}

public NodeUpdateBuilder withPayerId(final AccountID overridePayerId) {
this.contextPayerId = overridePayerId;
return this;
}

public NodeUpdateBuilder withDescription(final String description) {
this.description = description;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,11 @@ default String hapiSpecInfo() {
* @return the metadata for this node
*/
NodeMetadata metadata();

/**
* Dumps the threads of the node, if applicable.
*/
default void dumpThreads() {
// No-op, for remote and embedded nodes thread dumps are not applicable
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import static com.hedera.services.bdd.junit.hedera.subprocess.ProcessUtils.destroyAnySubProcessNodeWithId;
import static com.hedera.services.bdd.junit.hedera.subprocess.ProcessUtils.startSubProcessNodeFrom;
import static com.hedera.services.bdd.junit.hedera.subprocess.StatusLookupAttempt.newLogAttempt;
import static com.hedera.services.bdd.junit.hedera.utils.WorkingDirUtils.ERROR_REDIRECT_FILE;
import static com.hedera.services.bdd.junit.hedera.utils.WorkingDirUtils.OUTPUT_DIR;
import static com.hedera.services.bdd.junit.hedera.utils.WorkingDirUtils.recreateWorkingDir;
import static com.hedera.services.bdd.spec.HapiPropertySource.asAccount;
import static com.swirlds.platform.system.status.PlatformStatus.ACTIVE;
Expand All @@ -41,10 +43,10 @@
import com.hedera.services.bdd.junit.hedera.NodeMetadata;
import com.hedera.services.bdd.junit.hedera.subprocess.NodeStatus.BindExceptionSeen;
import com.hedera.services.bdd.suites.regression.system.LifecycleTest;
import com.swirlds.base.function.BooleanFunction;
import com.swirlds.platform.system.status.PlatformStatus;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
Expand Down Expand Up @@ -163,19 +165,28 @@ public CompletableFuture<Void> stopFuture() {
return CompletableFuture.completedFuture(null);
}
log.info(
"Destroying node '{}' with PID '{}' (Alive? {})",
"Destroying node{} with PID '{}' (Alive? {})",
metadata.nodeId(),
processHandle.pid(),
processHandle.isAlive() ? "Yes" : "No");
if (!processHandle.destroy()) {
log.warn("May have failed to stop node '{}' with PID '{}'", metadata.nodeId(), processHandle.pid());
log.warn("May have failed to stop node{} with PID '{}'", metadata.nodeId(), processHandle.pid());
}
return processHandle.onExit().thenAccept(handle -> {
log.info("Destroyed PID {}", handle.pid());
this.processHandle = null;
});
}

@Override
public void dumpThreads() {
try {
triggerThreadDump();
} catch (Exception e) {
log.warn("Unable to dump threads for node{}", metadata.nodeId(), e);
}
}

@Override
public String toString() {
return "SubProcessNode{" + "metadata=" + metadata + ", workingDirInitialized=" + workingDirInitialized + '}';
Expand All @@ -198,7 +209,7 @@ public SubProcessNode startWithConfigVersion(final int configVersion) {
* Reassigns the ports used by this node.
*
* @param grpcPort the new gRPC port
* @param grpcNodeOperatorPort the new gRPC node operator port
* @param grpcNodeOperatorPort the new gRPC node operator port
* @param gossipPort the new gossip port
* @param tlsGossipPort the new TLS gossip port
* @param prometheusPort the new Prometheus port
Expand All @@ -214,6 +225,7 @@ public void reassignPorts(

/**
* Reassigns the account ID used by this node.
*
* @param memo the memo containing the new account ID to use
*/
public void reassignNodeAccountIdFrom(@NonNull final String memo) {
Expand All @@ -228,15 +240,6 @@ private boolean swirldsLogContains(@NonNull final String text) {
}
}

private boolean stopWith(@NonNull final BooleanFunction<ProcessHandle> stop) {
if (processHandle == null) {
return false;
}
final var result = stop.apply(processHandle);
processHandle = null;
return result;
}

private void assertStopped() {
if (processHandle != null) {
throw new IllegalStateException("Node is still running");
Expand All @@ -257,4 +260,29 @@ public enum ReassignPorts {
YES,
NO
}

private void triggerThreadDump() throws IOException {
final var javaHome = System.getProperty("java.home");
var jcmdPath = javaHome + File.separator + "bin" + File.separator + "jcmd";
if (System.getProperty("os.name").toLowerCase().contains("win")) {
jcmdPath += ".exe";
}
final long pid = requireNonNull(processHandle).pid();
final var errorRedirectPath =
metadata.workingDirOrThrow().resolve(OUTPUT_DIR).resolve(ERROR_REDIRECT_FILE);
final var processBuilder = new ProcessBuilder(jcmdPath, Long.toString(pid), "Thread.print")
.redirectOutput(errorRedirectPath.toFile())
.redirectErrorStream(true);
final var jcmd = processBuilder.start();
final int exitCode;
try {
exitCode = jcmd.waitFor();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException(e);
}
if (exitCode != 0) {
throw new IOException("jcmd exited with code " + exitCode);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ default void registerOrCreateWith(@NonNull final HapiSpec spec) {
*
* <p>Always use with <code>&#64;LeakyHapiTest(requirement = NO_CONCURRENT_CREATIONS)</code>.
*
* <p>(See, e.g., {@link com.hedera.services.bdd.suites.contract.opcodes.SelfDestructSuite#selfDestructedContractIsDeletedInSameTx(String,SpecAccount,SpecContract,SpecContract)}.)
*
* @param spec the spec to use to create an entity if it is not already created
* @param entities - the entities to create, in order
*/
Expand Down
Loading

0 comments on commit 700e95e

Please sign in to comment.