From 24fe1ef4ddc036dfc4b5ecabc1a5801abbb3b16c Mon Sep 17 00:00:00 2001 From: Masatake Iwasaki Date: Sun, 22 Oct 2023 22:22:56 +0900 Subject: [PATCH 01/23] HADOOP-18942 addendum. update LICENSE-binary. --- LICENSE-binary | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE-binary b/LICENSE-binary index 5daa51efd8945..c367abdff5742 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -337,7 +337,7 @@ org.apache.kerby:kerby-xdr:2.0.3 org.apache.kerby:token-provider:2.0.3 org.apache.solr:solr-solrj:8.11.2 org.apache.yetus:audience-annotations:0.5.0 -org.apache.zookeeper:zookeeper:3.6.3 +org.apache.zookeeper:zookeeper:3.7.2 org.codehaus.jettison:jettison:1.5.4 org.eclipse.jetty:jetty-annotations:9.4.51.v20230217 org.eclipse.jetty:jetty-http:9.4.51.v20230217 From d7d772d6841db24ea222241bf7cd747eb3c939f3 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Sun, 22 Oct 2023 22:22:14 +0800 Subject: [PATCH 02/23] YARN-11595. Fix hadoop-yarn-client#java.lang.NoClassDefFoundError (#6210) Contributed by Shilun Fan. Reviewed-by: Ayush Saxena Signed-off-by: Shilun Fan --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml index dbe4e9048b2df..5eedf486121a3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml @@ -175,6 +175,12 @@ org.jline jline + + + org.junit.jupiter + junit-jupiter-api + test + From 3e0fcda7a5f48f5b36b8def4d177573e3933b5a0 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Mon, 23 Oct 2023 14:24:30 +0100 Subject: [PATCH 03/23] HADOOP-18945. S3A. IAMInstanceCredentialsProvider failing. (#6202) This restores asynchronous retrieval/refresh of any AWS credentials provided by the EC2 instance/container in which the process is running. Contributed by Steve Loughran --- .../org/apache/hadoop/fs/s3a/S3AUtils.java | 4 +- .../auth/IAMInstanceCredentialsProvider.java | 94 ++++++++++++--- .../hadoop/fs/s3a/impl/ErrorTranslation.java | 6 +- .../TestIAMInstanceCredentialsProvider.java | 107 ++++++++++++++++++ .../fs/s3a/impl/TestErrorTranslation.java | 31 ++++- 5 files changed, 220 insertions(+), 22 deletions(-) create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestIAMInstanceCredentialsProvider.java diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java index 6798a99c19e1e..7466690744e82 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java @@ -80,7 +80,7 @@ import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.isNotInstanceOf; import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.unsupportedConstructor; import static org.apache.hadoop.fs.s3a.impl.InternalConstants.*; -import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.maybeExtractNetworkException; +import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.maybeExtractIOException; import static org.apache.hadoop.io.IOUtils.cleanupWithLogger; import static org.apache.hadoop.util.functional.RemoteIterators.filteringRemoteIterator; @@ -194,7 +194,7 @@ public static IOException translateException(@Nullable String operation, return ioe; } // network problems covered by an IOE inside the exception chain. - ioe = maybeExtractNetworkException(path, exception); + ioe = maybeExtractIOException(path, exception); if (ioe != null) { return ioe; } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java index 2e39b275b4a4d..080b79e7f20d5 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java @@ -21,37 +21,69 @@ import java.io.Closeable; import java.io.IOException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import software.amazon.awssdk.auth.credentials.AwsCredentials; import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; import software.amazon.awssdk.auth.credentials.ContainerCredentialsProvider; +import software.amazon.awssdk.auth.credentials.HttpCredentialsProvider; import software.amazon.awssdk.auth.credentials.InstanceProfileCredentialsProvider; import software.amazon.awssdk.core.exception.SdkClientException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.maybeExtractIOException; + /** * This is an IAM credential provider which wraps * an {@code ContainerCredentialsProvider} * to provide credentials when the S3A connector is instantiated on AWS EC2 * or the AWS container services. *

- * When it fails to authenticate, it raises a - * {@link NoAwsCredentialsException} which can be recognized by retry handlers + * The provider is initialized with async credential refresh enabled to be less + * brittle against transient network issues. + *

+ * If the ContainerCredentialsProvider fails to authenticate, then an instance of + * {@link InstanceProfileCredentialsProvider} is created and attemped to + * be used instead, again with async credential refresh enabled. + *

+ * If both credential providers fail, a {@link NoAwsCredentialsException} + * is thrown, which can be recognized by retry handlers * as a non-recoverable failure. *

* It is implicitly public; marked evolving as we can change its semantics. - * */ @InterfaceAudience.Public @InterfaceStability.Evolving public class IAMInstanceCredentialsProvider implements AwsCredentialsProvider, Closeable { - private final AwsCredentialsProvider containerCredentialsProvider = - ContainerCredentialsProvider.builder().build(); + private static final Logger LOG = + LoggerFactory.getLogger(IAMInstanceCredentialsProvider.class); + + /** + * The credentials provider. + * Initially a container credentials provider, but if that fails + * fall back to the instance profile provider. + */ + private HttpCredentialsProvider iamCredentialsProvider; + + /** + * Is the container credentials provider in use? + */ + private boolean isContainerCredentialsProvider; + /** + * Constructor. + * Build credentials provider with async refresh, + * mark {@link #isContainerCredentialsProvider} as true. + */ public IAMInstanceCredentialsProvider() { + isContainerCredentialsProvider = true; + iamCredentialsProvider = ContainerCredentialsProvider.builder() + .asyncCredentialUpdateEnabled(true) + .build(); } /** @@ -65,9 +97,16 @@ public AwsCredentials resolveCredentials() { try { return getCredentials(); } catch (SdkClientException e) { + + // if the exception contains an IOE, extract it + // so its type is the immediate cause of this new exception. + Throwable t = e; + final IOException ioe = maybeExtractIOException("IAM endpoint", e); + if (ioe != null) { + t = ioe; + } throw new NoAwsCredentialsException("IAMInstanceCredentialsProvider", - e.getMessage(), - e); + e.getMessage(), t); } } @@ -78,23 +117,52 @@ public AwsCredentials resolveCredentials() { * * @return credentials */ - private AwsCredentials getCredentials() { + private synchronized AwsCredentials getCredentials() { try { - return containerCredentialsProvider.resolveCredentials(); + return iamCredentialsProvider.resolveCredentials(); } catch (SdkClientException e) { - return InstanceProfileCredentialsProvider.create().resolveCredentials(); + LOG.debug("Failed to get credentials from container provider,", e); + if (isContainerCredentialsProvider) { + // create instance profile provider + LOG.debug("Switching to instance provider", e); + + // close it to shut down any thread + iamCredentialsProvider.close(); + isContainerCredentialsProvider = false; + iamCredentialsProvider = InstanceProfileCredentialsProvider.builder() + .asyncCredentialUpdateEnabled(true) + .build(); + return iamCredentialsProvider.resolveCredentials(); + } else { + // already using instance profile provider, so fail + throw e; + } + } } + /** + * Is this a container credentials provider? + * @return true if the container credentials provider is in use; + * false for InstanceProfileCredentialsProvider + */ + public boolean isContainerCredentialsProvider() { + return isContainerCredentialsProvider; + } + @Override - public void close() throws IOException { - // no-op. + public synchronized void close() throws IOException { + // this be true but just for safety... + if (iamCredentialsProvider != null) { + iamCredentialsProvider.close(); + } } @Override public String toString() { return "IAMInstanceCredentialsProvider{" + - "containerCredentialsProvider=" + containerCredentialsProvider + + "credentialsProvider=" + iamCredentialsProvider + + ", isContainerCredentialsProvider=" + isContainerCredentialsProvider + '}'; } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ErrorTranslation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ErrorTranslation.java index 7b5190becc487..f8a1f907bb3b1 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ErrorTranslation.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ErrorTranslation.java @@ -79,7 +79,7 @@ public static boolean isObjectNotFound(AwsServiceException e) { * @param thrown exception * @return a translated exception or null. */ - public static IOException maybeExtractNetworkException(String path, Throwable thrown) { + public static IOException maybeExtractIOException(String path, Throwable thrown) { if (thrown == null) { return null; @@ -100,7 +100,9 @@ public static IOException maybeExtractNetworkException(String path, Throwable th // as a new instance is created through reflection, the // class of the returned instance will be that of the innermost, // unless no suitable constructor is available. - return wrapWithInnerIOE(path, thrown, (IOException) cause); + final IOException ioe = (IOException) cause; + + return wrapWithInnerIOE(path, thrown, ioe); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestIAMInstanceCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestIAMInstanceCredentialsProvider.java new file mode 100644 index 0000000000000..c8986eab9b850 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestIAMInstanceCredentialsProvider.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.auth; + +import java.io.IOException; + +import org.assertj.core.api.Assertions; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.auth.credentials.AwsCredentials; + +import org.apache.hadoop.test.AbstractHadoopTestBase; + +/** + * Unit tests for IAMInstanceCredentials provider. + * This is a bit tricky as we don't want to require running in EC2, + * but nor do we want a test which doesn't work in EC2. + */ +public class TestIAMInstanceCredentialsProvider extends AbstractHadoopTestBase { + + private static final Logger LOG = + LoggerFactory.getLogger(TestIAMInstanceCredentialsProvider.class); + + /** + * Error string from + * software.amazon.awssdk.auth.credentials.InstanceProfileCredentialsProvider, + * if IAM resolution has been disabled: {@value}. + */ + public static final String DISABLED = + "IMDS credentials have been disabled by environment variable or system property"; + + /** + * Test an immediate create/close. + */ + @Test + public void testIAMInstanceCredentialsProviderClose() throws Throwable { + new IAMInstanceCredentialsProvider().close(); + } + + /** + * Test instantiation. + * Multiple outcomes depending on host setup. + *

    + *
  1. In EC2: credentials resolved. + * Assert the credentials comes with a key.
  2. + *
  3. Not in EC2: NoAwsCredentialsException wraps network error trying + * to talk to the service. + * Assert wrapped exception is an IOE.
  4. + *
  5. IMDS resolution disabled by env var/sysprop. + * NoAwsCredentialsException raised doesn't contain an IOE. + * Require the message to contain the {@link #DISABLED} text.
  6. j + *
+ */ + @Test + public void testIAMInstanceCredentialsInstantiate() throws Throwable { + try (IAMInstanceCredentialsProvider provider = new IAMInstanceCredentialsProvider()) { + try { + final AwsCredentials credentials = provider.resolveCredentials(); + // if we get here this test suite is running in a container/EC2 + LOG.info("Credentials: retrieved from {}: key={}", + provider.isContainerCredentialsProvider() ? "container" : "EC2", + credentials.accessKeyId()); + Assertions.assertThat(credentials.accessKeyId()) + .describedAs("Access key from IMDS") + .isNotBlank(); + + // and if we get here, so does a second call + provider.resolveCredentials(); + } catch (NoAwsCredentialsException expected) { + // this is expected if the test is not running in a container/EC2 + LOG.info("Not running in a container/EC2"); + LOG.info("Exception raised", expected); + // and we expect to have fallen back to InstanceProfileCredentialsProvider + Assertions.assertThat(provider.isContainerCredentialsProvider()) + .describedAs("%s: shoud be using InstanceProfileCredentialsProvider") + .isFalse(); + final Throwable cause = expected.getCause(); + if (cause == null) { + throw expected; + } + if (!(cause instanceof IOException) + && !cause.toString().contains(DISABLED)) { + throw new AssertionError("Cause not a IOException", cause); + } + } + } + } + + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestErrorTranslation.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestErrorTranslation.java index a1fdbabb94793..0f0b2c0c34bb5 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestErrorTranslation.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestErrorTranslation.java @@ -19,8 +19,10 @@ package org.apache.hadoop.fs.s3a.impl; import java.io.IOException; +import java.io.UncheckedIOException; import java.net.ConnectException; import java.net.NoRouteToHostException; +import java.net.SocketTimeoutException; import java.net.UnknownHostException; import java.util.Collections; @@ -31,9 +33,10 @@ import software.amazon.awssdk.core.retry.RetryPolicyContext; import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.s3a.auth.NoAwsCredentialsException; import org.apache.hadoop.test.AbstractHadoopTestBase; -import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.maybeExtractNetworkException; +import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.maybeExtractIOException; import static org.apache.hadoop.test.LambdaTestUtils.intercept; import static org.junit.Assert.assertTrue; @@ -64,7 +67,7 @@ public void testUnknownHostExceptionExtraction() throws Throwable { new UnknownHostException("bottom"))); final IOException ioe = intercept(UnknownHostException.class, "top", () -> { - throw maybeExtractNetworkException("", thrown); + throw maybeExtractIOException("", thrown); }); // the wrapped exception is the top level one: no stack traces have @@ -79,7 +82,7 @@ public void testUnknownHostExceptionExtraction() throws Throwable { public void testNoRouteToHostExceptionExtraction() throws Throwable { intercept(NoRouteToHostException.class, "top", () -> { - throw maybeExtractNetworkException("p2", + throw maybeExtractIOException("p2", sdkException("top", sdkException("middle", new NoRouteToHostException("bottom")))); @@ -90,17 +93,35 @@ public void testNoRouteToHostExceptionExtraction() throws Throwable { public void testConnectExceptionExtraction() throws Throwable { intercept(ConnectException.class, "top", () -> { - throw maybeExtractNetworkException("p1", + throw maybeExtractIOException("p1", sdkException("top", sdkException("middle", new ConnectException("bottom")))); }); } + + /** + * When there is an UncheckedIOException, its inner class is + * extracted. + */ + @Test + public void testUncheckedIOExceptionExtraction() throws Throwable { + intercept(SocketTimeoutException.class, "top", + () -> { + final SdkClientException thrown = sdkException("top", + sdkException("middle", + new UncheckedIOException( + new SocketTimeoutException("bottom")))); + throw maybeExtractIOException("p1", + new NoAwsCredentialsException("IamProvider", thrown.toString(), thrown)); + }); + } + @Test public void testNoConstructorExtraction() throws Throwable { intercept(PathIOException.class, NoConstructorIOE.MESSAGE, () -> { - throw maybeExtractNetworkException("p1", + throw maybeExtractIOException("p1", sdkException("top", sdkException("middle", new NoConstructorIOE()))); From fbd653be9bdffa425c701322eaa9a73b823b282c Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Mon, 23 Oct 2023 19:35:12 +0530 Subject: [PATCH 04/23] Revert "HDFS-17228. Improve documentation related to BlockManager. (#6195). Contributed by JiangHua Zhu." This reverts commit 81ba2e8484c4315bb9a765374df4bb2a05bc0ebd. --- .../hadoop/hdfs/server/blockmanagement/BlockManager.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 848da7bd11542..783000bbef2f8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -150,10 +150,10 @@ * redundancy. * * For regular replication, # of min live replicas for maintenance is determined - * by {@link DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY}. This number has to <= - * {@link DFS_NAMENODE_REPLICATION_MIN_KEY}. + * by DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY. This number has to <= + * DFS_NAMENODE_REPLICATION_MIN_KEY. * For erasure encoding, # of min live replicas for maintenance is - * {@link BlockInfoStriped#getRealDataBlockNum}. + * BlockInfoStriped#getRealDataBlockNum. * * Another safety property is to satisfy the block placement policy. While the * policy is configurable, the replicas the policy is applied to are the live From 6e13e4addc14388450cdf6ac6e890d1c95d47e4d Mon Sep 17 00:00:00 2001 From: jianghuazhu <740087514@qq.com> Date: Wed, 18 Oct 2023 07:35:33 +0800 Subject: [PATCH 05/23] HDFS-17228. Improve documentation related to BlockManager. (#6195). Contributed by JiangHua Zhu. Reviewed-by: Inigo Goiri Signed-off-by: Ayush Saxena --- .../hadoop/hdfs/server/blockmanagement/BlockManager.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 783000bbef2f8..848da7bd11542 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -150,10 +150,10 @@ * redundancy. * * For regular replication, # of min live replicas for maintenance is determined - * by DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY. This number has to <= - * DFS_NAMENODE_REPLICATION_MIN_KEY. + * by {@link DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY}. This number has to <= + * {@link DFS_NAMENODE_REPLICATION_MIN_KEY}. * For erasure encoding, # of min live replicas for maintenance is - * BlockInfoStriped#getRealDataBlockNum. + * {@link BlockInfoStriped#getRealDataBlockNum}. * * Another safety property is to satisfy the block placement policy. While the * policy is configurable, the replicas the policy is applied to are the live From 5eeab5e1b9721326cfbb2e7f8fbc4f4b372a2697 Mon Sep 17 00:00:00 2001 From: huhaiyang Date: Mon, 23 Oct 2023 22:42:39 +0800 Subject: [PATCH 06/23] HDFS-17235. Fix javadoc errors in BlockManager (#6214). Contributed by Haiyang Hu. --- .../hadoop/hdfs/server/blockmanagement/BlockManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 848da7bd11542..2351bb4782873 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -150,8 +150,8 @@ * redundancy. * * For regular replication, # of min live replicas for maintenance is determined - * by {@link DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY}. This number has to <= - * {@link DFS_NAMENODE_REPLICATION_MIN_KEY}. + * by {@link DFSConfigKeys#DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY}. This number has to <= + * {@link DFSConfigKeys#DFS_NAMENODE_REPLICATION_MIN_KEY}. * For erasure encoding, # of min live replicas for maintenance is * {@link BlockInfoStriped#getRealDataBlockNum}. * From 4c04818d3da5d136a6e332b66b1268d8c46f4d35 Mon Sep 17 00:00:00 2001 From: Zita Dombi <50611074+dombizita@users.noreply.github.com> Date: Mon, 23 Oct 2023 20:03:15 +0200 Subject: [PATCH 07/23] HADOOP-18919. Zookeeper SSL/TLS support in HDFS ZKFC (#6194) --- .../hadoop/ha/ActiveStandbyElector.java | 32 +++++- .../hadoop/ha/ZKFailoverController.java | 6 +- .../apache/hadoop/security/SecurityUtil.java | 102 ++++++++++++++++++ .../hadoop/util/curator/ZKCuratorManager.java | 97 +---------------- .../org/apache/hadoop/ha/MiniZKFCCluster.java | 5 + .../hadoop/ha/TestActiveStandbyElector.java | 90 +++++++++++++++- .../ha/TestActiveStandbyElectorRealZK.java | 4 +- .../curator/TestSecureZKCuratorManager.java | 11 +- .../util/curator/TestZKCuratorManager.java | 3 +- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 3 + .../hdfs/tools/DFSZKFailoverController.java | 7 ++ .../src/main/resources/hdfs-default.xml | 8 ++ ...tiveStandbyElectorBasedElectorService.java | 7 +- 13 files changed, 263 insertions(+), 112 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index edd15af534a76..b6907c672b5ad 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -29,8 +29,10 @@ import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.util.ZKUtil.ZKAuthInfo; import org.apache.hadoop.util.StringUtils; +import org.apache.zookeeper.client.ZKClientConfig; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.Watcher; @@ -48,6 +50,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.naming.ConfigurationException; + +import org.apache.hadoop.security.SecurityUtil.TruststoreKeystore; + /** * * This class implements a simple library to perform leader election on top of @@ -170,6 +176,7 @@ enum State { private final int zkSessionTimeout; private final List zkAcl; private final List zkAuthInfo; + private TruststoreKeystore truststoreKeystore; private byte[] appData; private final String zkLockFilePath; private final String zkBreadCrumbPath; @@ -209,6 +216,7 @@ enum State { * @param app * reference to callback interface object * @param maxRetryNum maxRetryNum. + * @param truststoreKeystore truststore keystore, that we will use for ZK if SSL/TLS is enabled * @throws IOException raised on errors performing I/O. * @throws HadoopIllegalArgumentException * if valid data is not supplied. @@ -218,10 +226,10 @@ enum State { public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName, List acl, List authInfo, ActiveStandbyElectorCallback app, - int maxRetryNum) throws IOException, HadoopIllegalArgumentException, - KeeperException { + int maxRetryNum, TruststoreKeystore truststoreKeystore) + throws IOException, HadoopIllegalArgumentException, KeeperException { this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl, - authInfo, app, maxRetryNum, true); + authInfo, app, maxRetryNum, true, truststoreKeystore); } /** @@ -254,6 +262,7 @@ public ActiveStandbyElector(String zookeeperHostPorts, * @param failFast * whether need to add the retry when establishing ZK connection. * @param maxRetryNum max Retry Num + * @param truststoreKeystore truststore keystore, that we will use for ZK if SSL/TLS is enabled * @throws IOException * raised on errors performing I/O. * @throws HadoopIllegalArgumentException @@ -264,7 +273,7 @@ public ActiveStandbyElector(String zookeeperHostPorts, public ActiveStandbyElector(String zookeeperHostPorts, int zookeeperSessionTimeout, String parentZnodeName, List acl, List authInfo, ActiveStandbyElectorCallback app, - int maxRetryNum, boolean failFast) throws IOException, + int maxRetryNum, boolean failFast, TruststoreKeystore truststoreKeystore) throws IOException, HadoopIllegalArgumentException, KeeperException { if (app == null || acl == null || parentZnodeName == null || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) { @@ -279,6 +288,7 @@ public ActiveStandbyElector(String zookeeperHostPorts, zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME; zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME; this.maxRetryNum = maxRetryNum; + this.truststoreKeystore = truststoreKeystore; // establish the ZK Connection for future API calls if (failFast) { @@ -740,7 +750,19 @@ protected synchronized ZooKeeper connectToZooKeeper() throws IOException, * @throws IOException raised on errors performing I/O. */ protected ZooKeeper createZooKeeper() throws IOException { - return new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); + ZKClientConfig zkClientConfig = new ZKClientConfig(); + if (truststoreKeystore != null) { + try { + SecurityUtil.setSslConfiguration(zkClientConfig, truststoreKeystore); + } catch (ConfigurationException ce) { + throw new IOException(ce); + } + } + return initiateZookeeper(zkClientConfig); + } + + protected ZooKeeper initiateZookeeper(ZKClientConfig zkClientConfig) throws IOException { + return new ZooKeeper(zkHostPort, zkSessionTimeout, watcher, zkClientConfig); } private void fatalError(String errorMessage) { diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java index d24d5630c5917..487d7b9409159 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java @@ -59,6 +59,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.security.SecurityUtil.TruststoreKeystore; + @InterfaceAudience.LimitedPrivate("HDFS") public abstract class ZKFailoverController { @@ -147,6 +149,7 @@ protected abstract void checkRpcAdminAccess() protected abstract InetSocketAddress getRpcAddressToBindTo(); protected abstract PolicyProvider getPolicyProvider(); protected abstract List getAllOtherNodes(); + protected abstract boolean isSSLEnabled(); /** * Return the name of a znode inside the configured parent znode in which @@ -372,9 +375,10 @@ private void initZK() throws HadoopIllegalArgumentException, IOException, int maxRetryNum = conf.getInt( CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + TruststoreKeystore truststoreKeystore = isSSLEnabled() ? new TruststoreKeystore(conf) : null; elector = new ActiveStandbyElector(zkQuorum, zkTimeout, getParentZnode(), zkAcls, zkAuths, - new ElectorCallbacks(), maxRetryNum); + new ElectorCallbacks(), maxRetryNum, truststoreKeystore); } private String getParentZnode() { diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java index d045a7f6fc488..fd3030e8a977d 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java @@ -35,6 +35,7 @@ import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; +import javax.naming.ConfigurationException; import javax.security.auth.kerberos.KerberosPrincipal; import javax.security.auth.kerberos.KerberosTicket; @@ -53,6 +54,8 @@ import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ZKUtil; +import org.apache.zookeeper.client.ZKClientConfig; +import org.apache.zookeeper.common.ClientX509Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xbill.DNS.Name; @@ -786,4 +789,103 @@ public static List getZKAuthInfos(Configuration conf, throw e; } } + + public static void validateSslConfiguration(TruststoreKeystore truststoreKeystore) + throws ConfigurationException { + if (org.apache.commons.lang3.StringUtils.isEmpty(truststoreKeystore.keystoreLocation)) { + throw new ConfigurationException( + "The keystore location parameter is empty for the ZooKeeper client connection."); + } + if (org.apache.commons.lang3.StringUtils.isEmpty(truststoreKeystore.keystorePassword)) { + throw new ConfigurationException( + "The keystore password parameter is empty for the ZooKeeper client connection."); + } + if (org.apache.commons.lang3.StringUtils.isEmpty(truststoreKeystore.truststoreLocation)) { + throw new ConfigurationException( + "The truststore location parameter is empty for the ZooKeeper client connection."); + } + if (org.apache.commons.lang3.StringUtils.isEmpty(truststoreKeystore.truststorePassword)) { + throw new ConfigurationException( + "The truststore password parameter is empty for the ZooKeeper client connection."); + } + } + + /** + * Configure ZooKeeper Client with SSL/TLS connection. + * @param zkClientConfig ZooKeeper Client configuration + * @param truststoreKeystore truststore keystore, that we use to set the SSL configurations + * @throws ConfigurationException if the SSL configs are empty + */ + public static void setSslConfiguration(ZKClientConfig zkClientConfig, + TruststoreKeystore truststoreKeystore) + throws ConfigurationException { + setSslConfiguration(zkClientConfig, truststoreKeystore, new ClientX509Util()); + } + + public static void setSslConfiguration(ZKClientConfig zkClientConfig, + TruststoreKeystore truststoreKeystore, + ClientX509Util x509Util) + throws ConfigurationException { + validateSslConfiguration(truststoreKeystore); + LOG.info("Configuring the ZooKeeper client to use SSL/TLS encryption for connecting to the " + + "ZooKeeper server."); + LOG.debug("Configuring the ZooKeeper client with {} location: {}.", + truststoreKeystore.keystoreLocation, + CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION); + LOG.debug("Configuring the ZooKeeper client with {} location: {}.", + truststoreKeystore.truststoreLocation, + CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION); + + zkClientConfig.setProperty(ZKClientConfig.SECURE_CLIENT, "true"); + zkClientConfig.setProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET, + "org.apache.zookeeper.ClientCnxnSocketNetty"); + zkClientConfig.setProperty(x509Util.getSslKeystoreLocationProperty(), + truststoreKeystore.keystoreLocation); + zkClientConfig.setProperty(x509Util.getSslKeystorePasswdProperty(), + truststoreKeystore.keystorePassword); + zkClientConfig.setProperty(x509Util.getSslTruststoreLocationProperty(), + truststoreKeystore.truststoreLocation); + zkClientConfig.setProperty(x509Util.getSslTruststorePasswdProperty(), + truststoreKeystore.truststorePassword); + } + + /** + * Helper class to contain the Truststore/Keystore paths for the ZK client connection over + * SSL/TLS. + */ + public static class TruststoreKeystore { + private final String keystoreLocation; + private final String keystorePassword; + private final String truststoreLocation; + private final String truststorePassword; + + /** + * Configuration for the ZooKeeper connection when SSL/TLS is enabled. + * When a value is not configured, ensure that empty string is set instead of null. + * + * @param conf ZooKeeper Client configuration + */ + public TruststoreKeystore(Configuration conf) { + keystoreLocation = conf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION, ""); + keystorePassword = conf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_PASSWORD, ""); + truststoreLocation = conf.get(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION, ""); + truststorePassword = conf.get(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_PASSWORD, ""); + } + + public String getKeystoreLocation() { + return keystoreLocation; + } + + public String getKeystorePassword() { + return keystorePassword; + } + + public String getTruststoreLocation() { + return truststoreLocation; + } + + public String getTruststorePassword() { + return truststorePassword; + } + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ZKCuratorManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ZKCuratorManager.java index 4df7977432918..3055e7bf659a6 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ZKCuratorManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ZKCuratorManager.java @@ -40,7 +40,6 @@ import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.client.ZKClientConfig; -import org.apache.zookeeper.common.ClientX509Util; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Stat; @@ -49,7 +48,7 @@ import org.apache.hadoop.util.Preconditions; -import javax.naming.ConfigurationException; +import org.apache.hadoop.security.SecurityUtil.TruststoreKeystore; /** * Helper class that provides utility methods specific to ZK operations. @@ -570,64 +569,12 @@ public ZooKeeper newZooKeeper(String connectString, int sessionTimeout, setJaasConfiguration(zkClientConfig); } if (sslEnabled) { - setSslConfiguration(zkClientConfig); + SecurityUtil.setSslConfiguration(zkClientConfig, truststoreKeystore); } return new ZooKeeper(connectString, sessionTimeout, watcher, canBeReadOnly, zkClientConfig); } - /** - * Configure ZooKeeper Client with SSL/TLS connection. - * @param zkClientConfig ZooKeeper Client configuration - */ - private void setSslConfiguration(ZKClientConfig zkClientConfig) throws ConfigurationException { - this.setSslConfiguration(zkClientConfig, new ClientX509Util()); - } - - private void setSslConfiguration(ZKClientConfig zkClientConfig, ClientX509Util x509Util) - throws ConfigurationException { - validateSslConfiguration(); - LOG.info("Configuring the ZooKeeper client to use SSL/TLS encryption for connecting to the " - + "ZooKeeper server."); - LOG.debug("Configuring the ZooKeeper client with {} location: {}.", - this.truststoreKeystore.keystoreLocation, - CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION); - LOG.debug("Configuring the ZooKeeper client with {} location: {}.", - this.truststoreKeystore.truststoreLocation, - CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION); - - zkClientConfig.setProperty(ZKClientConfig.SECURE_CLIENT, "true"); - zkClientConfig.setProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET, - "org.apache.zookeeper.ClientCnxnSocketNetty"); - zkClientConfig.setProperty(x509Util.getSslKeystoreLocationProperty(), - this.truststoreKeystore.keystoreLocation); - zkClientConfig.setProperty(x509Util.getSslKeystorePasswdProperty(), - this.truststoreKeystore.keystorePassword); - zkClientConfig.setProperty(x509Util.getSslTruststoreLocationProperty(), - this.truststoreKeystore.truststoreLocation); - zkClientConfig.setProperty(x509Util.getSslTruststorePasswdProperty(), - this.truststoreKeystore.truststorePassword); - } - - private void validateSslConfiguration() throws ConfigurationException { - if (StringUtils.isEmpty(this.truststoreKeystore.keystoreLocation)) { - throw new ConfigurationException( - "The keystore location parameter is empty for the ZooKeeper client connection."); - } - if (StringUtils.isEmpty(this.truststoreKeystore.keystorePassword)) { - throw new ConfigurationException( - "The keystore password parameter is empty for the ZooKeeper client connection."); - } - if (StringUtils.isEmpty(this.truststoreKeystore.truststoreLocation)) { - throw new ConfigurationException( - "The truststore location parameter is empty for the ZooKeeper client connection."); - } - if (StringUtils.isEmpty(this.truststoreKeystore.truststorePassword)) { - throw new ConfigurationException( - "The truststore password parameter is empty for the ZooKeeper client connection."); - } - } - private boolean isJaasConfigurationSet(ZKClientConfig zkClientConfig) { String clientConfig = zkClientConfig.getProperty(ZKClientConfig.LOGIN_CONTEXT_NAME_KEY, ZKClientConfig.LOGIN_CONTEXT_NAME_KEY_DEFAULT); @@ -649,44 +596,4 @@ private void setJaasConfiguration(ZKClientConfig zkClientConfig) throws IOExcept zkClientConfig.setProperty(ZKClientConfig.LOGIN_CONTEXT_NAME_KEY, JAAS_CLIENT_ENTRY); } } - - /** - * Helper class to contain the Truststore/Keystore paths for the ZK client connection over - * SSL/TLS. - */ - public static class TruststoreKeystore { - private final String keystoreLocation; - private final String keystorePassword; - private final String truststoreLocation; - private final String truststorePassword; - - /** - * Configuration for the ZooKeeper connection when SSL/TLS is enabled. - * When a value is not configured, ensure that empty string is set instead of null. - * - * @param conf ZooKeeper Client configuration - */ - public TruststoreKeystore(Configuration conf) { - keystoreLocation = conf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION, ""); - keystorePassword = conf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_PASSWORD, ""); - truststoreLocation = conf.get(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION, ""); - truststorePassword = conf.get(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_PASSWORD, ""); - } - - public String getKeystoreLocation() { - return keystoreLocation; - } - - public String getKeystorePassword() { - return keystorePassword; - } - - public String getTruststoreLocation() { - return truststoreLocation; - } - - public String getTruststorePassword() { - return truststorePassword; - } - } } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java index 7fc617c378950..8d3075f45263b 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/MiniZKFCCluster.java @@ -370,5 +370,10 @@ protected List getAllOtherNodes() { } return services; } + + @Override + protected boolean isSSLEnabled() { + return false; + } } } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java index a68dad6509129..e8c57f1efd717 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java @@ -22,6 +22,8 @@ import java.util.Collections; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.SecurityUtil; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.Code; @@ -29,12 +31,15 @@ import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.Watcher.Event; +import org.apache.zookeeper.client.ZKClientConfig; +import org.apache.zookeeper.common.ClientX509Util; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.ZooDefs.Ids; import org.junit.Before; import org.junit.Test; import org.junit.Assert; +import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import org.apache.hadoop.HadoopIllegalArgumentException; @@ -63,7 +68,7 @@ class ActiveStandbyElectorTester extends ActiveStandbyElector { KeeperException { super(hostPort, timeout, parent, acl, Collections . emptyList(), app, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT, null); } @Override @@ -777,7 +782,7 @@ public void testWithoutZKServer() throws Exception { try { new ActiveStandbyElector("127.0.0.1", 2000, ZK_PARENT_NAME, Ids.OPEN_ACL_UNSAFE, Collections. emptyList(), mockApp, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT) { + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT, null) { @Override protected ZooKeeper createZooKeeper() throws IOException { @@ -809,4 +814,85 @@ public void testBecomeActiveBeforeServiceHealthy() throws Exception { Mockito.verify(mockZK, Mockito.times(0)).create(ZK_LOCK_NAME, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, mockZK); } + + /** + * We want to test if we create an ActiveStandbyElector with null as a TruststoreKeystore, + * then we are creating a ZooKeeper without the SSL configs in ActiveStandbyElector and the other + * configs are the same as the default values. + * We do this by checking the ZKClientConfig properties. + * @throws Exception + */ + @Test + public void testWithoutTruststoreKeystore() throws Exception { + ZKClientConfig defaultConfig = new ZKClientConfig(); + ClientX509Util clientX509Util = new ClientX509Util(); + System.out.println(defaultConfig.getProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET)); + ActiveStandbyElector e = Mockito.spy(new ActiveStandbyElector("localhost", 1, "", + Collections.emptyList(), null, Mockito.mock(ActiveStandbyElectorCallback.class), + 1, null) { + @Override + protected synchronized ZooKeeper connectToZooKeeper() { + return null; + } + }); + + e.createZooKeeper(); + + ArgumentCaptor configArgumentCaptor + = ArgumentCaptor.forClass(ZKClientConfig.class); + Mockito.verify(e).initiateZookeeper(configArgumentCaptor.capture()); + ZKClientConfig clientConfig = configArgumentCaptor.getValue(); + Assert.assertEquals(defaultConfig.getProperty(ZKClientConfig.SECURE_CLIENT), + clientConfig.getProperty(ZKClientConfig.SECURE_CLIENT)); + Assert.assertEquals(defaultConfig.getProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET), + clientConfig.getProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET)); + Assert.assertNull(clientConfig.getProperty(clientX509Util.getSslKeystoreLocationProperty())); + Assert.assertNull(clientConfig.getProperty(clientX509Util.getSslKeystorePasswdProperty())); + Assert.assertNull(clientConfig.getProperty(clientX509Util.getSslTruststoreLocationProperty())); + Assert.assertNull(clientConfig.getProperty(clientX509Util.getSslTruststorePasswdProperty())); + } + + /** + * We want to test if we create an ActiveStandbyElector with a TruststoreKeystore, which already + * has the SSL configuration set, then we are creating a ZooKeeper with the correct SSL configs + * in ActiveStandbyElector. We do this by checking the ZKClientConfig properties. + * @throws Exception + */ + @Test + public void testWithTruststoreKeystore() throws Exception { + Configuration conf = new Configuration(); + ClientX509Util clientX509Util = new ClientX509Util(); + conf.set(CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION, "keystore_location"); + conf.set(CommonConfigurationKeys.ZK_SSL_KEYSTORE_PASSWORD, "keystore_password"); + conf.set(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION, "truststore_location"); + conf.set(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_PASSWORD, "truststore_password"); + SecurityUtil.TruststoreKeystore truststoreKeystore = new SecurityUtil.TruststoreKeystore(conf); + ActiveStandbyElector e = Mockito.spy(new ActiveStandbyElector("localhost", 1, "", + Collections.emptyList(), null, Mockito.mock(ActiveStandbyElectorCallback.class), + 1, truststoreKeystore) { + @Override + protected synchronized ZooKeeper connectToZooKeeper() { + return null; + } + }); + + e.createZooKeeper(); + + ArgumentCaptor configArgumentCaptor + = ArgumentCaptor.forClass(ZKClientConfig.class); + Mockito.verify(e).initiateZookeeper(configArgumentCaptor.capture()); + ZKClientConfig clientConfig = configArgumentCaptor.getValue(); + Assert.assertEquals("true", clientConfig.getProperty(ZKClientConfig.SECURE_CLIENT)); + Assert.assertEquals("org.apache.zookeeper.ClientCnxnSocketNetty", + clientConfig.getProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET)); + Assert.assertEquals("keystore_location", + clientConfig.getProperty(clientX509Util.getSslKeystoreLocationProperty())); + Assert.assertEquals("keystore_password", + clientConfig.getProperty(clientX509Util.getSslKeystorePasswdProperty())); + Assert.assertEquals("truststore_location", + clientConfig.getProperty(clientX509Util.getSslTruststoreLocationProperty())); + Assert.assertEquals("truststore_password", + clientConfig.getProperty(clientX509Util.getSslTruststorePasswdProperty())); + + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java index badd5afc5e91b..7003e99f15382 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java @@ -70,7 +70,7 @@ public void setUp() throws Exception { appDatas[i] = Ints.toByteArray(i); electors[i] = new ActiveStandbyElector(hostPort, 5000, PARENT_DIR, Ids.OPEN_ACL_UNSAFE, Collections. emptyList(), cbs[i], - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT, null); } } @@ -270,7 +270,7 @@ public void testSetZooKeeperACLsOnParentZnodeName() ActiveStandbyElector elector = new ActiveStandbyElector(hostPort, 5000, PARENT_DIR, Ids.READ_ACL_UNSAFE, Collections.emptyList(), cb, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT, null); // Simulate the case by pre-creating znode 'parentZnodeName'. Then updates // znode's data so that data version will be increased to 1. Here znode's diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestSecureZKCuratorManager.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestSecureZKCuratorManager.java index d83279a941462..4862c1c79838d 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestSecureZKCuratorManager.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestSecureZKCuratorManager.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.hadoop.security.SecurityUtil; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -146,7 +147,7 @@ public void testSecureZKConfiguration() throws Exception { // Validate that HadoopZooKeeperFactory will set ZKConfig with given principals ZKCuratorManager.HadoopZookeeperFactory factory = new ZKCuratorManager.HadoopZookeeperFactory(null, null, null, true, - new ZKCuratorManager.TruststoreKeystore(hadoopConf)); + new SecurityUtil.TruststoreKeystore(hadoopConf)); ZooKeeper zk = factory.newZooKeeper(this.server.getConnectString(), 1000, null, false); validateSSLConfiguration(this.hadoopConf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_LOCATION), this.hadoopConf.get(CommonConfigurationKeys.ZK_SSL_KEYSTORE_PASSWORD), @@ -183,8 +184,8 @@ public void testTruststoreKeystoreConfiguration() { Validate that the null values are converted into empty strings by the class. */ Configuration conf = new Configuration(); - ZKCuratorManager.TruststoreKeystore truststoreKeystore = - new ZKCuratorManager.TruststoreKeystore(conf); + SecurityUtil.TruststoreKeystore truststoreKeystore = + new SecurityUtil.TruststoreKeystore(conf); assertEquals("Validate that null value is converted to empty string.", "", truststoreKeystore.getKeystoreLocation()); @@ -200,8 +201,8 @@ public void testTruststoreKeystoreConfiguration() { conf.set(CommonConfigurationKeys.ZK_SSL_KEYSTORE_PASSWORD, "keystorePassword"); conf.set(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_LOCATION, "/truststore.jks"); conf.set(CommonConfigurationKeys.ZK_SSL_TRUSTSTORE_PASSWORD, "truststorePassword"); - ZKCuratorManager.TruststoreKeystore truststoreKeystore1 = - new ZKCuratorManager.TruststoreKeystore(conf); + SecurityUtil.TruststoreKeystore truststoreKeystore1 = + new SecurityUtil.TruststoreKeystore(conf); assertEquals("Validate that non-null value kept intact.", "/keystore.jks", truststoreKeystore1.getKeystoreLocation()); assertEquals("Validate that null value is converted to empty string.", "keystorePassword", diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestZKCuratorManager.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestZKCuratorManager.java index 4365e43e49139..aced6e8d28b27 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestZKCuratorManager.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestZKCuratorManager.java @@ -34,6 +34,7 @@ import org.apache.curator.test.TestingServer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.authentication.util.JaasConfiguration; import org.apache.hadoop.util.ZKUtil; import org.apache.zookeeper.CreateMode; @@ -220,7 +221,7 @@ public void testCuratorFrameworkFactory() throws Exception{ .authorization(new ArrayList<>()) .zookeeperFactory(new ZKCuratorManager.HadoopZookeeperFactory( "foo1", "bar1", "bar1.keytab", false, - new ZKCuratorManager.TruststoreKeystore(conf)) + new SecurityUtil.TruststoreKeystore(conf)) ).build(); client.start(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index c783fc76d091b..dd2731813bd77 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -1343,6 +1343,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019; public static final String DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY = "dfs.ha.zkfc.nn.http.timeout.ms"; public static final int DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT = 20000; + /** Enable Zookeeper SSL/TLS communication. */ + public static final String ZK_CLIENT_SSL_ENABLED = "dfs.ha.zkfc.client.ssl.enabled"; + public static final boolean DEFAULT_ZK_CLIENT_SSL_ENABLED = false; public static final String DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE = "dfs.ha.nn.not-become-active-in-safemode"; public static final boolean DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT = diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSZKFailoverController.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSZKFailoverController.java index 4d67f20450dd9..2b09f81301dde 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSZKFailoverController.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSZKFailoverController.java @@ -294,4 +294,11 @@ public List getAllOtherNodes() { } return targets; } + + @Override + protected boolean isSSLEnabled() { + return conf.getBoolean( + DFSConfigKeys.ZK_CLIENT_SSL_ENABLED, + DFSConfigKeys.DEFAULT_ZK_CLIENT_SSL_ENABLED); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 4ff825d642d54..e73fc802a0453 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -3756,6 +3756,14 @@ + + dfs.ha.zkfc.client.ssl.enabled + false + + Enable SSL/TLS encryption for the ZooKeeper communication from ZKFC. + + + dfs.ha.nn.not-become-active-in-safemode false diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java index 564dbc181f1a9..989c9c53cd0a9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager; import org.apache.hadoop.classification.VisibleForTesting; +import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.thirdparty.protobuf.InvalidProtocolBufferException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -104,8 +105,12 @@ protected void serviceInit(Configuration conf) conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf .getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT)); + boolean isSSLEnabled = conf.getBoolean(YarnConfiguration.RM_ZK_CLIENT_SSL_ENABLED, + YarnConfiguration.DEFAULT_RM_ZK_CLIENT_SSL_ENABLED); + SecurityUtil.TruststoreKeystore truststoreKeystore + = isSSLEnabled ? new SecurityUtil.TruststoreKeystore(conf) : null; elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, - electionZNode, zkAcls, zkAuths, this, maxRetryNum, false); + electionZNode, zkAcls, zkAuths, this, maxRetryNum, false, truststoreKeystore); elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) { From 9d48af8d7027fb2c5124f028356446acbe3e703f Mon Sep 17 00:00:00 2001 From: huhaiyang Date: Tue, 24 Oct 2023 05:06:02 +0800 Subject: [PATCH 08/23] HADOOP-18868. Optimize the configuration and use of callqueue overflow trigger failover (#5998) --- .../apache/hadoop/ipc/CallQueueManager.java | 72 ++++++++++++++++--- .../org/apache/hadoop/ipc/FairCallQueue.java | 26 +++++-- .../java/org/apache/hadoop/ipc/Server.java | 10 +++ .../src/main/resources/core-default.xml | 12 +++- .../conf/TestCommonConfigurationFields.java | 4 ++ .../hadoop/ipc/TestCallQueueManager.java | 26 +++++++ .../apache/hadoop/ipc/TestFairCallQueue.java | 12 ++-- .../apache/hadoop/TestRefreshCallQueue.java | 4 +- 8 files changed, 139 insertions(+), 27 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallQueueManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallQueueManager.java index fa6f34adaf3bd..2cc96f4c16130 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallQueueManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallQueueManager.java @@ -18,6 +18,9 @@ package org.apache.hadoop.ipc; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT; + import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; @@ -63,7 +66,7 @@ static Class convertSchedulerClass( } private volatile boolean clientBackOffEnabled; - private boolean serverFailOverEnabled; + private volatile boolean serverFailOverEnabled; // Atomic refs point to active callQueue // We have two so we can better control swapping @@ -81,18 +84,15 @@ public CallQueueManager(Class> backingClass, namespace, conf); int[] capacityWeights = parseCapacityWeights(priorityLevels, namespace, conf); + this.serverFailOverEnabled = getServerFailOverEnable(namespace, conf); BlockingQueue bq = createCallQueueInstance(backingClass, priorityLevels, maxQueueSize, namespace, capacityWeights, conf); this.clientBackOffEnabled = clientBackOffEnabled; - this.serverFailOverEnabled = conf.getBoolean( - namespace + "." + - CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE, - CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT); this.putRef = new AtomicReference>(bq); this.takeRef = new AtomicReference>(bq); LOG.info("Using callQueue: {}, queueCapacity: {}, " + - "scheduler: {}, ipcBackoff: {}.", - backingClass, maxQueueSize, schedulerClass, clientBackOffEnabled); + "scheduler: {}, ipcBackoff: {}, ipcFailOver: {}.", + backingClass, maxQueueSize, schedulerClass, clientBackOffEnabled, serverFailOverEnabled); } @VisibleForTesting // only! @@ -105,6 +105,41 @@ public CallQueueManager(Class> backingClass, this.serverFailOverEnabled = serverFailOverEnabled; } + /** + * Return boolean value configured by property 'ipc..callqueue.overflow.trigger.failover' + * if it is present. If the config is not present, default config + * (without port) is used to derive class i.e 'ipc.callqueue.overflow.trigger.failover', + * and derived value is returned if configured. Otherwise, default value + * {@link CommonConfigurationKeys#IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT} is returned. + * + * @param namespace Namespace "ipc" + "." + Server's listener port. + * @param conf Configuration properties. + * @return Value returned based on configuration. + */ + private boolean getServerFailOverEnable(String namespace, Configuration conf) { + String propertyKey = namespace + "." + + CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE; + + if (conf.get(propertyKey) != null) { + return conf.getBoolean(propertyKey, + CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT); + } + + String[] nsPort = namespace.split("\\."); + if (nsPort.length == 2) { + // Only if ns is split with ".", we can separate namespace and port. + // In the absence of "ipc..callqueue.overflow.trigger.failover" property, + // we look up "ipc.callqueue.overflow.trigger.failover" property. + return conf.getBoolean(nsPort[0] + "." + + IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE, IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT); + } + + // Otherwise return default value. + LOG.info("{} not specified set default value is {}", + IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE, IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT); + return CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT; + } + private static T createScheduler( Class theClass, int priorityLevels, String ns, Configuration conf) { // Used for custom, configurable scheduler @@ -155,9 +190,9 @@ private > T createCallQueueInstance( // Used for custom, configurable callqueues try { Constructor ctor = theClass.getDeclaredConstructor(int.class, - int.class, String.class, int[].class, Configuration.class); - return ctor.newInstance(priorityLevels, maxLen, ns, - capacityWeights, conf); + int.class, String.class, int[].class, boolean.class, Configuration.class); + return ctor.newInstance(priorityLevels, maxLen, ns, capacityWeights, + this.serverFailOverEnabled, conf); } catch (RuntimeException e) { throw e; } catch (InvocationTargetException e) { @@ -199,6 +234,20 @@ boolean isClientBackoffEnabled() { return clientBackOffEnabled; } + @VisibleForTesting + public boolean isServerFailOverEnabled() { + return serverFailOverEnabled; + } + + @VisibleForTesting + public boolean isServerFailOverEnabledByQueue() { + BlockingQueue bq = putRef.get(); + if (bq instanceof FairCallQueue) { + return ((FairCallQueue) bq).isServerFailOverEnabled(); + } + return false; + } + // Based on policy to determine back off current call boolean shouldBackOff(Schedulable e) { return scheduler.shouldBackOff(e); @@ -421,6 +470,9 @@ public synchronized void swapQueue( RpcScheduler newScheduler = createScheduler(schedulerClass, priorityLevels, ns, conf); int[] capacityWeights = parseCapacityWeights(priorityLevels, ns, conf); + + // Update serverFailOverEnabled. + this.serverFailOverEnabled = getServerFailOverEnable(ns, conf); BlockingQueue newQ = createCallQueueInstance(queueClassToUse, priorityLevels, maxSize, ns, capacityWeights, conf); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/FairCallQueue.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/FairCallQueue.java index d416e797fbeea..187a26bac8f45 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/FairCallQueue.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/FairCallQueue.java @@ -35,7 +35,6 @@ import org.apache.hadoop.classification.VisibleForTesting; import org.apache.commons.lang3.NotImplementedException; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.ipc.CallQueueManager.CallQueueOverflowException; import org.apache.hadoop.metrics2.MetricsCollector; import org.apache.hadoop.metrics2.MetricsRecordBuilder; @@ -90,7 +89,16 @@ private void signalNotEmpty() { public FairCallQueue(int priorityLevels, int capacity, String ns, Configuration conf) { this(priorityLevels, capacity, ns, - CallQueueManager.getDefaultQueueCapacityWeights(priorityLevels), conf); + CallQueueManager.getDefaultQueueCapacityWeights(priorityLevels), + false, conf); + } + + @VisibleForTesting + public FairCallQueue(int priorityLevels, int capacity, String ns, boolean serverFailOverEnabled, + Configuration conf) { + this(priorityLevels, capacity, ns, + CallQueueManager.getDefaultQueueCapacityWeights(priorityLevels), + serverFailOverEnabled, conf); } /** @@ -101,18 +109,21 @@ public FairCallQueue(int priorityLevels, int capacity, String ns, * @param ns the prefix to use for configuration * @param capacityWeights the weights array for capacity allocation * among subqueues + * @param serverFailOverEnabled whether or not to enable callqueue overflow trigger failover + * for stateless servers when RPC call queue is filled * @param conf the configuration to read from * Notes: Each sub-queue has a capacity of `capacity / numSubqueues`. * The first or the highest priority sub-queue has an excess capacity * of `capacity % numSubqueues` */ public FairCallQueue(int priorityLevels, int capacity, String ns, - int[] capacityWeights, Configuration conf) { + int[] capacityWeights, boolean serverFailOverEnabled, Configuration conf) { if(priorityLevels < 1) { throw new IllegalArgumentException("Number of Priority Levels must be " + "at least 1"); } int numQueues = priorityLevels; + this.serverFailOverEnabled = serverFailOverEnabled; LOG.info("FairCallQueue is in use with " + numQueues + " queues with total capacity of " + capacity); @@ -135,10 +146,6 @@ public FairCallQueue(int priorityLevels, int capacity, String ns, } this.overflowedCalls.add(new AtomicLong(0)); } - this.serverFailOverEnabled = conf.getBoolean( - ns + "." + - CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE, - CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE_DEFAULT); this.multiplexer = new WeightedRoundRobinMultiplexer(numQueues, ns, conf); // Make this the active source of metrics @@ -493,4 +500,9 @@ public long[] getOverflowedCalls() { public void setMultiplexer(RpcMultiplexer newMux) { this.multiplexer = newMux; } + + @VisibleForTesting + public boolean isServerFailOverEnabled() { + return serverFailOverEnabled; + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java index a594d2be01ccb..73c86c09fc79e 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java @@ -3857,6 +3857,16 @@ public void setClientBackoffEnabled(boolean value) { callQueue.setClientBackoffEnabled(value); } + @VisibleForTesting + public boolean isServerFailOverEnabled() { + return callQueue.isServerFailOverEnabled(); + } + + @VisibleForTesting + public boolean isServerFailOverEnabledByQueue() { + return callQueue.isServerFailOverEnabledByQueue(); + } + /** * The maximum size of the rpc call queue of this server. * @return The maximum size of the rpc call queue. diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 24f6ca27847c2..6c3597a83fa69 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -2580,13 +2580,23 @@ The switch to turn S3A auditing on or off. - callqueue.overflow.trigger.failover + ipc.[port_number].callqueue.overflow.trigger.failover false Enable callqueue overflow trigger failover for stateless servers. + + ipc.callqueue.overflow.trigger.failover + false + + This property is used as fallback property in case + "ipc.[port_number].callqueue.overflow.trigger.failover" is not defined. + It determines whether or not to enable callqueue overflow trigger failover for stateless servers. + + + diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestCommonConfigurationFields.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestCommonConfigurationFields.java index f7303fb0f5e1a..b07ba76e8eec0 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestCommonConfigurationFields.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/conf/TestCommonConfigurationFields.java @@ -149,6 +149,10 @@ public void initializeMemberVariables() { xmlPropsToSkipCompare.add("fs.azure.saskey.usecontainersaskeyforallaccess"); xmlPropsToSkipCompare.add("fs.azure.user.agent.prefix"); + // Properties in enable callqueue overflow trigger failover for stateless servers. + xmlPropsToSkipCompare.add("ipc.[port_number].callqueue.overflow.trigger.failover"); + xmlPropsToSkipCompare.add("ipc.callqueue.overflow.trigger.failover"); + // FairCallQueue configs that includes dynamic ports in its keys xmlPropsToSkipCompare.add("ipc.[port_number].backoff.enable"); xmlPropsToSkipCompare.add("ipc.backoff.enable"); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestCallQueueManager.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestCallQueueManager.java index 4a60520a364b6..545ddb40ff5fe 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestCallQueueManager.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestCallQueueManager.java @@ -19,6 +19,7 @@ package org.apache.hadoop.ipc; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -517,4 +518,29 @@ public void testCallQueueOverflowExceptions() throws Exception { verify(queue, times(0)).put(call); verify(queue, times(0)).add(call); } + + @Test + public void testCallQueueOverEnabled() { + // default ipc.callqueue.overflow.trigger.failover' configure false. + String ns = "ipc.8888"; + conf.setBoolean("ipc.callqueue.overflow.trigger.failover", false); + manager = new CallQueueManager<>(fcqueueClass, rpcSchedulerClass, false, + 10, ns, conf); + assertFalse(manager.isServerFailOverEnabled()); + assertFalse(manager.isServerFailOverEnabledByQueue()); + + // set ipc.8888.callqueue.overflow.trigger.failover configure true. + conf.setBoolean("ipc.8888.callqueue.overflow.trigger.failover", true); + manager = new CallQueueManager<>(fcqueueClass, rpcSchedulerClass, false, + 10, ns, conf); + assertTrue(manager.isServerFailOverEnabled()); + assertTrue(manager.isServerFailOverEnabledByQueue()); + + // set ipc.callqueue.overflow.trigger.failover' configure true. + conf.setBoolean("ipc.callqueue.overflow.trigger.failover", true); + manager = new CallQueueManager<>(fcqueueClass, rpcSchedulerClass, false, + 10, ns, conf); + assertTrue(manager.isServerFailOverEnabled()); + assertTrue(manager.isServerFailOverEnabledByQueue()); + } } \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestFairCallQueue.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestFairCallQueue.java index 1fed9a317642a..06b65dc4df3c5 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestFairCallQueue.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestFairCallQueue.java @@ -28,7 +28,6 @@ import static org.mockito.Mockito.when; import static org.mockito.Mockito.times; -import org.apache.hadoop.fs.CommonConfigurationKeys; import org.junit.Before; import org.junit.Test; import static org.junit.Assert.assertEquals; @@ -105,7 +104,7 @@ public void testTotalCapacityOfSubQueues() { fairCallQueue = new FairCallQueue(7, 1025, "ns", conf); assertThat(fairCallQueue.remainingCapacity()).isEqualTo(1025); fairCallQueue = new FairCallQueue(7, 1025, "ns", - new int[]{7, 6, 5, 4, 3, 2, 1}, conf); + new int[]{7, 6, 5, 4, 3, 2, 1}, false, conf); assertThat(fairCallQueue.remainingCapacity()).isEqualTo(1025); } @@ -170,7 +169,7 @@ public void testQueueCapacity() { // default weights i.e. all queues share capacity fcq = new FairCallQueue(numQueues, 4, "ns", conf); FairCallQueue fcq1 = new FairCallQueue( - numQueues, capacity, "ns", new int[]{1, 3}, conf); + numQueues, capacity, "ns", new int[]{1, 3}, false, conf); for (int i=0; i < capacity; i++) { Schedulable call = mockCall("u", i%2); @@ -221,11 +220,10 @@ public void testInsertionWithFailover() { Configuration conf = new Configuration(); // Config for server to throw StandbyException instead of the // regular RetriableException if call queue is full. - conf.setBoolean( - "ns." + CommonConfigurationKeys.IPC_CALLQUEUE_SERVER_FAILOVER_ENABLE, - true); + // 3 queues, 2 slots each. - fcq = Mockito.spy(new FairCallQueue<>(3, 6, "ns", conf)); + fcq = Mockito.spy(new FairCallQueue<>(3, 6, "ns", + true, conf)); Schedulable p0 = mockCall("a", 0); Schedulable p1 = mockCall("b", 1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/TestRefreshCallQueue.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/TestRefreshCallQueue.java index e21a5a307308a..873a524c988c3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/TestRefreshCallQueue.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/TestRefreshCallQueue.java @@ -88,7 +88,7 @@ public void tearDown() throws IOException { @SuppressWarnings("serial") public static class MockCallQueue extends LinkedBlockingQueue { public MockCallQueue(int levels, int cap, String ns, int[] capacityWeights, - Configuration conf) { + boolean serverFailOverEnabled, Configuration conf) { super(cap); mockQueueConstructions++; } @@ -172,6 +172,6 @@ public void testRefreshCallQueueWithFairCallQueue() throws Exception { // check callQueueSize has changed assertEquals(150 * serviceHandlerCount, rpcServer.getClientRpcServer() .getMaxQueueSize()); - } + } } \ No newline at end of file From 80a22a736ee886d78ef809bc06aa27f1dff837cb Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Tue, 24 Oct 2023 09:28:05 +0800 Subject: [PATCH 09/23] YARN-11500. [Addendum] Fix typos in hadoop-yarn-server-common#federation. (#6212) Contributed by Shilun Fan. Reviewed-by: Inigo Goiri Signed-off-by: Shilun Fan --- .../policies/AbstractConfigurableFederationPolicy.java | 2 +- .../FederationPolicyInitializationContextValidator.java | 8 ++++---- .../policies/amrmproxy/BroadcastAMRMProxyPolicy.java | 2 +- .../amrmproxy/LocalityMulticastAMRMProxyPolicy.java | 2 +- .../policies/manager/AbstractPolicyManager.java | 8 ++++---- .../policies/manager/FederationPolicyManager.java | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/AbstractConfigurableFederationPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/AbstractConfigurableFederationPolicy.java index 7234d46b61261..c70b7b5eb5435 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/AbstractConfigurableFederationPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/AbstractConfigurableFederationPolicy.java @@ -53,7 +53,7 @@ public void reinitialize( initializationContext.getSubClusterPolicyConfiguration().getParams()); // if nothing has changed skip the rest of initialization - // and signal to childs that the reinit is free via isDirty var. + // and signal to children that the reinit is free via isDirty var. if (policyInfo != null && policyInfo.equals(newPolicyInfo)) { isDirty = false; return; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/FederationPolicyInitializationContextValidator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/FederationPolicyInitializationContextValidator.java index da63bc1de468c..1d430751036af 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/FederationPolicyInitializationContextValidator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/FederationPolicyInitializationContextValidator.java @@ -40,25 +40,25 @@ public static void validate( if (policyContext == null) { throw new FederationPolicyInitializationException( "The FederationPolicyInitializationContext provided is null. Cannot" - + " reinitalize " + "successfully."); + + " reinitialize " + "successfully."); } if (policyContext.getFederationStateStoreFacade() == null) { throw new FederationPolicyInitializationException( "The FederationStateStoreFacade provided is null. Cannot" - + " reinitalize successfully."); + + " reinitialize successfully."); } if (policyContext.getFederationSubclusterResolver() == null) { throw new FederationPolicyInitializationException( "The FederationSubclusterResolver provided is null. Cannot" - + " reinitalize successfully."); + + " reinitialize successfully."); } if (policyContext.getSubClusterPolicyConfiguration() == null) { throw new FederationPolicyInitializationException( "The SubClusterPolicyConfiguration provided is null. Cannot " - + "reinitalize successfully."); + + "reinitialize successfully."); } String intendedType = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/BroadcastAMRMProxyPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/BroadcastAMRMProxyPolicy.java index 643bfa6da0117..36074f989fd1a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/BroadcastAMRMProxyPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/BroadcastAMRMProxyPolicy.java @@ -41,7 +41,7 @@ public class BroadcastAMRMProxyPolicy extends AbstractAMRMProxyPolicy { public void reinitialize( FederationPolicyInitializationContext policyContext) throws FederationPolicyInitializationException { - // overrides initialize to avoid weight checks that do no apply for + // overrides initialize to avoid weight checks that do not apply for // this policy. FederationPolicyInitializationContextValidator .validate(policyContext, this.getClass().getCanonicalName()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/LocalityMulticastAMRMProxyPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/LocalityMulticastAMRMProxyPolicy.java index a98ec138f604a..2cd1eacaa665f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/LocalityMulticastAMRMProxyPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/amrmproxy/LocalityMulticastAMRMProxyPolicy.java @@ -394,7 +394,7 @@ private void splitAnyRequests(List originalResourceRequests, targetSubclusters = allocationBookkeeper.getActiveAndEnabledSC(); } - // SECOND: pick how much to ask to each RM for each request + // SECOND: pick how much to ask each RM for each request splitIndividualAny(resourceRequest, targetSubclusters, allocationBookkeeper); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/AbstractPolicyManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/AbstractPolicyManager.java index f7a89c614feaf..aa0742d090c2a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/AbstractPolicyManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/AbstractPolicyManager.java @@ -53,10 +53,10 @@ public abstract class AbstractPolicyManager implements * @param federationPolicyContext the current context * @param oldInstance the existing (possibly null) instance. * - * @return a valid and fully reinitalized {@link FederationAMRMProxyPolicy} + * @return a valid and fully reinitialized {@link FederationAMRMProxyPolicy} * instance * - * @throws FederationPolicyInitializationException if the reinitalization is + * @throws FederationPolicyInitializationException if the reinitialization is * not valid, and ensure * previous state is preserved */ @@ -89,10 +89,10 @@ public FederationAMRMProxyPolicy getAMRMPolicy( * @param federationPolicyContext the current context * @param oldInstance the existing (possibly null) instance. * - * @return a valid and fully reinitalized {@link FederationRouterPolicy} + * @return a valid and fully reinitialized {@link FederationRouterPolicy} * instance * - * @throws FederationPolicyInitializationException if the reinitalization is + * @throws FederationPolicyInitializationException if the reinitialization is * not valid, and ensure * previous state is preserved */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/FederationPolicyManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/FederationPolicyManager.java index 23f7cf3ae38bc..3aeb7d718e2d4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/FederationPolicyManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/manager/FederationPolicyManager.java @@ -48,7 +48,7 @@ public interface FederationPolicyManager { * instance of {@link FederationAMRMProxyPolicy} reinitialized with the * current context, otherwise a new instance initialized with the current * context is provided. If the instance is compatible with the current class - * the implementors should attempt to reinitalize (retaining state). To affect + * the implementors should attempt to reinitialize (retaining state). To affect * a complete policy reset oldInstance should be null. * * @param policyContext the current context @@ -70,7 +70,7 @@ FederationAMRMProxyPolicy getAMRMPolicy( * instance of {@link FederationRouterPolicy} reinitialized with the current * context, otherwise a new instance initialized with the current context is * provided. If the instance is compatible with the current class the - * implementors should attempt to reinitalize (retaining state). To affect a + * implementors should attempt to reinitialize (retaining state). To affect a * complete policy reset oldInstance should be set to null. * * @param policyContext the current context From 9c7e5b66fa9af9c750df640fca9680d97d7ecdc5 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Tue, 24 Oct 2023 09:36:06 +0800 Subject: [PATCH 10/23] YARN-11576. Improve FederationInterceptorREST AuditLog. (#6117) Contributed by Shilun Fan. Reviewed-by: Inigo Goiri Signed-off-by: Shilun Fan --- .../yarn/server/router/RouterAuditLogger.java | 27 ++ .../webapp/FederationInterceptorREST.java | 437 +++++++++++++++++- 2 files changed, 460 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterAuditLogger.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterAuditLogger.java index bb814b652831a..b0902a5e5c14e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterAuditLogger.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterAuditLogger.java @@ -50,6 +50,7 @@ public static class AuditConstants { public static final String FORCE_KILL_APP = "Force Kill App"; public static final String GET_APP_REPORT = "Get Application Report"; public static final String TARGET_CLIENT_RM_SERVICE = "RouterClientRMService"; + public static final String TARGET_WEB_SERVICE = "RouterWebServices"; public static final String UNKNOWN = "UNKNOWN"; public static final String GET_APPLICATIONS = "Get Applications"; public static final String GET_CLUSTERMETRICS = "Get ClusterMetrics"; @@ -82,6 +83,32 @@ public static class AuditConstants { public static final String GET_ATTRIBUTESTONODES = "Get AttributesToNodes"; public static final String GET_CLUSTERNODEATTRIBUTES = "Get ClusterNodeAttributes"; public static final String GET_NODESTOATTRIBUTES = "Get NodesToAttributes"; + public static final String GET_CLUSTERINFO = "Get ClusterInfo"; + public static final String GET_CLUSTERUSERINFO = "Get ClusterUserInfo"; + public static final String GET_SCHEDULERINFO = "Get SchedulerInfo"; + public static final String DUMP_SCHEDULERLOGS = "Dump SchedulerLogs"; + public static final String GET_ACTIVITIES = "Get Activities"; + public static final String GET_BULKACTIVITIES = "Get BulkActivities"; + public static final String GET_APPACTIVITIES = "Get AppActivities"; + public static final String GET_APPSTATISTICS = "Get AppStatistics"; + public static final String GET_RMNODELABELS = "Get RMNodeLabels"; + public static final String REPLACE_LABELSONNODES = "Replace LabelsOnNodes"; + public static final String REPLACE_LABELSONNODE = "Replace LabelsOnNode"; + public static final String GET_CLUSTER_NODELABELS = "Get ClusterNodeLabels"; + public static final String ADD_TO_CLUSTER_NODELABELS = "Add To ClusterNodeLabels"; + public static final String REMOVE_FROM_CLUSTERNODELABELS = "Remove From ClusterNodeLabels"; + public static final String GET_LABELS_ON_NODE = "Get LabelsOnNode"; + public static final String GET_APP_PRIORITY = "Get AppPriority"; + public static final String UPDATE_APP_QUEUE = "Update AppQueue"; + public static final String POST_DELEGATION_TOKEN = "Post DelegationToken"; + public static final String POST_DELEGATION_TOKEN_EXPIRATION = "Post DelegationTokenExpiration"; + public static final String GET_APP_TIMEOUT = "Get App Timeout"; + public static final String GET_APP_TIMEOUTS = "Get App Timeouts"; + public static final String CHECK_USER_ACCESS_TO_QUEUE = "Check User AccessToQueue"; + public static final String GET_APP_ATTEMPT = "Get AppAttempt"; + public static final String GET_CONTAINER = "Get Container"; + public static final String UPDATE_SCHEDULER_CONFIGURATION = "Update SchedulerConfiguration"; + public static final String GET_SCHEDULER_CONFIGURATION = "Get SchedulerConfiguration"; } public static void logSuccess(String user, String operation, String target) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java index 71268ac672d98..7f9446878b3e6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/FederationInterceptorREST.java @@ -120,6 +120,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeLabelInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ReservationDefinitionInfo; import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.NodeToLabelsEntry; +import org.apache.hadoop.yarn.server.router.RouterAuditLogger; import org.apache.hadoop.yarn.server.router.RouterMetrics; import org.apache.hadoop.yarn.server.router.RouterServerUtil; import org.apache.hadoop.yarn.server.router.clientrm.ClientMethod; @@ -151,6 +152,50 @@ import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; import static org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade.getRandomActiveSubCluster; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_NEW_APP; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.SUBMIT_NEW_APP; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_CLUSTERINFO; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_CLUSTERUSERINFO; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_SCHEDULERINFO; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.DUMP_SCHEDULERLOGS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_ACTIVITIES; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_BULKACTIVITIES; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APPACTIVITIES; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APPSTATISTICS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_NODETOLABELS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_RMNODELABELS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_LABELSTONODES; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UNKNOWN; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.TARGET_WEB_SERVICE; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.REPLACE_LABELSONNODES; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.REPLACE_LABELSONNODE; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_CLUSTER_NODELABELS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.ADD_TO_CLUSTER_NODELABELS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.REMOVE_FROM_CLUSTERNODELABELS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_LABELS_ON_NODE; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APP_PRIORITY; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_QUEUEINFO; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UPDATE_APPLICATIONPRIORITY; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UPDATE_APP_QUEUE; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.POST_DELEGATION_TOKEN; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.POST_DELEGATION_TOKEN_EXPIRATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.CANCEL_DELEGATIONTOKEN; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_NEW_RESERVATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.SUBMIT_RESERVATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UPDATE_RESERVATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.DELETE_RESERVATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.LIST_RESERVATIONS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APP_TIMEOUT; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APP_TIMEOUTS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UPDATE_APPLICATIONTIMEOUTS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APPLICATION_ATTEMPTS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.CHECK_USER_ACCESS_TO_QUEUE; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_APP_ATTEMPT; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_CONTAINERS; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_CONTAINER; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.UPDATE_SCHEDULER_CONFIGURATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.GET_SCHEDULER_CONFIGURATION; +import static org.apache.hadoop.yarn.server.router.RouterAuditLogger.AuditConstants.SIGNAL_TOCONTAINER; import static org.apache.hadoop.yarn.server.router.webapp.RouterWebServiceUtil.extractToken; import static org.apache.hadoop.yarn.server.router.webapp.RouterWebServiceUtil.getKerberosUserGroupInformation; @@ -360,9 +405,13 @@ public Response createNewApplication(HttpServletRequest hsr) } catch (FederationPolicyException e) { // If a FederationPolicyException is thrown, the service is unavailable. routerMetrics.incrAppsFailedCreated(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.SERVICE_UNAVAILABLE).entity(e.getLocalizedMessage()).build(); } catch (Exception e) { routerMetrics.incrAppsFailedCreated(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.INTERNAL_SERVER_ERROR).entity(e.getLocalizedMessage()).build(); } @@ -370,6 +419,8 @@ public Response createNewApplication(HttpServletRequest hsr) String errMsg = "Fail to create a new application."; LOG.error(errMsg); routerMetrics.incrAppsFailedCreated(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, errMsg); return Response.status(Status.INTERNAL_SERVER_ERROR).entity(errMsg).build(); } @@ -400,6 +451,9 @@ private Response invokeGetNewApplication(Map subCl try { Response response = interceptor.createNewApplication(hsr); if (response != null && response.getStatus() == HttpServletResponse.SC_OK) { + ApplicationId applicationId = ApplicationId.fromString(response.getEntity().toString()); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_NEW_APP, + TARGET_WEB_SERVICE, applicationId, subClusterId); return response; } } catch (Exception e) { @@ -490,6 +544,8 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp, HttpS routerMetrics.incrAppsFailedSubmitted(); String errMsg = "Missing ApplicationSubmissionContextInfo or " + "applicationSubmissionContext information."; + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, errMsg); return Response.status(Status.BAD_REQUEST).entity(errMsg).build(); } @@ -498,6 +554,8 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp, HttpS RouterServerUtil.validateApplicationId(applicationId); } catch (IllegalArgumentException e) { routerMetrics.incrAppsFailedSubmitted(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, e.getMessage()); return Response.status(Status.BAD_REQUEST).entity(e.getLocalizedMessage()).build(); } @@ -515,6 +573,8 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp, HttpS } } catch (Exception e) { routerMetrics.incrAppsFailedSubmitted(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, e.getMessage()); return Response.status(Status.SERVICE_UNAVAILABLE).entity(e.getLocalizedMessage()).build(); } @@ -522,6 +582,8 @@ public Response submitApplication(ApplicationSubmissionContextInfo newApp, HttpS String errMsg = String.format("Application %s with appId %s failed to be submitted.", newApp.getApplicationName(), newApp.getApplicationId()); LOG.error(errMsg); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, errMsg); return Response.status(Status.SERVICE_UNAVAILABLE).entity(errMsg).build(); } @@ -577,11 +639,15 @@ private Response invokeSubmitApplication(ApplicationSubmissionContextInfo submis if (response != null && response.getStatus() == HttpServletResponse.SC_ACCEPTED) { LOG.info("Application {} with appId {} submitted on {}.", context.getApplicationName(), applicationId, subClusterId); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), SUBMIT_NEW_APP, + TARGET_WEB_SERVICE, applicationId, subClusterId); return response; } String msg = String.format("application %s failed to be submitted.", applicationId); throw new YarnException(msg); } catch (Exception e) { + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_NEW_APP, UNKNOWN, + TARGET_WEB_SERVICE, e.getMessage(), applicationId, subClusterId); LOG.warn("Unable to submit the application {} to SubCluster {}.", applicationId, subClusterId, e); if (subClusterId != null) { @@ -1103,16 +1169,24 @@ public ClusterInfo getClusterInfo() { federationClusterInfo.getList().add(clusterInfo); }); long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_CLUSTERINFO, + TARGET_WEB_SERVICE); routerMetrics.succeededGetClusterInfoRetrieved(stopTime - startTime); return federationClusterInfo; } catch (NotFoundException e) { routerMetrics.incrGetClusterInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("Get all active sub cluster(s) error.", e); } catch (YarnException | IOException e) { routerMetrics.incrGetClusterInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getClusterInfo error.", e); } routerMetrics.incrGetClusterInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERINFO, UNKNOWN, + TARGET_WEB_SERVICE, "getClusterInfo error."); throw new RuntimeException("getClusterInfo error."); } @@ -1144,16 +1218,24 @@ public ClusterUserInfo getClusterUserInfo(HttpServletRequest hsr) { federationClusterUserInfo.getList().add(clusterUserInfo); }); long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_CLUSTERUSERINFO, + TARGET_WEB_SERVICE); routerMetrics.succeededGetClusterUserInfoRetrieved(stopTime - startTime); return federationClusterUserInfo; } catch (NotFoundException e) { routerMetrics.incrGetClusterUserInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERUSERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("Get all active sub cluster(s) error.", e); } catch (YarnException | IOException e) { routerMetrics.incrGetClusterUserInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERUSERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getClusterUserInfo error.", e); } routerMetrics.incrGetClusterUserInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTERUSERINFO, UNKNOWN, + TARGET_WEB_SERVICE, "getClusterUserInfo error."); throw new RuntimeException("getClusterUserInfo error."); } @@ -1182,16 +1264,24 @@ public SchedulerTypeInfo getSchedulerInfo() { federationSchedulerTypeInfo.getList().add(schedulerTypeInfo); }); long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_SCHEDULERINFO, + TARGET_WEB_SERVICE); routerMetrics.succeededGetSchedulerInfoRetrieved(stopTime - startTime); return federationSchedulerTypeInfo; } catch (NotFoundException e) { routerMetrics.incrGetSchedulerInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("Get all active sub cluster(s) error.", e); } catch (YarnException | IOException e) { routerMetrics.incrGetSchedulerInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULERINFO, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getSchedulerInfo error.", e); } routerMetrics.incrGetSchedulerInfoFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULERINFO, UNKNOWN, + TARGET_WEB_SERVICE, "getSchedulerInfo error."); throw new RuntimeException("getSchedulerInfo error."); } @@ -1213,6 +1303,8 @@ public String dumpSchedulerLogs(String time, HttpServletRequest hsr) if (StringUtils.isBlank(time)) { routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, the time is empty or null."); throw new IllegalArgumentException("Parameter error, the time is empty or null."); } @@ -1223,9 +1315,13 @@ public String dumpSchedulerLogs(String time, HttpServletRequest hsr) } } catch (NumberFormatException e) { routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw new IllegalArgumentException("time must be a number."); } catch (IllegalArgumentException e) { routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -1246,19 +1342,27 @@ public String dumpSchedulerLogs(String time, HttpServletRequest hsr) .append(subClusterId).append(" : ").append(msg).append("; "); }); long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, + TARGET_WEB_SERVICE); routerMetrics.succeededDumpSchedulerLogsRetrieved(stopTime - startTime); return stringBuilder.toString(); } catch (IllegalArgumentException e) { routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to dump SchedulerLogs by time: %s.", time); } catch (YarnException e) { routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "dumpSchedulerLogs by time = %s error .", time); } routerMetrics.incrDumpSchedulerLogsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DUMP_SCHEDULERLOGS, UNKNOWN, + TARGET_WEB_SERVICE, "dumpSchedulerLogs Failed."); throw new RuntimeException("dumpSchedulerLogs Failed."); } @@ -1290,14 +1394,19 @@ public ActivitiesInfo getActivities(HttpServletRequest hsr, String nodeId, ActivitiesInfo activitiesInfo = interceptor.getActivities(hsrCopy, nodeId, groupBy); if (activitiesInfo != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_ACTIVITIES, + TARGET_WEB_SERVICE); routerMetrics.succeededGetActivitiesLatencyRetrieved(stopTime - startTime); return activitiesInfo; } } catch (IllegalArgumentException | NotFoundException e) { routerMetrics.incrGetActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_ACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } - + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_ACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, "getActivities Failed."); routerMetrics.incrGetActivitiesFailedRetrieved(); throw new RuntimeException("getActivities Failed."); } @@ -1337,28 +1446,40 @@ public BulkActivitiesInfo getBulkActivities(HttpServletRequest hsr, bulkActivitiesInfo.setSubClusterId(subClusterId.getId()); fedBulkActivitiesInfo.getList().add(bulkActivitiesInfo); }); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_BULKACTIVITIES, + TARGET_WEB_SERVICE); long stopTime = clock.getTime(); routerMetrics.succeededGetBulkActivitiesRetrieved(stopTime - startTime); return fedBulkActivitiesInfo; } catch (IllegalArgumentException e) { routerMetrics.incrGetBulkActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_BULKACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } catch (NotFoundException e) { routerMetrics.incrGetBulkActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_BULKACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("get all active sub cluster(s) error.", e); } catch (IOException e) { routerMetrics.incrGetBulkActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_BULKACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getBulkActivities by groupBy = %s, activitiesCount = %s with io error.", groupBy, String.valueOf(activitiesCount)); } catch (YarnException e) { routerMetrics.incrGetBulkActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_BULKACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getBulkActivities by groupBy = %s, activitiesCount = %s with yarn error.", groupBy, String.valueOf(activitiesCount)); } routerMetrics.incrGetBulkActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_BULKACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, "getBulkActivities Failed."); throw new RuntimeException("getBulkActivities Failed."); } @@ -1377,17 +1498,25 @@ public AppActivitiesInfo getAppActivities(HttpServletRequest hsr, if (appActivitiesInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetAppActivitiesRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APPACTIVITIES, + TARGET_WEB_SERVICE); return appActivitiesInfo; } } catch (IllegalArgumentException e) { routerMetrics.incrGetAppActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get subCluster by appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrGetAppActivitiesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getAppActivities by appId = %s error .", appId); } + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPACTIVITIES, UNKNOWN, + TARGET_WEB_SERVICE, "getAppActivities Failed."); routerMetrics.incrGetAppActivitiesFailedRetrieved(); throw new RuntimeException("getAppActivities Failed."); } @@ -1409,23 +1538,33 @@ public ApplicationStatisticsInfo getAppStatistics(HttpServletRequest hsr, if (applicationStatisticsInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetAppStatisticsRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APPSTATISTICS, + TARGET_WEB_SERVICE); return applicationStatisticsInfo; } } catch (NotFoundException e) { routerMetrics.incrGetAppStatisticsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPSTATISTICS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("get all active sub cluster(s) error.", e); } catch (IOException e) { routerMetrics.incrGetAppStatisticsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPSTATISTICS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getAppStatistics error by stateQueries = %s, typeQueries = %s with io error.", StringUtils.join(stateQueries, ","), StringUtils.join(typeQueries, ",")); } catch (YarnException e) { routerMetrics.incrGetAppStatisticsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPSTATISTICS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getAppStatistics by stateQueries = %s, typeQueries = %s with yarn error.", StringUtils.join(stateQueries, ","), StringUtils.join(typeQueries, ",")); } routerMetrics.incrGetAppStatisticsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPSTATISTICS, UNKNOWN, + TARGET_WEB_SERVICE, "getAppStatistics Failed."); throw RouterServerUtil.logAndReturnRunTimeException( "getAppStatistics by stateQueries = %s, typeQueries = %s Failed.", StringUtils.join(stateQueries, ","), StringUtils.join(typeQueries, ",")); @@ -1448,16 +1587,24 @@ public NodeToLabelsInfo getNodeToLabels(HttpServletRequest hsr) if (nodeToLabelsInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetNodeToLabelsRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_NODETOLABELS, + TARGET_WEB_SERVICE); return nodeToLabelsInfo; } } catch (NotFoundException e) { routerMetrics.incrNodeToLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NODETOLABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrNodeToLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NODETOLABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("getNodeToLabels error.", e); } routerMetrics.incrNodeToLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NODETOLABELS, UNKNOWN, + TARGET_WEB_SERVICE, "getNodeToLabels Failed."); throw new RuntimeException("getNodeToLabels Failed."); } @@ -1477,16 +1624,24 @@ public NodeLabelsInfo getRMNodeLabels(HttpServletRequest hsr) throws IOException if (nodeToLabelsInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetRMNodeLabelsRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_RMNODELABELS, + TARGET_WEB_SERVICE); return nodeToLabelsInfo; } } catch (NotFoundException e) { routerMetrics.incrGetRMNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_RMNODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrGetRMNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_RMNODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("getRMNodeLabels error.", e); } routerMetrics.incrGetRMNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_RMNODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, "getRMNodeLabels Failed."); throw new RuntimeException("getRMNodeLabels Failed."); } @@ -1515,18 +1670,26 @@ public LabelsToNodesInfo getLabelsToNodes(Set labels) LabelsToNodesInfo labelsToNodesInfo = new LabelsToNodesInfo(labelToNodesMap); if (labelsToNodesInfo != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_LABELSTONODES, + TARGET_WEB_SERVICE); routerMetrics.succeededGetLabelsToNodesRetrieved(stopTime - startTime); return labelsToNodesInfo; } } catch (NotFoundException e) { routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELSTONODES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELSTONODES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException( e, "getLabelsToNodes by labels = %s with yarn error.", StringUtils.join(labels, ",")); } routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELSTONODES, UNKNOWN, + TARGET_WEB_SERVICE, "getLabelsToNodes Failed."); throw RouterServerUtil.logAndReturnRunTimeException( "getLabelsToNodes by labels = %s Failed.", StringUtils.join(labels, ",")); } @@ -1553,6 +1716,9 @@ public Response replaceLabelsOnNodes(NodeToLabelsEntryList newNodeToLabels, List nodeToLabelsEntries = newNodeToLabels.getNodeToLabels(); if (CollectionUtils.isEmpty(nodeToLabelsEntries)) { routerMetrics.incrReplaceLabelsOnNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REPLACE_LABELSONNODES, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, " + + "nodeToLabelsEntries must not be empty."); throw new IllegalArgumentException("Parameter error, " + "nodeToLabelsEntries must not be empty."); } @@ -1594,12 +1760,16 @@ public Response replaceLabelsOnNodes(NodeToLabelsEntryList newNodeToLabels, } }); long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), REPLACE_LABELSONNODES, + TARGET_WEB_SERVICE); routerMetrics.succeededReplaceLabelsOnNodesRetrieved(stopTime - startTime); // Step5. return call result. return Response.status(Status.OK).entity(builder.toString()).build(); } catch (Exception e) { routerMetrics.incrReplaceLabelsOnNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REPLACE_LABELSONNODES, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } } @@ -1623,10 +1793,14 @@ public Response replaceLabelsOnNode(Set newNodeLabelsName, // Step1. Check the parameters to ensure that the parameters are not empty. if (StringUtils.isBlank(nodeId)) { routerMetrics.incrReplaceLabelsOnNodeFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REPLACE_LABELSONNODE, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, nodeId must not be null or empty."); throw new IllegalArgumentException("Parameter error, nodeId must not be null or empty."); } if (CollectionUtils.isEmpty(newNodeLabelsName)) { routerMetrics.incrReplaceLabelsOnNodeFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REPLACE_LABELSONNODE, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, newNodeLabelsName must not be empty."); throw new IllegalArgumentException("Parameter error, newNodeLabelsName must not be empty."); } @@ -1641,11 +1815,15 @@ public Response replaceLabelsOnNode(Set newNodeLabelsName, // Step3. Return the response result. long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), REPLACE_LABELSONNODE, + TARGET_WEB_SERVICE); routerMetrics.succeededReplaceLabelsOnNodeRetrieved(stopTime - startTime); String msg = "subCluster#" + subClusterInfo.getSubClusterId().getId() + ":Success;"; return Response.status(Status.OK).entity(msg).build(); } catch (Exception e) { routerMetrics.incrReplaceLabelsOnNodeFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REPLACE_LABELSONNODE, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } } @@ -1668,16 +1846,24 @@ public NodeLabelsInfo getClusterNodeLabels(HttpServletRequest hsr) if (nodeLabelsInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetClusterNodeLabelsRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_CLUSTER_NODELABELS, + TARGET_WEB_SERVICE); return nodeLabelsInfo; } } catch (NotFoundException e) { routerMetrics.incrClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("getClusterNodeLabels with yarn error.", e); } routerMetrics.incrClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, "getClusterNodeLabels Failed."); throw new RuntimeException("getClusterNodeLabels Failed."); } @@ -1697,12 +1883,16 @@ public Response addToClusterNodeLabels(NodeLabelsInfo newNodeLabels, if (newNodeLabels == null) { routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, the newNodeLabels is null."); throw new IllegalArgumentException("Parameter error, the newNodeLabels is null."); } List nodeLabelInfos = newNodeLabels.getNodeLabelsInfo(); if (CollectionUtils.isEmpty(nodeLabelInfos)) { routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, "Parameter error, the nodeLabelsInfo is null or empty."); throw new IllegalArgumentException("Parameter error, the nodeLabelsInfo is null or empty."); } @@ -1720,17 +1910,25 @@ public Response addToClusterNodeLabels(NodeLabelsInfo newNodeLabels, responseInfoMap.forEach((subClusterInfo, response) -> buildAppendMsg(subClusterInfo, buffer, response)); long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, + TARGET_WEB_SERVICE); routerMetrics.succeededAddToClusterNodeLabelsRetrieved((stopTime - startTime)); return Response.status(Status.OK).entity(buffer.toString()).build(); } catch (NotFoundException e) { routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("addToClusterNodeLabels with yarn error.", e); } routerMetrics.incrAddToClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), ADD_TO_CLUSTER_NODELABELS, UNKNOWN, + TARGET_WEB_SERVICE, "addToClusterNodeLabels Failed."); throw new RuntimeException("addToClusterNodeLabels Failed."); } @@ -1750,6 +1948,8 @@ public Response removeFromClusterNodeLabels(Set oldNodeLabels, if (CollectionUtils.isEmpty(oldNodeLabels)) { routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REMOVE_FROM_CLUSTERNODELABELS, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the oldNodeLabels is null or empty."); throw new IllegalArgumentException("Parameter error, the oldNodeLabels is null or empty."); } @@ -1768,13 +1968,19 @@ public Response removeFromClusterNodeLabels(Set oldNodeLabels, responseInfoMap.forEach((subClusterInfo, response) -> buildAppendMsg(subClusterInfo, buffer, response)); long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), REMOVE_FROM_CLUSTERNODELABELS, + TARGET_WEB_SERVICE); routerMetrics.succeededRemoveFromClusterNodeLabelsRetrieved(stopTime - startTime); return Response.status(Status.OK).entity(buffer.toString()).build(); } catch (NotFoundException e) { routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REMOVE_FROM_CLUSTERNODELABELS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrRemoveFromClusterNodeLabelsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), REMOVE_FROM_CLUSTERNODELABELS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("removeFromClusterNodeLabels with yarn error.", e); } @@ -1819,17 +2025,25 @@ public NodeLabelsInfo getLabelsOnNode(HttpServletRequest hsr, String nodeId) if (nodeLabelsInfo != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetLabelsToNodesRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_LABELS_ON_NODE, + TARGET_WEB_SERVICE); return nodeLabelsInfo; } } catch (NotFoundException e) { routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELS_ON_NODE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException("get all active sub cluster(s) error.", e); } catch (YarnException e) { routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELS_ON_NODE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowIOException( e, "getLabelsOnNode nodeId = %s with yarn error.", nodeId); } routerMetrics.incrLabelsToNodesFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_LABELS_ON_NODE, + UNKNOWN, TARGET_WEB_SERVICE, "getLabelsOnNode by nodeId = " + nodeId + " Failed."); throw RouterServerUtil.logAndReturnRunTimeException( "getLabelsOnNode by nodeId = %s Failed.", nodeId); } @@ -1845,17 +2059,25 @@ public AppPriority getAppPriority(HttpServletRequest hsr, String appId) if (appPriority != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetAppPriorityRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APP_PRIORITY, + TARGET_WEB_SERVICE); return appPriority; } } catch (IllegalArgumentException e) { routerMetrics.incrGetAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_PRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the getAppPriority appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrGetAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_PRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getAppPriority error.", e); } routerMetrics.incrGetAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_PRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, "getAppPriority Failed."); throw new RuntimeException("getAppPriority Failed."); } @@ -1866,6 +2088,8 @@ public Response updateApplicationPriority(AppPriority targetPriority, if (targetPriority == null) { routerMetrics.incrUpdateAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONPRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the targetPriority is empty or null."); throw new IllegalArgumentException("Parameter error, the targetPriority is empty or null."); } @@ -1876,16 +2100,24 @@ public Response updateApplicationPriority(AppPriority targetPriority, if (response != null) { long stopTime = clock.getTime(); routerMetrics.succeededUpdateAppPriorityRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), UPDATE_APPLICATIONPRIORITY, + TARGET_WEB_SERVICE); return response; } } catch (IllegalArgumentException e) { routerMetrics.incrUpdateAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONPRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the updateApplicationPriority appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrUpdateAppPriorityFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONPRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("updateApplicationPriority error.", e); } + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONPRIORITY, + UNKNOWN, TARGET_WEB_SERVICE, "getAppPriority Failed."); routerMetrics.incrUpdateAppPriorityFailedRetrieved(); throw new RuntimeException("updateApplicationPriority Failed."); } @@ -1901,15 +2133,23 @@ public AppQueue getAppQueue(HttpServletRequest hsr, String appId) if (queue != null) { long stopTime = clock.getTime(); routerMetrics.succeededGetAppQueueRetrieved((stopTime - startTime)); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_QUEUEINFO, + TARGET_WEB_SERVICE); return queue; } } catch (IllegalArgumentException e) { routerMetrics.incrGetAppQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_QUEUEINFO, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get queue by appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrGetAppQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_QUEUEINFO, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getAppQueue error.", e); } + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_QUEUEINFO, + UNKNOWN, TARGET_WEB_SERVICE, "getAppQueue Failed."); routerMetrics.incrGetAppQueueFailedRetrieved(); throw new RuntimeException("getAppQueue Failed."); } @@ -1921,6 +2161,8 @@ public Response updateAppQueue(AppQueue targetQueue, HttpServletRequest hsr, if (targetQueue == null) { routerMetrics.incrUpdateAppQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APP_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the targetQueue is null."); throw new IllegalArgumentException("Parameter error, the targetQueue is null."); } @@ -1931,16 +2173,24 @@ public Response updateAppQueue(AppQueue targetQueue, HttpServletRequest hsr, if (response != null) { long stopTime = clock.getTime(); routerMetrics.succeededUpdateAppQueueRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), UPDATE_APP_QUEUE, + TARGET_WEB_SERVICE); return response; } } catch (IllegalArgumentException e) { routerMetrics.incrUpdateAppQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APP_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to update app queue by appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrUpdateAppQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APP_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("updateAppQueue error.", e); } + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APP_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "updateAppQueue Failed."); routerMetrics.incrUpdateAppQueueFailedRetrieved(); throw new RuntimeException("updateAppQueue Failed."); } @@ -1961,6 +2211,8 @@ public Response postDelegationToken(DelegationToken tokenData, HttpServletReques throws AuthorizationException, IOException, InterruptedException, Exception { if (tokenData == null || hsr == null) { + RouterAuditLogger.logFailure(getUser().getShortUserName(), POST_DELEGATION_TOKEN, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the tokenData or hsr is null."); throw new IllegalArgumentException("Parameter error, the tokenData or hsr is null."); } @@ -1973,6 +2225,8 @@ public Response postDelegationToken(DelegationToken tokenData, HttpServletReques return createDelegationToken(tokenData, callerUGI); } catch (YarnException e) { LOG.error("Create delegation token request failed.", e); + RouterAuditLogger.logFailure(getUser().getShortUserName(), POST_DELEGATION_TOKEN, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.FORBIDDEN).entity(e.getMessage()).build(); } } @@ -1997,6 +2251,8 @@ private Response createDelegationToken(DelegationToken dtoken, UserGroupInformat }); DelegationToken respToken = getDelegationToken(renewer, resp); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), POST_DELEGATION_TOKEN, + TARGET_WEB_SERVICE); return Response.status(Status.OK).entity(respToken).build(); } @@ -2061,6 +2317,8 @@ public Response postDelegationTokenExpiration(HttpServletRequest hsr) throws AuthorizationException, IOException, InterruptedException, Exception { if (hsr == null) { + RouterAuditLogger.logFailure(getUser().getShortUserName(), POST_DELEGATION_TOKEN_EXPIRATION, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the hsr is null."); throw new IllegalArgumentException("Parameter error, the hsr is null."); } @@ -2071,6 +2329,8 @@ public Response postDelegationTokenExpiration(HttpServletRequest hsr) return renewDelegationToken(hsr, callerUGI); } catch (YarnException e) { LOG.error("Renew delegation token request failed.", e); + RouterAuditLogger.logFailure(getUser().getShortUserName(), POST_DELEGATION_TOKEN_EXPIRATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.FORBIDDEN).entity(e.getMessage()).build(); } } @@ -2109,6 +2369,8 @@ private Response renewDelegationToken(HttpServletRequest hsr, UserGroupInformati long renewTime = resp.getNextExpirationTime(); DelegationToken respToken = new DelegationToken(); respToken.setNextExpirationTime(renewTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), POST_DELEGATION_TOKEN_EXPIRATION, + TARGET_WEB_SERVICE); return Response.status(Status.OK).entity(respToken).build(); } @@ -2142,9 +2404,13 @@ public Response cancelDelegationToken(HttpServletRequest hsr) return this.getRouterClientRMService().cancelDelegationToken(req); }); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), CANCEL_DELEGATIONTOKEN, + TARGET_WEB_SERVICE); return Response.status(Status.OK).build(); } catch (YarnException e) { LOG.error("Cancel delegation token request failed.", e); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CANCEL_DELEGATIONTOKEN, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.FORBIDDEN).entity(e.getMessage()).build(); } } @@ -2166,15 +2432,21 @@ public Response createNewReservation(HttpServletRequest hsr) // this request can be returned directly. if (response != null && response.getStatus() == HttpServletResponse.SC_OK) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_NEW_RESERVATION, + TARGET_WEB_SERVICE); routerMetrics.succeededGetNewReservationRetrieved(stopTime - startTime); return response; } } catch (FederationPolicyException e) { // If a FederationPolicyException is thrown, the service is unavailable. routerMetrics.incrGetNewReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.SERVICE_UNAVAILABLE).entity(e.getLocalizedMessage()).build(); } catch (Exception e) { routerMetrics.incrGetNewReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.INTERNAL_SERVER_ERROR).entity(e.getLocalizedMessage()).build(); } @@ -2182,6 +2454,8 @@ public Response createNewReservation(HttpServletRequest hsr) String errMsg = "Fail to create a new reservation."; LOG.error(errMsg); routerMetrics.incrGetNewReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_NEW_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, errMsg); return Response.status(Status.INTERNAL_SERVER_ERROR).entity(errMsg).build(); } @@ -2217,6 +2491,8 @@ public Response submitReservation(ReservationSubmissionRequestInfo resContext, routerMetrics.incrSubmitReservationFailedRetrieved(); String errMsg = "Missing submitReservation resContext or reservationId " + "or reservation definition or queue."; + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, errMsg); return Response.status(Status.BAD_REQUEST).entity(errMsg).build(); } @@ -2226,6 +2502,8 @@ public Response submitReservation(ReservationSubmissionRequestInfo resContext, RouterServerUtil.validateReservationId(resId); } catch (IllegalArgumentException e) { routerMetrics.incrSubmitReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2238,16 +2516,22 @@ public Response submitReservation(ReservationSubmissionRequestInfo resContext, runWithRetries(actualRetryNums, submitIntervalTime); if (response != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), SUBMIT_RESERVATION, + TARGET_WEB_SERVICE); routerMetrics.succeededSubmitReservationRetrieved(stopTime - startTime); return response; } } catch (Exception e) { routerMetrics.incrSubmitReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); return Response.status(Status.SERVICE_UNAVAILABLE).entity(e.getLocalizedMessage()).build(); } routerMetrics.incrSubmitReservationFailedRetrieved(); String msg = String.format("Reservation %s failed to be submitted.", resId); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SUBMIT_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, msg); return Response.status(Status.SERVICE_UNAVAILABLE).entity(msg).build(); } @@ -2311,6 +2595,8 @@ public Response updateReservation(ReservationUpdateRequestInfo resContext, routerMetrics.incrUpdateReservationFailedRetrieved(); String errMsg = "Missing updateReservation resContext or reservationId " + "or reservation definition."; + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, errMsg); return Response.status(Status.BAD_REQUEST).entity(errMsg).build(); } @@ -2322,6 +2608,8 @@ public Response updateReservation(ReservationUpdateRequestInfo resContext, RouterServerUtil.validateReservationId(reservationId); } catch (IllegalArgumentException e) { routerMetrics.incrUpdateReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2332,15 +2620,21 @@ public Response updateReservation(ReservationUpdateRequestInfo resContext, HttpServletRequest hsrCopy = clone(hsr); Response response = interceptor.updateReservation(resContext, hsrCopy); if (response != null) { + RouterAuditLogger.logSuccess(getUser().getShortUserName(), UPDATE_RESERVATION, + TARGET_WEB_SERVICE); return response; } } catch (Exception e) { routerMetrics.incrUpdateReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("updateReservation Failed.", e); } // throw an exception routerMetrics.incrUpdateReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, "updateReservation Failed, reservationId = " + reservationId); throw new YarnRuntimeException("updateReservation Failed, reservationId = " + reservationId); } @@ -2353,6 +2647,8 @@ public Response deleteReservation(ReservationDeleteRequestInfo resContext, if (resContext == null || resContext.getReservationId() == null) { routerMetrics.incrDeleteReservationFailedRetrieved(); String errMsg = "Missing deleteReservation request or reservationId."; + RouterAuditLogger.logFailure(getUser().getShortUserName(), DELETE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, errMsg); return Response.status(Status.BAD_REQUEST).entity(errMsg).build(); } @@ -2364,6 +2660,8 @@ public Response deleteReservation(ReservationDeleteRequestInfo resContext, RouterServerUtil.validateReservationId(reservationId); } catch (IllegalArgumentException e) { routerMetrics.incrDeleteReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DELETE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2374,15 +2672,21 @@ public Response deleteReservation(ReservationDeleteRequestInfo resContext, HttpServletRequest hsrCopy = clone(hsr); Response response = interceptor.deleteReservation(resContext, hsrCopy); if (response != null) { + RouterAuditLogger.logSuccess(getUser().getShortUserName(), DELETE_RESERVATION, + TARGET_WEB_SERVICE); return response; } } catch (Exception e) { routerMetrics.incrDeleteReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DELETE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("deleteReservation Failed.", e); } // throw an exception routerMetrics.incrDeleteReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), DELETE_RESERVATION, + UNKNOWN, TARGET_WEB_SERVICE, "deleteReservation Failed, reservationId = " + reservationId); throw new YarnRuntimeException("deleteReservation Failed, reservationId = " + reservationId); } @@ -2393,11 +2697,15 @@ public Response listReservation(String queue, String reservationId, if (queue == null || queue.isEmpty()) { routerMetrics.incrListReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), LIST_RESERVATIONS, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the queue is empty or null."); throw new IllegalArgumentException("Parameter error, the queue is empty or null."); } if (reservationId == null || reservationId.isEmpty()) { routerMetrics.incrListReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), LIST_RESERVATIONS, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the reservationId is empty or null."); throw new IllegalArgumentException("Parameter error, the reservationId is empty or null."); } @@ -2406,6 +2714,8 @@ public Response listReservation(String queue, String reservationId, RouterServerUtil.validateReservationId(reservationId); } catch (IllegalArgumentException e) { routerMetrics.incrListReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), LIST_RESERVATIONS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2419,15 +2729,21 @@ public Response listReservation(String queue, String reservationId, includeResourceAllocations, hsrCopy); if (response != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), LIST_RESERVATIONS, + TARGET_WEB_SERVICE); routerMetrics.succeededListReservationRetrieved(stopTime - startTime1); return response; } } catch (YarnException e) { routerMetrics.incrListReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), LIST_RESERVATIONS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("listReservation error.", e); } routerMetrics.incrListReservationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), LIST_RESERVATIONS, + UNKNOWN, TARGET_WEB_SERVICE, "listReservation Failed."); throw new YarnException("listReservation Failed."); } @@ -2437,6 +2753,8 @@ public AppTimeoutInfo getAppTimeout(HttpServletRequest hsr, String appId, if (type == null || type.isEmpty()) { routerMetrics.incrGetAppTimeoutFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUT, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the type is empty or null."); throw new IllegalArgumentException("Parameter error, the type is empty or null."); } @@ -2446,18 +2764,26 @@ public AppTimeoutInfo getAppTimeout(HttpServletRequest hsr, String appId, AppTimeoutInfo appTimeoutInfo = interceptor.getAppTimeout(hsr, appId, type); if (appTimeoutInfo != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APP_TIMEOUT, + TARGET_WEB_SERVICE); routerMetrics.succeededGetAppTimeoutRetrieved((stopTime - startTime)); return appTimeoutInfo; } } catch (IllegalArgumentException e) { routerMetrics.incrGetAppTimeoutFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUT, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the getAppTimeout appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrGetAppTimeoutFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUT, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getAppTimeout error.", e); } routerMetrics.incrGetAppTimeoutFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUT, + UNKNOWN, TARGET_WEB_SERVICE, "getAppTimeout Failed."); throw new RuntimeException("getAppTimeout Failed."); } @@ -2471,19 +2797,27 @@ public AppTimeoutsInfo getAppTimeouts(HttpServletRequest hsr, String appId) AppTimeoutsInfo appTimeoutsInfo = interceptor.getAppTimeouts(hsr, appId); if (appTimeoutsInfo != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APP_TIMEOUTS, + TARGET_WEB_SERVICE); routerMetrics.succeededGetAppTimeoutsRetrieved((stopTime - startTime)); return appTimeoutsInfo; } } catch (IllegalArgumentException e) { routerMetrics.incrGetAppTimeoutsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the getAppTimeouts appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrGetAppTimeoutsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getAppTimeouts error.", e); } routerMetrics.incrGetAppTimeoutsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_TIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, "getAppTimeouts Failed."); throw new RuntimeException("getAppTimeouts Failed."); } @@ -2494,6 +2828,8 @@ public Response updateApplicationTimeout(AppTimeoutInfo appTimeout, if (appTimeout == null) { routerMetrics.incrUpdateApplicationTimeoutsRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONTIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the appTimeout is null."); throw new IllegalArgumentException("Parameter error, the appTimeout is null."); } @@ -2503,19 +2839,27 @@ public Response updateApplicationTimeout(AppTimeoutInfo appTimeout, Response response = interceptor.updateApplicationTimeout(appTimeout, hsr, appId); if (response != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), UPDATE_APPLICATIONTIMEOUTS, + TARGET_WEB_SERVICE); routerMetrics.succeededUpdateAppTimeoutsRetrieved((stopTime - startTime)); return response; } } catch (IllegalArgumentException e) { routerMetrics.incrUpdateApplicationTimeoutsRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONTIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the updateApplicationTimeout appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrUpdateApplicationTimeoutsRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONTIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("updateApplicationTimeout error.", e); } routerMetrics.incrUpdateApplicationTimeoutsRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_APPLICATIONTIMEOUTS, + UNKNOWN, TARGET_WEB_SERVICE, "updateApplicationTimeout Failed."); throw new RuntimeException("updateApplicationTimeout Failed."); } @@ -2529,18 +2873,26 @@ public AppAttemptsInfo getAppAttempts(HttpServletRequest hsr, String appId) { if (appAttemptsInfo != null) { long stopTime = Time.now(); routerMetrics.succeededAppAttemptsRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APPLICATION_ATTEMPTS, + TARGET_WEB_SERVICE); return appAttemptsInfo; } } catch (IllegalArgumentException e) { routerMetrics.incrAppAttemptsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPLICATION_ATTEMPTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to get the AppAttempt appId: %s.", appId); } catch (YarnException e) { routerMetrics.incrAppAttemptsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPLICATION_ATTEMPTS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getAppAttempts error.", e); } routerMetrics.incrAppAttemptsFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APPLICATION_ATTEMPTS, + UNKNOWN, TARGET_WEB_SERVICE, "getAppAttempts Failed."); throw new RuntimeException("getAppAttempts Failed."); } @@ -2551,16 +2903,22 @@ public RMQueueAclInfo checkUserAccessToQueue(String queue, String username, // Parameter Verification if (queue == null || queue.isEmpty()) { routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the queue is empty or null."); throw new IllegalArgumentException("Parameter error, the queue is empty or null."); } if (username == null || username.isEmpty()) { routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the username is empty or null."); throw new IllegalArgumentException("Parameter error, the username is empty or null."); } if (queueAclType == null || queueAclType.isEmpty()) { routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the queueAclType is empty or null."); throw new IllegalArgumentException("Parameter error, the queueAclType is empty or null."); } @@ -2582,17 +2940,25 @@ public RMQueueAclInfo checkUserAccessToQueue(String queue, String username, aclInfo.getList().add(rMQueueAclInfo); }); long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + TARGET_WEB_SERVICE); routerMetrics.succeededCheckUserAccessToQueueRetrieved(stopTime - startTime); return aclInfo; } catch (NotFoundException e) { routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("Get all active sub cluster(s) error.", e); } catch (YarnException | IOException e) { routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("checkUserAccessToQueue error.", e); } routerMetrics.incrCheckUserAccessToQueueFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), CHECK_USER_ACCESS_TO_QUEUE, + UNKNOWN, TARGET_WEB_SERVICE, "checkUserAccessToQueue error."); throw new RuntimeException("checkUserAccessToQueue error."); } @@ -2605,6 +2971,8 @@ public AppAttemptInfo getAppAttempt(HttpServletRequest req, RouterServerUtil.validateApplicationAttemptId(appAttemptId); } catch (IllegalArgumentException e) { routerMetrics.incrAppAttemptReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_ATTEMPT, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2615,20 +2983,28 @@ public AppAttemptInfo getAppAttempt(HttpServletRequest req, AppAttemptInfo appAttemptInfo = interceptor.getAppAttempt(req, res, appId, appAttemptId); if (appAttemptInfo != null) { long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_APP_ATTEMPT, + TARGET_WEB_SERVICE); routerMetrics.succeededAppAttemptReportRetrieved(stopTime - startTime); return appAttemptInfo; } } catch (IllegalArgumentException e) { routerMetrics.incrAppAttemptReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_ATTEMPT, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Unable to getAppAttempt by appId: %s, appAttemptId: %s.", appId, appAttemptId); } catch (YarnException e) { routerMetrics.incrAppAttemptReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_ATTEMPT, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getAppAttempt error, appId: %s, appAttemptId: %s.", appId, appAttemptId); } routerMetrics.incrAppAttemptReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_APP_ATTEMPT, + UNKNOWN, TARGET_WEB_SERVICE, "getAppAttempt failed."); throw RouterServerUtil.logAndReturnRunTimeException( "getAppAttempt failed, appId: %s, appAttemptId: %s.", appId, appAttemptId); } @@ -2642,6 +3018,8 @@ public ContainersInfo getContainers(HttpServletRequest req, RouterServerUtil.validateApplicationId(appId); RouterServerUtil.validateApplicationAttemptId(appAttemptId); } catch (IllegalArgumentException e) { + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINERS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); routerMetrics.incrGetContainersFailedRetrieved(); throw e; } @@ -2662,20 +3040,28 @@ public ContainersInfo getContainers(HttpServletRequest req, } if (containersInfo != null) { long stopTime = clock.getTime(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_CONTAINERS, + TARGET_WEB_SERVICE); routerMetrics.succeededGetContainersRetrieved(stopTime - startTime); return containersInfo; } } catch (NotFoundException e) { routerMetrics.incrGetContainersFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINERS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getContainers error, appId = %s, " + " appAttemptId = %s, Probably getActiveSubclusters error.", appId, appAttemptId); } catch (IOException | YarnException e) { routerMetrics.incrGetContainersFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINERS, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "getContainers error, appId = %s, " + " appAttemptId = %s.", appId, appAttemptId); } routerMetrics.incrGetContainersFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINERS, + UNKNOWN, TARGET_WEB_SERVICE, "getContainers failed."); throw RouterServerUtil.logAndReturnRunTimeException( "getContainers failed, appId: %s, appAttemptId: %s.", appId, appAttemptId); } @@ -2695,6 +3081,8 @@ public ContainerInfo getContainer(HttpServletRequest req, RouterServerUtil.validateContainerId(containerId); } catch (IllegalArgumentException e) { routerMetrics.incrGetContainerReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } @@ -2706,6 +3094,8 @@ public ContainerInfo getContainer(HttpServletRequest req, if (containerInfo != null) { long stopTime = Time.now(); routerMetrics.succeededGetContainerReportRetrieved(stopTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_CONTAINER, + TARGET_WEB_SERVICE); return containerInfo; } } catch (IllegalArgumentException e) { @@ -2713,13 +3103,19 @@ public ContainerInfo getContainer(HttpServletRequest req, "Unable to get the AppAttempt appId: %s, appAttemptId: %s, containerId: %s.", appId, appAttemptId, containerId); routerMetrics.incrGetContainerReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(msg, e); } catch (YarnException e) { routerMetrics.incrGetContainerReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("getContainer Failed.", e); } routerMetrics.incrGetContainerReportFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_CONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, "getContainer Failed."); throw new RuntimeException("getContainer Failed."); } @@ -2743,6 +3139,9 @@ public Response updateSchedulerConfiguration(SchedConfUpdateInfo mutationInfo, // Make Sure mutationInfo is not null. if (mutationInfo == null) { routerMetrics.incrUpdateSchedulerConfigurationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, + "Parameter error, the schedConfUpdateInfo is empty or null."); throw new IllegalArgumentException( "Parameter error, the schedConfUpdateInfo is empty or null."); } @@ -2753,6 +3152,9 @@ public Response updateSchedulerConfiguration(SchedConfUpdateInfo mutationInfo, String pSubClusterId = mutationInfo.getSubClusterId(); if (StringUtils.isBlank(pSubClusterId)) { routerMetrics.incrUpdateSchedulerConfigurationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, + "Parameter error, the subClusterId is empty or null."); throw new IllegalArgumentException("Parameter error, " + "the subClusterId is empty or null."); } @@ -2767,19 +3169,27 @@ public Response updateSchedulerConfiguration(SchedConfUpdateInfo mutationInfo, if (response != null) { long endTime = clock.getTime(); routerMetrics.succeededUpdateSchedulerConfigurationRetrieved(endTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + TARGET_WEB_SERVICE); return Response.status(response.getStatus()).entity(response.getEntity()).build(); } } catch (NotFoundException e) { routerMetrics.incrUpdateSchedulerConfigurationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "Get subCluster error. subClusterId = %s", pSubClusterId); } catch (Exception e) { routerMetrics.incrUpdateSchedulerConfigurationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException(e, "UpdateSchedulerConfiguration error. subClusterId = %s", pSubClusterId); } routerMetrics.incrUpdateSchedulerConfigurationFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), UPDATE_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, "UpdateSchedulerConfiguration Failed."); throw new RuntimeException("UpdateSchedulerConfiguration error. subClusterId = " + pSubClusterId); } @@ -2822,18 +3232,25 @@ public Response getSchedulerConfiguration(HttpServletRequest hsr) }); long endTime = clock.getTime(); routerMetrics.succeededGetSchedulerConfigurationRetrieved(endTime - startTime); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), GET_SCHEDULER_CONFIGURATION, + TARGET_WEB_SERVICE); return Response.status(Status.OK).entity(federationConfInfo).build(); } catch (NotFoundException e) { - RouterServerUtil.logAndThrowRunTimeException("get all active sub cluster(s) error.", e); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); routerMetrics.incrGetSchedulerConfigurationFailedRetrieved(); + RouterServerUtil.logAndThrowRunTimeException("get all active sub cluster(s) error.", e); } catch (Exception e) { + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); routerMetrics.incrGetSchedulerConfigurationFailedRetrieved(); - RouterServerUtil.logAndThrowRunTimeException("getSchedulerConfiguration error.", e); return Response.status(Status.BAD_REQUEST).entity("getSchedulerConfiguration error.").build(); } routerMetrics.incrGetSchedulerConfigurationFailedRetrieved(); - throw new RuntimeException("getSchedulerConfiguration error."); + RouterAuditLogger.logFailure(getUser().getShortUserName(), GET_SCHEDULER_CONFIGURATION, + UNKNOWN, TARGET_WEB_SERVICE, "getSchedulerConfiguration Failed."); + throw new RuntimeException("getSchedulerConfiguration Failed."); } @Override @@ -2853,12 +3270,16 @@ public Response signalToContainer(String containerId, String command, RouterServerUtil.validateContainerId(containerId); } catch (IllegalArgumentException e) { routerMetrics.incrSignalToContainerFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); throw e; } // Check if command is empty or null if (command == null || command.isEmpty()) { routerMetrics.incrSignalToContainerFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, "Parameter error, the command is empty or null."); throw new IllegalArgumentException("Parameter error, the command is empty or null."); } @@ -2874,18 +3295,26 @@ public Response signalToContainer(String containerId, String command, Response response = interceptor.signalToContainer(containerId, command, req); if (response != null) { long stopTime = Time.now(); + RouterAuditLogger.logSuccess(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + TARGET_WEB_SERVICE); routerMetrics.succeededSignalToContainerRetrieved(stopTime - startTime); return response; } } catch (YarnException e) { routerMetrics.incrSignalToContainerFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("signalToContainer Failed.", e); } catch (AuthorizationException e) { routerMetrics.incrSignalToContainerFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, e.getLocalizedMessage()); RouterServerUtil.logAndThrowRunTimeException("signalToContainer Author Failed.", e); } routerMetrics.incrSignalToContainerFailedRetrieved(); + RouterAuditLogger.logFailure(getUser().getShortUserName(), SIGNAL_TOCONTAINER, + UNKNOWN, TARGET_WEB_SERVICE, "signalToContainer Failed."); throw new RuntimeException("signalToContainer Failed."); } From 0042544bf2b3bcb89f1bbd3d792e489c28655432 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Tue, 24 Oct 2023 12:28:40 +0100 Subject: [PATCH 11/23] HADOOP-18949. upgrade maven dependency plugin due to CVE-2021-26291. (#6219) Addresses CVE-2021-26291. "Origin Validation Error in Apache Maven" Contributed by PJ Fanning. --- hadoop-maven-plugins/pom.xml | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/hadoop-maven-plugins/pom.xml b/hadoop-maven-plugins/pom.xml index 522c5a9468705..8765eb795b874 100644 --- a/hadoop-maven-plugins/pom.xml +++ b/hadoop-maven-plugins/pom.xml @@ -26,26 +26,56 @@ maven-plugin Apache Hadoop Maven Plugins - 3.0.5 - 3.6.0 + 3.9.5 + 3.10.1 + 2.7.0 + 0.3.5 org.apache.maven maven-plugin-api ${maven.dependency.version} + + + org.eclipse.sisu + org.eclipse.sisu.inject + + + org.codehaus.plexus + plexus-classworlds + + org.apache.maven maven-core ${maven.dependency.version} + + org.eclipse.sisu + org.eclipse.sisu.inject + org.sonatype.sisu sisu-inject-plexus + + org.codehaus.plexus + plexus-classworlds + + + org.codehaus.plexus + plexus-classworlds + ${plexus.classworlds.version} + + + org.eclipse.sisu + org.eclipse.sisu.inject + ${sisu.inject.version} + org.apache.maven.plugin-tools maven-plugin-annotations From 8b974bcc1f084ae77dccf99ebc243e7a571f2e11 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 24 Oct 2023 18:17:52 +0100 Subject: [PATCH 12/23] HADOOP-18889. Third party storage followup. (#6186) Followup to HADOOP-18889 third party store support; Fix some minor review comments which came in after the merge. --- .../java/org/apache/hadoop/fs/s3a/S3AFileSystem.java | 2 +- .../org/apache/hadoop/fs/s3a/auth/SignerFactory.java | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 8ab8d22cc6d84..d7149d7dead6b 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -1357,7 +1357,7 @@ public String getBucketLocation() throws IOException { public String getBucketLocation(String bucketName) throws IOException { final String region = trackDurationAndSpan( STORE_EXISTS_PROBE, bucketName, null, () -> - once("getBucketLocation()", bucketName, () -> + invoker.retry("getBucketLocation()", bucketName, true, () -> // If accessPoint then region is known from Arn accessPoint != null ? accessPoint.getRegion() diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/SignerFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/SignerFactory.java index 5d34688cebe14..21c390c07940b 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/SignerFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/SignerFactory.java @@ -81,16 +81,6 @@ public static void registerSigner( SIGNERS.put(signerType, signerClass); } - /** - * Check if the signer has already been registered. - * @param signerType signer to get - * @throws IllegalArgumentException if the signer type is unknown. - */ - public static void verifySignerRegistered(String signerType) { - checkArgument(isSignerRegistered(signerType), - "unknown signer type: %s", signerType); - } - /** * Check if the signer has already been registered. * @param signerType signer to get From 882f08b4bc1d23ac3b0d78b339ddd3a5af53abdd Mon Sep 17 00:00:00 2001 From: Stephen O'Donnell Date: Tue, 24 Oct 2023 21:39:03 +0100 Subject: [PATCH 13/23] HDFS-17237. Remove IPCLoggerChannelMetrics when the logger is closed (#6217) --- .../qjournal/client/IPCLoggerChannel.java | 1 + .../client/IPCLoggerChannelMetrics.java | 39 +++---------------- .../qjournal/client/TestIPCLoggerChannel.java | 21 +++++++++- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java index 4b7e59c51f13e..67fc85810278d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java @@ -206,6 +206,7 @@ public void close() { // making any more calls after this point (eg clear the queue) RPC.stopProxy(proxy); } + metrics.unregister(); } protected QJournalProtocol getProxy() throws IOException { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannelMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannelMetrics.java index 6eef8ffd38620..c1e27e2e98a71 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannelMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannelMetrics.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hdfs.qjournal.client; import java.net.InetSocketAddress; -import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; @@ -29,8 +28,6 @@ import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MutableQuantiles; -import org.apache.hadoop.thirdparty.com.google.common.collect.Maps; - /** * The metrics for a journal from the writer's perspective. */ @@ -43,21 +40,6 @@ class IPCLoggerChannelMetrics { private final MutableQuantiles[] writeEndToEndLatencyQuantiles; private final MutableQuantiles[] writeRpcLatencyQuantiles; - - /** - * In the case of the NN transitioning between states, edit logs are closed - * and reopened. Thus, the IPCLoggerChannel instance that writes to a - * given JournalNode may change over the lifetime of the process. - * However, metrics2 doesn't have a function to unregister a set of metrics - * and fails if a new metrics class is registered with the same name - * as the existing one. Hence, we have to maintain our own registry - * ("multiton") here, so that we have exactly one metrics instance - * per JournalNode, and switch out the pointer to the underlying - * IPCLoggerChannel instance. - */ - private static final Map REGISTRY = - Maps.newHashMap(); - private IPCLoggerChannelMetrics(IPCLoggerChannel ch) { this.ch = ch; @@ -81,25 +63,16 @@ private IPCLoggerChannelMetrics(IPCLoggerChannel ch) { writeRpcLatencyQuantiles = null; } } - - private void setChannel(IPCLoggerChannel ch) { - assert ch.getRemoteAddress().equals(this.ch.getRemoteAddress()); - this.ch = ch; + + public void unregister() { + DefaultMetricsSystem.instance().unregisterSource(getName(ch)); } static IPCLoggerChannelMetrics create(IPCLoggerChannel ch) { String name = getName(ch); - synchronized (REGISTRY) { - IPCLoggerChannelMetrics m = REGISTRY.get(name); - if (m != null) { - m.setChannel(ch); - } else { - m = new IPCLoggerChannelMetrics(ch); - DefaultMetricsSystem.instance().register(name, null, m); - REGISTRY.put(name, m); - } - return m; - } + IPCLoggerChannelMetrics m = new IPCLoggerChannelMetrics(ch); + DefaultMetricsSystem.instance().register(name, null, m); + return m; } private static String getName(IPCLoggerChannel ch) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/client/TestIPCLoggerChannel.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/client/TestIPCLoggerChannel.java index f2f46424cfd5a..06df99de1fe8a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/client/TestIPCLoggerChannel.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/client/TestIPCLoggerChannel.java @@ -24,12 +24,13 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; -import org.apache.hadoop.hdfs.qjournal.client.IPCLoggerChannel; -import org.apache.hadoop.hdfs.qjournal.client.LoggerTooFarBehindException; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol; import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo; import org.apache.hadoop.hdfs.server.namenode.NameNodeLayoutVersion; @@ -178,4 +179,20 @@ public void testStopSendingEditsWhenOutOfSync() throws Exception { ch.sendEdits(3L, 3L, 1, FAKE_DATA).get(); } + + @Test + public void testMetricsRemovedOnClose() { + MetricsSystem metricsSystem = DefaultMetricsSystem.instance(); + String sourceName = "IPCLoggerChannel-" + + FAKE_ADDR.getAddress().getHostAddress() + + "-" + FAKE_ADDR.getPort(); + // Ensure the metrics exist + MetricsSource source = metricsSystem.getSource(sourceName); + assertNotNull(source); + + ch.close(); + // ensure the metrics are removed. + source = metricsSystem.getSource(sourceName); + assertNull(source); + } } From a170d58501cf9f1bca0f111007122caf3ebe9419 Mon Sep 17 00:00:00 2001 From: gp1314 <814085234@qq.com> Date: Wed, 25 Oct 2023 11:43:12 +0800 Subject: [PATCH 14/23] HDFS-17231. HA: Safemode should exit when resources are from low to available. (#6207). Contributed by Gu Peng. Reviewed-by: Xing Lin Signed-off-by: He Xiaoqiao --- .../hadoop/hdfs/server/namenode/FSNamesystem.java | 12 ++++++++++++ .../server/namenode/TestNameNodeResourceChecker.java | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index d9b165f96ee0c..3d360c6d0dd2a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4534,6 +4534,11 @@ public void run () { LOG.warn(lowResourcesMsg + "Already in safe mode."); } enterSafeMode(true); + } else { + if (isNoManualAndResourceLowSafeMode()) { + LOG.info("Namenode has sufficient available resources, exiting safe mode."); + leaveSafeMode(false); + } } try { Thread.sleep(resourceRecheckInterval); @@ -5265,6 +5270,13 @@ private synchronized boolean isInManualOrResourceLowSafeMode() { return manualSafeMode || resourceLowSafeMode; } + /** + * @return true if it is not in manual safe mode and resource low safe mode. + */ + private synchronized boolean isNoManualAndResourceLowSafeMode() { + return !manualSafeMode && resourceLowSafeMode; + } + private synchronized void setManualAndResourceLowSafeMode(boolean manual, boolean resourceLow) { this.manualSafeMode = manual; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourceChecker.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourceChecker.java index f86ce5fc06772..f3e187b5e3cd9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourceChecker.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourceChecker.java @@ -130,6 +130,14 @@ public void testCheckThatNameNodeResourceMonitorIsRunning() assertTrue("NN should be in safe mode after resources crossed threshold", cluster.getNameNode().isInSafeMode()); + + mockResourceChecker.setResourcesAvailable(true); + while (cluster.getNameNode().isInSafeMode() && + Time.now() < startMillis + (60 * 1000)) { + Thread.sleep(1000); + } + assertTrue("NN should leave safe mode after resources not crossed threshold", + !cluster.getNameNode().isInSafeMode()); } finally { if (cluster != null) cluster.shutdown(); From f85ac5b60daf478c08b3301d7633db933f1d0834 Mon Sep 17 00:00:00 2001 From: huhaiyang Date: Wed, 25 Oct 2023 13:56:39 +0800 Subject: [PATCH 15/23] HADOOP-18920. RPC Metrics : Optimize logic for log slow RPCs (#6146) --- .../fs/CommonConfigurationKeysPublic.java | 4 ++ .../java/org/apache/hadoop/ipc/Server.java | 52 +++++++++++++------ .../src/main/resources/core-default.xml | 9 ++++ .../apache/hadoop/ipc/TestProtoBufRpc.java | 9 +++- .../hadoop/hdfs/server/namenode/NameNode.java | 48 ++++++++++++++++- .../namenode/TestNameNodeReconfigure.java | 46 ++++++++++++++++ .../hadoop/hdfs/tools/TestDFSAdmin.java | 2 +- 7 files changed, 150 insertions(+), 20 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java index 397d81f92f60b..006144e64ad15 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java @@ -504,6 +504,10 @@ public class CommonConfigurationKeysPublic { "ipc.server.log.slow.rpc"; public static final boolean IPC_SERVER_LOG_SLOW_RPC_DEFAULT = false; + public static final String IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY = + "ipc.server.log.slow.rpc.threshold.ms"; + public static final long IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT = 0; + public static final String IPC_SERVER_PURGE_INTERVAL_MINUTES_KEY = "ipc.server.purge.interval"; public static final int IPC_SERVER_PURGE_INTERVAL_MINUTES_DEFAULT = 15; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java index 73c86c09fc79e..53497e9707807 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java @@ -516,16 +516,22 @@ protected ResponseBuffer initialValue() { private final long metricsUpdaterInterval; private final ScheduledExecutorService scheduledExecutorService; - private boolean logSlowRPC = false; + private volatile boolean logSlowRPC = false; + /** Threshold time for log slow rpc. */ + private volatile long logSlowRPCThresholdTime; /** * Checks if LogSlowRPC is set true. * @return true, if LogSlowRPC is set true, false, otherwise. */ - protected boolean isLogSlowRPC() { + public boolean isLogSlowRPC() { return logSlowRPC; } + public long getLogSlowRPCThresholdTime() { + return logSlowRPCThresholdTime; + } + public int getNumInProcessHandler() { return numInProcessHandler.get(); } @@ -543,10 +549,16 @@ public long getTotalRequestsPerSecond() { * @param logSlowRPCFlag input logSlowRPCFlag. */ @VisibleForTesting - protected void setLogSlowRPC(boolean logSlowRPCFlag) { + public void setLogSlowRPC(boolean logSlowRPCFlag) { this.logSlowRPC = logSlowRPCFlag; } + @VisibleForTesting + public void setLogSlowRPCThresholdTime(long logSlowRPCThresholdMs) { + this.logSlowRPCThresholdTime = rpcMetrics.getMetricsTimeUnit(). + convert(logSlowRPCThresholdMs, TimeUnit.MILLISECONDS); + } + private void setPurgeIntervalNanos(int purgeInterval) { int tmpPurgeInterval = CommonConfigurationKeysPublic. IPC_SERVER_PURGE_INTERVAL_MINUTES_DEFAULT; @@ -568,12 +580,15 @@ public long getPurgeIntervalNanos() { * @param methodName - RPC Request method name * @param details - Processing Detail. * - * if this request took too much time relative to other requests - * we consider that as a slow RPC. 3 is a magic number that comes - * from 3 sigma deviation. A very simple explanation can be found - * by searching for 68-95-99.7 rule. We flag an RPC as slow RPC - * if and only if it falls above 99.7% of requests. We start this logic - * only once we have enough sample size. + * If a request took significant more time than other requests, + * and its processing time is at least `logSlowRPCThresholdMs` we consider that as a slow RPC. + * + * The definition rules for calculating whether the current request took too much time + * compared to other requests are as follows: + * 3 is a magic number that comes from 3 sigma deviation. + * A very simple explanation can be found by searching for 68-95-99.7 rule. + * We flag an RPC as slow RPC if and only if it falls above 99.7% of requests. + * We start this logic only once we have enough sample size. */ void logSlowRpcCalls(String methodName, Call call, ProcessingDetails details) { @@ -587,15 +602,14 @@ void logSlowRpcCalls(String methodName, Call call, final double threeSigma = rpcMetrics.getProcessingMean() + (rpcMetrics.getProcessingStdDev() * deviation); - long processingTime = - details.get(Timing.PROCESSING, rpcMetrics.getMetricsTimeUnit()); + final TimeUnit metricsTimeUnit = rpcMetrics.getMetricsTimeUnit(); + long processingTime = details.get(Timing.PROCESSING, metricsTimeUnit); if ((rpcMetrics.getProcessingSampleCount() > minSampleSize) && - (processingTime > threeSigma)) { - LOG.warn( - "Slow RPC : {} took {} {} to process from client {}," - + " the processing detail is {}", - methodName, processingTime, rpcMetrics.getMetricsTimeUnit(), call, - details.toString()); + (processingTime > threeSigma) && + (processingTime > getLogSlowRPCThresholdTime())) { + LOG.warn("Slow RPC : {} took {} {} to process from client {}, the processing detail is {}," + + " and the threshold time is {} {}.", methodName, processingTime, metricsTimeUnit, + call, details.toString(), getLogSlowRPCThresholdTime(), metricsTimeUnit); rpcMetrics.incrSlowRpc(); } } @@ -3359,6 +3373,10 @@ protected Server(String bindAddress, int port, CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC, CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_DEFAULT)); + this.setLogSlowRPCThresholdTime(conf.getLong( + CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY, + CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT)); + this.setPurgeIntervalNanos(conf.getInt( CommonConfigurationKeysPublic.IPC_SERVER_PURGE_INTERVAL_MINUTES_KEY, CommonConfigurationKeysPublic.IPC_SERVER_PURGE_INTERVAL_MINUTES_DEFAULT)); diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 6c3597a83fa69..d64abf79407ae 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -2526,6 +2526,15 @@ The switch to turn S3A auditing on or off. + + ipc.server.log.slow.rpc.threshold.ms + 0 + The threshold in milliseconds for logging slow rpc when ipc.server.log.slow.rpc is enabled. + Besides of being much slower than other RPC requests, an RPC request has to take at least the threshold value + defined by this property before it can be considered as slow. By default, this threshold is set to 0 (disabled). + + + ipc.server.purge.interval 15 diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestProtoBufRpc.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestProtoBufRpc.java index 0740f056c8fc9..a9eaccb3bf3df 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestProtoBufRpc.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestProtoBufRpc.java @@ -355,6 +355,7 @@ public void testLogSlowRPC() throws IOException, ServiceException, TimeoutException, InterruptedException { //No test with legacy assumeFalse(testWithLegacy); + server.setLogSlowRPCThresholdTime(SLEEP_DURATION); TestRpcService2 client = getClient2(); // make 10 K fast calls for (int x = 0; x < 10000; x++) { @@ -370,7 +371,13 @@ public void testLogSlowRPC() throws IOException, ServiceException, assertThat(rpcMetrics.getProcessingSampleCount()).isGreaterThan(999L); long before = rpcMetrics.getRpcSlowCalls(); - // make a really slow call. Sleep sleeps for 1000ms + // Sleep sleeps for 500ms(less than `logSlowRPCThresholdTime`), + // make sure we never called into Log slow RPC routine. + client.sleep(null, newSleepRequest(SLEEP_DURATION / 2)); + long after = rpcMetrics.getRpcSlowCalls(); + assertThat(before).isEqualTo(after); + + // Make a really slow call. Sleep sleeps for 3000ms. client.sleep(null, newSleepRequest(SLEEP_DURATION * 3)); // Ensure slow call is logged. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index bee7db315de5c..df490ea0d9fe0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -126,6 +126,10 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_MAX_NODES_TO_REPORT_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_MAX_NODES_TO_REPORT_KEY; @@ -365,7 +369,9 @@ public enum OperationCategory { DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_KEY, DFS_NAMENODE_DECOMMISSION_BACKOFF_MONITOR_PENDING_LIMIT, DFS_NAMENODE_DECOMMISSION_BACKOFF_MONITOR_PENDING_BLOCKS_PER_LOCK, - DFS_NAMENODE_BLOCKPLACEMENTPOLICY_MIN_BLOCKS_FOR_WRITE_KEY)); + DFS_NAMENODE_BLOCKPLACEMENTPOLICY_MIN_BLOCKS_FOR_WRITE_KEY, + IPC_SERVER_LOG_SLOW_RPC, + IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY)); private static final String USAGE = "Usage: hdfs namenode [" + StartupOption.BACKUP.getName() + "] | \n\t[" @@ -2369,6 +2375,9 @@ protected String reconfigurePropertyImpl(String property, String newVal) newVal); } else if (property.equals(DFS_NAMENODE_BLOCKPLACEMENTPOLICY_MIN_BLOCKS_FOR_WRITE_KEY)) { return reconfigureMinBlocksForWrite(property, newVal); + } else if (property.equals(IPC_SERVER_LOG_SLOW_RPC) || + (property.equals(IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY))) { + return reconfigureLogSlowRPC(property, newVal); } else { throw new ReconfigurationException(property, newVal, getConf().get( property)); @@ -2511,6 +2520,43 @@ String reconfigureIPCBackoffEnabled(String newVal) { return Boolean.toString(clientBackoffEnabled); } + String reconfigureLogSlowRPC(String property, String newVal) throws ReconfigurationException { + String result = null; + try { + if (property.equals(IPC_SERVER_LOG_SLOW_RPC)) { + if (newVal != null && !newVal.equalsIgnoreCase("true") && + !newVal.equalsIgnoreCase("false")) { + throw new IllegalArgumentException(newVal + " is not boolean value"); + } + boolean logSlowRPC = (newVal == null ? IPC_SERVER_LOG_SLOW_RPC_DEFAULT : + Boolean.parseBoolean(newVal)); + rpcServer.getClientRpcServer().setLogSlowRPC(logSlowRPC); + if (rpcServer.getServiceRpcServer() != null) { + rpcServer.getServiceRpcServer().setLogSlowRPC(logSlowRPC); + } + if (rpcServer.getLifelineRpcServer() != null) { + rpcServer.getLifelineRpcServer().setLogSlowRPC(logSlowRPC); + } + result = Boolean.toString(logSlowRPC); + } else if (property.equals(IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY)) { + long logSlowRPCThresholdTime = (newVal == null ? + IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT : Long.parseLong(newVal)); + rpcServer.getClientRpcServer().setLogSlowRPCThresholdTime(logSlowRPCThresholdTime); + if (rpcServer.getServiceRpcServer() != null) { + rpcServer.getServiceRpcServer().setLogSlowRPCThresholdTime(logSlowRPCThresholdTime); + } + if (rpcServer.getLifelineRpcServer() != null) { + rpcServer.getLifelineRpcServer().setLogSlowRPCThresholdTime(logSlowRPCThresholdTime); + } + result = Long.toString(logSlowRPCThresholdTime); + } + LOG.info("RECONFIGURE* changed reconfigureLogSlowRPC {} to {}", property, result); + return result; + } catch (IllegalArgumentException e) { + throw new ReconfigurationException(property, newVal, getConf().get(property), e); + } + } + String reconfigureSPSModeEvent(String newVal, String property) throws ReconfigurationException { if (newVal == null diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeReconfigure.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeReconfigure.java index 63d3a45fff81e..5a0f62a8117e0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeReconfigure.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeReconfigure.java @@ -29,6 +29,9 @@ import org.junit.Before; import org.junit.After; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_MAX_NODES_TO_REPORT_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_IMAGE_PARALLEL_LOAD_KEY; import static org.junit.Assert.*; @@ -701,6 +704,49 @@ public void testReconfigureMinBlocksForWrite() throws Exception { assertEquals(3, bm.getMinBlocksForWrite(BlockType.STRIPED)); } + @Test + public void testReconfigureLogSlowRPC() throws ReconfigurationException { + final NameNode nameNode = cluster.getNameNode(); + final NameNodeRpcServer nnrs = (NameNodeRpcServer) nameNode.getRpcServer(); + // verify default value. + assertFalse(nnrs.getClientRpcServer().isLogSlowRPC()); + assertEquals(IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_DEFAULT, + nnrs.getClientRpcServer().getLogSlowRPCThresholdTime()); + + // try invalid logSlowRPC. + try { + nameNode.reconfigurePropertyImpl(IPC_SERVER_LOG_SLOW_RPC, "non-boolean"); + fail("should not reach here"); + } catch (ReconfigurationException e) { + assertEquals( + "Could not change property ipc.server.log.slow.rpc from 'false' to 'non-boolean'", + e.getMessage()); + } + + // try correct logSlowRPC. + nameNode.reconfigurePropertyImpl(IPC_SERVER_LOG_SLOW_RPC, "True"); + assertTrue(nnrs.getClientRpcServer().isLogSlowRPC()); + + // revert to defaults. + nameNode.reconfigurePropertyImpl(IPC_SERVER_LOG_SLOW_RPC, null); + assertFalse(nnrs.getClientRpcServer().isLogSlowRPC()); + + // try invalid logSlowRPCThresholdTime. + try { + nameNode.reconfigureProperty(IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY, + "non-numeric"); + fail("Should not reach here"); + } catch (ReconfigurationException e) { + assertEquals("Could not change property " + + "ipc.server.log.slow.rpc.threshold.ms from '0' to 'non-numeric'", e.getMessage()); + } + + // try correct logSlowRPCThresholdTime. + nameNode.reconfigureProperty(IPC_SERVER_LOG_SLOW_RPC_THRESHOLD_MS_KEY, + "20000"); + assertEquals(nnrs.getClientRpcServer().getLogSlowRPCThresholdTime(), 20000); + } + @After public void shutDown() throws IOException { if (cluster != null) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSAdmin.java index 70a8bab8b0905..1712c620d2c82 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSAdmin.java @@ -442,7 +442,7 @@ public void testNameNodeGetReconfigurableProperties() throws IOException, Interr final List outs = Lists.newArrayList(); final List errs = Lists.newArrayList(); getReconfigurableProperties("namenode", address, outs, errs); - assertEquals(23, outs.size()); + assertEquals(25, outs.size()); assertTrue(outs.get(0).contains("Reconfigurable properties:")); assertEquals(DFS_BLOCK_INVALIDATE_LIMIT_KEY, outs.get(1)); assertEquals(DFS_BLOCK_PLACEMENT_EC_CLASSNAME_KEY, outs.get(2)); From bbf905dc99bb8939f61bcb25fe158f56cc826352 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Wed, 25 Oct 2023 14:06:13 +0100 Subject: [PATCH 16/23] HADOOP-18933. upgrade to netty 4.1.100 due to CVE (#6173) Mitigates Netty security advisory GHSA-xpw8-rcwv-8f8p "HTTP/2 Rapid Reset Attack - DDoS vector in the HTTP/2 protocol due RST frames" Contributed by PJ Fanning --- LICENSE-binary | 60 +++++++++++++++++++++--------------------- hadoop-project/pom.xml | 2 +- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index c367abdff5742..e2f61dc7cd84c 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -257,36 +257,36 @@ io.grpc:grpc-netty:1.26.0 io.grpc:grpc-protobuf:1.26.0 io.grpc:grpc-protobuf-lite:1.26.0 io.grpc:grpc-stub:1.26.0 -io.netty:netty-all:4.1.94.Final -io.netty:netty-buffer:4.1.94.Final -io.netty:netty-codec:4.1.94.Final -io.netty:netty-codec-dns:4.1.94.Final -io.netty:netty-codec-haproxy:4.1.94.Final -io.netty:netty-codec-http:4.1.94.Final -io.netty:netty-codec-http2:4.1.94.Final -io.netty:netty-codec-memcache:4.1.94.Final -io.netty:netty-codec-mqtt:4.1.94.Final -io.netty:netty-codec-redis:4.1.94.Final -io.netty:netty-codec-smtp:4.1.94.Final -io.netty:netty-codec-socks:4.1.94.Final -io.netty:netty-codec-stomp:4.1.94.Final -io.netty:netty-codec-xml:4.1.94.Final -io.netty:netty-common:4.1.94.Final -io.netty:netty-handler:4.1.94.Final -io.netty:netty-handler-proxy:4.1.94.Final -io.netty:netty-resolver:4.1.94.Final -io.netty:netty-resolver-dns:4.1.94.Final -io.netty:netty-transport:4.1.94.Final -io.netty:netty-transport-rxtx:4.1.94.Final -io.netty:netty-transport-sctp:4.1.94.Final -io.netty:netty-transport-udt:4.1.94.Final -io.netty:netty-transport-classes-epoll:4.1.94.Final -io.netty:netty-transport-native-unix-common:4.1.94.Final -io.netty:netty-transport-classes-kqueue:4.1.94.Final -io.netty:netty-resolver-dns-classes-macos:4.1.94.Final -io.netty:netty-transport-native-epoll:4.1.94.Final -io.netty:netty-transport-native-kqueue:4.1.94.Final -io.netty:netty-resolver-dns-native-macos:4.1.94.Final +io.netty:netty-all:4.1.100.Final +io.netty:netty-buffer:4.1.100.Final +io.netty:netty-codec:4.1.100.Final +io.netty:netty-codec-dns:4.1.100.Final +io.netty:netty-codec-haproxy:4.1.100.Final +io.netty:netty-codec-http:4.1.100.Final +io.netty:netty-codec-http2:4.1.100.Final +io.netty:netty-codec-memcache:4.1.100.Final +io.netty:netty-codec-mqtt:4.1.100.Final +io.netty:netty-codec-redis:4.1.100.Final +io.netty:netty-codec-smtp:4.1.100.Final +io.netty:netty-codec-socks:4.1.100.Final +io.netty:netty-codec-stomp:4.1.100.Final +io.netty:netty-codec-xml:4.1.100.Final +io.netty:netty-common:4.1.100.Final +io.netty:netty-handler:4.1.100.Final +io.netty:netty-handler-proxy:4.1.100.Final +io.netty:netty-resolver:4.1.100.Final +io.netty:netty-resolver-dns:4.1.100.Final +io.netty:netty-transport:4.1.100.Final +io.netty:netty-transport-rxtx:4.1.100.Final +io.netty:netty-transport-sctp:4.1.100.Final +io.netty:netty-transport-udt:4.1.100.Final +io.netty:netty-transport-classes-epoll:4.1.100.Final +io.netty:netty-transport-native-unix-common:4.1.100.Final +io.netty:netty-transport-classes-kqueue:4.1.100.Final +io.netty:netty-resolver-dns-classes-macos:4.1.100.Final +io.netty:netty-transport-native-epoll:4.1.100.Final +io.netty:netty-transport-native-kqueue:4.1.100.Final +io.netty:netty-resolver-dns-native-macos:4.1.100.Final io.opencensus:opencensus-api:0.12.3 io.opencensus:opencensus-contrib-grpc-metrics:0.12.3 io.reactivex:rxjava:1.3.8 diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 9303d7ff4c807..25e48f293a64a 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -143,7 +143,7 @@ 5.2.0 2.9.0 3.2.4 - 4.1.94.Final + 4.1.100.Final 1.1.10.4 1.7.1 From 8bd1f65efc42d9b93568666f10653b81dd53fc01 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 25 Oct 2023 17:39:16 +0100 Subject: [PATCH 17/23] HADOOP-18948. S3A. Add option fs.s3a.directory.operations.purge.uploads to purge on rename/delete (#6218) S3A directory delete and rename will optionally abort all pending multipart uploads in their under their to-be-deleted paths when. fs.s3a.directory.operations.purge.upload is true It is off by default. The filesystems hasPathCapability("fs.s3a.directory.operations.purge.upload") probe will return true when this feature is enabled. Multipart uploads may accrue from interrupted data writes, uncommitted staging/magic committer jobs and other operations/applications. On AWS S3 lifecycle rules are the recommended way to clean these; this change improves support for stores which lack these rules. Contributed by Steve Loughran --- .../fs/statistics/StoreStatisticNames.java | 7 + .../org/apache/hadoop/fs/s3a/Constants.java | 15 ++ .../apache/hadoop/fs/s3a/MultipartUtils.java | 6 +- .../apache/hadoop/fs/s3a/S3AFileSystem.java | 152 ++++++++++++---- .../org/apache/hadoop/fs/s3a/Statistic.java | 5 +- .../hadoop/fs/s3a/impl/CallableSupplier.java | 33 ++-- .../hadoop/fs/s3a/impl/DeleteOperation.java | 46 ++++- .../fs/s3a/impl/OperationCallbacks.java | 12 ++ .../hadoop/fs/s3a/impl/RenameOperation.java | 39 ++++- .../hadoop/fs/s3a/s3guard/S3GuardTool.java | 4 +- .../tools/hadoop-aws/third_party_stores.md | 57 +++++- .../hadoop/fs/s3a/ITestS3AMultipartUtils.java | 5 +- .../hadoop/fs/s3a/MultipartTestUtils.java | 7 +- .../hadoop/fs/s3a/auth/ITestAssumeRole.java | 5 +- ...ITestUploadPurgeOnDirectoryOperations.java | 163 ++++++++++++++++++ .../s3a/performance/AbstractS3ACostTest.java | 7 + .../fs/s3a/s3guard/ITestS3GuardTool.java | 19 +- 17 files changed, 499 insertions(+), 83 deletions(-) create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java index c04c1bb47fcea..19ee9d1414ecf 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java @@ -244,6 +244,13 @@ public final class StoreStatisticNames { public static final String OBJECT_MULTIPART_UPLOAD_ABORTED = "object_multipart_aborted"; + /** + * Object multipart list request. + * Value :{@value}. + */ + public static final String OBJECT_MULTIPART_UPLOAD_LIST = + "object_multipart_list"; + /** * Object put/multipart upload count. * Value :{@value}. diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index d69d01f99450f..8b174e92b2911 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1318,4 +1318,19 @@ private Constants() { * The bucket region header. */ public static final String BUCKET_REGION_HEADER = "x-amz-bucket-region"; + + /** + * Should directory operations purge uploads? + * This adds at least one parallelized list operation to the call, + * plus the overhead of deletions. + * Value: {@value}. + */ + public static final String DIRECTORY_OPERATIONS_PURGE_UPLOADS = + "fs.s3a.directory.operations.purge.uploads"; + + /** + * Default value of {@link #DIRECTORY_OPERATIONS_PURGE_UPLOADS}: {@value}. + */ + public static final boolean DIRECTORY_OPERATIONS_PURGE_UPLOADS_DEFAULT = false; + } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/MultipartUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/MultipartUtils.java index efca093204c25..b2057c211da7b 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/MultipartUtils.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/MultipartUtils.java @@ -36,7 +36,7 @@ import org.apache.hadoop.fs.s3a.impl.StoreContext; import org.apache.hadoop.fs.store.audit.AuditSpan; -import static org.apache.hadoop.fs.s3a.Statistic.MULTIPART_UPLOAD_LIST; +import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_LIST; import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfOperation; @@ -66,7 +66,7 @@ private MultipartUtils() { } * @param maxKeys maximum batch size to request at a time from S3. * @return an iterator of matching uploads */ - static MultipartUtils.UploadIterator listMultipartUploads( + static RemoteIterator listMultipartUploads( final StoreContext storeContext, S3Client s3, @Nullable String prefix, @@ -196,7 +196,7 @@ private void requestNextBatch() throws IOException { listing = invoker.retry("listMultipartUploads", prefix, true, trackDurationOfOperation(storeContext.getInstrumentation(), - MULTIPART_UPLOAD_LIST.getSymbol(), + OBJECT_MULTIPART_UPLOAD_LIST.getSymbol(), () -> s3.listMultipartUploads(requestBuilder.build()))); LOG.debug("Listing found {} upload(s)", listing.uploads().size()); diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index d7149d7dead6b..defbcd94a5b14 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -258,6 +258,7 @@ import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier; import static org.apache.hadoop.io.IOUtils.cleanupWithLogger; import static org.apache.hadoop.util.Preconditions.checkArgument; +import static org.apache.hadoop.util.functional.RemoteIterators.foreach; import static org.apache.hadoop.util.functional.RemoteIterators.typeCastingRemoteIterator; /** @@ -384,6 +385,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, private SignerManager signerManager; private S3AInternals s3aInternals; + /** + * Do directory operations purge pending uploads? + */ + private boolean dirOperationsPurgeUploads; + /** * Page size for deletions. */ @@ -565,6 +571,9 @@ public void initialize(URI name, Configuration originalConf) //check but do not store the block size longBytesOption(conf, FS_S3A_BLOCK_SIZE, DEFAULT_BLOCKSIZE, 1); enableMultiObjectsDelete = conf.getBoolean(ENABLE_MULTI_DELETE, true); + // should the delete also purge uploads. + dirOperationsPurgeUploads = conf.getBoolean(DIRECTORY_OPERATIONS_PURGE_UPLOADS, + DIRECTORY_OPERATIONS_PURGE_UPLOADS_DEFAULT); this.prefetchEnabled = conf.getBoolean(PREFETCH_ENABLED_KEY, PREFETCH_ENABLED_DEFAULT); long prefetchBlockSizeLong = @@ -1230,7 +1239,7 @@ public void abortOutstandingMultipartUploads(long seconds) purgeBefore); invoker.retry("Purging multipart uploads", bucket, true, () -> { - MultipartUtils.UploadIterator uploadIterator = + RemoteIterator uploadIterator = MultipartUtils.listMultipartUploads(createStoreContext(), s3Client, null, maxKeys); while (uploadIterator.hasNext()) { @@ -2283,12 +2292,14 @@ private long innerRename(Path source, Path dest) // Initiate the rename. // this will call back into this class via the rename callbacks + final StoreContext storeContext = createStoreContext(); RenameOperation renameOperation = new RenameOperation( - createStoreContext(), + storeContext, src, srcKey, p.getLeft(), dst, dstKey, p.getRight(), - new OperationCallbacksImpl(), - pageSize); + new OperationCallbacksImpl(storeContext), + pageSize, + dirOperationsPurgeUploads); return renameOperation.execute(); } @@ -2309,8 +2320,19 @@ private final class OperationCallbacksImpl implements OperationCallbacks { /** Audit Span at time of creation. */ private final AuditSpan auditSpan; - private OperationCallbacksImpl() { - auditSpan = getActiveAuditSpan(); + private final StoreContext storeContext; + + private OperationCallbacksImpl(final StoreContext storeContext) { + this.storeContext = requireNonNull(storeContext); + this.auditSpan = storeContext.getActiveAuditSpan(); + } + + /** + * Get the audit span. + * @return the span + */ + private AuditSpan getAuditSpan() { + return auditSpan; } @Override @@ -2410,7 +2432,29 @@ public RemoteIterator listObjects( Listing.ACCEPT_ALL_BUT_S3N, auditSpan)); } - } + + /** + * Abort multipart uploads under a path. + * @param prefix prefix for uploads to abort + * @return a count of aborts + * @throws IOException trouble; FileNotFoundExceptions are swallowed. + */ + @Override + @Retries.RetryTranslated + public long abortMultipartUploadsUnderPrefix(String prefix) + throws IOException { + getAuditSpan().activate(); + // this deactivates the audit span somehow + final RemoteIterator uploads = + S3AFileSystem.this.listUploadsUnderPrefix(storeContext, prefix); + // so reactivate it. + getAuditSpan().activate(); + return foreach(uploads, upload -> + invoker.retry("Aborting multipart commit", upload.key(), true, () -> + S3AFileSystem.this.abortMultipartUpload(upload))); + } + + } // end OperationCallbacksImpl /** * Callbacks from {@link Listing}. @@ -3371,14 +3415,17 @@ protected boolean deleteWithoutCloseCheck(Path f, boolean recursive) throws IOEx // span covers delete, getFileStatus, fake directory operations. try (AuditSpan span = createSpan(INVOCATION_DELETE.getSymbol(), path.toString(), null)) { + // SC will include active span + final StoreContext storeContext = createStoreContext(); boolean outcome = trackDuration(getDurationTrackerFactory(), INVOCATION_DELETE.getSymbol(), new DeleteOperation( - createStoreContext(), + storeContext, innerGetFileStatus(path, true, StatusProbeEnum.ALL), recursive, - new OperationCallbacksImpl(), - pageSize)); + new OperationCallbacksImpl(storeContext), + pageSize, + dirOperationsPurgeUploads)); if (outcome) { try { maybeCreateFakeParentDirectory(path); @@ -5151,13 +5198,39 @@ S3ALocatedFileStatus toLocatedFileStatus(S3AFileStatus status) @InterfaceAudience.Private @Retries.RetryTranslated @AuditEntryPoint - public MultipartUtils.UploadIterator listUploads(@Nullable String prefix) + public RemoteIterator listUploads(@Nullable String prefix) + throws IOException { + // span is picked up retained in the listing. + checkNotClosed(); + try (AuditSpan span = createSpan(MULTIPART_UPLOAD_LIST.getSymbol(), + prefix, null)) { + return listUploadsUnderPrefix(createStoreContext(), prefix); + } + } + + /** + * List any pending multipart uploads whose keys begin with prefix, using + * an iterator that can handle an unlimited number of entries. + * See {@link #listMultipartUploads(String)} for a non-iterator version of + * this. + * @param storeContext store conext. + * @param prefix optional key prefix to search + * @return Iterator over multipart uploads. + * @throws IOException on failure + */ + @InterfaceAudience.Private + @Retries.RetryTranslated + public RemoteIterator listUploadsUnderPrefix( + final StoreContext storeContext, + final @Nullable String prefix) throws IOException { // span is picked up retained in the listing. - return trackDurationAndSpan(MULTIPART_UPLOAD_LIST, prefix, null, () -> - MultipartUtils.listMultipartUploads( - createStoreContext(), s3Client, prefix, maxKeys - )); + String p = prefix; + if (prefix != null && !prefix.isEmpty() && !prefix.endsWith("/")) { + p = prefix + "/"; + } + // duration tracking is done in iterator. + return MultipartUtils.listMultipartUploads(storeContext, s3Client, p, maxKeys); } /** @@ -5179,9 +5252,10 @@ public List listMultipartUploads(String prefix) } String p = prefix; return invoker.retry("listMultipartUploads", p, true, () -> { - ListMultipartUploadsRequest.Builder requestBuilder = getRequestFactory() - .newListMultipartUploadsRequestBuilder(p); - return s3Client.listMultipartUploads(requestBuilder.build()).uploads(); + final ListMultipartUploadsRequest request = getRequestFactory() + .newListMultipartUploadsRequestBuilder(p).build(); + return trackDuration(getInstrumentation(), MULTIPART_UPLOAD_LIST.getSymbol(), () -> + s3Client.listMultipartUploads(request).uploads()); }); } @@ -5190,37 +5264,35 @@ public List listMultipartUploads(String prefix) * Retry policy: none. * @param destKey destination key * @param uploadId Upload ID + * @throws IOException IO failure, including any uprated SdkException */ - @Retries.OnceRaw - void abortMultipartUpload(String destKey, String uploadId) { - LOG.info("Aborting multipart upload {} to {}", uploadId, destKey); - s3Client.abortMultipartUpload( - getRequestFactory().newAbortMultipartUploadRequestBuilder( - destKey, - uploadId).build()); + @Retries.OnceTranslated + public void abortMultipartUpload(String destKey, String uploadId) throws IOException { + LOG.debug("Aborting multipart upload {} to {}", uploadId, destKey); + trackDuration(getInstrumentation(), OBJECT_MULTIPART_UPLOAD_ABORTED.getSymbol(), () -> + s3Client.abortMultipartUpload( + getRequestFactory().newAbortMultipartUploadRequestBuilder( + destKey, + uploadId).build())); } /** * Abort a multipart upload. * Retry policy: none. * @param upload the listed upload to abort. + * @throws IOException IO failure, including any uprated SdkException */ - @Retries.OnceRaw - void abortMultipartUpload(MultipartUpload upload) { - String destKey; - String uploadId; - destKey = upload.key(); - uploadId = upload.uploadId(); - if (LOG.isInfoEnabled()) { + @Retries.OnceTranslated + public void abortMultipartUpload(MultipartUpload upload) throws IOException { + String destKey = upload.key(); + String uploadId = upload.uploadId(); + if (LOG.isDebugEnabled()) { DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); LOG.debug("Aborting multipart upload {} to {} initiated by {} on {}", uploadId, destKey, upload.initiator(), df.format(Date.from(upload.initiated()))); } - s3Client.abortMultipartUpload( - getRequestFactory().newAbortMultipartUploadRequestBuilder( - destKey, - uploadId).build()); + abortMultipartUpload(destKey, uploadId); } /** @@ -5266,13 +5338,17 @@ public boolean hasPathCapability(final Path path, final String capability) case STORE_CAPABILITY_DIRECTORY_MARKER_AWARE: return true; + // Do directory operations purge uploads. + case DIRECTORY_OPERATIONS_PURGE_UPLOADS: + return dirOperationsPurgeUploads; + // etags are avaialable in listings, but they // are not consistent across renames. // therefore, only availability is declared case CommonPathCapabilities.ETAGS_AVAILABLE: return true; - /* + /* * Marker policy capabilities are handed off. */ case STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_KEEP: @@ -5545,7 +5621,7 @@ public MarkerToolOperations createMarkerToolOperations(final String target) throws IOException { createSpan("marker-tool-scan", target, null); - return new MarkerToolOperationsImpl(new OperationCallbacksImpl()); + return new MarkerToolOperationsImpl(new OperationCallbacksImpl(createStoreContext())); } /** diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java index f4e28aa62783e..72fc75b642415 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java @@ -242,7 +242,10 @@ public enum Statistic { StoreStatisticNames.OBJECT_MULTIPART_UPLOAD_ABORTED, "Object multipart upload aborted", TYPE_DURATION), - OBJECT_PUT_REQUESTS( + OBJECT_MULTIPART_UPLOAD_LIST( + StoreStatisticNames.OBJECT_MULTIPART_UPLOAD_LIST, + "Object multipart list request issued", + TYPE_DURATION), OBJECT_PUT_REQUESTS( StoreStatisticNames.OBJECT_PUT_REQUEST, "Object put/multipart upload count", TYPE_DURATION), diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CallableSupplier.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CallableSupplier.java index 0156207419210..e0580df08a76d 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CallableSupplier.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CallableSupplier.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; +import java.util.Optional; import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; @@ -155,19 +156,21 @@ public static void waitForCompletion( * Wait for a single of future to complete, extracting IOEs afterwards. * @param future future to wait for. * @param type + * @return the result * @throws IOException if one of the called futures raised an IOE. * @throws RuntimeException if one of the futures raised one. */ - public static void waitForCompletion( + public static T waitForCompletion( final CompletableFuture future) throws IOException { try (DurationInfo ignore = new DurationInfo(LOG, false, "Waiting for task completion")) { - future.join(); + return future.join(); } catch (CancellationException e) { throw new IOException(e); } catch (CompletionException e) { raiseInnerCause(e); + return null; } } @@ -175,31 +178,35 @@ public static void waitForCompletion( * Wait for a single of future to complete, ignoring exceptions raised. * @param future future to wait for. * @param type + * @return the outcome if successfully retrieved. */ - public static void waitForCompletionIgnoringExceptions( + public static Optional waitForCompletionIgnoringExceptions( @Nullable final CompletableFuture future) { - if (future != null) { - try (DurationInfo ignore = - new DurationInfo(LOG, false, "Waiting for task completion")) { - future.join(); - } catch (Exception e) { - LOG.debug("Ignoring exception raised in task completion: "); - } + + try { + return maybeAwaitCompletion(future); + } catch (Exception e) { + LOG.debug("Ignoring exception raised in task completion: ", e); + return Optional.empty(); } } /** * Block awaiting completion for any non-null future passed in; * No-op if a null arg was supplied. + * @param return type * @param future future + * @return the outcome; is empty if the future was null/had no return value * @throws IOException if one of the called futures raised an IOE. * @throws RuntimeException if one of the futures raised one. */ - public static void maybeAwaitCompletion( - @Nullable final CompletableFuture future) + public static Optional maybeAwaitCompletion( + @Nullable final CompletableFuture future) throws IOException { if (future != null) { - waitForCompletion(future); + return Optional.ofNullable(waitForCompletion(future)); + } else { + return Optional.empty(); } } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/DeleteOperation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/DeleteOperation.java index 314d7cb82d1dd..11e73aeb750ae 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/DeleteOperation.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/DeleteOperation.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; @@ -41,6 +42,7 @@ import org.apache.hadoop.util.DurationInfo; +import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletionIgnoringExceptions; import static org.apache.hadoop.fs.store.audit.AuditingFunctions.callableWithinAuditSpan; import static org.apache.hadoop.util.Preconditions.checkArgument; import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.maybeAwaitCompletion; @@ -110,6 +112,16 @@ public class DeleteOperation extends ExecutingStoreOperation { */ private long filesDeleted; + /** + * Do directory operations purge pending uploads? + */ + private final boolean dirOperationsPurgeUploads; + + /** + * Count of uploads aborted. + */ + private Optional uploadsAborted = Optional.empty(); + /** * Constructor. * @param context store context @@ -117,12 +129,14 @@ public class DeleteOperation extends ExecutingStoreOperation { * @param recursive recursive delete? * @param callbacks callback provider * @param pageSize size of delete pages + * @param dirOperationsPurgeUploads Do directory operations purge pending uploads? */ public DeleteOperation(final StoreContext context, final S3AFileStatus status, final boolean recursive, final OperationCallbacks callbacks, - final int pageSize) { + final int pageSize, + final boolean dirOperationsPurgeUploads) { super(context); this.status = status; @@ -134,12 +148,22 @@ public DeleteOperation(final StoreContext context, this.pageSize = pageSize; executor = MoreExecutors.listeningDecorator( context.createThrottledExecutor(1)); + this.dirOperationsPurgeUploads = dirOperationsPurgeUploads; } public long getFilesDeleted() { return filesDeleted; } + /** + * Get the count of uploads aborted. + * Non-empty iff enabled, and the operations completed without errors. + * @return count of aborted uploads. + */ + public Optional getUploadsAborted() { + return uploadsAborted; + } + /** * Delete a file or directory tree. *

@@ -236,6 +260,17 @@ protected void deleteDirectoryTree(final Path path, try (DurationInfo ignored = new DurationInfo(LOG, false, "deleting %s", dirKey)) { + final CompletableFuture abortUploads; + if (dirOperationsPurgeUploads) { + final StoreContext sc = getStoreContext(); + final String key = sc.pathToKey(path) + "/"; + LOG.debug("All uploads under {} will be deleted", key); + abortUploads = submit(sc.getExecutor(), sc.getActiveAuditSpan(), () -> + callbacks.abortMultipartUploadsUnderPrefix(key)); + } else { + abortUploads = null; + } + // init the lists of keys and paths to delete resetDeleteList(); deleteFuture = null; @@ -257,10 +292,10 @@ protected void deleteDirectoryTree(final Path path, LOG.debug("Deleting final batch of listed files"); submitNextBatch(); maybeAwaitCompletion(deleteFuture); - + uploadsAborted = waitForCompletionIgnoringExceptions(abortUploads); } - LOG.debug("Delete \"{}\" completed; deleted {} objects", path, - filesDeleted); + LOG.debug("Delete \"{}\" completed; deleted {} objects and aborted {} uploads", path, + filesDeleted, uploadsAborted.orElse(0L)); } /** @@ -313,7 +348,8 @@ private void submitNextBatch() throws IOException { // delete a single page of keys and the metadata. // block for any previous batch. - maybeAwaitCompletion(deleteFuture); + maybeAwaitCompletion(deleteFuture).ifPresent(count -> + LOG.debug("Deleted {} uploads", count)); // delete the current page of keys and paths deleteFuture = submitDelete(keys); diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/OperationCallbacks.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/OperationCallbacks.java index e0d9c7c6aada7..9c88870633a35 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/OperationCallbacks.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/OperationCallbacks.java @@ -164,4 +164,16 @@ RemoteIterator listObjects( Path path, String key) throws IOException; + + /** + * Abort multipart uploads under a path; paged. + * @param prefix prefix for uploads to abort + * @return a count of aborts + * @throws IOException trouble; FileNotFoundExceptions are swallowed. + */ + @Retries.RetryTranslated + default long abortMultipartUploadsUnderPrefix(String prefix) + throws IOException { + return 0; + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RenameOperation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RenameOperation.java index 4bb15f74965a9..288b3c0aae585 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RenameOperation.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RenameOperation.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicLong; @@ -44,6 +45,7 @@ import org.apache.hadoop.util.OperationDuration; import static org.apache.hadoop.fs.s3a.S3AUtils.translateException; +import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletionIgnoringExceptions; import static org.apache.hadoop.fs.store.audit.AuditingFunctions.callableWithinAuditSpan; import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit; import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion; @@ -124,9 +126,18 @@ public class RenameOperation extends ExecutingStoreOperation { private final List keysToDelete = new ArrayList<>(); + /** + * Do directory operations purge pending uploads? + */ + private final boolean dirOperationsPurgeUploads; + + /** + * Count of uploads aborted. + */ + private Optional uploadsAborted = Optional.empty(); + /** * Initiate the rename. - * * @param storeContext store context * @param sourcePath source path * @param sourceKey key of source @@ -136,6 +147,7 @@ public class RenameOperation extends ExecutingStoreOperation { * @param destStatus destination status. * @param callbacks callback provider * @param pageSize size of delete requests + * @param dirOperationsPurgeUploads Do directory operations purge pending uploads? */ public RenameOperation( final StoreContext storeContext, @@ -146,7 +158,8 @@ public RenameOperation( final String destKey, final S3AFileStatus destStatus, final OperationCallbacks callbacks, - final int pageSize) { + final int pageSize, + final boolean dirOperationsPurgeUploads) { super(storeContext); this.sourcePath = sourcePath; this.sourceKey = sourceKey; @@ -159,6 +172,16 @@ public RenameOperation( && pageSize <= InternalConstants.MAX_ENTRIES_TO_DELETE, "page size out of range: %s", pageSize); this.pageSize = pageSize; + this.dirOperationsPurgeUploads = dirOperationsPurgeUploads; + } + + /** + * Get the count of uploads aborted. + * Non-empty iff enabled, and the operations completed without errors. + * @return count of aborted uploads. + */ + public Optional getUploadsAborted() { + return uploadsAborted; } /** @@ -341,6 +364,16 @@ protected void recursiveDirectoryRename() throws IOException { throw new RenameFailedException(srcKey, dstKey, "cannot rename a directory to a subdirectory of itself "); } + // start the async dir cleanup + final CompletableFuture abortUploads; + if (dirOperationsPurgeUploads) { + final String key = srcKey; + LOG.debug("All uploads under {} will be deleted", key); + abortUploads = submit(getStoreContext().getExecutor(), () -> + callbacks.abortMultipartUploadsUnderPrefix(key)); + } else { + abortUploads = null; + } if (destStatus != null && destStatus.isEmptyDirectory() == Tristate.TRUE) { @@ -422,6 +455,8 @@ protected void recursiveDirectoryRename() throws IOException { // have been deleted. completeActiveCopiesAndDeleteSources("final copy and delete"); + // and if uploads were being aborted, wait for that to finish + uploadsAborted = waitForCompletionIgnoringExceptions(abortUploads); } /** diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java index 22fc630dad1f5..ea1ea908486e2 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java @@ -47,8 +47,8 @@ import org.apache.hadoop.fs.FilterFileSystem; import org.apache.hadoop.fs.Options; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.s3a.Constants; -import org.apache.hadoop.fs.s3a.MultipartUtils; import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.fs.s3a.WriteOperationHelper; import org.apache.hadoop.fs.s3a.auth.RolePolicies; @@ -683,7 +683,7 @@ private void promptBeforeAbort(PrintStream out) throws IOException { private void processUploads(PrintStream out) throws IOException { final S3AFileSystem fs = getFilesystem(); - MultipartUtils.UploadIterator uploads = fs.listUploads(prefix); + RemoteIterator uploads = fs.listUploads(prefix); // create a span so that the write operation helper // is within one AuditSpan span = diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index a7ea7b2e59024..0216e46014c7e 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -39,11 +39,12 @@ The features which may be unavailable include: * Optional Bucket Probes at startup (`fs.s3a.bucket.probe = 0`). This is now the default -do not change it. * List API to use (`fs.s3a.list.version = 1`) +* Bucket lifecycle rules to clean up pending uploads. ## Configuring s3a to connect to a third party store -### Connecting to a third party object store over HTTPS +## Connecting to a third party object store over HTTPS The core setting for a third party store is to change the endpoint in `fs.s3a.endpoint`. @@ -89,6 +90,57 @@ then these must be set, either in XML or (preferred) in a JCEKS file. If per-bucket settings are used here, then third-party stores and credentials may be used alongside an AWS store. + + +## Other issues + +### Coping without bucket lifecycle rules + +Not all third-party stores support bucket lifecycle rules to clean up buckets +of incomplete uploads. + +This can be addressed in two ways +* Command line: `hadoop s3guard uploads -abort -force \`. +* With `fs.s3a.multipart.purge` and a purge age set in `fs.s3a.multipart.purge.age` +* In rename/delete `fs.s3a.directory.operations.purge.uploads = true`. + +#### S3Guard uploads command + +This can be executed on a schedule, or manually + +``` +hadoop s3guard uploads -abort -force s3a://bucket/ +``` + +Consult the [S3Guard documentation](s3guard.html) for the full set of parameters. + +#### In startup: `fs.s3a.multipart.purge` + +This lists all uploads in a bucket when a filesystem is created and deletes +all of those above a certain age. + +This can hurt performance on a large bucket, as the purge scans the entire tree, +and is executed whenever a filesystem is created -which can happen many times during +hive, spark, distcp jobs. + +For this reason, this option may be deleted in future, however it has long been +available in the S3A client and so guaranteed to work across versions. + +#### During rename and delete: `fs.s3a.directory.operations.purge.uploads` + +When `fs.s3a.directory.operations.purge.uploads` is set, when a directory is renamed +or deleted, then in parallel with the delete an attempt is made to list +all pending uploads. +If there are any, they are aborted (sequentially). + +* This is disabled by default: it adds overhead and extra cost. +* Because it only applies to the directories being processed, directories which + are not renamed or deleted will retain all incomplete uploads. +* There is no age checking: all uploads will be aborted. +* If any other process is writing to the same directory tree, their operations +will be cancelled. + + # Troubleshooting The most common problem when talking to third-party stores are @@ -412,4 +464,5 @@ It is also a way to regression test foundational S3A third-party store compatibi ``` _Note_ If anyone is set up to test this reguarly, please let the hadoop developer team know if regressions do surface, -as it is not a common test configuration. \ No newline at end of file +as it is not a common test configuration. +[] \ No newline at end of file diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java index 263a857e03300..e0559b7c49edc 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java @@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.store.audit.AuditSpan; import java.io.IOException; @@ -76,7 +77,7 @@ public void testListMultipartUploads() throws Exception { // 2. Verify all uploads are found listing by prefix describe("Verifying upload list by prefix"); - MultipartUtils.UploadIterator uploads = fs.listUploads(getPartPrefix(fs)); + RemoteIterator uploads = fs.listUploads(getPartPrefix(fs)); assertUploadsPresent(uploads, keySet); // 3. Verify all uploads are found listing without prefix @@ -97,7 +98,7 @@ public void testListMultipartUploads() throws Exception { * @param ourUploads set up uploads that should be present * @throws IOException on I/O error */ - private void assertUploadsPresent(MultipartUtils.UploadIterator list, + private void assertUploadsPresent(RemoteIterator list, Set ourUploads) throws IOException { // Don't modify passed-in set, use copy. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MultipartTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MultipartTestUtils.java index 3e343a9ea85f2..3f6870be46b2a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MultipartTestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MultipartTestUtils.java @@ -23,6 +23,7 @@ import software.amazon.awssdk.services.s3.model.UploadPartRequest; import software.amazon.awssdk.services.s3.model.UploadPartResponse; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.s3a.impl.PutObjectOptions; import org.apache.hadoop.fs.store.audit.AuditSpan; import org.apache.hadoop.io.IOUtils; @@ -96,7 +97,7 @@ public static void clearAnyUploads(S3AFileSystem fs, Path path) { String key = fs.pathToKey(path); AuditSpan span = null; try { - MultipartUtils.UploadIterator uploads = fs.listUploads(key); + RemoteIterator uploads = fs.listUploads(key); span = fs.createSpan("multipart", path.toString(), null); final WriteOperationHelper helper = fs.getWriteOperationHelper(); @@ -118,7 +119,7 @@ public static void clearAnyUploads(S3AFileSystem fs, Path path) { public static void assertNoUploadsAt(S3AFileSystem fs, Path path) throws Exception { String key = fs.pathToKey(path); - MultipartUtils.UploadIterator uploads = fs.listUploads(key); + RemoteIterator uploads = fs.listUploads(key); while (uploads.hasNext()) { MultipartUpload upload = uploads.next(); Assert.fail("Found unexpected upload " + upload.key() + " " + @@ -130,7 +131,7 @@ public static void assertNoUploadsAt(S3AFileSystem fs, Path path) throws public static int countUploadsAt(S3AFileSystem fs, Path path) throws IOException { String key = fs.pathToKey(path); - MultipartUtils.UploadIterator uploads = fs.listUploads(key); + RemoteIterator uploads = fs.listUploads(key); int count = 0; while (uploads.hasNext()) { MultipartUpload upload = uploads.next(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java index 5534bb77c0ddb..12234301b50d8 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java @@ -27,6 +27,7 @@ import java.util.stream.IntStream; import software.amazon.awssdk.auth.credentials.AwsCredentials; +import software.amazon.awssdk.services.s3.model.MultipartUpload; import software.amazon.awssdk.services.sts.model.StsException; import com.fasterxml.jackson.core.JsonProcessingException; import org.assertj.core.api.Assertions; @@ -40,10 +41,10 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.contract.ContractTestUtils; import org.apache.hadoop.fs.s3a.AWSBadRequestException; import org.apache.hadoop.fs.s3a.AbstractS3ATestBase; -import org.apache.hadoop.fs.s3a.MultipartUtils; import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.fs.s3a.S3ATestConstants; import org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider; @@ -463,7 +464,7 @@ public void testReadOnlyOperations() throws Throwable { // list multipart uploads. // This is part of the read policy. int counter = 0; - MultipartUtils.UploadIterator iterator = roleFS.listUploads("/"); + RemoteIterator iterator = roleFS.listUploads("/"); while (iterator.hasNext()) { counter++; iterator.next(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java new file mode 100644 index 0000000000000..9e07027375989 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.impl; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.assertj.core.api.Assertions; +import org.junit.Test; +import software.amazon.awssdk.services.s3.model.MultipartUpload; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; +import org.apache.hadoop.fs.store.audit.AuditSpan; + +import static org.apache.hadoop.fs.contract.ContractTestUtils.assertFileHasLength; +import static org.apache.hadoop.fs.contract.ContractTestUtils.assertHasPathCapabilities; +import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile; +import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_OPERATIONS_PURGE_UPLOADS; +import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; +import static org.apache.hadoop.fs.s3a.Statistic.MULTIPART_UPLOAD_LIST; +import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_ABORTED; +import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_LIST; +import static org.apache.hadoop.fs.s3a.commit.CommitConstants.MAGIC_COMMITTER_ENABLED; +import static org.apache.hadoop.fs.s3a.commit.CommitConstants.MAGIC_PATH_PREFIX; +import static org.apache.hadoop.util.functional.RemoteIterators.toList; + +/** + * Test behavior of purging uploads in rename and delete. + */ +public class ITestUploadPurgeOnDirectoryOperations extends AbstractS3ACostTest { + + @Override + public Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + removeBaseAndBucketOverrides(conf, + DIRECTORY_OPERATIONS_PURGE_UPLOADS, + MAGIC_COMMITTER_ENABLED); + conf.setBoolean(DIRECTORY_OPERATIONS_PURGE_UPLOADS, true); + conf.setBoolean(MAGIC_COMMITTER_ENABLED, true); + return conf; + } + + @Override + public void setup() throws Exception { + super.setup(); + final S3AFileSystem fs = getFileSystem(); + assertHasPathCapabilities(fs, new Path("/"), + DIRECTORY_OPERATIONS_PURGE_UPLOADS); + clearAnyUploads(fs, methodPath()); + } + + @Test + public void testDeleteWithPendingUpload() throws Throwable { + + final S3AFileSystem fs = getFileSystem(); + final Path dir = methodPath(); + + // create a magic file. + createMagicFile(fs, dir); + + // and there's a pending upload + assertUploadCount(dir, 1); + + // delete the dir, with a cost of 1 abort, 1 list. + verifyMetrics(() -> fs.delete(dir, true), + with(OBJECT_MULTIPART_UPLOAD_ABORTED, 1), // abort + with(OBJECT_MULTIPART_UPLOAD_LIST, 1), // HTTP request inside iterator + with(MULTIPART_UPLOAD_LIST, 0)); // api list call + + + // and the pending upload is gone + assertUploadCount(dir, 0); + } + + @Test + public void testRenameWithPendingUpload() throws Throwable { + + final S3AFileSystem fs = getFileSystem(); + final Path base = methodPath(); + final Path dir = new Path(base, "src"); + final Path dest = new Path(base, "dest"); + + // create a magic file. + createMagicFile(fs, dir); + + // and there's a pending upload + assertUploadCount(dir, 1); + + // rename the dir, with a cost of 1 abort, 1 list. + verifyMetrics(() -> fs.rename(dir, dest), + with(OBJECT_MULTIPART_UPLOAD_ABORTED, 1), // abort + with(OBJECT_MULTIPART_UPLOAD_LIST, 1), // HTTP request inside iterator + with(MULTIPART_UPLOAD_LIST, 0)); // api list call + + // and there isn't + assertUploadCount(dir, 0); + } + + /** + * Create a magic file of "real" length more than 0 bytes long. + * @param fs filesystem + * @param dir directory + * @return the path + * @throws IOException creation failure.p + */ + private static Path createMagicFile(final S3AFileSystem fs, final Path dir) throws IOException { + Path magicFile = new Path(dir, MAGIC_PATH_PREFIX + "001/file.txt"); + createFile(fs, magicFile, true, "123".getBytes(StandardCharsets.UTF_8)); + + // the file exists but is a 0 byte marker file. + assertFileHasLength(fs, magicFile, 0); + return magicFile; + } + + /** + * Assert the upload count under a dir is the expected value. + * Failure message will include the list of entries. + * @param dir dir + * @param expected expected count + * @throws IOException listing problem + */ + private void assertUploadCount(final Path dir, final int expected) throws IOException { + Assertions.assertThat(toList(listUploads(dir))) + .describedAs("uploads under %s", dir) + .hasSize(expected); + } + + /** + * List uploads; use the same APIs that the directory operations use, + * so implicitly validating them. + * @param dir directory to list + * @return full list of entries + * @throws IOException listing problem + */ + private RemoteIterator listUploads(Path dir) throws IOException { + final S3AFileSystem fs = getFileSystem(); + try (AuditSpan ignored = span()) { + final StoreContext sc = fs.createStoreContext(); + return fs.listUploadsUnderPrefix(sc, sc.pathToKey(dir)); + } + } +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/AbstractS3ACostTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/AbstractS3ACostTest.java index 48378ce75dc9c..e37717bfa1e34 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/AbstractS3ACostTest.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/AbstractS3ACostTest.java @@ -91,6 +91,13 @@ protected AbstractS3ACostTest( this.keepMarkers = keepMarkers; } + /** + * Constructor with markers kept. + */ + public AbstractS3ACostTest() { + this(true); + } + @Override public Configuration createConfiguration() { Configuration conf = super.createConfiguration(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java index 844230e8bea9f..28bc2a246af1a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java @@ -97,22 +97,22 @@ public void testStoreInfo() throws Throwable { LOG.info("Exec output=\n{}", output); } - private final static String UPLOAD_PREFIX = "test-upload-prefix"; private final static String UPLOAD_NAME = "test-upload"; @Test public void testUploads() throws Throwable { S3AFileSystem fs = getFileSystem(); - Path path = path(UPLOAD_PREFIX + "/" + UPLOAD_NAME); + Path path = methodPath(); + Path file = new Path(path, UPLOAD_NAME); describe("Cleaning up any leftover uploads from previous runs."); - final String key = fs.pathToKey(path); + final String key = fs.pathToKey(file); try { // 1. Make sure key doesn't already exist clearAnyUploads(fs, path); // 2. Confirm no uploads are listed via API - assertNoUploadsAt(fs, path.getParent()); + assertNoUploadsAt(fs, path); // 3. Confirm no uploads are listed via CLI describe("Confirming CLI lists nothing."); @@ -127,8 +127,6 @@ public void testUploads() throws Throwable { // 6. Confirm part exists via CLI, direct path and parent path describe("Confirming CLI lists one part"); assertNumUploads(path, 1); - assertNumUploads(path.getParent(), 1); - // 7. Use CLI to delete part, assert it worked describe("Deleting part via CLI"); assertNumDeleted(fs, path, 1); @@ -150,22 +148,23 @@ public void testUploads() throws Throwable { @Test public void testUploadListByAge() throws Throwable { S3AFileSystem fs = getFileSystem(); - Path path = path(UPLOAD_PREFIX + "/" + UPLOAD_NAME); + Path path = methodPath(); + Path file = new Path(path, UPLOAD_NAME); describe("Cleaning up any leftover uploads from previous runs."); + // 1. Make sure key doesn't already exist clearAnyUploads(fs, path); // 2. Create a upload part describe("Uploading single part."); - final String key = fs.pathToKey(path); + final String key = fs.pathToKey(file); createPartUpload(fs, key, 128, 1); //try (AuditSpan span = fs.startOperation("multipart", key, null)) { try { - // 3. Confirm it exists via API.. may want to wrap with - // LambdaTestUtils.eventually() ? + // 3. Confirm it exists via API assertEquals("Should be one upload", 1, countUploadsAt(fs, path)); // 4. Confirm part does appear in listing with long age filter From d18410221bd9d5357eba1af8377e82ac27a38c97 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Thu, 26 Oct 2023 08:22:18 +0800 Subject: [PATCH 18/23] YARN-11593. [Federation] Improve command line help information. (#6199) Contributed by Shilun Fan. Reviewed-by: Inigo Goiri Signed-off-by: Shilun Fan --- .../hadoop/yarn/client/cli/RouterCLI.java | 294 +++++++++++++----- .../hadoop/yarn/client/cli/TestRouterCLI.java | 26 ++ .../src/site/markdown/Federation.md | 158 +++++++++- 3 files changed, 405 insertions(+), 73 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RouterCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RouterCLI.java index 0aa02c8124a3a..2da584f9d61bb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RouterCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RouterCLI.java @@ -23,7 +23,10 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections.MapUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.classification.VisibleForTesting; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.ha.HAAdmin.UsageInfo; @@ -67,6 +70,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.LinkedHashMap; import java.util.stream.Collectors; import static org.apache.hadoop.yarn.server.api.protocolrecords.FederationQueueWeight.checkHeadRoomAlphaValid; @@ -74,29 +78,8 @@ public class RouterCLI extends Configured implements Tool { - private static final Logger LOG = LoggerFactory.getLogger(RouterCLI.class); - protected final static Map ADMIN_USAGE = - ImmutableMap.builder() - // Command1: deregisterSubCluster - .put("-deregisterSubCluster", new UsageInfo( - "[-sc|--subClusterId [subCluster Id]]", - "Deregister SubCluster, If the interval between the heartbeat time of the subCluster " + - "and the current time exceeds the timeout period, " + - "set the state of the subCluster to SC_LOST.")) - // Command2: policy - .put("-policy", new UsageInfo( - "[-s|--save [queue;router weight;amrm weight;headroomalpha]] " + - "[-bs|--batch-save [--format xml] [-f|--input-file fileName]]" + - "[-l|--list [--pageSize][--currentPage][--queue][--queues]]", - "We provide a set of commands for Policy:" + - " Include list policies, save policies, batch save policies. " + - " (Note: The policy type will be directly read from the" + - " yarn.federation.policy-manager in the local yarn-site.xml.)" + - " eg. (routeradmin -policy [-s|--save] root.a;SC-1:0.7,SC-2:0.3;SC-1:0.7,SC-2:0.3;1.0)")) - .build(); - // Common Constant private static final String SEMICOLON = ";"; @@ -104,6 +87,7 @@ public class RouterCLI extends Configured implements Tool { private static final String CMD_EMPTY = ""; private static final int EXIT_SUCCESS = 0; private static final int EXIT_ERROR = -1; + private static final String CMD_HELP = "-help"; // Command1: deregisterSubCluster private static final String DEREGISTER_SUBCLUSTER_TITLE = @@ -115,17 +99,46 @@ public class RouterCLI extends Configured implements Tool { private static final String OPTION_SC = "sc"; private static final String OPTION_SUBCLUSTERID = "subClusterId"; private static final String CMD_DEREGISTERSUBCLUSTER = "-deregisterSubCluster"; - private static final String CMD_HELP = "-help"; + + // DeregisterSubCluster Command Parameters + protected final static UsageInfo SUBCLUSTER_ID = new UsageInfo("<-sc|--subClusterId>", + "'-sc' option allows you to specify the sub-cluster to operate on, " + + "while the '--subClusterId' option is the long format of -sc and serves the same purpose."); + + // DeregisterSubCluster Command Examples + protected final static String DEREGISTER_SUBCLUSTER_EXAMPLE_1 = + "yarn routeradmin -deregisterSubCluster -sc SC-1"; + protected final static String DEREGISTER_SUBCLUSTER_EXAMPLE_2 = + "yarn routeradmin -deregisterSubCluster --subClusterId SC-1"; + + // DeregisterSubCluster Command Help Information + protected final static String DEREGISTER_SUBCLUSTER_HELP_INFO = + "deregister subCluster, If the interval between the heartbeat time of the subCluster and" + + "the current time exceeds the timeout period, set the state of the subCluster to SC_LOST."; + + protected final static RouterCmdUsageInfos DEREGISTER_SUBCLUSTER_USAGEINFOS = + new RouterCmdUsageInfos() + .addUsageInfo(SUBCLUSTER_ID) + .addHelpInfo(DEREGISTER_SUBCLUSTER_HELP_INFO) + .addExampleDescs(CMD_DEREGISTERSUBCLUSTER, "If we want to deregisterSubCluster SC-1") + .addExample(CMD_DEREGISTERSUBCLUSTER, DEREGISTER_SUBCLUSTER_EXAMPLE_1) + .addExample(CMD_DEREGISTERSUBCLUSTER, DEREGISTER_SUBCLUSTER_EXAMPLE_2); // Command2: policy + + private static final String CMD_POLICY = "-policy"; + // save policy private static final String OPTION_S = "s"; - private static final String OPTION_BATCH_S = "bs"; private static final String OPTION_SAVE = "save"; + // batch save policy + private static final String OPTION_BATCH_S = "bs"; private static final String OPTION_BATCH_SAVE = "batch-save"; private static final String OPTION_FORMAT = "format"; + private static final String FORMAT_XML = "xml"; private static final String OPTION_FILE = "f"; private static final String OPTION_INPUT_FILE = "input-file"; + // list policy private static final String OPTION_L = "l"; private static final String OPTION_LIST = "list"; private static final String OPTION_PAGE_SIZE = "pageSize"; @@ -133,9 +146,6 @@ public class RouterCLI extends Configured implements Tool { private static final String OPTION_QUEUE = "queue"; private static final String OPTION_QUEUES = "queues"; - private static final String CMD_POLICY = "-policy"; - private static final String FORMAT_XML = "xml"; - private static final String FORMAT_JSON = "json"; private static final String XML_TAG_SUBCLUSTERIDINFO = "subClusterIdInfo"; private static final String XML_TAG_AMRMPOLICYWEIGHTS = "amrmPolicyWeights"; private static final String XML_TAG_ROUTERPOLICYWEIGHTS = "routerPolicyWeights"; @@ -146,10 +156,85 @@ public class RouterCLI extends Configured implements Tool { private static final String LIST_POLICIES_TITLE = "Yarn Federation Queue Policies"; + // Columns information private static final List LIST_POLICIES_HEADER = Arrays.asList( "Queue Name", "AMRM Weight", "Router Weight"); + // Policy Commands + protected final static UsageInfo POLICY_SAVE_USAGE = new UsageInfo( + "-s|--save ()", + "This command is used to save the policy information of the queue, " + + "including queue and weight information."); + + protected final static String POLICY_SAVE_USAGE_EXAMPLE_DESC = + "We have two sub-clusters, SC-1 and SC-2. \\" + + "We want to configure a weight policy for the 'root.a' queue. \\" + + "The Router Weight is set to SC-1 with a weight of 0.7 and SC-2 with a weight of 0.3. \\" + + "The AMRM Weight is set SC-1 to 0.6 and SC-2 to 0.4. \\" + + "We are using the default value of 0.1 for headroomalpha."; + + protected final static String POLICY_SAVE_USAGE_EXAMPLE_1 = + "yarn routeradmin -policy -s root.a;SC-1:0.7,SC-2:0.3;SC-1:0.6,SC-2:0.4;1.0"; + protected final static String POLICY_SAVE_USAGE_EXAMPLE_2 = + "yarn routeradmin -policy --save root.a;SC-1:0.7,SC-2:0.3;SC-1:0.6,SC-2:0.4;1.0"; + + protected final static UsageInfo POLICY_BATCH_SAVE_USAGE = new UsageInfo( + "-bs|--batch-save (--format ) (-f|--input-file )", + "This command can batch load weight information for queues " + + "based on the provided `federation-weights.xml` file."); + + protected final static String POLICY_BATCH_SAVE_USAGE_EXAMPLE_DESC = + "We have two sub-clusters, SC-1 and SC-2. \\" + + "We would like to configure weights for 'root.a' and 'root.b' queues. \\" + + "We can set the weights for 'root.a' and 'root.b' in the 'federation-weights.xml' file. \\" + + "and then use the batch-save command to save the configurations in bulk."; + + protected final static String POLICY_BATCH_SAVE_USAGE_EXAMPLE_1 = + "yarn routeradmin -policy -bs --format xml -f federation-weights.xml"; + protected final static String POLICY_BATCH_SAVE_USAGE_EXAMPLE_2 = + "yarn routeradmin -policy --batch-save --format xml -f federation-weights.xml"; + + protected final static UsageInfo POLICY_LIST_USAGE = new UsageInfo( + "-l|--list [--pageSize][--currentPage][--queue][--queues]", + "This command is used to display the configured queue weight information."); + + protected final static String POLICY_LIST_USAGE_EXAMPLE_DESC = + "We can display the list of already configured queue weight information. \\" + + "We can use the --queue option to query the weight information for a specific queue \\" + + " or use the --queues option to query the weight information for multiple queues. \\"; + + protected final static String POLICY_LIST_USAGE_EXAMPLE_1 = + "yarn routeradmin -policy -l --pageSize 20 --currentPage 1 --queue root.a"; + + protected final static String POLICY_LIST_USAGE_EXAMPLE_2 = + "yarn routeradmin -policy -list --pageSize 20 --currentPage 1 --queues root.a,root.b"; + + protected final static RouterCmdUsageInfos POLICY_USAGEINFOS = new RouterCmdUsageInfos() + // Policy Save + .addUsageInfo(POLICY_SAVE_USAGE) + .addExampleDescs(POLICY_SAVE_USAGE.args, POLICY_SAVE_USAGE_EXAMPLE_DESC) + .addExample(POLICY_SAVE_USAGE.args, POLICY_SAVE_USAGE_EXAMPLE_1) + .addExample(POLICY_SAVE_USAGE.args, POLICY_SAVE_USAGE_EXAMPLE_2) + // Policy Batch Save + .addUsageInfo(POLICY_BATCH_SAVE_USAGE) + .addExampleDescs(POLICY_BATCH_SAVE_USAGE.args, POLICY_BATCH_SAVE_USAGE_EXAMPLE_DESC) + .addExample(POLICY_BATCH_SAVE_USAGE.args, POLICY_BATCH_SAVE_USAGE_EXAMPLE_1) + .addExample(POLICY_BATCH_SAVE_USAGE.args, POLICY_BATCH_SAVE_USAGE_EXAMPLE_2) + // Policy List Save + .addUsageInfo(POLICY_LIST_USAGE) + .addExampleDescs(POLICY_LIST_USAGE.args, POLICY_LIST_USAGE_EXAMPLE_DESC) + .addExample(POLICY_LIST_USAGE.args, POLICY_LIST_USAGE_EXAMPLE_1) + .addExample(POLICY_LIST_USAGE.args, POLICY_LIST_USAGE_EXAMPLE_2); + + protected final static Map ADMIN_USAGE = + ImmutableMap.builder() + // Command1: deregisterSubCluster + .put(CMD_DEREGISTERSUBCLUSTER, DEREGISTER_SUBCLUSTER_USAGEINFOS) + // Command2: policy + .put(CMD_POLICY, POLICY_USAGEINFOS) + .build(); + public RouterCLI() { super(); } @@ -159,43 +244,66 @@ public RouterCLI(Configuration conf) { } private static void buildHelpMsg(String cmd, StringBuilder builder) { - UsageInfo usageInfo = ADMIN_USAGE.get(cmd); - if (usageInfo == null) { + RouterCmdUsageInfos routerUsageInfo = ADMIN_USAGE.get(cmd); + + if (routerUsageInfo == null) { return; } + builder.append("[").append(cmd).append("]\n"); - if (usageInfo.args != null) { - String space = (usageInfo.args == "") ? "" : " "; - builder.append(" ") - .append(cmd) - .append(space) - .append(usageInfo.args) - .append(": ") - .append(usageInfo.help); - } else { - builder.append(" ") - .append(cmd) - .append(": ") - .append(usageInfo.help); + if (!routerUsageInfo.helpInfos.isEmpty()) { + builder.append("\t Description: \n"); + for (String helpInfo : routerUsageInfo.helpInfos) { + builder.append("\t\t").append(helpInfo).append("\n\n"); + } } - } - private static void buildIndividualUsageMsg(String cmd, StringBuilder builder) { - UsageInfo usageInfo = ADMIN_USAGE.get(cmd); - if (usageInfo == null) { - return; + if (!routerUsageInfo.usageInfos.isEmpty()) { + builder.append("\t UsageInfos: \n"); + for (UsageInfo usageInfo : routerUsageInfo.usageInfos) { + builder.append("\t\t").append(usageInfo.args) + .append(": ") + .append("\n\t\t") + .append(usageInfo.help).append("\n\n"); + } } - if (usageInfo.args == null) { - builder.append("Usage: routeradmin [") - .append(cmd) - .append("]\n"); - } else { - String space = (usageInfo.args == "") ? "" : " "; - builder.append("Usage: routeradmin [") - .append(cmd) - .append(space) - .append(usageInfo.args) - .append("]\n"); + + if (MapUtils.isNotEmpty(routerUsageInfo.examples)) { + builder.append("\t Examples: \n"); + int count = 1; + for (Map.Entry> example : routerUsageInfo.examples.entrySet()) { + + String keyCmd = example.getKey(); + builder.append("\t\t") + .append("Cmd:").append(count) + .append(". ").append(keyCmd) + .append(": \n\n"); + + // Print Command Description + List exampleDescs = routerUsageInfo.exampleDescs.get(keyCmd); + if (CollectionUtils.isNotEmpty(exampleDescs)) { + builder.append("\t\t").append("Cmd Requirement Description:\n"); + for (String value : exampleDescs) { + String[] valueDescs = StringUtils.split(value, "\\"); + for (String valueDesc : valueDescs) { + builder.append("\t\t").append(valueDesc).append("\n"); + } + } + } + + builder.append("\n"); + + // Print Command example + List valueExamples = example.getValue(); + if (CollectionUtils.isNotEmpty(valueExamples)) { + builder.append("\t\t").append("Cmd Examples:\n"); + for (String valueExample : valueExamples) { + builder.append("\t\t").append(valueExample).append("\n"); + } + } + builder.append("\n"); + count++; + } } } @@ -204,12 +312,7 @@ private static void printHelp() { summary.append("routeradmin is the command to execute ") .append("YARN Federation administrative commands.\n") .append("The full syntax is: \n\n") - .append("routeradmin\n") - .append(" [-deregisterSubCluster [-sc|--subClusterId [subCluster Id]]\n") - .append(" [-policy [-s|--save [queue;router weight;amrm weight;headroomalpha] " + - "[-bs|--batch-save [--format xml,json] [-f|--input-file fileName]]] " + - "[-l|--list [--pageSize][--currentPage][--queue][--queues]]\n") - .append(" [-help [cmd]]").append("\n"); + .append("routeradmin\n"); StringBuilder helpBuilder = new StringBuilder(); System.out.println(summary); @@ -235,13 +338,9 @@ protected ResourceManagerAdministrationProtocol createAdminProtocol() private static void buildUsageMsg(StringBuilder builder) { builder.append("routeradmin is only used in Yarn Federation Mode.\n"); builder.append("Usage: routeradmin\n"); - for (Map.Entry cmdEntry : ADMIN_USAGE.entrySet()) { - UsageInfo usageInfo = cmdEntry.getValue(); - builder.append(" ") - .append(cmdEntry.getKey()) - .append(" ") - .append(usageInfo.args) - .append("\n"); + for (String cmdKey : ADMIN_USAGE.keySet()) { + buildHelpMsg(cmdKey, builder); + builder.append("\n"); } builder.append(" -help [cmd]\n"); } @@ -249,7 +348,7 @@ private static void buildUsageMsg(StringBuilder builder) { private static void printUsage(String cmd) { StringBuilder usageBuilder = new StringBuilder(); if (ADMIN_USAGE.containsKey(cmd)) { - buildIndividualUsageMsg(cmd, usageBuilder); + buildHelpMsg(cmd, usageBuilder); } else { buildUsageMsg(usageBuilder); } @@ -353,7 +452,7 @@ private int handlePolicy(String[] args) saveOpt.setOptionalArg(true); Option batchSaveOpt = new Option(OPTION_BATCH_S, OPTION_BATCH_SAVE, false, "We will save queue policies in bulk, " + - "where users can provide XML or JSON files containing the policies. " + + "where users can provide XML files containing the policies. " + "This command will parse the file contents and store the results " + "in the FederationStateStore."); Option formatOpt = new Option(null, "format", true, @@ -748,8 +847,59 @@ public int run(String[] args) throws Exception { return EXIT_SUCCESS; } + public static UsageInfo getPolicyBatchSaveUsage() { + return POLICY_BATCH_SAVE_USAGE; + } + + static class RouterCmdUsageInfos { + private List usageInfos; + private List helpInfos; + private Map> examples; + protected Map> exampleDescs; + + RouterCmdUsageInfos() { + this.usageInfos = new ArrayList<>(); + this.helpInfos = new ArrayList<>(); + this.examples = new LinkedHashMap<>(); + this.exampleDescs = new LinkedHashMap<>(); + } + + public RouterCmdUsageInfos addUsageInfo(UsageInfo usageInfo) { + this.usageInfos.add(usageInfo); + return this; + } + + public RouterCmdUsageInfos addHelpInfo(String helpInfo) { + this.helpInfos.add(helpInfo); + return this; + } + + private RouterCmdUsageInfos addExample(String cmd, String example) { + List exampleList = this.examples.getOrDefault(cmd, new ArrayList<>()); + exampleList.add(example); + this.examples.put(cmd, exampleList); + return this; + } + + private RouterCmdUsageInfos addExampleDescs(String cmd, String exampleDesc) { + List exampleDescList = this.exampleDescs.getOrDefault(cmd, new ArrayList<>()); + exampleDescList.add(exampleDesc); + this.exampleDescs.put(cmd, exampleDescList); + return this; + } + + public Map> getExamples() { + return examples; + } + } + public static void main(String[] args) throws Exception { int result = ToolRunner.run(new RouterCLI(), args); System.exit(result); } + + @VisibleForTesting + public Map getAdminUsage(){ + return ADMIN_USAGE; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRouterCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRouterCLI.java index 6ed83826dfa58..a86878dac3f81 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRouterCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/cli/TestRouterCLI.java @@ -40,6 +40,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; +import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -266,4 +267,29 @@ public void testListPolicies() throws Exception { String[] args = {"-policy", "-l", "--queue", "root.a"}; assertEquals(0, rmAdminCLI.run(args)); } + + @Test + public void testBuildHelpMsg() throws Exception { + Map adminUsage = rmAdminCLI.getAdminUsage(); + assertEquals(2, adminUsage.size()); + + RouterCLI.RouterCmdUsageInfos deregisterSubClusterUsageInfos = + adminUsage.get("-deregisterSubCluster"); + assertNotNull(deregisterSubClusterUsageInfos); + Map> dsExamplesMap = deregisterSubClusterUsageInfos.getExamples(); + assertNotNull(dsExamplesMap); + assertEquals(1, dsExamplesMap.size()); + List dsExamples = dsExamplesMap.get("-deregisterSubCluster"); + assertNotNull(dsExamples); + assertEquals(2, dsExamples.size()); + + RouterCLI.RouterCmdUsageInfos policyUsageInfos = adminUsage.get("-policy"); + assertNotNull(policyUsageInfos); + Map> policyExamplesMap = policyUsageInfos.getExamples(); + assertNotNull(policyExamplesMap); + assertEquals(3, policyExamplesMap.size()); + policyExamplesMap.forEach((cmd, cmdExamples) -> { + assertEquals(2, cmdExamples.size()); + }); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/Federation.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/Federation.md index 66c79c94cc9f9..5d5dc786e13b4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/Federation.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/Federation.md @@ -465,9 +465,165 @@ If we want to use JCache, we can configure `yarn.federation.cache.class` to `org This is a Cache implemented based on the Guava framework. If we want to use it, we can configure `yarn.federation.cache.class` to `org.apache.hadoop.yarn.server.federation.cache.FederationGuavaCache`. +Router command line: + +- deregisterSubCluster + +This command is used to `deregister subCluster`, If the interval between the heartbeat time of the subCluster, and the current time exceeds the timeout period, set the state of the subCluster to `SC_LOST`. + +Uasge: + +`yarn routeradmin -deregisterSubCluster [-sc|--subClusterId ]` + +Options: + +| Property | Description | +|:--------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `-sc, --subClusterId [subCluster Id]` | `'-sc' option allows you to specify the sub-cluster to operate on, while the '--subClusterId' option is the long format of -sc and serves the same purpose.` | + +Examples: + +If we want to deregisterSubCluster `SC-1` + +- yarn routeradmin -deregisterSubCluster -sc SC-1 +- yarn routeradmin -deregisterSubCluster --subClusterId SC-1 + +- policy + +We provide a set of commands for Policy Include list policies, save policies, batch save policies. + +Uasge: + +`yarn routeradmin -policy -s|--save (queue;router weight;amrm weight;headroomalpha)` + +`yarn routeradmin -policy -bs|--batch-save (--format xml) (-f|--input-file fileName)` + +`yarn routeradmin -policy -l|--list ([--pageSize][--currentPage][--queue][--queues])` + +- -s|--save () + +This command is used to save the policy information of the queue, including queue and weight information. + +How to configure `queue;router weight;amrm weight;headroomalpha` + +the sum of weights for all sub-clusters in routerWeight/amrmWeight should be 1. + +| Property | Description | +|:----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `queue` | `Scheduled queue` | +| `router weight` | `Weight for routing applications to different subclusters.` | +| `amrm weight` | `Weight for resource request from ApplicationMaster (AM) to different subclusters' Resource Manager (RM).` | +| `headroomalpha` | `Used by policies that balance weight-based and load-based considerations in their decisions. It is recommended to use 1.0 because the load-base function is not yet complete.` | + +Example: + +We have two sub-clusters, `SC-1` and `SC-2`. We want to configure a weight policy for the `root.a` queue. The Router Weight is set to `SC-1` with a weight of `0.7` and `SC-2` with a weight of `0.3`. +The AMRM Weight is set `SC-1` to `0.6` and `SC-2` to `0.4`. We are using the default value of `0.1` for `headroomalpha`. + +yarn routeradmin -policy --save root.a;SC-1:0.7,SC-2:0.3;SC-1:0.6,SC-2:0.4;1.0 + +yarn routeradmin -policy -s root.a;SC-1:0.7,SC-2:0.3;SC-1:0.6,SC-2:0.4;1.0 + +- -bs|--batch-save (--format xml) (-f|--input-file fileName) + +This command can batch load weight information for queues based on the provided `federation-weights.xml` file. + +| Property | Description | +|:--------------------------|:----------------------------------------------------------------------------------------------| +| `--format [xml]` | `Configuration file format, we currently only support xml format` | +| `-f, --input-file [path]` | `The path to the configuration file. Please use the absolute path of the configuration file.` | + +How to configure `federation-weights.xml` + ```xml + + + + root.a + + + SC-1 + 0.7 + + + SC-2 + 0.3 + + + + + SC-1 + 0.6 + + + SC-2 + 0.4 + + + 1.0 + + + + + root.b + + + SC-1 + 0.8 + + + SC-2 + 0.2 + + + + + SC-1 + 0.6 + + + SC-2 + 0.4 + + + 1.0 + + + + ``` + +Example: + +We have two sub-clusters, `SC-1` and `SC-2`. We would like to configure weights for `root.a` and `root.b` queues. We can set the weights for `root.a` and `root.b` in the `federation-weights.xml` file. +and then use the batch-save command to save the configurations in bulk. + +The file name can be any file name, but it is recommended to use `federation-weights.xml` + +yarn routeradmin -policy -bs --format xml -f /path/federation-weights.xml + +yarn routeradmin -policy --batch-save --format xml -f /path/federation-weights.xml + +- -l|--list (--pageSize --currentPage --queue --queues) + +This command is used to display the configured queue weight information. + +| Property | Description | +|:----------------|:-------------------------------------------------------------| +| `--pageSize` | `The number of policies displayed per page.` | +| `--currentPage` | `This parameter represents the page number to be displayed.` | +| `--queue` | `the queue we need to filter. example: root.a` | +| `--queues` | `list of queues to filter. example: root.a,root.b,root.c` | + +Example: + +We can display the list of already configured queue weight information. We can use the `--queue` option to query the weight information for a specific queue or use the `--queues` option to query the weight information for multiple queues. + +yarn routeradmin -policy -l --pageSize 20 --currentPage 1 --queue root.a + +yarn routeradmin -policy -list --pageSize 20 --currentPage 1 --queues root.a,root.b + ### ON GPG: -GlobalPolicyGenerator, abbreviated as "GPG," is used for the automatic generation of global policies for subClusters. +GlobalPolicyGenerator, abbreviated as "GPG", is used for the automatic generation of global policies for subClusters. These are extra configurations that should appear in the **conf/yarn-site.xml** for GPG. We allow only one GPG. From 821ed83873572d408d8d923f579cc56371a33001 Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Thu, 26 Oct 2023 10:35:10 -0700 Subject: [PATCH 19/23] HDFS-15273. CacheReplicationMonitor hold lock for long time and lead to NN out of service. Contributed by Xiaoqiao He. --- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 12 +++++++ .../CacheReplicationMonitor.java | 31 +++++++++++++++++++ .../hdfs/server/namenode/CacheManager.java | 28 +++++++++++++++++ .../src/main/resources/hdfs-default.xml | 27 ++++++++++++++++ 4 files changed, 98 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index dd2731813bd77..88a18d9cf0763 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -192,6 +192,18 @@ public class DFSConfigKeys extends CommonConfigurationKeys { "dfs.namenode.path.based.cache.block.map.allocation.percent"; public static final float DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT = 0.25f; + public static final String DFS_NAMENODE_CRM_CHECKLOCKTIME_ENABLE = + "dfs.namenode.crm.checklocktime.enable"; + public static final boolean DFS_NAMENODE_CRM_CHECKLOCKTIME_DEFAULT = false; + + public static final String DFS_NAMENODE_CRM_MAXLOCKTIME_MS = + "dfs.namenode.crm.maxlocktime.ms"; + public static final long DFS_NAMENODE_CRM_MAXLOCKTIME_MS_DEFAULT = 1000; + + public static final String DFS_NAMENODE_CRM_SLEEP_TIME_MS = + "dfs.namenode.crm.sleeptime.ms"; + public static final long DFS_NAMENODE_CRM_SLEEP_TIME_MS_DEFAULT = 300; + public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT = HdfsClientConfigKeys.DFS_NAMENODE_HTTP_PORT_DEFAULT; public static final String DFS_NAMENODE_HTTP_ADDRESS_KEY = diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java index 1e5f952040d53..f9036c550e852 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java @@ -140,6 +140,11 @@ public class CacheReplicationMonitor extends Thread implements Closeable { */ private long scannedBlocks; + /** + * Avoid to hold global lock for long times. + */ + private long lastScanTimeMs; + public CacheReplicationMonitor(FSNamesystem namesystem, CacheManager cacheManager, long intervalMs, ReentrantLock lock) { this.namesystem = namesystem; @@ -284,6 +289,7 @@ public void close() throws IOException { private void rescan() throws InterruptedException { scannedDirectives = 0; scannedBlocks = 0; + lastScanTimeMs = Time.monotonicNow(); try { namesystem.writeLock(); try { @@ -315,6 +321,19 @@ private void resetStatistics() { } } + private void reacquireLock(long last) { + long now = Time.monotonicNow(); + if (now - last > cacheManager.getMaxLockTimeMs()) { + try { + namesystem.writeUnlock(); + Thread.sleep(cacheManager.getSleepTimeMs()); + } catch (InterruptedException e) { + } finally { + namesystem.writeLock(); + } + } + } + /** * Scan all CacheDirectives. Use the information to figure out * what cache replication factor each block should have. @@ -447,6 +466,10 @@ private void rescanFile(CacheDirective directive, INodeFile file) { if (cachedTotal == neededTotal) { directive.addFilesCached(1); } + if (cacheManager.isCheckLockTimeEnable()) { + reacquireLock(lastScanTimeMs); + lastScanTimeMs = Time.monotonicNow(); + } LOG.debug("Directive {}: caching {}: {}/{} bytes", directive.getId(), file.getFullPathName(), cachedTotal, neededTotal); } @@ -518,6 +541,10 @@ private void rescanCachedBlockMap() { } } } + if (cacheManager.isCheckLockTimeEnable()) { + reacquireLock(lastScanTimeMs); + lastScanTimeMs = Time.monotonicNow(); + } for (Iterator cbIter = cachedBlocks.iterator(); cbIter.hasNext(); ) { scannedBlocks++; @@ -603,6 +630,10 @@ private void rescanCachedBlockMap() { ); cbIter.remove(); } + if (cacheManager.isCheckLockTimeEnable()) { + reacquireLock(lastScanTimeMs); + lastScanTimeMs = Time.monotonicNow(); + } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java index e71b057595952..24ccf45b91d29 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java @@ -17,6 +17,12 @@ */ package org.apache.hadoop.hdfs.server.namenode; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_CHECKLOCKTIME_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_CHECKLOCKTIME_ENABLE; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_MAXLOCKTIME_MS; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_MAXLOCKTIME_MS_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_SLEEP_TIME_MS; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CRM_SLEEP_TIME_MS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIST_CACHE_DIRECTIVES_NUM_RESPONSES; @@ -194,6 +200,9 @@ public class CacheManager { * The CacheReplicationMonitor. */ private CacheReplicationMonitor monitor; + private boolean isCheckLockTimeEnable; + private long maxLockTimeMs; + private long sleepTimeMs; public static final class PersistState { public final CacheManagerSection section; @@ -235,12 +244,31 @@ public PersistState(CacheManagerSection section, this.cachedBlocks = enabled ? new LightWeightGSet( LightWeightGSet.computeCapacity(cachedBlocksPercent, "cachedBlocks")) : new LightWeightGSet<>(0); + this.isCheckLockTimeEnable = conf.getBoolean( + DFS_NAMENODE_CRM_CHECKLOCKTIME_ENABLE, + DFS_NAMENODE_CRM_CHECKLOCKTIME_DEFAULT); + this.maxLockTimeMs = conf.getLong(DFS_NAMENODE_CRM_MAXLOCKTIME_MS, + DFS_NAMENODE_CRM_MAXLOCKTIME_MS_DEFAULT); + this.sleepTimeMs = conf.getLong(DFS_NAMENODE_CRM_SLEEP_TIME_MS, + DFS_NAMENODE_CRM_SLEEP_TIME_MS_DEFAULT); } public boolean isEnabled() { return enabled; } + public boolean isCheckLockTimeEnable() { + return isCheckLockTimeEnable; + } + + public long getMaxLockTimeMs() { + return this.maxLockTimeMs; + } + + public long getSleepTimeMs() { + return this.sleepTimeMs; + } + /** * Resets all tracked directives and pools. Called during 2NN checkpointing to * reset FSNamesystem state. See {@link FSNamesystem#clear()}. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index e73fc802a0453..52075a24f1e32 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2940,6 +2940,33 @@ + + dfs.namenode.crm.checklocktime.enable + false + + Set to true to enable CacheManager to check amount of time to hold the + global rwlock. + + + + + dfs.namenode.crm.maxlocktime.ms + 1000 + + The maximum amount of time that CacheManager should hold the global rwlock. + This configuration enable when set `dfs.namenode.crm.checklocktime.enable`. + + + + + dfs.namenode.crm.sleeptime.ms + 300 + + The amount of time that CacheManager should relase the global rwlock. + This configuration enable when set `dfs.namenode.crm.checklocktime.enable`. + + + dfs.datanode.max.locked.memory 0 From 652908519eed5fe79b696e97cc62f2014387be31 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Fri, 27 Oct 2023 04:39:06 +0800 Subject: [PATCH 20/23] YARN-11588. [Federation] [Addendum] Fix uncleaned threads in yarn router thread pool executor. (#6222) --- .../hadoop/yarn/conf/YarnConfiguration.java | 16 ++++++++++++++++ .../src/main/resources/yarn-default.xml | 17 +++++++++++++++++ .../clientrm/FederationClientInterceptor.java | 8 +++++++- .../rmadmin/FederationRMAdminInterceptor.java | 14 +++++++++++++- 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 90a8978a228b2..2a204519228a4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -4369,6 +4369,22 @@ public static boolean isAclEnabled(Configuration conf) { public static final long DEFAULT_ROUTER_USER_CLIENT_THREAD_POOL_KEEP_ALIVE_TIME = TimeUnit.SECONDS.toMillis(0); // 0s + /** + * This method configures the policy for core threads regarding termination + * when no tasks arrive within the keep-alive time. + * When set to false, core threads are never terminated due to a lack of tasks. + * When set to true, the same keep-alive policy + * that applies to non-core threads also applies to core threads. + * To prevent constant thread replacement, + * ensure that the keep-alive time is greater than zero when setting it to true. + * It's advisable to call this method before the pool becomes actively used. + */ + public static final String ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT = + ROUTER_PREFIX + "interceptor.user-thread-pool.allow-core-thread-time-out"; + + public static final boolean DEFAULT_ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT = + false; + /** The address of the Router web application. */ public static final String ROUTER_WEBAPP_ADDRESS = ROUTER_WEBAPP_PREFIX + "address"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 9991e841d74b6..72e8cc70f8743 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -5139,6 +5139,23 @@ + + yarn.router.interceptor.user-thread-pool.allow-core-thread-time-out + false + + This method configures the policy for core threads regarding termination + when no tasks arrive within the keep-alive time. + When set to false, core threads are never terminated due to a lack of tasks. + When set to true, the same keep-alive policy + that applies to non-core threads also applies to core threads. + To prevent constant thread replacement, + ensure that the keep-alive time is greater than zero when setting it to true. + It's advisable to call this method before the pool becomes actively used. + We need to ensure that + yarn.router.interceptor.user-thread-pool.keep-alive-time is greater than 0. + + + yarn.router.submit.interval.time 10ms diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java index 9c3f9971d8c77..35b3e6eeb2bd5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java @@ -231,7 +231,13 @@ public void init(String userName) { keepAliveTime, TimeUnit.MILLISECONDS, workQueue, threadFactory); // Adding this line so that unused user threads will exit and be cleaned up if idle for too long - this.executorService.allowCoreThreadTimeOut(true); + boolean allowCoreThreadTimeOut = getConf().getBoolean( + YarnConfiguration.ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT, + YarnConfiguration.DEFAULT_ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT); + + if (keepAliveTime > 0 && allowCoreThreadTimeOut) { + this.executorService.allowCoreThreadTimeOut(allowCoreThreadTimeOut); + } final Configuration conf = this.getConf(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/rmadmin/FederationRMAdminInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/rmadmin/FederationRMAdminInterceptor.java index b7c1462a60d56..d269cfe0971cf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/rmadmin/FederationRMAdminInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/rmadmin/FederationRMAdminInterceptor.java @@ -130,9 +130,21 @@ public void init(String userName) { ThreadFactory threadFactory = new ThreadFactoryBuilder() .setNameFormat("RPC Router RMAdminClient-" + userName + "-%d ").build(); + long keepAliveTime = getConf().getTimeDuration( + YarnConfiguration.ROUTER_USER_CLIENT_THREAD_POOL_KEEP_ALIVE_TIME, + YarnConfiguration.DEFAULT_ROUTER_USER_CLIENT_THREAD_POOL_KEEP_ALIVE_TIME, TimeUnit.SECONDS); + BlockingQueue workQueue = new LinkedBlockingQueue<>(); this.executorService = new ThreadPoolExecutor(numThreads, numThreads, - 0L, TimeUnit.MILLISECONDS, workQueue, threadFactory); + keepAliveTime, TimeUnit.MILLISECONDS, workQueue, threadFactory); + + boolean allowCoreThreadTimeOut = getConf().getBoolean( + YarnConfiguration.ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT, + YarnConfiguration.DEFAULT_ROUTER_USER_CLIENT_THREAD_POOL_ALLOW_CORE_THREAD_TIMEOUT); + + if (keepAliveTime > 0 && allowCoreThreadTimeOut) { + this.executorService.allowCoreThreadTimeOut(allowCoreThreadTimeOut); + } federationFacade = FederationStateStoreFacade.getInstance(this.getConf()); this.conf = this.getConf(); From 93a3c6e2cd4db4b395b3ec00a513dd3aceb3e306 Mon Sep 17 00:00:00 2001 From: Hiroaki Segawa Date: Fri, 27 Oct 2023 14:25:00 +0900 Subject: [PATCH 21/23] HDFS-17024. Potential data race introduced by HDFS-15865 (#6223) --- .../main/java/org/apache/hadoop/hdfs/DataStreamer.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java index 4fa578ab6c03f..d92f5943fd8a2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java @@ -476,6 +476,7 @@ boolean doWaitForRestart() { private DataOutputStream blockStream; private DataInputStream blockReplyStream; private ResponseProcessor response = null; + private final Object nodesLock = new Object(); private volatile DatanodeInfo[] nodes = null; // list of targets for current block private volatile StorageType[] storageTypes = null; private volatile String[] storageIDs = null; @@ -619,7 +620,9 @@ private void setPipeline(LocatedBlock lb) { private void setPipeline(DatanodeInfo[] nodes, StorageType[] storageTypes, String[] storageIDs) { - this.nodes = nodes; + synchronized (nodesLock) { + this.nodes = nodes; + } this.storageTypes = storageTypes; this.storageIDs = storageIDs; } @@ -916,7 +919,10 @@ void waitForAckedSeqno(long seqno) throws IOException { try (TraceScope ignored = dfsClient.getTracer(). newScope("waitForAckedSeqno")) { LOG.debug("{} waiting for ack for: {}", this, seqno); - int dnodes = nodes != null ? nodes.length : 3; + int dnodes; + synchronized (nodesLock) { + dnodes = nodes != null ? nodes.length : 3; + } int writeTimeout = dfsClient.getDatanodeWriteTimeout(dnodes); long begin = Time.monotonicNow(); try { From 7ec636deec1751341e91453ab8051ab1fe48f37e Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 27 Oct 2023 12:23:55 +0100 Subject: [PATCH 22/23] HADOOP-18930. Make fs.s3a.create.performance a bucket-wide setting. (#6168) If fs.s3a.create.performance is set on a bucket - All file overwrite checks are skipped, even if the caller says otherwise. - All directory existence checks are skipped. - Marker deletion is *always* skipped. This eliminates a HEAD and a LIST for every creation. * New path capability "fs.s3a.create.performance.enabled" true if the option is enabled. * Parameterize ITestS3AContractCreate to expect the different outcomes * Parameterize ITestCreateFileCost similarly, with changed cost assertions there. * create(/) raises an IOE. existing bug only noticed here. Contributed by Steve Loughran --- .../filesystem/fsdataoutputstreambuilder.md | 6 + .../org/apache/hadoop/fs/s3a/Constants.java | 18 ++- .../apache/hadoop/fs/s3a/S3AFileSystem.java | 54 ++++++-- .../hadoop/fs/s3a/impl/CreateFileBuilder.java | 7 ++ .../contract/s3a/ITestS3AContractCreate.java | 93 ++++++++++++++ .../fs/s3a/ITestS3AFSMainOperations.java | 20 +++ .../fs/s3a/ITestS3AFileOperationCost.java | 18 ++- .../fs/s3a/ITestS3AFileSystemContract.java | 20 +++ .../apache/hadoop/fs/s3a/S3ATestUtils.java | 13 ++ .../hadoop/fs/s3a/impl/ITestXAttrCost.java | 7 +- .../s3a/performance/ITestCreateFileCost.java | 118 +++++++++++++++--- .../ITestDirectoryMarkerListing.java | 8 +- .../s3a/performance/ITestS3ADeleteCost.java | 11 ++ .../fs/s3a/tools/AbstractMarkerToolTest.java | 5 +- 14 files changed, 356 insertions(+), 42 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md index ad6d107d06cbc..5f24e75569786 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md @@ -224,6 +224,12 @@ be used as evidence at the inquest as proof that they made a conscious decision to choose speed over safety and that the outcome was their own fault. +Note: the option can be set for an entire filesystem. Again, the safety checks +are there to more closely match the semantics of a classic filesystem, +and to reduce the likelihood that the object store ends up in a state which +diverges so much from the classic directory + tree structur that applications +get confused. + Accordingly: *Use if and only if you are confident that the conditions are met.* ### `fs.s3a.create.header` User-supplied header support diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 8b174e92b2911..f4aeccf1efd10 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1192,12 +1192,26 @@ private Constants() { /** * Flag for create performance. - * This is *not* a configuration option; it is for use in the - * {code createFile()} builder. + * This can be set in the {code createFile()} builder. * Value {@value}. */ public static final String FS_S3A_CREATE_PERFORMANCE = "fs.s3a.create.performance"; + /** + * Default value for create performance in an S3A FS. + * Value {@value}. + */ + public static final boolean FS_S3A_CREATE_PERFORMANCE_DEFAULT = true; + + + /** + * Capability to indicate that the FS has been instantiated with + * {@link #FS_S3A_CREATE_PERFORMANCE} set to true. + * Value {@value}. + */ + public static final String FS_S3A_CREATE_PERFORMANCE_ENABLED = + FS_S3A_CREATE_PERFORMANCE + ".enabled"; + /** * Prefix for adding a header to the object when created. * The actual value must have a "." suffix and then the actual header. diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index defbcd94a5b14..f96a378b1cc92 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -235,6 +235,7 @@ import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit; import static org.apache.hadoop.fs.s3a.impl.CreateFileBuilder.OPTIONS_CREATE_FILE_NO_OVERWRITE; import static org.apache.hadoop.fs.s3a.impl.CreateFileBuilder.OPTIONS_CREATE_FILE_OVERWRITE; +import static org.apache.hadoop.fs.s3a.impl.CreateFileBuilder.OPTIONS_CREATE_FILE_PERFORMANCE; import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.isObjectNotFound; import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.isUnknownBucket; import static org.apache.hadoop.fs.s3a.impl.InternalConstants.AP_REQUIRED_EXCEPTION; @@ -348,7 +349,8 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, private S3AStatisticsContext statisticsContext; /** Storage Statistics Bonded to the instrumentation. */ private S3AStorageStatistics storageStatistics; - + /** Should all create files be "performance" unless unset. */ + private boolean performanceCreation; /** * Default input policy; may be overridden in * {@code openFile()}. @@ -660,6 +662,11 @@ public void initialize(URI name, Configuration originalConf) // verify there's no S3Guard in the store config. checkNoS3Guard(this.getUri(), getConf()); + // performance creation flag for code which wants performance + // at the risk of overwrites. + performanceCreation = conf.getBoolean(FS_S3A_CREATE_PERFORMANCE, + FS_S3A_CREATE_PERFORMANCE_DEFAULT); + LOG.debug("{} = {}", FS_S3A_CREATE_PERFORMANCE, performanceCreation); allowAuthoritativePaths = S3Guard.getAuthoritativePaths(this); // directory policy, which may look at authoritative paths @@ -1878,14 +1885,22 @@ public FSDataOutputStream create(Path f, FsPermission permission, Progressable progress) throws IOException { final Path path = qualify(f); + // work out the options to pass down + CreateFileBuilder.CreateFileOptions options; + if (performanceCreation) { + options = OPTIONS_CREATE_FILE_PERFORMANCE; + }else { + options = overwrite + ? OPTIONS_CREATE_FILE_OVERWRITE + : OPTIONS_CREATE_FILE_NO_OVERWRITE; + } + // the span will be picked up inside the output stream return trackDurationAndSpan(INVOCATION_CREATE, path, () -> innerCreateFile(path, progress, getActiveAuditSpan(), - overwrite - ? OPTIONS_CREATE_FILE_OVERWRITE - : OPTIONS_CREATE_FILE_NO_OVERWRITE)); + options)); } /** @@ -1912,14 +1927,19 @@ private FSDataOutputStream innerCreateFile( final CreateFileBuilder.CreateFileOptions options) throws IOException { auditSpan.activate(); String key = pathToKey(path); + if (key.isEmpty()) { + // no matter the creation options, root cannot be written to. + throw new PathIOException("/", "Can't create root path"); + } EnumSet flags = options.getFlags(); - boolean overwrite = flags.contains(CreateFlag.OVERWRITE); - boolean performance = options.isPerformance(); - boolean skipProbes = performance || isUnderMagicCommitPath(path); + + boolean skipProbes = options.isPerformance() || isUnderMagicCommitPath(path); if (skipProbes) { LOG.debug("Skipping existence/overwrite checks"); } else { try { + boolean overwrite = flags.contains(CreateFlag.OVERWRITE); + // get the status or throw an FNFE. // when overwriting, there is no need to look for any existing file, // just a directory (for safety) @@ -1951,7 +1971,7 @@ private FSDataOutputStream innerCreateFile( // put options are derived from the path and the // option builder. - boolean keep = performance || keepDirectoryMarkers(path); + boolean keep = options.isPerformance() || keepDirectoryMarkers(path); final PutObjectOptions putOptions = new PutObjectOptions(keep, null, options.getHeaders()); @@ -2034,11 +2054,14 @@ public FSDataOutputStreamBuilder createFile(final Path path) { final AuditSpan span = entryPoint(INVOCATION_CREATE_FILE, pathToKey(qualified), null); - return new CreateFileBuilder(this, + final CreateFileBuilder builder = new CreateFileBuilder(this, qualified, - new CreateFileBuilderCallbacksImpl(INVOCATION_CREATE_FILE, span)) - .create() - .overwrite(true); + new CreateFileBuilderCallbacksImpl(INVOCATION_CREATE_FILE, span)); + builder + .create() + .overwrite(true) + .must(FS_S3A_CREATE_PERFORMANCE, performanceCreation); + return builder; } catch (IOException e) { // catch any IOEs raised in span creation and convert to // an UncheckedIOException @@ -2101,7 +2124,8 @@ public FSDataOutputStream createNonRecursive(Path p, .create() .withFlags(flags) .blockSize(blockSize) - .bufferSize(bufferSize); + .bufferSize(bufferSize) + .must(FS_S3A_CREATE_PERFORMANCE, performanceCreation); if (progress != null) { builder.progress(progress); } @@ -5371,6 +5395,10 @@ public boolean hasPathCapability(final Path path, final String capability) case FS_S3A_CREATE_HEADER: return true; + // is the FS configured for create file performance + case FS_S3A_CREATE_PERFORMANCE_ENABLED: + return performanceCreation; + default: return super.hasPathCapability(p, cap); } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CreateFileBuilder.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CreateFileBuilder.java index 0392afac59d91..ae2945989ddd3 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CreateFileBuilder.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CreateFileBuilder.java @@ -71,6 +71,12 @@ public class CreateFileBuilder extends public static final CreateFileOptions OPTIONS_CREATE_FILE_NO_OVERWRITE = new CreateFileOptions(CREATE_NO_OVERWRITE_FLAGS, true, false, null); + /** + * Performance create options. + */ + public static final CreateFileOptions OPTIONS_CREATE_FILE_PERFORMANCE = + new CreateFileOptions(CREATE_OVERWRITE_FLAGS, true, true, null); + /** * Callback interface. */ @@ -129,6 +135,7 @@ public FSDataOutputStream build() throws IOException { if (flags.contains(CreateFlag.APPEND)) { throw new UnsupportedOperationException("Append is not supported"); } + if (!flags.contains(CreateFlag.CREATE) && !flags.contains(CreateFlag.OVERWRITE)) { throw new PathIOException(path.toString(), diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractCreate.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractCreate.java index d2a858f615ef6..7a2a10879dd8e 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractCreate.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractCreate.java @@ -18,18 +18,111 @@ package org.apache.hadoop.fs.contract.s3a; +import java.util.Arrays; +import java.util.Collection; + +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractContractCreateTest; import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.s3a.S3ATestUtils; + +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; /** * S3A contract tests creating files. + * Parameterized on the create performance flag as all overwrite + * tests are required to fail in create performance mode. */ +@RunWith(Parameterized.class) public class ITestS3AContractCreate extends AbstractContractCreateTest { + /** + * This test suite is parameterized for the different create file + * options. + * @return a list of test parameters. + */ + @Parameterized.Parameters + public static Collection params() { + return Arrays.asList(new Object[][]{ + {false}, + {true} + }); + } + + /** + * Is this test run in create performance mode? + */ + private final boolean createPerformance; + + public ITestS3AContractCreate(final boolean createPerformance) { + this.createPerformance = createPerformance; + } + @Override protected AbstractFSContract createContract(Configuration conf) { return new S3AContract(conf); } + @Override + protected Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + removeBaseAndBucketOverrides(conf, + FS_S3A_CREATE_PERFORMANCE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, createPerformance); + S3ATestUtils.disableFilesystemCaching(conf); + return conf; + } + + @Override + public void testOverwriteNonEmptyDirectory() throws Throwable { + try { + super.testOverwriteNonEmptyDirectory(); + failWithCreatePerformance(); + } catch (AssertionError e) { + swallowWithCreatePerformance(e); + } + } + + @Override + public void testOverwriteEmptyDirectory() throws Throwable { + try { + super.testOverwriteEmptyDirectory(); + failWithCreatePerformance(); + } catch (AssertionError e) { + swallowWithCreatePerformance(e); + } + } + + @Override + public void testCreateFileOverExistingFileNoOverwrite() throws Throwable { + try { + super.testCreateFileOverExistingFileNoOverwrite(); + failWithCreatePerformance(); + } catch (AssertionError e) { + swallowWithCreatePerformance(e); + } + } + + private void failWithCreatePerformance() { + if (createPerformance) { + fail("expected an assertion error in create performance mode"); + } + } + + /** + * Swallow an assertion error if the create performance flag is set. + * @param e assertion error + */ + private void swallowWithCreatePerformance(final AssertionError e) { + // this is expected in create performance modea + if (!createPerformance) { + // but if the create performance flag is set, then it is supported + // and the assertion error is unexpected + throw e; + } + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFSMainOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFSMainOperations.java index 6669e8426af0a..013ec901d0a77 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFSMainOperations.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFSMainOperations.java @@ -18,6 +18,9 @@ package org.apache.hadoop.fs.s3a; +import java.io.IOException; + +import org.assertj.core.api.Assertions; import org.junit.Ignore; import org.apache.hadoop.conf.Configuration; @@ -27,6 +30,7 @@ import org.apache.hadoop.fs.contract.s3a.S3AContract; import static org.apache.hadoop.fs.s3a.S3ATestUtils.createTestPath; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.isCreatePerformanceEnabled; /** * S3A Test suite for the FSMainOperationsBaseTest tests. @@ -70,4 +74,20 @@ public void testCopyToLocalWithUseRawLocalFileSystemOption() throws Exception { } + @Override + public void testOverwrite() throws IOException { + boolean createPerformance = isCreatePerformanceEnabled(fSys); + try { + super.testOverwrite(); + Assertions.assertThat(createPerformance) + .describedAs("create performance enabled") + .isFalse(); + } catch (AssertionError e) { + // swallow the exception if create performance is enabled, + // else rethrow + if (!createPerformance) { + throw e; + } + } + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java index dae6312d48098..0e4a8eda5b297 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.s3a; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3a.impl.StatusProbeEnum; @@ -39,6 +40,8 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.*; +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.*; import static org.apache.hadoop.fs.s3a.performance.OperationCost.*; import static org.apache.hadoop.test.GenericTestUtils.getTestDir; @@ -47,6 +50,9 @@ /** * Use metrics to assert about the cost of file API calls. * Parameterized on directory marker keep vs delete. + * When the FS is instantiated with creation performance, things + * behave differently...its value is that of the marker keep flag, + * so deletion costs are the same. */ @RunWith(Parameterized.class) public class ITestS3AFileOperationCost extends AbstractS3ACostTest { @@ -71,6 +77,14 @@ public ITestS3AFileOperationCost( super(keepMarkers); } + @Override + public Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + removeBaseAndBucketOverrides(conf, FS_S3A_CREATE_PERFORMANCE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, isKeepingMarkers()); + return conf; + } + /** * Test the cost of {@code listLocatedStatus(file)}. */ @@ -377,7 +391,7 @@ public void testCostOfGlobStatus() throws Throwable { // create a bunch of files int filesToCreate = 10; for (int i = 0; i < filesToCreate; i++) { - create(basePath.suffix("/" + i)); + file(basePath.suffix("/" + i)); } fs.globStatus(basePath.suffix("/*")); @@ -396,7 +410,7 @@ public void testCostOfGlobStatusNoSymlinkResolution() throws Throwable { // create a single file, globStatus returning a single file on a pattern // triggers attempts at symlinks resolution if configured String fileName = "/notASymlinkDOntResolveMeLikeOne"; - create(basePath.suffix(fileName)); + file(basePath.suffix(fileName)); // unguarded: 2 head + 1 list from getFileStatus on path, // plus 1 list to match the glob pattern // no additional operations from symlink resolution diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java index 7ce7b8385cec4..56827043c9b82 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java @@ -19,7 +19,9 @@ package org.apache.hadoop.fs.s3a; import java.io.FileNotFoundException; +import java.io.IOException; +import org.assertj.core.api.Assertions; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -32,6 +34,7 @@ import org.apache.hadoop.fs.Path; import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.isCreatePerformanceEnabled; import static org.apache.hadoop.test.LambdaTestUtils.intercept; import static org.junit.Assume.*; import static org.junit.Assert.*; @@ -137,4 +140,21 @@ public void testRenameNonExistentPath() throws Exception { () -> super.testRenameNonExistentPath()); } + + @Override + public void testOverwrite() throws IOException { + boolean createPerformance = isCreatePerformanceEnabled(fs); + try { + super.testOverwrite(); + Assertions.assertThat(createPerformance) + .describedAs("create performance enabled") + .isFalse(); + } catch (AssertionError e) { + // swallow the exception if create performance is enabled, + // else rethrow + if (!createPerformance) { + throw e; + } + } + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java index 3382c300b9315..aa38186c65032 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java @@ -1554,4 +1554,17 @@ public static boolean isBulkDeleteEnabled(FileSystem fs) { return fs.getConf().getBoolean(Constants.ENABLE_MULTI_DELETE, true); } + + /** + * Does this FS have create performance enabled? + * @param fs filesystem + * @return true if create performance is enabled + * @throws IOException IO problems + */ + public static boolean isCreatePerformanceEnabled(FileSystem fs) + throws IOException { + return fs.hasPathCapability(new Path("/"), FS_S3A_CREATE_PERFORMANCE_ENABLED); + } + + } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestXAttrCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestXAttrCost.java index f6fe68b8e6d33..71fdb7aeaa62c 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestXAttrCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestXAttrCost.java @@ -33,6 +33,7 @@ import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.isCreatePerformanceEnabled; import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_OP_XATTR_LIST; import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_XATTR_GET_MAP; import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_XATTR_GET_NAMED; @@ -43,6 +44,7 @@ import static org.apache.hadoop.fs.s3a.impl.HeaderProcessing.XA_STANDARD_HEADERS; import static org.apache.hadoop.fs.s3a.impl.HeaderProcessing.decodeBytes; import static org.apache.hadoop.fs.s3a.performance.OperationCost.CREATE_FILE_OVERWRITE; +import static org.apache.hadoop.fs.s3a.performance.OperationCost.NO_HEAD_OR_LIST; /** * Invoke XAttr API calls against objects in S3 and validate header @@ -95,8 +97,11 @@ private void logXAttrs(final Map xAttrs) { public void testXAttrFile() throws Throwable { describe("Test xattr on a file"); Path testFile = methodPath(); - create(testFile, true, CREATE_FILE_OVERWRITE); S3AFileSystem fs = getFileSystem(); + boolean createPerformance = isCreatePerformanceEnabled(fs); + + create(testFile, true, + createPerformance ? NO_HEAD_OR_LIST : CREATE_FILE_OVERWRITE); Map xAttrs = verifyMetrics(() -> fs.getXAttrs(testFile), with(INVOCATION_XATTR_GET_MAP, GET_METADATA_ON_OBJECT)); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java index 39530d97bf794..2d128cffc5af0 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java @@ -19,16 +19,22 @@ package org.apache.hadoop.fs.s3a.performance; import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; import org.assertj.core.api.Assertions; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStreamBuilder; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.contract.ContractTestUtils; import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3ATestUtils; import static java.util.Objects.requireNonNull; @@ -36,6 +42,7 @@ import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_HEADER; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; import static org.apache.hadoop.fs.s3a.Constants.XA_HEADER_PREFIX; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_BULK_DELETE_REQUEST; import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_DELETE_REQUEST; import static org.apache.hadoop.fs.s3a.performance.OperationCost.CREATE_FILE_NO_OVERWRITE; @@ -45,6 +52,7 @@ import static org.apache.hadoop.fs.s3a.performance.OperationCost.GET_FILE_STATUS_ON_DIR_MARKER; import static org.apache.hadoop.fs.s3a.performance.OperationCost.GET_FILE_STATUS_ON_FILE; import static org.apache.hadoop.fs.s3a.performance.OperationCost.HEAD_OPERATION; +import static org.apache.hadoop.fs.s3a.performance.OperationCost.LIST_OPERATION; import static org.apache.hadoop.fs.s3a.performance.OperationCost.NO_HEAD_OR_LIST; /** @@ -52,13 +60,55 @@ * with the FS_S3A_CREATE_PERFORMANCE option. */ @SuppressWarnings("resource") +@RunWith(Parameterized.class) public class ITestCreateFileCost extends AbstractS3ACostTest { + /** + * This test suite is parameterized for the different create file + * options. + * @return a list of test parameters. + */ + @Parameterized.Parameters + public static Collection params() { + return Arrays.asList(new Object[][]{ + {false}, + {true} + }); + } + + /** + * Flag for performance creation; all cost asserts need changing. + */ + private final boolean createPerformance; + /** * Create with markers kept, always. */ - public ITestCreateFileCost() { + public ITestCreateFileCost(final boolean createPerformance) { + // keep markers to permit assertions that create performance + // always skips marker deletion. super(false); + this.createPerformance = createPerformance; + } + + /** + * Determine the expected cost of a create operation; + * if {@link #createPerformance} is true, then the cost is always "no IO". + * @param source source cost + * @return cost to assert + */ + private OperationCost expected(OperationCost source) { + return createPerformance ? NO_HEAD_OR_LIST : source; + } + + @Override + public Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + removeBaseAndBucketOverrides(conf, + FS_S3A_CREATE_PERFORMANCE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, createPerformance); + S3ATestUtils.disableFilesystemCaching(conf); + return conf; } @Test @@ -67,7 +117,7 @@ public void testCreateNoOverwrite() throws Throwable { Path testFile = methodPath(); // when overwrite is false, the path is checked for existence. create(testFile, false, - CREATE_FILE_NO_OVERWRITE); + expected(CREATE_FILE_NO_OVERWRITE)); } @Test @@ -75,7 +125,7 @@ public void testCreateOverwrite() throws Throwable { describe("Test file creation with overwrite"); Path testFile = methodPath(); // when overwrite is true: only the directory checks take place. - create(testFile, true, CREATE_FILE_OVERWRITE); + create(testFile, true, expected(CREATE_FILE_OVERWRITE)); } @Test @@ -85,21 +135,45 @@ public void testCreateNoOverwriteFileExists() throws Throwable { // now there is a file there, an attempt with overwrite == false will // fail on the first HEAD. - interceptOperation(FileAlreadyExistsException.class, "", - FILE_STATUS_FILE_PROBE, - () -> file(testFile, false)); + if (!createPerformance) { + interceptOperation(FileAlreadyExistsException.class, "", + FILE_STATUS_FILE_PROBE, + () -> file(testFile, false)); + } else { + create(testFile, false, NO_HEAD_OR_LIST); + } } @Test - public void testCreateFileOverDir() throws Throwable { - describe("Test cost of create file failing with existing dir"); + public void testCreateFileOverDirNoOverwrite() throws Throwable { + describe("Test cost of create file overwrite=false failing with existing dir"); Path testFile = dir(methodPath()); - // now there is a file there, an attempt with overwrite == false will + // now there is a dir marker there, an attempt with overwrite == true will // fail on the first HEAD. - interceptOperation(FileAlreadyExistsException.class, "", - GET_FILE_STATUS_ON_DIR_MARKER, - () -> file(testFile, false)); + if (!createPerformance) { + interceptOperation(FileAlreadyExistsException.class, "", + GET_FILE_STATUS_ON_DIR_MARKER, + () -> file(testFile, false)); + } else { + create(testFile, false, NO_HEAD_OR_LIST); + } + } + + @Test + public void testCreateFileOverDirWithOverwrite() throws Throwable { + describe("Test cost of create file overwrite=false failing with existing dir"); + Path testFile = dir(methodPath()); + + // now there is a dir marker there, an attempt with overwrite == true will + // fail on the LIST; no HEAD is issued. + if (!createPerformance) { + interceptOperation(FileAlreadyExistsException.class, "", + LIST_OPERATION, + () -> file(testFile, true)); + } else { + create(testFile, true, NO_HEAD_OR_LIST); + } } /** @@ -117,14 +191,18 @@ public void testCreateBuilderSequence() throws Throwable { // files and so briefly the path not being present // only make sure the dest path isn't a directory. buildFile(testFile, true, false, - FILE_STATUS_DIR_PROBE); + expected(FILE_STATUS_DIR_PROBE)); // now there is a file there, an attempt with overwrite == false will // fail on the first HEAD. - interceptOperation(FileAlreadyExistsException.class, "", - GET_FILE_STATUS_ON_FILE, - () -> buildFile(testFile, false, true, - GET_FILE_STATUS_ON_FILE)); + if (!createPerformance) { + interceptOperation(FileAlreadyExistsException.class, "", + GET_FILE_STATUS_ON_FILE, + () -> buildFile(testFile, false, true, + GET_FILE_STATUS_ON_FILE)); + } else { + buildFile(testFile, false, true, NO_HEAD_OR_LIST); + } } @Test @@ -162,7 +240,7 @@ public void testCreateFileRecursive() throws Throwable { builder.must(FS_S3A_CREATE_HEADER + ".h1", custom); verifyMetrics(() -> build(builder), - always(CREATE_FILE_NO_OVERWRITE)); + always(expected(CREATE_FILE_NO_OVERWRITE))); // the header is there and the probe should be a single HEAD call. String header = verifyMetrics(() -> @@ -181,7 +259,7 @@ public void testCreateFileNonRecursive() throws Throwable { verifyMetrics(() -> build(fs.createFile(methodPath()).overwrite(true)), - always(CREATE_FILE_OVERWRITE)); + always(expected(CREATE_FILE_OVERWRITE))); } @@ -196,7 +274,7 @@ public void testCreateNonRecursive() throws Throwable { .close(); return ""; }, - always(CREATE_FILE_OVERWRITE)); + always(expected(CREATE_FILE_OVERWRITE))); } private FSDataOutputStream build(final FSDataOutputStreamBuilder builder) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java index cd4ee44406676..8dbf5baaa6206 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java @@ -54,6 +54,7 @@ import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY; import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY_DELETE; import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY_KEEP; +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.test.LambdaTestUtils.intercept; @@ -80,8 +81,7 @@ *

* Similarly: JUnit assertions over AssertJ. *

- * The tests work with unguarded buckets only -the bucket settings are changed - * appropriately. + * s3a create performance is disabled for consistent assertions. */ @RunWith(Parameterized.class) public class ITestDirectoryMarkerListing extends AbstractS3ATestBase { @@ -199,11 +199,13 @@ protected Configuration createConfiguration() { // directory marker options removeBaseAndBucketOverrides(bucketName, conf, - DIRECTORY_MARKER_POLICY); + DIRECTORY_MARKER_POLICY, + FS_S3A_CREATE_PERFORMANCE); conf.set(DIRECTORY_MARKER_POLICY, keepMarkers ? DIRECTORY_MARKER_POLICY_KEEP : DIRECTORY_MARKER_POLICY_DELETE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, false); return conf; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java index fae2df973af7f..97f51fe2c8dcd 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java @@ -30,6 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.contract.ContractTestUtils; @@ -38,6 +39,8 @@ import org.apache.hadoop.fs.s3a.Tristate; import org.apache.hadoop.fs.s3a.impl.StatusProbeEnum; +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.*; import static org.apache.hadoop.fs.s3a.performance.OperationCost.*; import static org.apache.hadoop.fs.s3a.performance.OperationCostValidator.probe; @@ -74,6 +77,14 @@ public ITestS3ADeleteCost(final String name, super(keepMarkers); } + @Override + public Configuration createConfiguration() { + Configuration conf = super.createConfiguration(); + removeBaseAndBucketOverrides(conf, FS_S3A_CREATE_PERFORMANCE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, false); + return conf; + } + @Override public void teardown() throws Exception { if (isKeepingMarkers()) { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/AbstractMarkerToolTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/AbstractMarkerToolTest.java index 0c2473a5f61da..759a3bf129eef 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/AbstractMarkerToolTest.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/AbstractMarkerToolTest.java @@ -73,12 +73,15 @@ protected Configuration createConfiguration() { removeBaseAndBucketOverrides(bucketName, conf, S3A_BUCKET_PROBE, DIRECTORY_MARKER_POLICY, - AUTHORITATIVE_PATH); + AUTHORITATIVE_PATH, + FS_S3A_CREATE_PERFORMANCE); // base FS is legacy conf.set(DIRECTORY_MARKER_POLICY, DIRECTORY_MARKER_POLICY_DELETE); + conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, false); // turn off bucket probes for a bit of speedup in the connectors we create. conf.setInt(S3A_BUCKET_PROBE, 0); + return conf; } From e4eda40ac9966a7d8488bbb0630eaa02c3f2b0e0 Mon Sep 17 00:00:00 2001 From: Junfan Zhang Date: Fri, 27 Oct 2023 22:11:01 +0800 Subject: [PATCH 23/23] YARN-11597. Fix NPE when loading static files in SLSWebApp (#6216) Contributed by Junfan Zhang. Reviewed-by: Shilun Fan Signed-off-by: Shilun Fan --- .../src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java index a2974615c4c67..ce36854bca2fd 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java @@ -136,6 +136,7 @@ public void start() throws Exception { String webRootDir = getClass().getClassLoader().getResource("html"). toExternalForm(); staticHandler.setResourceBase(webRootDir); + staticHandler.start(); Handler handler = new AbstractHandler() { @Override