From 5c9d397f109f233db7159a9901358c6b2585a73e Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Tue, 3 Dec 2024 13:31:24 +0530 Subject: [PATCH 01/11] Skip remote-repositories validations for node-joins when RepositoriesService is not in sync with cluster-state Signed-off-by: Pranshu Shukla --- .../discovery/DiscoveryDisruptionIT.java | 93 +++++++++++++++++++ .../remotestore/RemoteStoreNodeService.java | 8 ++ .../repositories/RepositoriesService.java | 6 ++ .../opensearch/test/InternalTestCluster.java | 21 ++++- .../test/OpenSearchIntegTestCase.java | 38 ++++++++ .../test/transport/MockTransportService.java | 30 ++++++ 6 files changed, 195 insertions(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index 70124c8c46700..4938617a312fe 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -32,13 +32,19 @@ package org.opensearch.discovery; +import org.junit.Assert; import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.coordination.FailedToCommitClusterStateException; import org.opensearch.cluster.coordination.JoinHelper; import org.opensearch.cluster.coordination.PublicationTransportHandler; +import org.opensearch.cluster.metadata.RepositoriesMetadata; +import org.opensearch.cluster.metadata.RepositoryMetadata; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; +import org.opensearch.repositories.RepositoriesService; +import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.test.OpenSearchIntegTestCase; import org.opensearch.test.disruption.NetworkDisruption; import org.opensearch.test.disruption.ServiceDisruptionScheme; @@ -50,6 +56,11 @@ import java.util.HashSet; import java.util.Set; import java.util.concurrent.CountDownLatch; +import java.util.stream.Collectors; +import java.util.Random; +import java.util.List; +import java.util.Arrays; + import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING; import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING; @@ -250,4 +261,86 @@ public void testNodeNotReachableFromClusterManager() throws Exception { ensureStableCluster(3); } + /** + * Test Repositories Configured Node Join Commit failures. + */ + public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitFails() throws Exception { + final String remoteStateRepoName = "remote-state-repo"; + final String remoteRoutingTableRepoName = "routing-table-repo"; + + + Settings remotePublicationSettings = buildRemotePublicationNodeAttributes( + remoteStateRepoName, + ReloadableFsRepository.TYPE, + remoteRoutingTableRepoName, + ReloadableFsRepository.TYPE + ); + internalCluster().startClusterManagerOnlyNodes(3); + internalCluster().startDataOnlyNodes(3); + + String clusterManagerNode = internalCluster().getClusterManagerName(); + List nonClusterManagerNodes = Arrays.stream(internalCluster().getNodeNames()).filter(node -> !node.equals(clusterManagerNode)).collect(Collectors.toList()); + + ensureStableCluster(6); + + MockTransportService clusterManagerTransportService = (MockTransportService) internalCluster().getInstance( + TransportService.class, + clusterManagerNode + ); + logger.info("Blocking Cluster Manager Commit Request on all nodes"); + nonClusterManagerNodes.forEach( + node -> { + TransportService targetTransportService = internalCluster().getInstance(TransportService.class, node); + clusterManagerTransportService.addOpenSearchFailureException( + targetTransportService, + new FailedToCommitClusterStateException("Blocking Commit"), + PublicationTransportHandler.COMMIT_STATE_ACTION_NAME + ); + } + ); + + logger.info("Starting Node with remote publication settings"); + internalCluster().startDataOnlyNodes(1, remotePublicationSettings, Boolean.TRUE); + + logger.info("Stopping current Cluster Manager"); + internalCluster().stopCurrentClusterManagerNode(); + ensureStableCluster(6); + + Random random = new Random(); + String randomNode = nonClusterManagerNodes.get(random.nextInt(nonClusterManagerNodes.size())); + + RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode).state().metadata().custom(RepositoriesMetadata.TYPE); + + Boolean isRemoteStateRepoConfigured = Boolean.FALSE; + Boolean isRemoteRoutingTableRepoConfigured = Boolean.FALSE; + + for (RepositoryMetadata repo : repositoriesMetadata.repositories()) { + if (repo.name().equals(remoteStateRepoName)) { + isRemoteStateRepoConfigured = Boolean.TRUE; + } else if (repo.name().equals(remoteRoutingTableRepoName)) { + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + } + } + + Assert.assertTrue("RemoteState Repo is not set in RepositoriesMetadata", isRemoteStateRepoConfigured); + Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoriesMetadata", isRemoteRoutingTableRepoConfigured); + + isRemoteStateRepoConfigured = Boolean.FALSE; + isRemoteRoutingTableRepoConfigured = Boolean.FALSE; + + RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); + + if (repositoriesService.isRepositoryPresent(remoteStateRepoName)) { + isRemoteStateRepoConfigured = Boolean.TRUE; + } + if (repositoriesService.isRepositoryPresent(remoteRoutingTableRepoName)) { + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + } + + Assert.assertTrue("RemoteState Repo is not set in RepositoryService", isRemoteStateRepoConfigured); + Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoryService", isRemoteRoutingTableRepoConfigured); + + logger.info("Stopping current Cluster Manager"); + } + } diff --git a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java index c1c041ce01198..c037cbf2b71c6 100644 --- a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java +++ b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java @@ -183,6 +183,14 @@ public RepositoriesMetadata updateRepositoriesMetadata(DiscoveryNode joiningNode boolean repositoryAlreadyPresent = false; for (RepositoryMetadata existingRepositoryMetadata : existingRepositories.repositories()) { if (newRepositoryMetadata.name().equals(existingRepositoryMetadata.name())) { + // This is to handle cases where-in the during a previous node-join attempt if the publish operation succeeded but + // the commit operation failed, the cluster-state may have the repository metadata which is not applied into the + // repository service. This may lead to assertion failures down the line. + if (!repositoriesService.get().isRepositoryPresent(newRepositoryMetadata.name())) { + logger.warn("remote repository [{}] in cluster-state but repository-service but not present " + + "in repository-service, skipping checks", newRepositoryMetadata.name()); + break; + } try { // This will help in handling two scenarios - // 1. When a fresh cluster is formed and a node tries to join the cluster, the repository diff --git a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java index 9aec81536dbd0..13fb6ed8455d7 100644 --- a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java +++ b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java @@ -578,6 +578,10 @@ public Repository repository(String repositoryName) { throw new RepositoryMissingException(repositoryName); } + public Boolean isRepositoryPresent(final String repositoryName) { + return Objects.nonNull(repositories.get(repositoryName)); + } + public List repositoriesStats() { List activeRepoStats = getRepositoryStatsForActiveRepositories(); return activeRepoStats; @@ -904,6 +908,8 @@ public void ensureValidSystemRepositoryUpdate(RepositoryMetadata newRepositoryMe Settings newRepositoryMetadataSettings = newRepositoryMetadata.settings(); Settings currentRepositoryMetadataSettings = currentRepositoryMetadata.settings(); + assert Objects.nonNull(repository) : String.format("repository [%s] not present in RepositoryService", currentRepositoryMetadata.name()); + List restrictedSettings = repository.getRestrictedSystemRepositorySettings() .stream() .map(setting -> setting.getKey()) diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java index fa5fb736f518f..c73620c40bb28 100644 --- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java @@ -2322,10 +2322,25 @@ public List startNodes(int numOfNodes, Settings settings) { return startNodes(Collections.nCopies(numOfNodes, settings).toArray(new Settings[0])); } + + /** + * Starts multiple nodes with the given settings and returns their names + */ + public List startNodes(int numOfNodes, Settings settings, Boolean ignoreNodeJoin) { + return startNodes(ignoreNodeJoin, Collections.nCopies(numOfNodes, settings).toArray(new Settings[0])); + } + /** * Starts multiple nodes with the given settings and returns their names */ public synchronized List startNodes(Settings... extraSettings) { + return startNodes(false, extraSettings); + } + + /** + * Starts multiple nodes with the given settings and returns their names + */ + public synchronized List startNodes(Boolean ignoreNodeJoin, Settings... extraSettings) { final int newClusterManagerCount = Math.toIntExact(Stream.of(extraSettings).filter(DiscoveryNode::isClusterManagerNode).count()); final int defaultMinClusterManagerNodes; if (autoManageClusterManagerNodes) { @@ -2377,7 +2392,7 @@ public synchronized List startNodes(Settings... extraSettings) { nodes.add(nodeAndClient); } startAndPublishNodesAndClients(nodes); - if (autoManageClusterManagerNodes) { + if (autoManageClusterManagerNodes && !ignoreNodeJoin) { validateClusterFormed(); } return nodes.stream().map(NodeAndClient::getName).collect(Collectors.toList()); @@ -2422,6 +2437,10 @@ public List startDataOnlyNodes(int numNodes, Settings settings) { return startNodes(numNodes, Settings.builder().put(onlyRole(settings, DiscoveryNodeRole.DATA_ROLE)).build()); } + public List startDataOnlyNodes(int numNodes, Settings settings, Boolean ignoreNodeJoin) { + return startNodes(numNodes, Settings.builder().put(onlyRole(settings, DiscoveryNodeRole.DATA_ROLE)).build(), ignoreNodeJoin); + } + public List startSearchOnlyNodes(int numNodes) { return startSearchOnlyNodes(numNodes, Settings.EMPTY); } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java index 1ee856d3092f0..c76da83df6cc9 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java @@ -181,6 +181,7 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; +import reactor.util.annotation.NonNull; import java.io.IOException; import java.lang.Runtime.Version; @@ -2915,6 +2916,43 @@ protected static Settings buildRemoteStoreNodeAttributes( return settings.build(); } + protected Settings buildRemotePublicationNodeAttributes( + @NonNull String remoteStateRepoName, + @NonNull String remoteStateRepoType, + @NonNull String routingTableRepoName, + @NonNull String routingTableRepoType + ) { + String remoteStateRepositoryTypeAttributeKey = String.format( + Locale.getDefault(), + "node.attr." + REMOTE_STORE_REPOSITORY_TYPE_ATTRIBUTE_KEY_FORMAT, + remoteStateRepoName + ); + String routingTableRepositoryTypeAttributeKey = String.format( + Locale.getDefault(), + "node.attr." + REMOTE_STORE_REPOSITORY_TYPE_ATTRIBUTE_KEY_FORMAT, + routingTableRepoName + ); + String remoteStateRepositorySettingsAttributeKeyPrefix = String.format( + Locale.getDefault(), + "node.attr." + REMOTE_STORE_REPOSITORY_SETTINGS_ATTRIBUTE_KEY_PREFIX, + remoteStateRepoName + ); + String routingTableRepositorySettingsAttributeKeyPrefix = String.format( + Locale.getDefault(), + "node.attr." + REMOTE_STORE_REPOSITORY_SETTINGS_ATTRIBUTE_KEY_PREFIX, + routingTableRepoName + ); + + return Settings.builder() + .put("node.attr." + REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY, remoteStateRepoName) + .put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName) + .put(remoteStateRepositoryTypeAttributeKey, remoteStateRepoType) + .put(routingTableRepositoryTypeAttributeKey, routingTableRepoType) + .put(remoteStateRepositorySettingsAttributeKeyPrefix+"location", randomRepoPath().toAbsolutePath()) + .put(routingTableRepositorySettingsAttributeKeyPrefix+"location", randomRepoPath().toAbsolutePath()) + .build(); + } + public static String resolvePath(IndexId indexId, String shardId) { PathType pathType = PathType.fromCode(indexId.getShardPathType()); RemoteStorePathStrategy.SnapshotShardPathInput shardPathInput = new RemoteStorePathStrategy.SnapshotShardPathInput.Builder() diff --git a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java index 6bf5381b62cc9..89e90bb044cc4 100644 --- a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java +++ b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java @@ -34,6 +34,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.opensearch.OpenSearchException; import org.opensearch.Version; import org.opensearch.cluster.ClusterModule; import org.opensearch.cluster.node.DiscoveryNode; @@ -376,6 +377,35 @@ public void addFailToSendNoConnectRule(TransportAddress transportAddress, final }); } + /** + * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions + */ + public void addOpenSearchFailureException(TransportService transportService, final OpenSearchException exception, final String... blockedActions) { + addOpenSearchFailureException(transportService, exception, new HashSet<>(Arrays.asList(blockedActions))); + } + + /** + * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions + */ + public void addOpenSearchFailureException(TransportService transportService, OpenSearchException exception, final Set blockedActions) { + for (TransportAddress transportAddress : extractTransportAddresses(transportService)) { + addOpenSearchFailureException(transportAddress, exception, blockedActions); + } + } + + /** + * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions + */ + public void addOpenSearchFailureException(TransportAddress transportAddress, OpenSearchException exception, final Set blockedActions) { + transport().addSendBehavior(transportAddress, (connection, requestId, action, request, options) -> { + if (blockedActions.contains(action)) { + logger.info("--> preventing {} request", action); + throw exception; + } + connection.sendRequest(requestId, action, request, options); + }); + } + /** * Adds a rule that will cause ignores each send request, simulating an unresponsive node * and failing to connect once the rule was added. From b884cfc73b6f17d0431336d7f1f73b18e1eb2c08 Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Tue, 3 Dec 2024 14:18:32 +0530 Subject: [PATCH 02/11] spotless Signed-off-by: Pranshu Shukla --- .../discovery/DiscoveryDisruptionIT.java | 37 ++++++++++--------- .../remotestore/RemoteStoreNodeService.java | 7 +++- .../repositories/RepositoriesService.java | 5 ++- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index 4938617a312fe..a0305261d3374 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -32,7 +32,6 @@ package org.opensearch.discovery; -import org.junit.Assert; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.coordination.FailedToCommitClusterStateException; import org.opensearch.cluster.coordination.JoinHelper; @@ -52,15 +51,15 @@ import org.opensearch.test.transport.MockTransportService; import org.opensearch.transport.Transport; import org.opensearch.transport.TransportService; +import org.junit.Assert; +import java.util.Arrays; import java.util.HashSet; +import java.util.List; +import java.util.Random; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; -import java.util.Random; -import java.util.List; -import java.util.Arrays; - import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING; import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING; @@ -268,7 +267,6 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF final String remoteStateRepoName = "remote-state-repo"; final String remoteRoutingTableRepoName = "routing-table-repo"; - Settings remotePublicationSettings = buildRemotePublicationNodeAttributes( remoteStateRepoName, ReloadableFsRepository.TYPE, @@ -279,7 +277,9 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF internalCluster().startDataOnlyNodes(3); String clusterManagerNode = internalCluster().getClusterManagerName(); - List nonClusterManagerNodes = Arrays.stream(internalCluster().getNodeNames()).filter(node -> !node.equals(clusterManagerNode)).collect(Collectors.toList()); + List nonClusterManagerNodes = Arrays.stream(internalCluster().getNodeNames()) + .filter(node -> !node.equals(clusterManagerNode)) + .collect(Collectors.toList()); ensureStableCluster(6); @@ -288,16 +288,14 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF clusterManagerNode ); logger.info("Blocking Cluster Manager Commit Request on all nodes"); - nonClusterManagerNodes.forEach( - node -> { - TransportService targetTransportService = internalCluster().getInstance(TransportService.class, node); - clusterManagerTransportService.addOpenSearchFailureException( - targetTransportService, - new FailedToCommitClusterStateException("Blocking Commit"), - PublicationTransportHandler.COMMIT_STATE_ACTION_NAME - ); - } - ); + nonClusterManagerNodes.forEach(node -> { + TransportService targetTransportService = internalCluster().getInstance(TransportService.class, node); + clusterManagerTransportService.addOpenSearchFailureException( + targetTransportService, + new FailedToCommitClusterStateException("Blocking Commit"), + PublicationTransportHandler.COMMIT_STATE_ACTION_NAME + ); + }); logger.info("Starting Node with remote publication settings"); internalCluster().startDataOnlyNodes(1, remotePublicationSettings, Boolean.TRUE); @@ -309,7 +307,10 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF Random random = new Random(); String randomNode = nonClusterManagerNodes.get(random.nextInt(nonClusterManagerNodes.size())); - RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode).state().metadata().custom(RepositoriesMetadata.TYPE); + RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode) + .state() + .metadata() + .custom(RepositoriesMetadata.TYPE); Boolean isRemoteStateRepoConfigured = Boolean.FALSE; Boolean isRemoteRoutingTableRepoConfigured = Boolean.FALSE; diff --git a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java index c037cbf2b71c6..10441a74e174c 100644 --- a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java +++ b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java @@ -187,8 +187,11 @@ public RepositoriesMetadata updateRepositoriesMetadata(DiscoveryNode joiningNode // the commit operation failed, the cluster-state may have the repository metadata which is not applied into the // repository service. This may lead to assertion failures down the line. if (!repositoriesService.get().isRepositoryPresent(newRepositoryMetadata.name())) { - logger.warn("remote repository [{}] in cluster-state but repository-service but not present " - + "in repository-service, skipping checks", newRepositoryMetadata.name()); + logger.warn( + "remote repository [{}] in cluster-state but repository-service but not present " + + "in repository-service, skipping checks", + newRepositoryMetadata.name() + ); break; } try { diff --git a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java index 13fb6ed8455d7..08008b12d0929 100644 --- a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java +++ b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java @@ -908,7 +908,10 @@ public void ensureValidSystemRepositoryUpdate(RepositoryMetadata newRepositoryMe Settings newRepositoryMetadataSettings = newRepositoryMetadata.settings(); Settings currentRepositoryMetadataSettings = currentRepositoryMetadata.settings(); - assert Objects.nonNull(repository) : String.format("repository [%s] not present in RepositoryService", currentRepositoryMetadata.name()); + assert Objects.nonNull(repository) : String.format( + "repository [%s] not present in RepositoryService", + currentRepositoryMetadata.name() + ); List restrictedSettings = repository.getRestrictedSystemRepositorySettings() .stream() From 6ea3fd608a02bb6e7887ee3f1c54e8e5cdb6217a Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Tue, 3 Dec 2024 14:29:50 +0530 Subject: [PATCH 03/11] spotless - test framework Signed-off-by: Pranshu Shukla --- .../opensearch/test/InternalTestCluster.java | 1 - .../test/OpenSearchIntegTestCase.java | 9 +++++---- .../test/transport/MockTransportService.java | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java index c73620c40bb28..14f0a30ef3f84 100644 --- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java @@ -2322,7 +2322,6 @@ public List startNodes(int numOfNodes, Settings settings) { return startNodes(Collections.nCopies(numOfNodes, settings).toArray(new Settings[0])); } - /** * Starts multiple nodes with the given settings and returns their names */ diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java index c76da83df6cc9..1c26ea4ca2c91 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java @@ -181,7 +181,6 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; -import reactor.util.annotation.NonNull; import java.io.IOException; import java.lang.Runtime.Version; @@ -215,6 +214,8 @@ import java.util.function.Function; import java.util.stream.Collectors; +import reactor.util.annotation.NonNull; + import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; import static org.opensearch.common.unit.TimeValue.timeValueMillis; @@ -2919,7 +2920,7 @@ protected static Settings buildRemoteStoreNodeAttributes( protected Settings buildRemotePublicationNodeAttributes( @NonNull String remoteStateRepoName, @NonNull String remoteStateRepoType, - @NonNull String routingTableRepoName, + @NonNull String routingTableRepoName, @NonNull String routingTableRepoType ) { String remoteStateRepositoryTypeAttributeKey = String.format( @@ -2948,8 +2949,8 @@ protected Settings buildRemotePublicationNodeAttributes( .put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName) .put(remoteStateRepositoryTypeAttributeKey, remoteStateRepoType) .put(routingTableRepositoryTypeAttributeKey, routingTableRepoType) - .put(remoteStateRepositorySettingsAttributeKeyPrefix+"location", randomRepoPath().toAbsolutePath()) - .put(routingTableRepositorySettingsAttributeKeyPrefix+"location", randomRepoPath().toAbsolutePath()) + .put(remoteStateRepositorySettingsAttributeKeyPrefix + "location", randomRepoPath().toAbsolutePath()) + .put(routingTableRepositorySettingsAttributeKeyPrefix + "location", randomRepoPath().toAbsolutePath()) .build(); } diff --git a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java index 89e90bb044cc4..d8146e6f7d540 100644 --- a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java +++ b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java @@ -380,14 +380,22 @@ public void addFailToSendNoConnectRule(TransportAddress transportAddress, final /** * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions */ - public void addOpenSearchFailureException(TransportService transportService, final OpenSearchException exception, final String... blockedActions) { + public void addOpenSearchFailureException( + TransportService transportService, + final OpenSearchException exception, + final String... blockedActions + ) { addOpenSearchFailureException(transportService, exception, new HashSet<>(Arrays.asList(blockedActions))); } /** * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions */ - public void addOpenSearchFailureException(TransportService transportService, OpenSearchException exception, final Set blockedActions) { + public void addOpenSearchFailureException( + TransportService transportService, + OpenSearchException exception, + final Set blockedActions + ) { for (TransportAddress transportAddress : extractTransportAddresses(transportService)) { addOpenSearchFailureException(transportAddress, exception, blockedActions); } @@ -396,7 +404,11 @@ public void addOpenSearchFailureException(TransportService transportService, Ope /** * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions */ - public void addOpenSearchFailureException(TransportAddress transportAddress, OpenSearchException exception, final Set blockedActions) { + public void addOpenSearchFailureException( + TransportAddress transportAddress, + OpenSearchException exception, + final Set blockedActions + ) { transport().addSendBehavior(transportAddress, (connection, requestId, action, request, options) -> { if (blockedActions.contains(action)) { logger.info("--> preventing {} request", action); From a0c1abda5270116fd8a815bd961c83cc3e86ba4f Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Tue, 3 Dec 2024 15:33:54 +0530 Subject: [PATCH 04/11] fix: forbiddenAPIs check Signed-off-by: Pranshu Shukla --- .../java/org/opensearch/repositories/RepositoriesService.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java index 08008b12d0929..0b9536e273f25 100644 --- a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java +++ b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java @@ -80,6 +80,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -909,6 +910,7 @@ public void ensureValidSystemRepositoryUpdate(RepositoryMetadata newRepositoryMe Settings currentRepositoryMetadataSettings = currentRepositoryMetadata.settings(); assert Objects.nonNull(repository) : String.format( + Locale.ROOT, "repository [%s] not present in RepositoryService", currentRepositoryMetadata.name() ); From ed9f5e7bb421f77e8ad8a1333729dc95b90b5f80 Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Tue, 3 Dec 2024 16:16:40 +0530 Subject: [PATCH 05/11] fix forbiddenApisInternalClusterTest Signed-off-by: Pranshu Shukla --- .../java/org/opensearch/discovery/DiscoveryDisruptionIT.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index a0305261d3374..2f6ea600d895f 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -41,6 +41,7 @@ import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.Randomness; import org.opensearch.common.settings.Settings; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.fs.ReloadableFsRepository; @@ -56,7 +57,6 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Random; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; @@ -304,8 +304,7 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF internalCluster().stopCurrentClusterManagerNode(); ensureStableCluster(6); - Random random = new Random(); - String randomNode = nonClusterManagerNodes.get(random.nextInt(nonClusterManagerNodes.size())); + String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode) .state() From d009e5342c1d5ec1d1c8561e7686fcae51884fe0 Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Fri, 6 Dec 2024 14:02:29 +0530 Subject: [PATCH 06/11] Refactor Signed-off-by: Pranshu Shukla --- .../discovery/DiscoveryDisruptionIT.java | 48 ++++++++++--- .../remotestore/RemoteStoreNodeService.java | 17 +++-- .../repositories/RepositoriesService.java | 4 -- .../coordination/JoinTaskExecutorTests.java | 67 +++++++++++++++++++ .../test/transport/MockTransportService.java | 42 ------------ 5 files changed, 116 insertions(+), 62 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index 2f6ea600d895f..21a395ad4c980 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -44,6 +44,8 @@ import org.opensearch.common.Randomness; import org.opensearch.common.settings.Settings; import org.opensearch.repositories.RepositoriesService; +import org.opensearch.repositories.Repository; +import org.opensearch.repositories.RepositoryMissingException; import org.opensearch.repositories.fs.ReloadableFsRepository; import org.opensearch.test.OpenSearchIntegTestCase; import org.opensearch.test.disruption.NetworkDisruption; @@ -57,6 +59,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; @@ -261,7 +264,9 @@ public void testNodeNotReachableFromClusterManager() throws Exception { } /** - * Test Repositories Configured Node Join Commit failures. + * Tests the scenario where-in a cluster-state containing new repository meta-data as part of a node-join from a + * repository-configured node fails on a commit stag and has a master switch. This would lead to master nodes + * doing another round of node-joins with the new cluster-state as the previous attempt had a successful publish. */ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitFails() throws Exception { final String remoteStateRepoName = "remote-state-repo"; @@ -288,20 +293,32 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF clusterManagerNode ); logger.info("Blocking Cluster Manager Commit Request on all nodes"); + // This is to allow the new node to have commit failures on the nodes in the send path itself. This will lead to the + // nodes have a successful publish operation but failed commit operation. This will come into play once the new node joins nonClusterManagerNodes.forEach(node -> { TransportService targetTransportService = internalCluster().getInstance(TransportService.class, node); - clusterManagerTransportService.addOpenSearchFailureException( - targetTransportService, - new FailedToCommitClusterStateException("Blocking Commit"), - PublicationTransportHandler.COMMIT_STATE_ACTION_NAME - ); + clusterManagerTransportService.addSendBehavior(targetTransportService, (connection, requestId, action, request, options) -> { + if (action.equals(PublicationTransportHandler.COMMIT_STATE_ACTION_NAME)) { + logger.info("--> preventing {} request", PublicationTransportHandler.COMMIT_STATE_ACTION_NAME); + throw new FailedToCommitClusterStateException("Blocking Commit"); + } + connection.sendRequest(requestId, action, request, options); + }); }); logger.info("Starting Node with remote publication settings"); + // Start a node with remote-publication repositories configured. This will lead to the active cluster-manager create + // a new cluster-state event with the new node-join along with new repositories setup in the cluster meta-data. internalCluster().startDataOnlyNodes(1, remotePublicationSettings, Boolean.TRUE); logger.info("Stopping current Cluster Manager"); + // We stop the current cluster-manager whose outbound paths were blocked. This is to force a new election onto nodes + // we had the new cluster-state published but not commited. internalCluster().stopCurrentClusterManagerNode(); + + // We expect that the repositories validations are skipped in this case and node-joins succeeds as expected. The + // repositories validations are skipped because even though the cluster-state is updated in the persisted registry, + // the repository service will not be updated as the commit attempt failed. ensureStableCluster(6); String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); @@ -330,11 +347,22 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); - if (repositoriesService.isRepositoryPresent(remoteStateRepoName)) { - isRemoteStateRepoConfigured = Boolean.TRUE; + try { + Repository remoteStateRepo = repositoriesService.repository(remoteStateRepoName); + if (Objects.nonNull(remoteStateRepo)) { + isRemoteStateRepoConfigured = Boolean.TRUE; + } + } catch (RepositoryMissingException e) { + isRemoteStateRepoConfigured = Boolean.FALSE; } - if (repositoriesService.isRepositoryPresent(remoteRoutingTableRepoName)) { - isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + + try { + Repository routingTableRepo = repositoriesService.repository(remoteRoutingTableRepoName); + if (Objects.nonNull(routingTableRepo)) { + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + } + } catch (RepositoryMissingException e) { + isRemoteRoutingTableRepoConfigured = Boolean.FALSE; } Assert.assertTrue("RemoteState Repo is not set in RepositoryService", isRemoteStateRepoConfigured); diff --git a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java index 10441a74e174c..da94c8a7836b3 100644 --- a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java +++ b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java @@ -21,6 +21,7 @@ import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; import org.opensearch.repositories.RepositoryException; +import org.opensearch.repositories.RepositoryMissingException; import org.opensearch.threadpool.ThreadPool; import java.util.ArrayList; @@ -183,13 +184,17 @@ public RepositoriesMetadata updateRepositoriesMetadata(DiscoveryNode joiningNode boolean repositoryAlreadyPresent = false; for (RepositoryMetadata existingRepositoryMetadata : existingRepositories.repositories()) { if (newRepositoryMetadata.name().equals(existingRepositoryMetadata.name())) { - // This is to handle cases where-in the during a previous node-join attempt if the publish operation succeeded but - // the commit operation failed, the cluster-state may have the repository metadata which is not applied into the - // repository service. This may lead to assertion failures down the line. - if (!repositoriesService.get().isRepositoryPresent(newRepositoryMetadata.name())) { + try { + // This is to handle cases where-in the during a previous node-join attempt if the publish operation succeeded + // but + // the commit operation failed, the cluster-state may have the repository metadata which is not applied into the + // repository service. This may lead to assertion failures down the line. + String repositoryName = newRepositoryMetadata.name(); + repositoriesService.get().repository(repositoryName); + } catch (RepositoryMissingException e) { logger.warn( - "remote repository [{}] in cluster-state but repository-service but not present " - + "in repository-service, skipping checks", + "Skipping repositories metadata checks: Remote repository [{}] is in the cluster state but not present " + + "in the repository service.", newRepositoryMetadata.name() ); break; diff --git a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java index 0b9536e273f25..49065be0abb25 100644 --- a/server/src/main/java/org/opensearch/repositories/RepositoriesService.java +++ b/server/src/main/java/org/opensearch/repositories/RepositoriesService.java @@ -579,10 +579,6 @@ public Repository repository(String repositoryName) { throw new RepositoryMissingException(repositoryName); } - public Boolean isRepositoryPresent(final String repositoryName) { - return Objects.nonNull(repositories.get(repositoryName)); - } - public List repositoriesStats() { List activeRepoStats = getRepositoryStatsForActiveRepositories(); return activeRepoStats; diff --git a/server/src/test/java/org/opensearch/cluster/coordination/JoinTaskExecutorTests.java b/server/src/test/java/org/opensearch/cluster/coordination/JoinTaskExecutorTests.java index f6fb203bfe1a9..9590e5615d451 100644 --- a/server/src/test/java/org/opensearch/cluster/coordination/JoinTaskExecutorTests.java +++ b/server/src/test/java/org/opensearch/cluster/coordination/JoinTaskExecutorTests.java @@ -55,6 +55,7 @@ import org.opensearch.common.util.FeatureFlags; import org.opensearch.node.remotestore.RemoteStoreNodeService; import org.opensearch.repositories.RepositoriesService; +import org.opensearch.repositories.RepositoryMissingException; import org.opensearch.repositories.blobstore.BlobStoreRepository; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.test.VersionUtils; @@ -1378,6 +1379,72 @@ public void testJoinRemoteStoreClusterWithRemotePublicationNodeInMixedMode() { JoinTaskExecutor.ensureNodesCompatibility(joiningNode, currentState.getNodes(), currentState.metadata()); } + public void testUpdatesClusterStateWithRepositoryMetadataNotInSync() throws Exception { + Map newNodeAttributes = new HashMap<>(); + newNodeAttributes.putAll(remoteStateNodeAttributes(CLUSTER_STATE_REPO)); + newNodeAttributes.putAll(remoteRoutingTableAttributes(ROUTING_TABLE_REPO)); + + final AllocationService allocationService = mock(AllocationService.class); + when(allocationService.adaptAutoExpandReplicas(any())).then(invocationOnMock -> invocationOnMock.getArguments()[0]); + final RerouteService rerouteService = (reason, priority, listener) -> listener.onResponse(null); + RepositoriesService repositoriesService = mock(RepositoriesService.class); + when(repositoriesService.repository(any())).thenThrow(RepositoryMissingException.class); + final RemoteStoreNodeService remoteStoreNodeService = new RemoteStoreNodeService(new SetOnce<>(repositoriesService)::get, null); + + final JoinTaskExecutor joinTaskExecutor = new JoinTaskExecutor( + Settings.EMPTY, + allocationService, + logger, + rerouteService, + remoteStoreNodeService + ); + + final DiscoveryNode clusterManagerNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + newNodeAttributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + + final RepositoryMetadata clusterStateRepo = buildRepositoryMetadata(clusterManagerNode, CLUSTER_STATE_REPO); + final RepositoryMetadata routingTableRepo = buildRepositoryMetadata(clusterManagerNode, ROUTING_TABLE_REPO); + List repositoriesMetadata = new ArrayList<>() { + { + add(clusterStateRepo); + add(routingTableRepo); + } + }; + + final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes( + DiscoveryNodes.builder() + .add(clusterManagerNode) + .localNodeId(clusterManagerNode.getId()) + .clusterManagerNodeId(clusterManagerNode.getId()) + ) + .metadata(Metadata.builder().putCustom(RepositoriesMetadata.TYPE, new RepositoriesMetadata(repositoriesMetadata))) + .build(); + + final DiscoveryNode joiningNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + newNodeAttributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + + final ClusterStateTaskExecutor.ClusterTasksResult result = joinTaskExecutor.execute( + clusterState, + List.of(new JoinTaskExecutor.Task(joiningNode, "test")) + ); + assertThat(result.executionResults.entrySet(), hasSize(1)); + final ClusterStateTaskExecutor.TaskResult taskResult = result.executionResults.values().iterator().next(); + assertTrue(taskResult.isSuccess()); + validatePublicationRepositoryMetadata(result.resultingState, clusterManagerNode); + + } + private void validateRepositoryMetadata(ClusterState updatedState, DiscoveryNode existingNode, int expectedRepositories) throws Exception { diff --git a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java index d8146e6f7d540..6bf5381b62cc9 100644 --- a/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java +++ b/test/framework/src/main/java/org/opensearch/test/transport/MockTransportService.java @@ -34,7 +34,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.opensearch.OpenSearchException; import org.opensearch.Version; import org.opensearch.cluster.ClusterModule; import org.opensearch.cluster.node.DiscoveryNode; @@ -377,47 +376,6 @@ public void addFailToSendNoConnectRule(TransportAddress transportAddress, final }); } - /** - * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions - */ - public void addOpenSearchFailureException( - TransportService transportService, - final OpenSearchException exception, - final String... blockedActions - ) { - addOpenSearchFailureException(transportService, exception, new HashSet<>(Arrays.asList(blockedActions))); - } - - /** - * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions - */ - public void addOpenSearchFailureException( - TransportService transportService, - OpenSearchException exception, - final Set blockedActions - ) { - for (TransportAddress transportAddress : extractTransportAddresses(transportService)) { - addOpenSearchFailureException(transportAddress, exception, blockedActions); - } - } - - /** - * Adds a rule that will cause matching operations to throw provided OpenSearch Exceptions - */ - public void addOpenSearchFailureException( - TransportAddress transportAddress, - OpenSearchException exception, - final Set blockedActions - ) { - transport().addSendBehavior(transportAddress, (connection, requestId, action, request, options) -> { - if (blockedActions.contains(action)) { - logger.info("--> preventing {} request", action); - throw exception; - } - connection.sendRequest(requestId, action, request, options); - }); - } - /** * Adds a rule that will cause ignores each send request, simulating an unresponsive node * and failing to connect once the rule was added. From 3df57ce02d4d47220b785cbb819b1b38139d44ba Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Fri, 6 Dec 2024 21:44:56 +0530 Subject: [PATCH 07/11] address comments Signed-off-by: Pranshu Shukla --- .../discovery/DiscoveryDisruptionIT.java | 59 +++++++++++++------ .../remotestore/RemoteStoreNodeService.java | 9 ++- .../opensearch/test/InternalTestCluster.java | 8 +-- 3 files changed, 48 insertions(+), 28 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index 21a395ad4c980..ed602a2cf4358 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -35,6 +35,7 @@ import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.coordination.FailedToCommitClusterStateException; import org.opensearch.cluster.coordination.JoinHelper; +import org.opensearch.cluster.coordination.PersistedStateRegistry; import org.opensearch.cluster.coordination.PublicationTransportHandler; import org.opensearch.cluster.metadata.RepositoriesMetadata; import org.opensearch.cluster.metadata.RepositoryMetadata; @@ -311,6 +312,30 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF // a new cluster-state event with the new node-join along with new repositories setup in the cluster meta-data. internalCluster().startDataOnlyNodes(1, remotePublicationSettings, Boolean.TRUE); + // Checking if publish succeeded in the nodes before shutting down the blocked cluster-manager + assertBusy(() -> { + String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); + PersistedStateRegistry registry = internalCluster().getInstance(PersistedStateRegistry.class, randomNode); + + ClusterState state = registry.getPersistedState(PersistedStateRegistry.PersistedStateType.LOCAL).getLastAcceptedState(); + RepositoriesMetadata repositoriesMetadata = state.metadata().custom(RepositoriesMetadata.TYPE); + Boolean isRemoteStateRepoConfigured = Boolean.FALSE; + Boolean isRemoteRoutingTableRepoConfigured = Boolean.FALSE; + + assertNotNull(repositoriesMetadata); + assertNotNull(repositoriesMetadata.repositories()); + + for (RepositoryMetadata repo : repositoriesMetadata.repositories()) { + if (repo.name().equals(remoteStateRepoName)) { + isRemoteStateRepoConfigured = Boolean.TRUE; + } else if (repo.name().equals(remoteRoutingTableRepoName)) { + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + } + } + assertTrue(isRemoteStateRepoConfigured); + assertTrue(isRemoteRoutingTableRepoConfigured); + }); + logger.info("Stopping current Cluster Manager"); // We stop the current cluster-manager whose outbound paths were blocked. This is to force a new election onto nodes // we had the new cluster-state published but not commited. @@ -323,6 +348,7 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); + // Checking if the final cluster-state is updated. RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode) .state() .metadata() @@ -342,33 +368,28 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF Assert.assertTrue("RemoteState Repo is not set in RepositoriesMetadata", isRemoteStateRepoConfigured); Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoriesMetadata", isRemoteRoutingTableRepoConfigured); - isRemoteStateRepoConfigured = Boolean.FALSE; - isRemoteRoutingTableRepoConfigured = Boolean.FALSE; - RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); - try { - Repository remoteStateRepo = repositoriesService.repository(remoteStateRepoName); - if (Objects.nonNull(remoteStateRepo)) { - isRemoteStateRepoConfigured = Boolean.TRUE; - } - } catch (RepositoryMissingException e) { - isRemoteStateRepoConfigured = Boolean.FALSE; - } + isRemoteStateRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteStateRepoName); + isRemoteRoutingTableRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteRoutingTableRepoName); + + Assert.assertTrue("RemoteState Repo is not set in RepositoryService", isRemoteStateRepoConfigured); + Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoryService", isRemoteRoutingTableRepoConfigured); + + logger.info("Stopping current Cluster Manager"); + } + private Boolean isRepoPresentInRepositoryService(RepositoriesService repositoriesService, String repoName) { try { - Repository routingTableRepo = repositoriesService.repository(remoteRoutingTableRepoName); - if (Objects.nonNull(routingTableRepo)) { - isRemoteRoutingTableRepoConfigured = Boolean.TRUE; + Repository remoteStateRepo = repositoriesService.repository(repoName); + if (Objects.nonNull(remoteStateRepo)) { + return Boolean.TRUE; } } catch (RepositoryMissingException e) { - isRemoteRoutingTableRepoConfigured = Boolean.FALSE; + return Boolean.FALSE; } - Assert.assertTrue("RemoteState Repo is not set in RepositoryService", isRemoteStateRepoConfigured); - Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoryService", isRemoteRoutingTableRepoConfigured); - - logger.info("Stopping current Cluster Manager"); + return Boolean.FALSE; } } diff --git a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java index da94c8a7836b3..fb97cf40d90d6 100644 --- a/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java +++ b/server/src/main/java/org/opensearch/node/remotestore/RemoteStoreNodeService.java @@ -186,11 +186,9 @@ public RepositoriesMetadata updateRepositoriesMetadata(DiscoveryNode joiningNode if (newRepositoryMetadata.name().equals(existingRepositoryMetadata.name())) { try { // This is to handle cases where-in the during a previous node-join attempt if the publish operation succeeded - // but - // the commit operation failed, the cluster-state may have the repository metadata which is not applied into the - // repository service. This may lead to assertion failures down the line. - String repositoryName = newRepositoryMetadata.name(); - repositoriesService.get().repository(repositoryName); + // but the commit operation failed, the cluster-state may have the repository metadata which is not applied + // into the repository service. This may lead to assertion failures down the line. + repositoriesService.get().repository(newRepositoryMetadata.name()); } catch (RepositoryMissingException e) { logger.warn( "Skipping repositories metadata checks: Remote repository [{}] is in the cluster state but not present " @@ -199,6 +197,7 @@ public RepositoriesMetadata updateRepositoriesMetadata(DiscoveryNode joiningNode ); break; } + try { // This will help in handling two scenarios - // 1. When a fresh cluster is formed and a node tries to join the cluster, the repository diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java index 14f0a30ef3f84..7b2c653e9bdb2 100644 --- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java @@ -2325,8 +2325,8 @@ public List startNodes(int numOfNodes, Settings settings) { /** * Starts multiple nodes with the given settings and returns their names */ - public List startNodes(int numOfNodes, Settings settings, Boolean ignoreNodeJoin) { - return startNodes(ignoreNodeJoin, Collections.nCopies(numOfNodes, settings).toArray(new Settings[0])); + public List startNodes(int numOfNodes, Settings settings, Boolean waitForNodeJoin) { + return startNodes(waitForNodeJoin, Collections.nCopies(numOfNodes, settings).toArray(new Settings[0])); } /** @@ -2339,7 +2339,7 @@ public synchronized List startNodes(Settings... extraSettings) { /** * Starts multiple nodes with the given settings and returns their names */ - public synchronized List startNodes(Boolean ignoreNodeJoin, Settings... extraSettings) { + public synchronized List startNodes(Boolean waitForNodeJoin, Settings... extraSettings) { final int newClusterManagerCount = Math.toIntExact(Stream.of(extraSettings).filter(DiscoveryNode::isClusterManagerNode).count()); final int defaultMinClusterManagerNodes; if (autoManageClusterManagerNodes) { @@ -2391,7 +2391,7 @@ public synchronized List startNodes(Boolean ignoreNodeJoin, Settings... nodes.add(nodeAndClient); } startAndPublishNodesAndClients(nodes); - if (autoManageClusterManagerNodes && !ignoreNodeJoin) { + if (autoManageClusterManagerNodes && !waitForNodeJoin) { validateClusterFormed(); } return nodes.stream().map(NodeAndClient::getName).collect(Collectors.toList()); From 9f8753f555d960d47e2ae4d19ff0fc770ccb860a Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Mon, 9 Dec 2024 12:16:13 +0530 Subject: [PATCH 08/11] update test Signed-off-by: Pranshu Shukla --- .../opensearch/discovery/DiscoveryDisruptionIT.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java index ed602a2cf4358..377f99cd8b791 100644 --- a/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/discovery/DiscoveryDisruptionIT.java @@ -332,8 +332,18 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF isRemoteRoutingTableRepoConfigured = Boolean.TRUE; } } + // Asserting that the metadata is present in the persisted cluster-state assertTrue(isRemoteStateRepoConfigured); assertTrue(isRemoteRoutingTableRepoConfigured); + + RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); + + isRemoteStateRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteStateRepoName); + isRemoteRoutingTableRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteRoutingTableRepoName); + + // Asserting that the metadata is not present in the repository service. + Assert.assertFalse(isRemoteStateRepoConfigured); + Assert.assertFalse(isRemoteRoutingTableRepoConfigured); }); logger.info("Stopping current Cluster Manager"); From dfb56d8ddf9ec4db19721a23462e5c9ad434bb3e Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Mon, 9 Dec 2024 15:16:58 +0530 Subject: [PATCH 09/11] Adding entry in CHANGELOG.md Signed-off-by: Pranshu Shukla --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c7c7eb7c5e8b..de177d4f00511 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [Tiered Caching] Fix bug in cache stats API ([#16560](https://github.com/opensearch-project/OpenSearch/pull/16560)) - Bound the size of cache in deprecation logger ([16702](https://github.com/opensearch-project/OpenSearch/issues/16702)) - Ensure consistency of system flag on IndexMetadata after diff is applied ([#16644](https://github.com/opensearch-project/OpenSearch/pull/16644)) +- Skip remote-repositories validations for node-joins when RepositoriesService is not in sync with cluster-state ([#16763](https://github.com/opensearch-project/OpenSearch/pull/16763)) ### Security From e89ea10d113394197554463c18b799f18e084056 Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Mon, 9 Dec 2024 16:26:24 +0530 Subject: [PATCH 10/11] Retry Build Signed-off-by: Pranshu Shukla From de0b01a8e2cb19457db897b093f8edb7a83dfad5 Mon Sep 17 00:00:00 2001 From: Pranshu Shukla Date: Mon, 9 Dec 2024 17:27:06 +0530 Subject: [PATCH 11/11] Retry Build Signed-off-by: Pranshu Shukla