Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revive remote cluster state auto restore integ tests #10503

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,13 @@ public void assertRemoteStoreRepositoryOnAllNodes(String repositoryName) {
// Validated that all the restricted settings are entact on all the nodes.
repository.getRestrictedSystemRepositorySettings()
.stream()
.forEach(setting -> assertEquals(setting.get(actualRepository.settings()), setting.get(expectedRepository.settings())));
.forEach(
setting -> assertEquals(
String.format(Locale.ROOT, "Restricted Settings mismatch [%s]", setting.getKey()),
setting.get(actualRepository.settings()),
setting.get(expectedRepository.settings())
)
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,22 @@

package org.opensearch.remotestore;

import org.opensearch.action.admin.cluster.remotestore.restore.RestoreRemoteStoreResponse;
import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsRequest;
import org.opensearch.action.support.PlainActionFuture;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.Metadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.gateway.remote.ClusterMetadataManifest;
import org.opensearch.gateway.remote.ClusterMetadataManifest.UploadedIndexMetadata;
import org.opensearch.gateway.remote.RemoteClusterStateService;
import org.opensearch.test.OpenSearchIntegTestCase;

import java.io.IOException;
import java.nio.file.Files;
import java.util.Locale;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ExecutionException;

import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_CLUSTER_STATE_ENABLED_SETTING;
import static org.opensearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
import static org.opensearch.indices.ShardLimitValidator.SETTING_MAX_SHARDS_PER_CLUSTER_KEY;

@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
public class RemoteStoreClusterStateRestoreIT extends BaseRemoteStoreRestoreIT {
Expand All @@ -48,47 +47,10 @@ private Map<String, Long> initialTestSetup(int shardCount, int replicaCount, int

private void resetCluster(int dataNodeCount, int clusterManagerNodeCount) {
internalCluster().stopAllNodes();
addNewNodes(dataNodeCount, clusterManagerNodeCount);
internalCluster().startClusterManagerOnlyNodes(clusterManagerNodeCount);
internalCluster().startDataOnlyNodes(dataNodeCount);
}

private void restoreAndValidate(String clusterUUID, Map<String, Long> indexStats) throws Exception {
restoreAndValidate(clusterUUID, indexStats, true);
}

private void restoreAndValidate(String clusterUUID, Map<String, Long> indexStats, boolean validate) throws Exception {
// TODO once auto restore is merged, the remote cluster state will be restored

if (validate) {
// Step - 4 validation restore is successful.
ensureGreen(INDEX_NAME);
verifyRestoredData(indexStats, INDEX_NAME);
}
}

private void restoreAndValidateFails(
String clusterUUID,
PlainActionFuture<RestoreRemoteStoreResponse> actionListener,
Class<? extends Throwable> clazz,
String errorSubString
) {

try {
restoreAndValidate(clusterUUID, null, false);
} catch (Exception e) {
assertTrue(
String.format(Locale.ROOT, "%s %s", clazz, e),
clazz.isAssignableFrom(e.getClass())
|| clazz.isAssignableFrom(e.getCause().getClass())
|| (e.getCause().getCause() != null && clazz.isAssignableFrom(e.getCause().getCause().getClass()))
);
assertTrue(
String.format(Locale.ROOT, "Error message mismatch. Expected: [%s]. Actual: [%s]", errorSubString, e.getMessage()),
e.getMessage().contains(errorSubString)
);
}
}

@AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9834")
public void testFullClusterRestore() throws Exception {
int shardCount = randomIntBetween(1, 2);
int replicaCount = 1;
Expand All @@ -106,10 +68,10 @@ public void testFullClusterRestore() throws Exception {
assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";

// Step - 3 Trigger full cluster restore and validate
restoreAndValidate(prevClusterUUID, indexStats);
validateMetadata(List.of(INDEX_NAME));
verifyRestoredData(indexStats, INDEX_NAME);
}

@AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9834")
public void testFullClusterRestoreMultipleIndices() throws Exception {
int shardCount = randomIntBetween(1, 2);
int replicaCount = 1;
Expand All @@ -134,155 +96,100 @@ public void testFullClusterRestoreMultipleIndices() throws Exception {
assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";

// Step - 3 Trigger full cluster restore
restoreAndValidate(prevClusterUUID, indexStats);
ensureGreen(secondIndexName);
verifyRestoredData(indexStats2, secondIndexName);
validateMetadata(List.of(INDEX_NAME, secondIndexName));
verifyRestoredData(indexStats, INDEX_NAME);
}

@AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9834")
public void testFullClusterRestoreFailureValidationFailures() throws Exception {
linuxpi marked this conversation as resolved.
Show resolved Hide resolved
public void testFullClusterRestoreManifestFilePointsToInvalidIndexMetadataPathThrowsException() throws Exception {
int shardCount = randomIntBetween(1, 2);
int replicaCount = 1;
int dataNodeCount = shardCount * (replicaCount + 1);
int clusterManagerNodeCount = 1;

// index some data to generate files in remote directory
Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, clusterManagerNodeCount);
String prevClusterUUID = clusterService().state().metadata().clusterUUID();

// Start of Test - 1
// Test - 1 Trigger full cluster restore and validate it fails due to incorrect cluster UUID
PlainActionFuture<RestoreRemoteStoreResponse> future = PlainActionFuture.newFuture();
restoreAndValidateFails("randomUUID", future, IllegalStateException.class, "Remote Cluster State not found - randomUUID");
// End of Test - 1
// Step - 1 index some data to generate files in remote directory
initialTestSetup(shardCount, replicaCount, dataNodeCount, clusterManagerNodeCount);

// Start of Test - 3
// Test - 2 Trigger full cluster restore and validate it fails due to cluster UUID same as current cluster UUID
future = PlainActionFuture.newFuture();
restoreAndValidateFails(
clusterService().state().metadata().clusterUUID(),
future,
IllegalArgumentException.class,
"clusterUUID to restore from should be different from current cluster UUID"
);
// End of Test - 2
String prevClusterUUID = clusterService().state().metadata().clusterUUID();
String clusterName = clusterService().state().getClusterName().value();

// Start of Test - 3
// Step - 2 Replace all nodes in the cluster with new nodes. This ensures new cluster state doesn't have previous index metadata
// Restarting cluster with just 1 data node helps with applying cluster settings
resetCluster(1, clusterManagerNodeCount);
String newClusterUUID = clusterService().state().metadata().clusterUUID();
assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";

reduceShardLimits(1, 1);

// Step - 4 Trigger full cluster restore and validate it fails
future = PlainActionFuture.newFuture();
restoreAndValidateFails(
prevClusterUUID,
future,
IllegalArgumentException.class,
"this action would add [2] total shards, but this cluster currently has [0]/[1] maximum shards open"
);
resetShardLimits();
// End of Test - 3

// Start of Test - 4
// Test -4 Reset cluster and trigger full restore with same name index in the cluster
// Test -4 Add required nodes for this test after last reset.
addNewNodes(dataNodeCount - 1, 0);

newClusterUUID = clusterService().state().metadata().clusterUUID();
assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";

// Test -4 Step - 2 Create a new index with same name
createIndex(INDEX_NAME, remoteStoreIndexSettings(0, 1));
ensureYellowAndNoInitializingShards(INDEX_NAME);
ensureGreen(INDEX_NAME);

future = PlainActionFuture.newFuture();

// Test -4 Step - 3 Trigger full cluster restore and validate fails
restoreAndValidateFails(
prevClusterUUID,
future,
IllegalStateException.class,
"cannot restore index [remote-store-test-idx-1] because an open index with same name/uuid already exists in the cluster"
);
internalCluster().stopAllNodes();
// Step - 3 Delete index metadata file in remote
try {
Files.move(
segmentRepoPath.resolve(
RemoteClusterStateService.encodeString(clusterName) + "/cluster-state/" + prevClusterUUID + "/index"
),
segmentRepoPath.resolve("cluster-state/")
);
} catch (IOException e) {
throw new RuntimeException(e);
}
assertThrows(IllegalStateException.class, () -> addNewNodes(dataNodeCount, clusterManagerNodeCount));
// Test is complete

// Test -4 Step - 4 validation restore is successful.
ensureGreen(INDEX_NAME);
// End of Test - 4
// Starting a node without remote state to ensure test cleanup
internalCluster().startNode(Settings.builder().put(REMOTE_CLUSTER_STATE_ENABLED_SETTING.getKey(), false).build());
}

@AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9834")
public void testFullClusterRestoreManifestFilePointsToInvalidIndexMetadataPathThrowsException() throws Exception {
public void testRemoteStateFullRestart() throws Exception {
int shardCount = randomIntBetween(1, 2);
int replicaCount = 1;
int dataNodeCount = shardCount * (replicaCount + 1);
int clusterManagerNodeCount = 1;

// Step - 1 index some data to generate files in remote directory
initialTestSetup(shardCount, replicaCount, dataNodeCount, clusterManagerNodeCount);
int clusterManagerNodeCount = 3;

Map<String, Long> indexStats = initialTestSetup(shardCount, replicaCount, dataNodeCount, clusterManagerNodeCount);
String prevClusterUUID = clusterService().state().metadata().clusterUUID();

// Step - 2 Replace all nodes in the cluster with new nodes. This ensures new cluster state doesn't have previous index metadata
resetCluster(dataNodeCount, clusterManagerNodeCount);

String newClusterUUID = clusterService().state().metadata().clusterUUID();
assert !Objects.equals(newClusterUUID, prevClusterUUID) : "cluster restart not successful. cluster uuid is same";

// Step - 4 Delete index metadata file in remote
// Delete index metadata file in remote
try {
Files.move(
segmentRepoPath.resolve(
RemoteClusterStateService.encodeString(clusterService().state().getClusterName().value())
+ "/cluster-state/"
+ prevClusterUUID
+ "/index"
+ "/manifest"
),
segmentRepoPath.resolve("cluster-state/")
);
} catch (IOException e) {
throw new RuntimeException(e);
}

// Step - 5 Trigger full cluster restore and validate fails
PlainActionFuture<RestoreRemoteStoreResponse> future = PlainActionFuture.newFuture();
restoreAndValidateFails(prevClusterUUID, future, IllegalStateException.class, "asdsa");
internalCluster().fullRestart();
ensureGreen(INDEX_NAME);
String newClusterUUID = clusterService().state().metadata().clusterUUID();
assert Objects.equals(newClusterUUID, prevClusterUUID) : "Full restart not successful. cluster uuid has changed";
validateCurrentMetadata();
verifyRestoredData(indexStats, INDEX_NAME);
linuxpi marked this conversation as resolved.
Show resolved Hide resolved
}

private void reduceShardLimits(int maxShardsPerNode, int maxShardsPerCluster) {
// Step 3 - Reduce shard limits to hit shard limit with less no of shards
try {
client().admin()
.cluster()
.updateSettings(
new ClusterUpdateSettingsRequest().transientSettings(
Settings.builder()
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(SETTING_MAX_SHARDS_PER_CLUSTER_KEY, maxShardsPerCluster)
)
)
.get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
private void validateMetadata(List<String> indexNames) {
assertEquals(clusterService().state().metadata().indices().size(), indexNames.size());
for (String indexName : indexNames) {
assertTrue(clusterService().state().metadata().hasIndex(indexName));
}
}

private void resetShardLimits() {
// Step - 5 Reset the cluster settings
ClusterUpdateSettingsRequest resetRequest = new ClusterUpdateSettingsRequest();
resetRequest.transientSettings(
Settings.builder().putNull(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()).putNull(SETTING_MAX_SHARDS_PER_CLUSTER_KEY)
private void validateCurrentMetadata() throws Exception {
RemoteClusterStateService remoteClusterStateService = internalCluster().getInstance(
RemoteClusterStateService.class,
internalCluster().getClusterManagerName()
);

try {
client().admin().cluster().updateSettings(resetRequest).get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
assertBusy(() -> {
ClusterMetadataManifest manifest = remoteClusterStateService.getLatestClusterMetadataManifest(
getClusterState().getClusterName().value(),
getClusterState().metadata().clusterUUID()
).get();
ClusterState clusterState = getClusterState();
Metadata currentMetadata = clusterState.metadata();
assertEquals(currentMetadata.indices().size(), manifest.getIndices().size());
assertEquals(currentMetadata.coordinationMetadata().term(), manifest.getClusterTerm());
assertEquals(clusterState.version(), manifest.getStateVersion());
assertEquals(clusterState.stateUUID(), manifest.getStateUUID());
assertEquals(currentMetadata.clusterUUIDCommitted(), manifest.isClusterUUIDCommitted());
for (UploadedIndexMetadata uploadedIndexMetadata : manifest.getIndices()) {
IndexMetadata currentIndexMetadata = currentMetadata.index(uploadedIndexMetadata.getIndexName());
assertEquals(currentIndexMetadata.getIndex().getUUID(), uploadedIndexMetadata.getIndexUUID());
}
});
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -1853,10 +1853,12 @@ public synchronized void stopRandomNodeNotCurrentMaster() throws IOException {
*/
public void stopAllNodes() {
try {
int totalDataNodes = numDataNodes();
while (totalDataNodes > 0) {
stopRandomDataNode();
totalDataNodes -= 1;
if (numDataAndClusterManagerNodes() != numClusterManagerNodes()) {
int totalDataNodes = numDataNodes();
while (totalDataNodes > 0) {
stopRandomDataNode();
totalDataNodes -= 1;
}
}
int totalClusterManagerNodes = numClusterManagerNodes();
while (totalClusterManagerNodes > 1) {
Expand Down