Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Fix flakiness with SegmentReplicationSuiteIT #13180

Merged
merged 1 commit into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@

package org.opensearch.indices.replication;

import org.apache.lucene.tests.util.LuceneTestCase;
import org.opensearch.action.admin.indices.delete.DeleteIndexRequest;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.indices.replication.common.ReplicationType;
import org.opensearch.test.OpenSearchIntegTestCase;
import org.junit.Before;

@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9499")
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.SUITE, minNumDataNodes = 2)
public class SegmentReplicationSuiteIT extends SegmentReplicationBaseIT {

Expand Down Expand Up @@ -64,6 +62,7 @@ public void testDropRandomNodeDuringReplication() throws Exception {
ensureYellow(INDEX_NAME);
client().prepareIndex(INDEX_NAME).setId(Integer.toString(docCount)).setSource("field", "value" + docCount).execute().get();
internalCluster().startDataOnlyNode();
ensureGreen(INDEX_NAME);
client().admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).actionGet();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ public CheckpointInfoResponse(
this.infosBytes = infosBytes;
}

public CheckpointInfoResponse(final ReplicationCheckpoint checkpoint, final byte[] infosBytes) {
this.checkpoint = checkpoint;
this.infosBytes = infosBytes;
this.metadataMap = checkpoint.getMetadataMap();
}

public CheckpointInfoResponse(StreamInput in) throws IOException {
this.checkpoint = new ReplicationCheckpoint(in);
this.metadataMap = in.readMap(StreamInput::readString, StoreFileMetadata::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@
import org.opensearch.indices.IndicesService;
import org.opensearch.indices.recovery.FileChunkWriter;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
import org.opensearch.indices.replication.common.CopyState;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -36,7 +34,6 @@
/**
* Manages references to ongoing segrep events on a node.
* Each replica will have a new {@link SegmentReplicationSourceHandler} created when starting replication.
* CopyStates will be cached for reuse between replicas and only released when all replicas have finished copying segments.
*
* @opensearch.internal
*/
Expand All @@ -45,7 +42,6 @@ class OngoingSegmentReplications {
private static final Logger logger = LogManager.getLogger(OngoingSegmentReplications.class);
private final RecoverySettings recoverySettings;
private final IndicesService indicesService;
private final Map<ReplicationCheckpoint, CopyState> copyStateMap;
private final Map<String, SegmentReplicationSourceHandler> allocationIdToHandlers;

/**
Expand All @@ -57,46 +53,9 @@ class OngoingSegmentReplications {
OngoingSegmentReplications(IndicesService indicesService, RecoverySettings recoverySettings) {
this.indicesService = indicesService;
this.recoverySettings = recoverySettings;
this.copyStateMap = Collections.synchronizedMap(new HashMap<>());
this.allocationIdToHandlers = ConcurrentCollections.newConcurrentMap();
}

/*
Operations on the {@link #copyStateMap} member.
*/

/**
* A synchronized method that checks {@link #copyStateMap} for the given {@link ReplicationCheckpoint} key
* and returns the cached value if one is present. If the key is not present, a {@link CopyState}
* object is constructed and stored in the map before being returned.
*/
synchronized CopyState getCachedCopyState(ReplicationCheckpoint checkpoint) throws IOException {
if (isInCopyStateMap(checkpoint)) {
final CopyState copyState = fetchFromCopyStateMap(checkpoint);
// we incref the copyState for every replica that is using this checkpoint.
// decref will happen when copy completes.
copyState.incRef();
return copyState;
} else {
// From the checkpoint's shard ID, fetch the IndexShard
ShardId shardId = checkpoint.getShardId();
final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
final IndexShard indexShard = indexService.getShard(shardId.id());
// build the CopyState object and cache it before returning
final CopyState copyState = new CopyState(checkpoint, indexShard);

/*
Use the checkpoint from the request as the key in the map, rather than
the checkpoint from the created CopyState. This maximizes cache hits
if replication targets make a request with an older checkpoint.
Replication targets are expected to fetch the checkpoint in the response
CopyState to bring themselves up to date.
*/
addToCopyStateMap(checkpoint, copyState);
return copyState;
}
}

/**
* Start sending files to the replica.
*
Expand All @@ -114,51 +73,43 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener<GetSegmentF
);
}
// update the given listener to release the CopyState before it resolves.
final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(listener, () -> {
final SegmentReplicationSourceHandler sourceHandler = allocationIdToHandlers.remove(request.getTargetAllocationId());
if (sourceHandler != null) {
removeCopyState(sourceHandler.getCopyState());
}
});
final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(
listener,
() -> allocationIdToHandlers.remove(request.getTargetAllocationId())
);
handler.sendFiles(request, wrappedListener);
} else {
listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList()));
}
}

/**
* Prepare for a Replication event. This method constructs a {@link CopyState} holding files to be sent off of the current
* node's store. This state is intended to be sent back to Replicas before copy is initiated so the replica can perform a diff against its
* local store. It will then build a handler to orchestrate the segment copy that will be stored locally and started on a subsequent request from replicas
* with the list of required files.
* Prepare for a Replication event. This method constructs a {@link SegmentReplicationSourceHandler} that orchestrates segment copy and
* will internally incref files for copy.
*
* @param request {@link CheckpointInfoRequest}
* @param fileChunkWriter {@link FileChunkWriter} writer to handle sending files over the transport layer.
* @return {@link CopyState} the built CopyState for this replication event.
* @throws IOException - When there is an IO error building CopyState.
* @return {@link SegmentReplicationSourceHandler} the built CopyState for this replication event.
*/
CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) throws IOException {
final CopyState copyState = getCachedCopyState(request.getCheckpoint());
final SegmentReplicationSourceHandler newHandler = createTargetHandler(
request.getTargetNode(),
copyState,
request.getTargetAllocationId(),
fileChunkWriter
);
final SegmentReplicationSourceHandler existingHandler = allocationIdToHandlers.putIfAbsent(
request.getTargetAllocationId(),
newHandler
);
// If we are already replicating to this allocation Id, cancel the old and replace with a new execution.
// This will clear the old handler & referenced copy state holding an incref'd indexCommit.
if (existingHandler != null) {
logger.warn("Override handler for allocation id {}", request.getTargetAllocationId());
cancelHandlers(handler -> handler.getAllocationId().equals(request.getTargetAllocationId()), "cancel due to retry");
assert allocationIdToHandlers.containsKey(request.getTargetAllocationId()) == false;
allocationIdToHandlers.put(request.getTargetAllocationId(), newHandler);
}
assert allocationIdToHandlers.containsKey(request.getTargetAllocationId());
return copyState;
SegmentReplicationSourceHandler prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) {
return allocationIdToHandlers.computeIfAbsent(request.getTargetAllocationId(), aId -> {
try {
// From the checkpoint's shard ID, fetch the IndexShard
final ShardId shardId = request.getCheckpoint().getShardId();
final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
final IndexShard indexShard = indexService.getShard(shardId.id());
return new SegmentReplicationSourceHandler(
request.getTargetNode(),
fileChunkWriter,
indexShard,
request.getTargetAllocationId(),
Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
recoverySettings.getMaxConcurrentFileChunks()
);
} catch (IOException e) {
throw new UncheckedIOException("Error creating replication handler", e);
}
});
}

/**
Expand All @@ -167,8 +118,8 @@ CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter f
* @param shard {@link IndexShard}
* @param reason {@link String} - Reason for the cancel
*/
synchronized void cancel(IndexShard shard, String reason) {
cancelHandlers(handler -> handler.getCopyState().getShard().shardId().equals(shard.shardId()), reason);
void cancel(IndexShard shard, String reason) {
cancelHandlers(handler -> handler.shardId().equals(shard.shardId()), reason);
}

/**
Expand All @@ -177,11 +128,10 @@ synchronized void cancel(IndexShard shard, String reason) {
* @param allocationId {@link String} - Allocation ID.
* @param reason {@link String} - Reason for the cancel
*/
synchronized void cancel(String allocationId, String reason) {
void cancel(String allocationId, String reason) {
final SegmentReplicationSourceHandler handler = allocationIdToHandlers.remove(allocationId);
if (handler != null) {
handler.cancel(reason);
removeCopyState(handler.getCopyState());
}
}

Expand All @@ -194,14 +144,6 @@ void cancelReplication(DiscoveryNode node) {
cancelHandlers(handler -> handler.getTargetNode().equals(node), "Node left");
}

/**
* Checks if the {@link #copyStateMap} has the input {@link ReplicationCheckpoint}
* as a key by invoking {@link Map#containsKey(Object)}.
*/
boolean isInCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
return copyStateMap.containsKey(replicationCheckpoint);
}

int size() {
return allocationIdToHandlers.size();
}
Expand All @@ -211,58 +153,20 @@ Map<String, SegmentReplicationSourceHandler> getHandlers() {
return allocationIdToHandlers;
}

int cachedCopyStateSize() {
return copyStateMap.size();
}

private SegmentReplicationSourceHandler createTargetHandler(
DiscoveryNode node,
CopyState copyState,
String allocationId,
FileChunkWriter fileChunkWriter
) {
return new SegmentReplicationSourceHandler(
node,
fileChunkWriter,
copyState.getShard().getThreadPool(),
copyState,
allocationId,
Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
recoverySettings.getMaxConcurrentFileChunks()
);
}

/**
* Adds the input {@link CopyState} object to {@link #copyStateMap}.
* The key is the CopyState's {@link ReplicationCheckpoint} object.
*/
private void addToCopyStateMap(ReplicationCheckpoint checkpoint, CopyState copyState) {
copyStateMap.putIfAbsent(checkpoint, copyState);
}

/**
* Given a {@link ReplicationCheckpoint}, return the corresponding
* {@link CopyState} object, if any, from {@link #copyStateMap}.
*/
private CopyState fetchFromCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
return copyStateMap.get(replicationCheckpoint);
}

/**
* Remove a CopyState. Intended to be called after a replication event completes.
* This method will remove a copyState from the copyStateMap only if its refCount hits 0.
*
* @param copyState {@link CopyState}
* Clear handlers for any allocationIds not in sync.
* @param shardId {@link ShardId}
* @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
*/
private synchronized void removeCopyState(CopyState copyState) {
if (copyState.decRef() == true) {
copyStateMap.remove(copyState.getRequestedReplicationCheckpoint());
}
void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
cancelHandlers(
(handler) -> handler.shardId().equals(shardId) && inSyncAllocationIds.contains(handler.getAllocationId()) == false,
"Shard is no longer in-sync with the primary"
);
}

/**
* Remove handlers from allocationIdToHandlers map based on a filter predicate.
* This will also decref the handler's CopyState reference.
*/
private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> predicate, String reason) {
final List<String> allocationIds = allocationIdToHandlers.values()
Expand All @@ -278,17 +182,4 @@ private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> p
cancel(allocationId, reason);
}
}

/**
* Clear copystate and target handlers for any non insync allocationIds.
* @param shardId {@link ShardId}
* @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
*/
public void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
cancelHandlers(
(handler) -> handler.getCopyState().getShard().shardId().equals(shardId)
&& inSyncAllocationIds.contains(handler.getAllocationId()) == false,
"Shard is no longer in-sync with the primary"
);
}
}
Loading
Loading