Skip to content

Commit

Permalink
Replace multipart download with parallel file download
Browse files Browse the repository at this point in the history
There are a few open issues with the multi-stream download approach:
 - Recovery stats are not being reported correctly
 - It is incompatible (short of reopening and re-reading the entire
   file) with the existing Lucene checksum validation logic
 - There are some issues with integrating it with the pending client
   side encryption work

Given this, I attempted an experiment where I replaced with
multi-stream-within-a-single-file approach with simply parallelizing
downloads across files (this is how snapshot restore works). I actually
got better results with this approach: recovering a ~52GiB shard took
about 4.7 minutes with the multi-stream code versus 3.9 minutes with the
parallel file approach (r7g.4xlarge EC2 instance, 500MiB/s EBS volume,
S3 as remote repository).

I think this is the right approach as it leverages the more
battle-tested code path and addresses the three issues listed above. The
multi-stream approach still has promise as it will allow us to download
very large files faster (whereas this approach they can be the long poll
on the transfer operation). However, given that 5GB segments (made up of
multiple files in practice) are the norm, we generally aren't dealing with
huge files.

Signed-off-by: Andrew Ross <[email protected]>
  • Loading branch information
andrross committed Oct 10, 2023
1 parent 562e3b2 commit d4cca9a
Show file tree
Hide file tree
Showing 20 changed files with 333 additions and 312 deletions.
7 changes: 5 additions & 2 deletions server/src/main/java/org/opensearch/index/IndexModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
import org.opensearch.indices.IndicesQueryCache;
import org.opensearch.indices.fielddata.cache.IndicesFieldDataCache;
import org.opensearch.indices.mapper.MapperRegistry;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.plugins.IndexStorePlugin;
import org.opensearch.repositories.RepositoriesService;
Expand Down Expand Up @@ -602,7 +603,8 @@ public IndexService newIndexService(
IndexStorePlugin.DirectoryFactory remoteDirectoryFactory,
BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier,
Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier,
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
RecoverySettings recoverySettings
) throws IOException {
final IndexEventListener eventListener = freeze();
Function<IndexService, CheckedFunction<DirectoryReader, DirectoryReader, IOException>> readerWrapperFactory = indexReaderWrapper
Expand Down Expand Up @@ -660,7 +662,8 @@ public IndexService newIndexService(
recoveryStateFactory,
translogFactorySupplier,
clusterDefaultRefreshIntervalSupplier,
clusterRemoteTranslogBufferIntervalSupplier
clusterRemoteTranslogBufferIntervalSupplier,
recoverySettings
);
success = true;
return indexService;
Expand Down
9 changes: 6 additions & 3 deletions server/src/main/java/org/opensearch/index/IndexService.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@
import org.opensearch.index.shard.ShardNotInPrimaryModeException;
import org.opensearch.index.shard.ShardPath;
import org.opensearch.index.similarity.SimilarityService;
import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory;
import org.opensearch.index.store.Store;
import org.opensearch.index.translog.Translog;
import org.opensearch.index.translog.TranslogFactory;
import org.opensearch.indices.cluster.IndicesClusterStateService;
import org.opensearch.indices.fielddata.cache.IndicesFieldDataCache;
import org.opensearch.indices.mapper.MapperRegistry;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher;
import org.opensearch.plugins.IndexStorePlugin;
Expand Down Expand Up @@ -179,6 +179,7 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
private final BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier;
private final Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier;
private final Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier;
private final RecoverySettings recoverySettings;

public IndexService(
IndexSettings indexSettings,
Expand Down Expand Up @@ -213,7 +214,8 @@ public IndexService(
IndexStorePlugin.RecoveryStateFactory recoveryStateFactory,
BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier,
Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier,
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
RecoverySettings recoverySettings
) {
super(indexSettings);
this.allowExpensiveQueries = allowExpensiveQueries;
Expand Down Expand Up @@ -290,6 +292,7 @@ public IndexService(
this.retentionLeaseSyncTask = new AsyncRetentionLeaseSyncTask(this);
this.translogFactorySupplier = translogFactorySupplier;
this.clusterRemoteTranslogBufferIntervalSupplier = clusterRemoteTranslogBufferIntervalSupplier;
this.recoverySettings = recoverySettings;
updateFsyncTaskIfNecessary();
}

Expand Down Expand Up @@ -522,7 +525,7 @@ public synchronized IndexShard createShard(
remoteStoreStatsTrackerFactory,
clusterRemoteTranslogBufferIntervalSupplier,
nodeEnv.nodeId(),
(RemoteSegmentStoreDirectoryFactory) remoteDirectoryFactory
recoverySettings
);
eventListener.indexShardStateChanged(indexShard, null, indexShard.state(), "shard created");
eventListener.afterIndexShardCreated(indexShard);
Expand Down
47 changes: 13 additions & 34 deletions server/src/main/java/org/opensearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
import org.opensearch.action.admin.indices.flush.FlushRequest;
import org.opensearch.action.admin.indices.forcemerge.ForceMergeRequest;
import org.opensearch.action.admin.indices.upgrade.post.UpgradeRequest;
import org.opensearch.action.support.PlainActionFuture;
import org.opensearch.action.support.replication.PendingReplicationActions;
import org.opensearch.action.support.replication.ReplicationResponse;
import org.opensearch.cluster.metadata.DataStream;
Expand Down Expand Up @@ -160,9 +159,8 @@
import org.opensearch.index.seqno.SequenceNumbers;
import org.opensearch.index.shard.PrimaryReplicaSyncer.ResyncTask;
import org.opensearch.index.similarity.SimilarityService;
import org.opensearch.index.store.DirectoryFileTransferTracker;
import org.opensearch.index.store.RemoteSegmentStoreDirectory;
import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory;
import org.opensearch.index.store.RemoteStoreFileDownloader;
import org.opensearch.index.store.Store;
import org.opensearch.index.store.Store.MetadataSnapshot;
import org.opensearch.index.store.StoreFileMetadata;
Expand All @@ -184,6 +182,7 @@
import org.opensearch.indices.recovery.PeerRecoveryTargetService;
import org.opensearch.indices.recovery.RecoveryFailedException;
import org.opensearch.indices.recovery.RecoveryListener;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.recovery.RecoveryTarget;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
Expand All @@ -201,7 +200,6 @@
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -342,7 +340,7 @@ Runnable getGlobalCheckpointSyncer() {
private final RemoteStoreStatsTrackerFactory remoteStoreStatsTrackerFactory;

private final List<ReferenceManager.RefreshListener> internalRefreshListener = new ArrayList<>();
private final RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory;
private final RemoteStoreFileDownloader fileDownloader;

public IndexShard(
final ShardRouting shardRouting,
Expand Down Expand Up @@ -371,10 +369,7 @@ public IndexShard(
final RemoteStoreStatsTrackerFactory remoteStoreStatsTrackerFactory,
final Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
final String nodeId,
// Wiring a directory factory here breaks some intended abstractions, but this remote directory
// factory is used not as a Lucene directory but instead to copy files from a remote store when
// restoring a shallow snapshot.
@Nullable final RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory
final RecoverySettings recoverySettings
) throws IOException {
super(shardRouting.shardId(), indexSettings);
assert shardRouting.initializing();
Expand Down Expand Up @@ -470,7 +465,7 @@ public boolean shouldCache(Query query) {
? false
: mapperService.documentMapper().mappers().containsTimeStampField();
this.remoteStoreStatsTrackerFactory = remoteStoreStatsTrackerFactory;
this.remoteSegmentStoreDirectoryFactory = remoteSegmentStoreDirectoryFactory;
this.fileDownloader = new RemoteStoreFileDownloader(shardRouting.shardId(), threadPool, recoverySettings);
}

public ThreadPool getThreadPool() {
Expand Down Expand Up @@ -568,6 +563,10 @@ public String getNodeId() {
return translogConfig.getNodeId();
}

public RemoteStoreFileDownloader getFileDownloader() {
return fileDownloader;
}

@Override
public void updateShardState(
final ShardRouting newRouting,
Expand Down Expand Up @@ -2706,15 +2705,15 @@ public void restoreFromRemoteStore(ActionListener<Boolean> listener) {

public void restoreFromSnapshotAndRemoteStore(
Repository repository,
RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory,
RepositoriesService repositoriesService,
ActionListener<Boolean> listener
) {
try {
assert shardRouting.primary() : "recover from store only makes sense if the shard is a primary shard";
assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.SNAPSHOT : "invalid recovery type: "
+ recoveryState.getRecoverySource();
StoreRecovery storeRecovery = new StoreRecovery(shardId, logger);
storeRecovery.recoverFromSnapshotAndRemoteStore(this, repository, remoteSegmentStoreDirectoryFactory, listener);
storeRecovery.recoverFromSnapshotAndRemoteStore(this, repository, repositoriesService, listener, threadPool);
} catch (Exception e) {
listener.onFailure(e);
}
Expand Down Expand Up @@ -3554,7 +3553,7 @@ public void startRecovery(
"from snapshot and remote store",
recoveryState,
recoveryListener,
l -> restoreFromSnapshotAndRemoteStore(repositoriesService.repository(repo), remoteSegmentStoreDirectoryFactory, l)
l -> restoreFromSnapshotAndRemoteStore(repositoriesService.repository(repo), repositoriesService, l)
);
// indicesService.indexService(shardRouting.shardId().getIndex()).addMetadataListener();
} else {
Expand Down Expand Up @@ -4912,7 +4911,7 @@ private String copySegmentFiles(

if (toDownloadSegments.isEmpty() == false) {
try {
downloadSegments(storeDirectory, sourceRemoteDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync);
fileDownloader.download(sourceRemoteDirectory, storeDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync);
} catch (Exception e) {
throw new IOException("Error occurred when downloading segments from remote store", e);
}
Expand All @@ -4925,26 +4924,6 @@ private String copySegmentFiles(
return segmentNFile;
}

private void downloadSegments(
Directory storeDirectory,
RemoteSegmentStoreDirectory sourceRemoteDirectory,
RemoteSegmentStoreDirectory targetRemoteDirectory,
Set<String> toDownloadSegments,
final Runnable onFileSync
) throws IOException {
final Path indexPath = store.shardPath() == null ? null : store.shardPath().resolveIndex();
final DirectoryFileTransferTracker fileTransferTracker = store.getDirectoryFileTransferTracker();
for (String segment : toDownloadSegments) {
final PlainActionFuture<String> segmentListener = PlainActionFuture.newFuture();
sourceRemoteDirectory.copyTo(segment, storeDirectory, indexPath, fileTransferTracker, segmentListener);
segmentListener.actionGet();
onFileSync.run();
if (targetRemoteDirectory != null) {
targetRemoteDirectory.copyFrom(storeDirectory, segment, segment, IOContext.DEFAULT);
}
}
}

private boolean localDirectoryContains(Directory localDirectory, String file, long checksum) {
try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
if (checksum == CodecUtil.retrieveChecksum(indexInput)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.replication.common.ReplicationLuceneIndex;
import org.opensearch.repositories.IndexId;
import org.opensearch.repositories.RepositoriesService;
import org.opensearch.repositories.Repository;
import org.opensearch.threadpool.ThreadPool;

import java.io.IOException;
import java.nio.channels.FileChannel;
Expand Down Expand Up @@ -360,8 +362,9 @@ void recoverFromRepository(final IndexShard indexShard, Repository repository, A
void recoverFromSnapshotAndRemoteStore(
final IndexShard indexShard,
Repository repository,
RemoteSegmentStoreDirectoryFactory directoryFactory,
ActionListener<Boolean> listener
RepositoriesService repositoriesService,
ActionListener<Boolean> listener,
ThreadPool threadPool
) {
try {
if (canRecover(indexShard)) {
Expand Down Expand Up @@ -389,6 +392,10 @@ void recoverFromSnapshotAndRemoteStore(
remoteStoreRepository = shallowCopyShardMetadata.getRemoteStoreRepository();
}

RemoteSegmentStoreDirectoryFactory directoryFactory = new RemoteSegmentStoreDirectoryFactory(
() -> repositoriesService,
threadPool
);
RemoteSegmentStoreDirectory sourceRemoteDirectory = (RemoteSegmentStoreDirectory) directoryFactory.newDirectory(
remoteStoreRepository,
indexUUID,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,6 @@ public boolean copyFrom(
return false;
}

protected UnaryOperator<InputStream> getDownloadRateLimiter() {
return downloadRateLimiter;
}

private void uploadBlob(
Directory from,
String src,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Version;
import org.opensearch.common.UUIDs;
import org.opensearch.common.blobstore.AsyncMultiStreamBlobContainer;
import org.opensearch.common.blobstore.stream.read.listener.ReadContextListener;
import org.opensearch.common.collect.Tuple;
import org.opensearch.common.io.VersionedCodecStreamWrapper;
import org.opensearch.common.logging.Loggers;
Expand All @@ -38,15 +36,13 @@
import org.opensearch.index.store.lockmanager.RemoteStoreLockManager;
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata;
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadataHandler;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
import org.opensearch.threadpool.ThreadPool;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -93,8 +89,6 @@ public final class RemoteSegmentStoreDirectory extends FilterDirectory implement

private final ThreadPool threadPool;

private final RecoverySettings recoverySettings;

/**
* Keeps track of local segment filename to uploaded filename along with other attributes like checksum.
* This map acts as a cache layer for uploaded segment filenames which helps avoid calling listAll() each time.
Expand Down Expand Up @@ -127,15 +121,13 @@ public RemoteSegmentStoreDirectory(
RemoteDirectory remoteMetadataDirectory,
RemoteStoreLockManager mdLockManager,
ThreadPool threadPool,
ShardId shardId,
RecoverySettings recoverySettings
ShardId shardId
) throws IOException {
super(remoteDataDirectory);
this.remoteDataDirectory = remoteDataDirectory;
this.remoteMetadataDirectory = remoteMetadataDirectory;
this.mdLockManager = mdLockManager;
this.threadPool = threadPool;
this.recoverySettings = recoverySettings;
this.logger = Loggers.getLogger(getClass(), shardId);
init();
}
Expand Down Expand Up @@ -473,70 +465,6 @@ public void copyFrom(Directory from, String src, IOContext context, ActionListen
}
}

/**
* Copies an existing {@code source} file from this directory to a non-existent file (also
* named {@code source}) in either {@code destinationDirectory} or {@code destinationPath}.
* If the blob container backing this directory supports multipart downloads, the {@code source}
* file will be downloaded (potentially in multiple concurrent parts) directly to
* {@code destinationPath}. This method will return immediately and {@code fileCompletionListener}
* will be notified upon completion.
* <p>
* If multipart downloads are not supported, then {@code source} file will be copied to a file named
* {@code source} in a single part to {@code destinationDirectory}. The download will happen on the
* calling thread and {@code fileCompletionListener} will be notified synchronously before this
* method returns.
*
* @param source The source file name
* @param destinationDirectory The destination directory (if multipart is not supported)
* @param destinationPath The destination path (if multipart is supported)
* @param fileTransferTracker Tracker used for file transfer stats
* @param fileCompletionListener The listener to notify of completion
*/
public void copyTo(
String source,
Directory destinationDirectory,
Path destinationPath,
DirectoryFileTransferTracker fileTransferTracker,
ActionListener<String> fileCompletionListener
) {
final String blobName = getExistingRemoteFilename(source);
if (destinationPath != null && remoteDataDirectory.getBlobContainer() instanceof AsyncMultiStreamBlobContainer) {
long length = 0L;
try {
length = fileLength(source);
} catch (IOException ex) {
logger.error("Unable to fetch segment length for stats tracking", ex);
}
final long fileLength = length;
final long startTime = System.currentTimeMillis();
fileTransferTracker.addTransferredBytesStarted(fileLength);
final AsyncMultiStreamBlobContainer blobContainer = (AsyncMultiStreamBlobContainer) remoteDataDirectory.getBlobContainer();
final Path destinationFilePath = destinationPath.resolve(source);
final ActionListener<String> completionListener = ActionListener.wrap(response -> {
fileTransferTracker.addTransferredBytesSucceeded(fileLength, startTime);
fileCompletionListener.onResponse(response);
}, e -> {
fileTransferTracker.addTransferredBytesFailed(fileLength, startTime);
fileCompletionListener.onFailure(e);
});
final ReadContextListener readContextListener = new ReadContextListener(
blobName,
destinationFilePath,
completionListener,
threadPool,
remoteDataDirectory.getDownloadRateLimiter(),
recoverySettings.getMaxConcurrentRemoteStoreStreams()
);
blobContainer.readBlobAsync(blobName, readContextListener);
} else {
// Fallback to older mechanism of downloading the file
ActionListener.completeWith(fileCompletionListener, () -> {
destinationDirectory.copyFrom(this, source, source, IOContext.DEFAULT);
return source;
});
}
}

/**
* This acquires a lock on a given commit by creating a lock file in lock directory using {@code FileLockInfo}
*
Expand Down
Loading

0 comments on commit d4cca9a

Please sign in to comment.