Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Replace multipart download with parallel file download #10548

Merged
merged 1 commit into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions server/src/main/java/org/opensearch/index/IndexModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
import org.opensearch.indices.IndicesQueryCache;
import org.opensearch.indices.fielddata.cache.IndicesFieldDataCache;
import org.opensearch.indices.mapper.MapperRegistry;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.plugins.IndexStorePlugin;
import org.opensearch.repositories.RepositoriesService;
Expand Down Expand Up @@ -602,7 +603,8 @@ public IndexService newIndexService(
IndexStorePlugin.DirectoryFactory remoteDirectoryFactory,
BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier,
Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier,
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
RecoverySettings recoverySettings
) throws IOException {
final IndexEventListener eventListener = freeze();
Function<IndexService, CheckedFunction<DirectoryReader, DirectoryReader, IOException>> readerWrapperFactory = indexReaderWrapper
Expand Down Expand Up @@ -660,7 +662,8 @@ public IndexService newIndexService(
recoveryStateFactory,
translogFactorySupplier,
clusterDefaultRefreshIntervalSupplier,
clusterRemoteTranslogBufferIntervalSupplier
clusterRemoteTranslogBufferIntervalSupplier,
recoverySettings
);
success = true;
return indexService;
Expand Down
9 changes: 6 additions & 3 deletions server/src/main/java/org/opensearch/index/IndexService.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@
import org.opensearch.index.shard.ShardNotInPrimaryModeException;
import org.opensearch.index.shard.ShardPath;
import org.opensearch.index.similarity.SimilarityService;
import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory;
import org.opensearch.index.store.Store;
import org.opensearch.index.translog.Translog;
import org.opensearch.index.translog.TranslogFactory;
import org.opensearch.indices.cluster.IndicesClusterStateService;
import org.opensearch.indices.fielddata.cache.IndicesFieldDataCache;
import org.opensearch.indices.mapper.MapperRegistry;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher;
import org.opensearch.plugins.IndexStorePlugin;
Expand Down Expand Up @@ -179,6 +179,7 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
private final BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier;
private final Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier;
private final Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier;
private final RecoverySettings recoverySettings;

public IndexService(
IndexSettings indexSettings,
Expand Down Expand Up @@ -213,7 +214,8 @@ public IndexService(
IndexStorePlugin.RecoveryStateFactory recoveryStateFactory,
BiFunction<IndexSettings, ShardRouting, TranslogFactory> translogFactorySupplier,
Supplier<TimeValue> clusterDefaultRefreshIntervalSupplier,
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier
Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
RecoverySettings recoverySettings
) {
super(indexSettings);
this.allowExpensiveQueries = allowExpensiveQueries;
Expand Down Expand Up @@ -290,6 +292,7 @@ public IndexService(
this.retentionLeaseSyncTask = new AsyncRetentionLeaseSyncTask(this);
this.translogFactorySupplier = translogFactorySupplier;
this.clusterRemoteTranslogBufferIntervalSupplier = clusterRemoteTranslogBufferIntervalSupplier;
this.recoverySettings = recoverySettings;
updateFsyncTaskIfNecessary();
}

Expand Down Expand Up @@ -522,7 +525,7 @@ public synchronized IndexShard createShard(
remoteStoreStatsTrackerFactory,
clusterRemoteTranslogBufferIntervalSupplier,
nodeEnv.nodeId(),
(RemoteSegmentStoreDirectoryFactory) remoteDirectoryFactory
recoverySettings
);
eventListener.indexShardStateChanged(indexShard, null, indexShard.state(), "shard created");
eventListener.afterIndexShardCreated(indexShard);
Expand Down
47 changes: 13 additions & 34 deletions server/src/main/java/org/opensearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@
import org.opensearch.action.admin.indices.flush.FlushRequest;
import org.opensearch.action.admin.indices.forcemerge.ForceMergeRequest;
import org.opensearch.action.admin.indices.upgrade.post.UpgradeRequest;
import org.opensearch.action.support.PlainActionFuture;
import org.opensearch.action.support.replication.PendingReplicationActions;
import org.opensearch.action.support.replication.ReplicationResponse;
import org.opensearch.cluster.metadata.DataStream;
Expand Down Expand Up @@ -162,9 +161,8 @@
import org.opensearch.index.seqno.SequenceNumbers;
import org.opensearch.index.shard.PrimaryReplicaSyncer.ResyncTask;
import org.opensearch.index.similarity.SimilarityService;
import org.opensearch.index.store.DirectoryFileTransferTracker;
import org.opensearch.index.store.RemoteSegmentStoreDirectory;
import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory;
import org.opensearch.index.store.RemoteStoreFileDownloader;
import org.opensearch.index.store.Store;
import org.opensearch.index.store.Store.MetadataSnapshot;
import org.opensearch.index.store.StoreFileMetadata;
Expand All @@ -185,6 +183,7 @@
import org.opensearch.indices.recovery.PeerRecoveryTargetService;
import org.opensearch.indices.recovery.RecoveryFailedException;
import org.opensearch.indices.recovery.RecoveryListener;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.recovery.RecoveryTarget;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
Expand All @@ -202,7 +201,6 @@
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -344,7 +342,7 @@
private final RemoteStoreStatsTrackerFactory remoteStoreStatsTrackerFactory;

private final List<ReferenceManager.RefreshListener> internalRefreshListener = new ArrayList<>();
private final RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory;
private final RemoteStoreFileDownloader fileDownloader;

public IndexShard(
final ShardRouting shardRouting,
Expand Down Expand Up @@ -373,10 +371,7 @@
final RemoteStoreStatsTrackerFactory remoteStoreStatsTrackerFactory,
final Supplier<TimeValue> clusterRemoteTranslogBufferIntervalSupplier,
final String nodeId,
// Wiring a directory factory here breaks some intended abstractions, but this remote directory
// factory is used not as a Lucene directory but instead to copy files from a remote store when
// restoring a shallow snapshot.
@Nullable final RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory
final RecoverySettings recoverySettings
) throws IOException {
super(shardRouting.shardId(), indexSettings);
assert shardRouting.initializing();
Expand Down Expand Up @@ -472,7 +467,7 @@
? false
: mapperService.documentMapper().mappers().containsTimeStampField();
this.remoteStoreStatsTrackerFactory = remoteStoreStatsTrackerFactory;
this.remoteSegmentStoreDirectoryFactory = remoteSegmentStoreDirectoryFactory;
this.fileDownloader = new RemoteStoreFileDownloader(shardRouting.shardId(), threadPool, recoverySettings);
}

public ThreadPool getThreadPool() {
Expand Down Expand Up @@ -570,6 +565,10 @@
return translogConfig.getNodeId();
}

public RemoteStoreFileDownloader getFileDownloader() {
return fileDownloader;
}

@Override
public void updateShardState(
final ShardRouting newRouting,
Expand Down Expand Up @@ -2711,15 +2710,15 @@

public void restoreFromSnapshotAndRemoteStore(
Repository repository,
RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory,
RepositoriesService repositoriesService,
ActionListener<Boolean> listener
) {
try {
assert shardRouting.primary() : "recover from store only makes sense if the shard is a primary shard";
assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.SNAPSHOT : "invalid recovery type: "
+ recoveryState.getRecoverySource();
StoreRecovery storeRecovery = new StoreRecovery(shardId, logger);
storeRecovery.recoverFromSnapshotAndRemoteStore(this, repository, remoteSegmentStoreDirectoryFactory, listener);
storeRecovery.recoverFromSnapshotAndRemoteStore(this, repository, repositoriesService, listener, threadPool);

Check warning on line 2721 in server/src/main/java/org/opensearch/index/shard/IndexShard.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/index/shard/IndexShard.java#L2721

Added line #L2721 was not covered by tests
} catch (Exception e) {
listener.onFailure(e);
}
Expand Down Expand Up @@ -3565,7 +3564,7 @@
"from snapshot and remote store",
recoveryState,
recoveryListener,
l -> restoreFromSnapshotAndRemoteStore(repositoriesService.repository(repo), remoteSegmentStoreDirectoryFactory, l)
l -> restoreFromSnapshotAndRemoteStore(repositoriesService.repository(repo), repositoriesService, l)

Check warning on line 3567 in server/src/main/java/org/opensearch/index/shard/IndexShard.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/index/shard/IndexShard.java#L3567

Added line #L3567 was not covered by tests
);
// indicesService.indexService(shardRouting.shardId().getIndex()).addMetadataListener();
} else {
Expand Down Expand Up @@ -4921,7 +4920,7 @@

if (toDownloadSegments.isEmpty() == false) {
try {
downloadSegments(storeDirectory, sourceRemoteDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync);
fileDownloader.download(sourceRemoteDirectory, storeDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync);
} catch (Exception e) {
throw new IOException("Error occurred when downloading segments from remote store", e);
}
Expand All @@ -4934,26 +4933,6 @@
return segmentNFile;
}

private void downloadSegments(
Directory storeDirectory,
RemoteSegmentStoreDirectory sourceRemoteDirectory,
RemoteSegmentStoreDirectory targetRemoteDirectory,
Set<String> toDownloadSegments,
final Runnable onFileSync
) throws IOException {
final Path indexPath = store.shardPath() == null ? null : store.shardPath().resolveIndex();
final DirectoryFileTransferTracker fileTransferTracker = store.getDirectoryFileTransferTracker();
for (String segment : toDownloadSegments) {
final PlainActionFuture<String> segmentListener = PlainActionFuture.newFuture();
sourceRemoteDirectory.copyTo(segment, storeDirectory, indexPath, fileTransferTracker, segmentListener);
segmentListener.actionGet();
onFileSync.run();
if (targetRemoteDirectory != null) {
targetRemoteDirectory.copyFrom(storeDirectory, segment, segment, IOContext.DEFAULT);
}
}
}

private boolean localDirectoryContains(Directory localDirectory, String file, long checksum) {
try (IndexInput indexInput = localDirectory.openInput(file, IOContext.DEFAULT)) {
if (checksum == CodecUtil.retrieveChecksum(indexInput)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@
import org.opensearch.indices.recovery.RecoveryState;
import org.opensearch.indices.replication.common.ReplicationLuceneIndex;
import org.opensearch.repositories.IndexId;
import org.opensearch.repositories.RepositoriesService;
import org.opensearch.repositories.Repository;
import org.opensearch.threadpool.ThreadPool;

import java.io.IOException;
import java.nio.channels.FileChannel;
Expand Down Expand Up @@ -360,8 +362,9 @@
void recoverFromSnapshotAndRemoteStore(
final IndexShard indexShard,
Repository repository,
RemoteSegmentStoreDirectoryFactory directoryFactory,
ActionListener<Boolean> listener
RepositoriesService repositoriesService,
ActionListener<Boolean> listener,
ThreadPool threadPool
) {
try {
if (canRecover(indexShard)) {
Expand Down Expand Up @@ -389,6 +392,10 @@
remoteStoreRepository = shallowCopyShardMetadata.getRemoteStoreRepository();
}

RemoteSegmentStoreDirectoryFactory directoryFactory = new RemoteSegmentStoreDirectoryFactory(
() -> repositoriesService,

Check warning on line 396 in server/src/main/java/org/opensearch/index/shard/StoreRecovery.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/index/shard/StoreRecovery.java#L395-L396

Added lines #L395 - L396 were not covered by tests
threadPool
);
RemoteSegmentStoreDirectory sourceRemoteDirectory = (RemoteSegmentStoreDirectory) directoryFactory.newDirectory(
remoteStoreRepository,
indexUUID,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,6 @@ public boolean copyFrom(
return false;
}

protected UnaryOperator<InputStream> getDownloadRateLimiter() {
return downloadRateLimiter;
}

private void uploadBlob(
Directory from,
String src,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Version;
import org.opensearch.common.UUIDs;
import org.opensearch.common.blobstore.AsyncMultiStreamBlobContainer;
import org.opensearch.common.blobstore.stream.read.listener.ReadContextListener;
import org.opensearch.common.collect.Tuple;
import org.opensearch.common.io.VersionedCodecStreamWrapper;
import org.opensearch.common.logging.Loggers;
Expand All @@ -38,15 +36,13 @@
import org.opensearch.index.store.lockmanager.RemoteStoreLockManager;
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata;
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadataHandler;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
import org.opensearch.threadpool.ThreadPool;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -93,8 +89,6 @@ public final class RemoteSegmentStoreDirectory extends FilterDirectory implement

private final ThreadPool threadPool;

private final RecoverySettings recoverySettings;

/**
* Keeps track of local segment filename to uploaded filename along with other attributes like checksum.
* This map acts as a cache layer for uploaded segment filenames which helps avoid calling listAll() each time.
Expand Down Expand Up @@ -127,15 +121,13 @@ public RemoteSegmentStoreDirectory(
RemoteDirectory remoteMetadataDirectory,
RemoteStoreLockManager mdLockManager,
ThreadPool threadPool,
ShardId shardId,
RecoverySettings recoverySettings
ShardId shardId
) throws IOException {
super(remoteDataDirectory);
this.remoteDataDirectory = remoteDataDirectory;
this.remoteMetadataDirectory = remoteMetadataDirectory;
this.mdLockManager = mdLockManager;
this.threadPool = threadPool;
this.recoverySettings = recoverySettings;
this.logger = Loggers.getLogger(getClass(), shardId);
init();
}
Expand Down Expand Up @@ -473,70 +465,6 @@ public void copyFrom(Directory from, String src, IOContext context, ActionListen
}
}

/**
* Copies an existing {@code source} file from this directory to a non-existent file (also
* named {@code source}) in either {@code destinationDirectory} or {@code destinationPath}.
* If the blob container backing this directory supports multipart downloads, the {@code source}
* file will be downloaded (potentially in multiple concurrent parts) directly to
* {@code destinationPath}. This method will return immediately and {@code fileCompletionListener}
* will be notified upon completion.
* <p>
* If multipart downloads are not supported, then {@code source} file will be copied to a file named
* {@code source} in a single part to {@code destinationDirectory}. The download will happen on the
* calling thread and {@code fileCompletionListener} will be notified synchronously before this
* method returns.
*
* @param source The source file name
* @param destinationDirectory The destination directory (if multipart is not supported)
* @param destinationPath The destination path (if multipart is supported)
* @param fileTransferTracker Tracker used for file transfer stats
* @param fileCompletionListener The listener to notify of completion
*/
public void copyTo(
String source,
Directory destinationDirectory,
Path destinationPath,
DirectoryFileTransferTracker fileTransferTracker,
ActionListener<String> fileCompletionListener
) {
final String blobName = getExistingRemoteFilename(source);
if (destinationPath != null && remoteDataDirectory.getBlobContainer() instanceof AsyncMultiStreamBlobContainer) {
long length = 0L;
try {
length = fileLength(source);
} catch (IOException ex) {
logger.error("Unable to fetch segment length for stats tracking", ex);
}
final long fileLength = length;
final long startTime = System.currentTimeMillis();
fileTransferTracker.addTransferredBytesStarted(fileLength);
final AsyncMultiStreamBlobContainer blobContainer = (AsyncMultiStreamBlobContainer) remoteDataDirectory.getBlobContainer();
final Path destinationFilePath = destinationPath.resolve(source);
final ActionListener<String> completionListener = ActionListener.wrap(response -> {
fileTransferTracker.addTransferredBytesSucceeded(fileLength, startTime);
fileCompletionListener.onResponse(response);
}, e -> {
fileTransferTracker.addTransferredBytesFailed(fileLength, startTime);
fileCompletionListener.onFailure(e);
});
final ReadContextListener readContextListener = new ReadContextListener(
blobName,
destinationFilePath,
completionListener,
threadPool,
remoteDataDirectory.getDownloadRateLimiter(),
recoverySettings.getMaxConcurrentRemoteStoreStreams()
);
blobContainer.readBlobAsync(blobName, readContextListener);
} else {
// Fallback to older mechanism of downloading the file
ActionListener.completeWith(fileCompletionListener, () -> {
destinationDirectory.copyFrom(this, source, source, IOContext.DEFAULT);
return source;
});
}
}

/**
* This acquires a lock on a given commit by creating a lock file in lock directory using {@code FileLockInfo}
*
Expand Down
Loading
Loading