Skip to content

Commit

Permalink
sets default concurrency for blob upload for adlfs to 1
Browse files Browse the repository at this point in the history
  • Loading branch information
rudolfix committed Sep 2, 2024
1 parent 36c0d14 commit b6a6bf7
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
11 changes: 11 additions & 0 deletions dlt/common/storages/fsspec_filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ class FileItem(TypedDict, total=False):
CREDENTIALS_DISPATCH["abfss"] = CREDENTIALS_DISPATCH["az"]
CREDENTIALS_DISPATCH["gcs"] = CREDENTIALS_DISPATCH["gs"]

# Default kwargs for protocol
DEFAULT_KWARGS = {
# disable concurrent
"az": {"max_concurrency": 1}
}
DEFAULT_KWARGS["adl"] = DEFAULT_KWARGS["az"]
DEFAULT_KWARGS["abfs"] = DEFAULT_KWARGS["az"]
DEFAULT_KWARGS["azure"] = DEFAULT_KWARGS["az"]
DEFAULT_KWARGS["abfss"] = DEFAULT_KWARGS["az"]


def fsspec_filesystem(
protocol: str,
Expand Down Expand Up @@ -125,6 +135,7 @@ def prepare_fsspec_args(config: FilesystemConfiguration) -> DictStrAny:

register_implementation("gdrive", GoogleDriveFileSystem, "GoogleDriveFileSystem")

fs_kwargs.update(DEFAULT_KWARGS.get(protocol, {}))
if config.kwargs is not None:
fs_kwargs.update(config.kwargs)
if config.client_kwargs is not None:
Expand Down
9 changes: 9 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/filesystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,15 @@ azure_client_secret = "client_secret"
azure_tenant_id = "tenant_id" # please set me up!
```

:::caution
**Concurrent blob uploads**
`dlt` limits the number of concurrent connections for a single uploaded blob to 1. By default `adlfs` that we use, splits blobs into 4 MB chunks and uploads them concurrently which leads to gigabytes of used memory and thousands of connections for a larger load packages. You can increase the maximum concurrency as follows:
```toml
[destination.filesystem.kwargs]
max_concurrency=3
```
:::

### Local file system
If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required)

Expand Down

0 comments on commit b6a6bf7

Please sign in to comment.