Skip to content

Commit

Permalink
Fix the double slash namespace bug (#2436)
Browse files Browse the repository at this point in the history
  • Loading branch information
itaiad200 authored Sep 5, 2021
1 parent 462de1d commit a9f47dc
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 18 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

## Unreleased - XXXX-XX-XX

- Fix double slash bug in storage namespace (#2397)

## v0.49.0 - 2021-09-02

- Add search locations to load lakeFS configuration. More information on
Expand Down
45 changes: 36 additions & 9 deletions cmd/lakefs/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ var runCmd = &cobra.Command{
os.Exit(1)
}
if !allowForeign {
checkForeignRepos(ctx, logger, authMetadataManager, blockStore, c)
checkRepos(ctx, logger, authMetadataManager, blockStore, c)
}

// update health info with installation ID
Expand Down Expand Up @@ -204,10 +204,8 @@ var runCmd = &cobra.Command{
},
}

// A foreign repo is a repository which namespace doesn't match the current block adapter.
// A foreign repo might exists if the lakeFS instance configuration changed after a repository was
// already created. The behaviour of lakeFS for foreign repos is undefined and should be blocked.
func checkForeignRepos(ctx context.Context, logger logging.Logger, authMetadataManager *auth.DBMetadataManager, blockStore block.Adapter, c *catalog.Catalog) {
// checkRepos iterates on all repos and validates that their settings are correct.
func checkRepos(ctx context.Context, logger logging.Logger, authMetadataManager *auth.DBMetadataManager, blockStore block.Adapter, c *catalog.Catalog) {
initialized, err := authMetadataManager.IsInitialized(ctx)
if err != nil {
logger.WithError(err).Fatal("Failed to check if lakeFS is initialized")
Expand Down Expand Up @@ -240,17 +238,46 @@ func checkForeignRepos(ctx context.Context, logger logging.Logger, authMetadataM
logger.WithError(err).Fatalf("Failed to parse to parse storage type '%s'", nsURL)
}

if adapterStorageType != repoStorageType.BlockstoreType() {
logger.Fatalf("Mismatched adapter detected. lakeFS started with adapter of type '%s', but repository '%s' is of type '%s'",
adapterStorageType, repo.Name, repoStorageType.BlockstoreType())
}
checkForeignRepo(repoStorageType, logger, adapterStorageType, repo.Name)
checkMetadataPrefix(ctx, repo, logger, blockStore, repoStorageType)

next = repo.Name
}
}
}
}

// checkMetadataPrefix checks for non-migrated repos of issue #2397 (https://github.com/treeverse/lakeFS/issues/2397)
func checkMetadataPrefix(ctx context.Context, repo *catalog.Repository, logger logging.Logger, adapter block.Adapter, repoStorageType block.StorageType) {
if repoStorageType != block.StorageTypeGS &&
repoStorageType != block.StorageTypeAzure {
return
}

const dummyFile = "dummy"
if _, err := adapter.Get(ctx, block.ObjectPointer{
StorageNamespace: repo.StorageNamespace,
Identifier: dummyFile,
}, -1); err != nil {
logger.WithFields(logging.Fields{
"path": dummyFile,
"storage_namespace": repo.StorageNamespace,
}).Fatal("Can't find dummy file in storage namespace, did you run the migration? " +
"(http://docs.lakefs.io/reference/upgrade.html#data-migration-for-version-v0500)")
}
}

// checkForeignRepo checks whether a repo storage namespace matches the block adapter.
// A foreign repo is a repository which namespace doesn't match the current block adapter.
// A foreign repo might exists if the lakeFS instance configuration changed after a repository was
// already created. The behaviour of lakeFS for foreign repos is undefined and should be blocked.
func checkForeignRepo(repoStorageType block.StorageType, logger logging.Logger, adapterStorageType, repoName string) {
if adapterStorageType != repoStorageType.BlockstoreType() {
logger.Fatalf("Mismatched adapter detected. lakeFS started with adapter of type '%s', but repository '%s' is of type '%s'",
adapterStorageType, repoName, repoStorageType.BlockstoreType())
}
}

const runBanner = `
██╗ █████╗ ██╗ ██╗███████╗███████╗███████╗
Expand Down
83 changes: 83 additions & 0 deletions docs/reference/upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,86 @@ To do so, add the following to your configuration (relevant **only** for 0.30.0)
cataloger:
type: rocks
```
## Data Migration for Version v0.50.0
We discovered a bug in the way lakeFS is storing objects in the underlying object store.
It affects only repositories on Azure and GCP, and not all of these.
[Issue #2397](https://github.com/treeverse/lakeFS/issues/2397#issuecomment-908397229) describes the repository storage namespaces patterns
which are affected by this bug.
When first upgrading to a version greater or equal to v0.50.0, you must follow these steps:
1. Stop lakeFS.
1. Perform a data-migration (details below)
1. Start lakeFS with the new version.
1. After a successful run of the new version, and after validating the objects are accessible, you can delete the old data prefix.
Note: Migrating data is a delicate procedure. The lakeFS team is here to help, reach out to us on Slack.
We'll be happy to walk you through the process.
{: .note .pb-3 }
### Data migration
The following patterns have been impacted by the bug:
| Type | Storage Namespace pattern | Copy From | Copy To |
|-------|-----------------------------------------------------------|------------------------------------------------------------|------------------------------------------------------------|
| gs | gs://bucket/prefix | gs://bucket//prefix/* | gs://bucket/prefix/* |
| gs | gs://bucket/prefix/ | gs://bucket//prefix/* | gs://bucket/prefix/* |
| azure | https://account.blob.core.windows.net/containerid | https://account.blob.core.windows.net/containerid//* | https://account.blob.core.windows.net/containerid/* |
| azure | https://account.blob.core.windows.net/containerid/ | https://account.blob.core.windows.net/containerid//* | https://account.blob.core.windows.net/containerid/* |
| azure | https://account.blob.core.windows.net/containerid/prefix/ | https://account.blob.core.windows.net/containerid/prefix// | https://account.blob.core.windows.net/containerid/prefix/* |
You can find the repositories storage namespaces with:
```shell
lakectl repo list
```

Or the settings tab in the UI.

#### Migrating Google Storage data with gsutil

[gsutil](https://cloud.google.com/storage/docs/gsutil) is a Python application that lets you access Cloud Storage from the command line.
We can use it for copying the data between the prefixes in the Google bucket, and later on removing it.

For every affected repository, copy its data with:
```shell
gsutil -m cp -r gs://<BUCKET>//<PREFIX>/ gs://<BUCKET>/
```

Note the double slash after the bucket name.

#### Migrating Azure Blob Storage data with AzCopy

[AzCopy](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10) is a command-line utility that you can use to copy blobs or files to or from a storage account.
We can use it for copying the data between the prefixes in the Azure storage account container, and later on removing it.

First, you need to acquire an [Account SAS](https://docs.microsoft.com/en-us/azure/storage/common/storage-sas-overview#account-sas).
Using the Azure CLI:
```shell
az storage container generate-sas \
--account-name <ACCOUNT> \
--name <CONTAINER> \
--permissions cdrw \
--auth-mode key \
--expiry 2021-12-31
```

With the resulted SAS, use AzCopy to copy the files.
If a prefix exists after the container:
```shell
azcopy copy \
"https://<ACCOUNT>.blob.core.windows.net/<CONTAINER>/<PREFIX>//?<SAS_TOKEN>" \
"https://<ACCOUNT>.blob.core.windows.net/<CONTAINER>?<SAS_TOKEN>" \
--recursive=true
```

Or when using the container without a prefix:

```shell
azcopy copy \
"https://<ACCOUNT>.blob.core.windows.net/<CONTAINER>//?<SAS_TOKEN>" \
"https://<ACCOUNT>.blob.core.windows.net/<CONTAINER>/./?<SAS_TOKEN>" \
--recursive=true
```
11 changes: 8 additions & 3 deletions pkg/block/azure/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func resolveBlobURLInfoFromURL(pathURL *url.URL) (BlobURLInfo, error) {
return qk, block.ErrInvalidNamespace
}
// In azure the first part of the path is part of the storage namespace
trimmedPath := strings.TrimLeft(pathURL.Path, "/")
trimmedPath := strings.Trim(pathURL.Path, "/")
parts := strings.Split(trimmedPath, "/")
if len(parts) == 0 {
return qk, block.ErrInvalidNamespace
Expand Down Expand Up @@ -100,10 +100,15 @@ func resolveBlobURLInfo(obj block.ObjectPointer) (BlobURLInfo, error) {
if err != nil {
return qk, err
}
return BlobURLInfo{
info := BlobURLInfo{
ContainerURL: qp.ContainerURL,
BlobURL: qp.BlobURL + "/" + key,
}, nil
}
if qp.BlobURL == "" {
info.BlobURL = key
}

return info, nil
}
return resolveBlobURLInfoFromURL(parsedKey)
}
Expand Down
5 changes: 2 additions & 3 deletions pkg/block/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,9 @@ func GetStorageType(namespaceURL *url.URL) (StorageType, error) {
}

func formatPathWithNamespace(namespacePath, keyPath string) string {
namespacePath = strings.TrimSuffix(namespacePath, "/")
keyPath = strings.TrimPrefix(keyPath, "/")
namespacePath = strings.Trim(namespacePath, "/")
if len(namespacePath) == 0 {
return keyPath
return strings.TrimPrefix(keyPath, "/")
}
return namespacePath + "/" + keyPath
}
Expand Down
60 changes: 57 additions & 3 deletions pkg/block/namespace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,42 @@ func TestResolveNamespace(t *testing.T) {
Key: "bar/baz",
},
},
{
Name: "valid_namespace_with_prefix_and_trailing_slash",
DefaultNamespace: "gs://foo/bla/",
Key: "bar/baz",
Type: block.IdentifierTypeRelative,
ExpectedErr: nil,
Expected: block.QualifiedKey{
StorageType: block.StorageTypeGS,
StorageNamespace: "foo",
Key: "bla/bar/baz",
},
},
{
Name: "valid_namespace_with_prefix_and_no_trailing_slash",
DefaultNamespace: "gs://foo/bla",
Key: "bar/baz",
Type: block.IdentifierTypeRelative,
ExpectedErr: nil,
Expected: block.QualifiedKey{
StorageType: block.StorageTypeGS,
StorageNamespace: "foo",
Key: "bla/bar/baz",
},
},
{
Name: "valid_namespace_with_prefix_and_leading_key_slash",
DefaultNamespace: "gs://foo/bla",
Key: "/bar/baz",
Type: block.IdentifierTypeRelative,
ExpectedErr: nil,
Expected: block.QualifiedKey{
StorageType: block.StorageTypeGS,
StorageNamespace: "foo",
Key: "bla//bar/baz",
},
},
{
Name: "valid_fq_key",
DefaultNamespace: "mem://foo/",
Expand Down Expand Up @@ -148,7 +184,7 @@ func TestFormatQualifiedKey(t *testing.T) {
QualifiedKey: block.QualifiedKey{
StorageType: block.StorageTypeS3,
StorageNamespace: "some-bucket/",
Key: "/path/to/file",
Key: "path/to/file",
},
Expected: "s3://some-bucket/path/to/file",
},
Expand All @@ -157,16 +193,34 @@ func TestFormatQualifiedKey(t *testing.T) {
QualifiedKey: block.QualifiedKey{
StorageType: block.StorageTypeS3,
StorageNamespace: "some-bucket/prefix/",
Key: "/path/to/file",
Key: "path/to/file",
},
Expected: "s3://some-bucket/prefix/path/to/file",
},
{
Name: "path_with_prefix_leading_slash",
QualifiedKey: block.QualifiedKey{
StorageType: block.StorageTypeS3,
StorageNamespace: "some-bucket",
Key: "/path/to/file",
},
Expected: "s3://some-bucket//path/to/file",
},
{
Name: "bucket_with_prefix_leading_slash",
QualifiedKey: block.QualifiedKey{
StorageType: block.StorageTypeS3,
StorageNamespace: "some-bucket/prefix",
Key: "/path/to/file",
},
Expected: "s3://some-bucket/prefix//path/to/file",
},
{
Name: "dont_eliminate_dots",
QualifiedKey: block.QualifiedKey{
StorageType: block.StorageTypeS3,
StorageNamespace: "some-bucket/prefix/",
Key: "/path/to/../file",
Key: "path/to/../file",
},
Expected: "s3://some-bucket/prefix/path/to/../file",
},
Expand Down

0 comments on commit a9f47dc

Please sign in to comment.