From 7b211bd4b85f35629bc61a20dd137e3aa1e6bbfa Mon Sep 17 00:00:00 2001 From: Barak Amar Date: Wed, 20 Sep 2023 18:43:55 +0800 Subject: [PATCH 1/5] misc: code cleanup (#6627) * code cleanup * defer adapter close with check --- esti/webhook_server_test.go | 2 +- pkg/api/auth_middleware.go | 6 +-- pkg/api/controller.go | 6 +-- pkg/auth/acl/name.go | 12 +++--- pkg/auth/acl/permission.go | 24 +++++------ pkg/auth/acl/write.go | 13 +++--- pkg/auth/service_test.go | 20 ++++----- pkg/auth/setup/setup.go | 42 +++++++++---------- pkg/block/blocktest/adapter.go | 4 +- pkg/block/gs/adapter_test.go | 19 ++++++--- pkg/block/gs/main_test.go | 44 +++++++++----------- pkg/gateway/operations/listbuckets.go | 2 +- pkg/graveler/committed/diff_test.go | 4 +- pkg/graveler/committed/import_test.go | 8 ++-- pkg/graveler/committed/merge_test.go | 4 +- pkg/graveler/graveler_test.go | 10 ++--- pkg/graveler/ref/manager_test.go | 2 +- pkg/graveler/settings/manager_test.go | 2 +- pkg/kv/kvtest/store.go | 4 +- pkg/kv/migrations/migrations_test.go | 51 +++++++++++------------ pkg/kv/migrations/rbac_to_acl.go | 58 +++++++++++++-------------- pkg/loadtest/local_load_test.go | 2 +- pkg/samplerepo/samplecontent.go | 10 ++--- 23 files changed, 176 insertions(+), 173 deletions(-) diff --git a/esti/webhook_server_test.go b/esti/webhook_server_test.go index aa8f2b7636d..54313baa46e 100644 --- a/esti/webhook_server_test.go +++ b/esti/webhook_server_test.go @@ -63,7 +63,7 @@ func startWebhookServer() (*webhookServer, error) { } go func() { if err = s.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) { - log.Fatalf("listen:%s\n", err) + log.Fatalf("listen: %s\n", err) } }() diff --git a/pkg/api/auth_middleware.go b/pkg/api/auth_middleware.go index b09d7bed268..3b4b27cfd5c 100644 --- a/pkg/api/auth_middleware.go +++ b/pkg/api/auth_middleware.go @@ -15,7 +15,7 @@ import ( "github.com/treeverse/lakefs/pkg/api/apigen" "github.com/treeverse/lakefs/pkg/auth" "github.com/treeverse/lakefs/pkg/auth/model" - oidc_encoding "github.com/treeverse/lakefs/pkg/auth/oidc/encoding" + oidcencoding "github.com/treeverse/lakefs/pkg/auth/oidc/encoding" "github.com/treeverse/lakefs/pkg/logging" ) @@ -202,7 +202,7 @@ func enhanceWithFriendlyName(user *model.User, friendlyName string) *model.User // If the user doesn't exist on the lakeFS side, it is created. // This function does not make any calls to an external provider. func userFromSAML(ctx context.Context, logger logging.Logger, authService auth.Service, authSession *sessions.Session, cookieAuthConfig *CookieAuthConfig) (*model.User, error) { - idTokenClaims, ok := authSession.Values[SAMLTokenClaimsSessionKey].(oidc_encoding.Claims) + idTokenClaims, ok := authSession.Values[SAMLTokenClaimsSessionKey].(oidcencoding.Claims) if idTokenClaims == nil { return nil, nil } @@ -291,7 +291,7 @@ func userFromSAML(ctx context.Context, logger logging.Logger, authService auth.S // If the user doesn't exist on the lakeFS side, it is created. // This function does not make any calls to an external provider. func userFromOIDC(ctx context.Context, logger logging.Logger, authService auth.Service, authSession *sessions.Session, oidcConfig *OIDCConfig) (*model.User, error) { - idTokenClaims, ok := authSession.Values[IDTokenClaimsSessionKey].(oidc_encoding.Claims) + idTokenClaims, ok := authSession.Values[IDTokenClaimsSessionKey].(oidcencoding.Claims) if idTokenClaims == nil { return nil, nil } diff --git a/pkg/api/controller.go b/pkg/api/controller.go index d9f995a2f12..a1ba65d3b8c 100644 --- a/pkg/api/controller.go +++ b/pkg/api/controller.go @@ -557,7 +557,7 @@ func (c *Controller) GetGroup(w http.ResponseWriter, r *http.Request, groupID st } func (c *Controller) GetGroupACL(w http.ResponseWriter, r *http.Request, groupID string) { - aclPolicyName := acl.ACLPolicyName(groupID) + aclPolicyName := acl.PolicyName(groupID) if !c.authorize(w, r, permissions.Node{ Type: permissions.NodeTypeAnd, Nodes: []permissions.Node{ @@ -629,7 +629,7 @@ func (c *Controller) GetGroupACL(w http.ResponseWriter, r *http.Request, groupID } func (c *Controller) SetGroupACL(w http.ResponseWriter, r *http.Request, body apigen.SetGroupACLJSONRequestBody, groupID string) { - aclPolicyName := acl.ACLPolicyName(groupID) + aclPolicyName := acl.PolicyName(groupID) if !c.authorize(w, r, permissions.Node{ Type: permissions.NodeTypeAnd, Nodes: []permissions.Node{ @@ -1583,7 +1583,7 @@ func (c *Controller) CreateRepository(w http.ResponseWriter, r *http.Request, bo return } - err = samplerepo.SampleRepoAddBranchProtection(ctx, newRepo, c.Catalog) + err = samplerepo.AddBranchProtection(ctx, newRepo, c.Catalog) if err != nil { c.handleAPIError(ctx, w, r, fmt.Errorf("error adding branch protection to sample repository: %w", err)) return diff --git a/pkg/auth/acl/name.go b/pkg/auth/acl/name.go index be1e1028382..03802ddc11c 100644 --- a/pkg/auth/acl/name.go +++ b/pkg/auth/acl/name.go @@ -2,13 +2,13 @@ package acl import "strings" -const ACLPolicyPrefix = "ACL(_-_)" +const PolicyPrefix = "ACL(_-_)" -// ACLPolicyName returns the policy identifier for the ACL for groupID. -func ACLPolicyName(groupID string) string { - return ACLPolicyPrefix + groupID +// PolicyName returns the policy identifier for the ACL for groupID. +func PolicyName(groupID string) string { + return PolicyPrefix + groupID } -func IsACLPolicyName(policyName string) bool { - return strings.HasPrefix(policyName, ACLPolicyPrefix) +func IsPolicyName(policyName string) bool { + return strings.HasPrefix(policyName, PolicyPrefix) } diff --git a/pkg/auth/acl/permission.go b/pkg/auth/acl/permission.go index 218f455490c..099108d23cf 100644 --- a/pkg/auth/acl/permission.go +++ b/pkg/auth/acl/permission.go @@ -9,19 +9,19 @@ import ( ) const ( - // ACLRead allows reading the specified repositories, as well as + // ReadPermission allows reading the specified repositories, as well as // managing own credentials. - ACLRead model.ACLPermission = "Read" - // ACLWrite allows reading and writing the specified repositories, + ReadPermission model.ACLPermission = "Read" + // WritePermission allows reading and writing the specified repositories, // as well as managing own credentials. - ACLWrite model.ACLPermission = "Write" - // ACLSuper allows reading, writing, and all other actions on the + WritePermission model.ACLPermission = "Write" + // SuperPermission allows reading, writing, and all other actions on the // specified repositories, as well as managing own credentials. - ACLSuper model.ACLPermission = "Super" - // ACLAdmin allows all operations, including all reading, writing, + SuperPermission model.ACLPermission = "Super" + // AdminPermission allows all operations, including all reading, writing, // and all other actions on all repositories, and managing // authorization and credentials of all users. - ACLAdmin model.ACLPermission = "Admin" + AdminPermission model.ACLPermission = "Admin" ) var ( @@ -38,7 +38,7 @@ func ACLToStatement(acl model.ACL) (model.Statements, error) { ) switch acl.Permission { - case ACLRead: + case ReadPermission: statements, err = auth.MakeStatementForPolicyType("FSRead", all) if err != nil { return nil, fmt.Errorf("%s: %w", acl.Permission, ErrBadACLPermission) @@ -53,7 +53,7 @@ func ACLToStatement(acl model.ACL) (model.Statements, error) { return nil, err } statements = append(append(statements, readConfigStatement...), ownCredentialsStatement...) - case ACLWrite: + case WritePermission: statements, err = auth.MakeStatementForPolicyType("FSReadWrite", all) if err != nil { return nil, fmt.Errorf("%s: %w", acl.Permission, ErrBadACLPermission) @@ -70,7 +70,7 @@ func ACLToStatement(acl model.ACL) (model.Statements, error) { } statements = append(statements, append(ownCredentialsStatement, ciStatement...)...) - case ACLSuper: + case SuperPermission: statements, err = auth.MakeStatementForPolicyType("FSFullAccess", all) if err != nil { return nil, fmt.Errorf("%s: get FSFullAccess: %w", acl.Permission, ErrBadACLPermission) @@ -87,7 +87,7 @@ func ACLToStatement(acl model.ACL) (model.Statements, error) { } statements = append(statements, append(ownCredentialsStatement, ciStatement...)...) - case ACLAdmin: + case AdminPermission: statements, err = auth.MakeStatementForPolicyType("AllAccess", []string{permissions.All}) if err != nil { return nil, fmt.Errorf("%s: %w", acl.Permission, ErrBadACLPermission) diff --git a/pkg/auth/acl/write.go b/pkg/auth/acl/write.go index f461646ddb3..6a0912b4523 100644 --- a/pkg/auth/acl/write.go +++ b/pkg/auth/acl/write.go @@ -7,18 +7,17 @@ import ( "fmt" "time" + "github.com/hashicorp/go-multierror" "github.com/treeverse/lakefs/pkg/auth" "github.com/treeverse/lakefs/pkg/auth/model" "github.com/treeverse/lakefs/pkg/logging" - - "github.com/hashicorp/go-multierror" ) const ( - ACLAdminsGroup = "Admins" - ACLSupersGroup = "Supers" - ACLWritersGroup = "Writers" - ACLReadersGroup = "Readers" + AdminsGroup = "Admins" + SupersGroup = "Supers" + WritersGroup = "Writers" + ReadersGroup = "Readers" ) func WriteGroupACL(ctx context.Context, svc auth.Service, groupName string, acl model.ACL, creationTime time.Time, warnIfCreate bool) error { @@ -29,7 +28,7 @@ func WriteGroupACL(ctx context.Context, svc auth.Service, groupName string, acl return fmt.Errorf("%s: translate ACL %+v to statements: %w", groupName, acl, err) } - aclPolicyName := ACLPolicyName(groupName) + aclPolicyName := PolicyName(groupName) policy := &model.Policy{ CreatedAt: creationTime, diff --git a/pkg/auth/service_test.go b/pkg/auth/service_test.go index c9a0d5b3cd3..f94fceea5c3 100644 --- a/pkg/auth/service_test.go +++ b/pkg/auth/service_test.go @@ -22,7 +22,7 @@ import ( "github.com/treeverse/lakefs/pkg/auth/mock" "github.com/treeverse/lakefs/pkg/auth/model" authparams "github.com/treeverse/lakefs/pkg/auth/params" - auth_testutil "github.com/treeverse/lakefs/pkg/auth/testutil" + authtestutil "github.com/treeverse/lakefs/pkg/auth/testutil" "github.com/treeverse/lakefs/pkg/kv/kvtest" "github.com/treeverse/lakefs/pkg/logging" "github.com/treeverse/lakefs/pkg/permissions" @@ -212,7 +212,7 @@ func TestAuthService_DeleteUserWithRelations(t *testing.T) { policyNames := []string{"policy01", "policy02", "policy03", "policy04"} ctx := context.Background() - authService, _ := auth_testutil.SetupService(t, ctx, someSecret) + authService, _ := authtestutil.SetupService(t, ctx, someSecret) // create initial data set and verify users groups and policies are create and related as expected createInitialDataSet(t, ctx, authService, userNames, groupNames, policyNames) @@ -280,7 +280,7 @@ func TestAuthService_DeleteGroupWithRelations(t *testing.T) { policyNames := []string{"policy01", "policy02", "policy03", "policy04"} ctx := context.Background() - authService, _ := auth_testutil.SetupService(t, ctx, someSecret) + authService, _ := authtestutil.SetupService(t, ctx, someSecret) // create initial data set and verify users groups and policies are created and related as expected createInitialDataSet(t, ctx, authService, userNames, groupNames, policyNames) @@ -364,7 +364,7 @@ func TestAuthService_DeletePoliciesWithRelations(t *testing.T) { policyNames := []string{"policy01", "policy02", "policy03", "policy04"} ctx := context.Background() - authService, _ := auth_testutil.SetupService(t, ctx, someSecret) + authService, _ := authtestutil.SetupService(t, ctx, someSecret) // create initial data set and verify users groups and policies are create and related as expected createInitialDataSet(t, ctx, authService, userNames, groupNames, policyNames) @@ -585,7 +585,7 @@ func describeAllowed(allowed bool) string { } func TestACL(t *testing.T) { - hierarchy := []model.ACLPermission{acl.ACLRead, acl.ACLWrite, acl.ACLSuper, acl.ACLAdmin} + hierarchy := []model.ACLPermission{acl.ReadPermission, acl.WritePermission, acl.SuperPermission, acl.AdminPermission} type PermissionFrom map[model.ACLPermission][]permissions.Permission type TestCase struct { @@ -604,25 +604,25 @@ func TestACL(t *testing.T) { Name: "all repos", ACL: model.ACL{}, PermissionFrom: PermissionFrom{ - acl.ACLRead: []permissions.Permission{ + acl.ReadPermission: []permissions.Permission{ {Action: permissions.ReadObjectAction, Resource: permissions.ObjectArn("foo", "some/path")}, {Action: permissions.ListObjectsAction, Resource: permissions.ObjectArn("foo", "some/path")}, {Action: permissions.ListObjectsAction, Resource: permissions.ObjectArn("quux", "")}, {Action: permissions.CreateCredentialsAction, Resource: permissions.UserArn("${user}")}, }, - acl.ACLWrite: []permissions.Permission{ + acl.WritePermission: []permissions.Permission{ {Action: permissions.WriteObjectAction, Resource: permissions.ObjectArn("foo", "some/path")}, {Action: permissions.DeleteObjectAction, Resource: permissions.ObjectArn("foo", "some/path")}, {Action: permissions.CreateBranchAction, Resource: permissions.BranchArn("foo", "twig")}, {Action: permissions.CreateCommitAction, Resource: permissions.BranchArn("foo", "twig")}, {Action: permissions.CreateMetaRangeAction, Resource: permissions.RepoArn("foo")}, }, - acl.ACLSuper: []permissions.Permission{ + acl.SuperPermission: []permissions.Permission{ {Action: permissions.AttachStorageNamespaceAction, Resource: permissions.StorageNamespace("storage://bucket/path")}, {Action: permissions.ImportFromStorageAction, Resource: permissions.StorageNamespace("storage://bucket/path")}, {Action: permissions.ImportCancelAction, Resource: permissions.BranchArn("foo", "twig")}, }, - acl.ACLAdmin: []permissions.Permission{ + acl.AdminPermission: []permissions.Permission{ {Action: permissions.CreateUserAction, Resource: permissions.UserArn("you")}, }, }, @@ -633,7 +633,7 @@ func TestACL(t *testing.T) { for _, tt := range tests { t.Run(tt.Name, func(t *testing.T) { - s, _ := auth_testutil.SetupService(t, ctx, someSecret) + s, _ := authtestutil.SetupService(t, ctx, someSecret) userID := make(map[model.ACLPermission]string, len(hierarchy)) for _, aclPermission := range hierarchy { tt.ACL.Permission = aclPermission diff --git a/pkg/auth/setup/setup.go b/pkg/auth/setup/setup.go index 8b523f1ec4b..a75bd24794a 100644 --- a/pkg/auth/setup/setup.go +++ b/pkg/auth/setup/setup.go @@ -114,7 +114,7 @@ func attachPolicies(ctx context.Context, authService auth.Service, groupID strin return nil } -func SetupRBACBaseGroups(ctx context.Context, authService auth.Service, ts time.Time) error { +func CreateRBACBaseGroups(ctx context.Context, authService auth.Service, ts time.Time) error { err := createGroups(ctx, authService, []*model.Group{ {CreatedAt: ts, DisplayName: AdminsGroup}, {CreatedAt: ts, DisplayName: SuperUsersGroup}, @@ -150,43 +150,43 @@ func SetupRBACBaseGroups(ctx context.Context, authService auth.Service, ts time. return nil } -func SetupACLBaseGroups(ctx context.Context, authService auth.Service, ts time.Time) error { - if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.ACLAdminsGroup}); err != nil { - return fmt.Errorf("setup: create base ACL group %s: %w", acl.ACLAdminsGroup, err) +func CreateACLBaseGroups(ctx context.Context, authService auth.Service, ts time.Time) error { + if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.AdminsGroup}); err != nil { + return fmt.Errorf("setup: create base ACL group %s: %w", acl.AdminsGroup, err) } - if err := acl.WriteGroupACL(ctx, authService, acl.ACLAdminsGroup, model.ACL{Permission: acl.ACLAdmin}, ts, false); err != nil { + if err := acl.WriteGroupACL(ctx, authService, acl.AdminsGroup, model.ACL{Permission: acl.AdminPermission}, ts, false); err != nil { return fmt.Errorf("setup: %w", err) } - if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.ACLSupersGroup}); err != nil { - return fmt.Errorf("setup: create base ACL group %s: %w", acl.ACLSupersGroup, err) + if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.SupersGroup}); err != nil { + return fmt.Errorf("setup: create base ACL group %s: %w", acl.SupersGroup, err) } - if err := acl.WriteGroupACL(ctx, authService, acl.ACLSupersGroup, model.ACL{Permission: acl.ACLSuper}, ts, false); err != nil { + if err := acl.WriteGroupACL(ctx, authService, acl.SupersGroup, model.ACL{Permission: acl.SuperPermission}, ts, false); err != nil { return fmt.Errorf("setup: %w", err) } - if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.ACLWritersGroup}); err != nil { - return fmt.Errorf("setup: create base ACL group %s: %w", acl.ACLWritersGroup, err) + if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.WritersGroup}); err != nil { + return fmt.Errorf("setup: create base ACL group %s: %w", acl.WritersGroup, err) } - if err := acl.WriteGroupACL(ctx, authService, acl.ACLWritersGroup, model.ACL{Permission: acl.ACLWrite}, ts, false); err != nil { + if err := acl.WriteGroupACL(ctx, authService, acl.WritersGroup, model.ACL{Permission: acl.WritePermission}, ts, false); err != nil { return fmt.Errorf("setup: %w", err) } - if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.ACLReadersGroup}); err != nil { - return fmt.Errorf("create base ACL group %s: %w", acl.ACLReadersGroup, err) + if err := authService.CreateGroup(ctx, &model.Group{CreatedAt: ts, DisplayName: acl.ReadersGroup}); err != nil { + return fmt.Errorf("create base ACL group %s: %w", acl.ReadersGroup, err) } - if err := acl.WriteGroupACL(ctx, authService, acl.ACLReadersGroup, model.ACL{Permission: acl.ACLRead}, ts, false); err != nil { + if err := acl.WriteGroupACL(ctx, authService, acl.ReadersGroup, model.ACL{Permission: acl.ReadPermission}, ts, false); err != nil { return fmt.Errorf("setup: %w", err) } return nil } -// SetupAdminUser setup base groups, policies and create admin user -func SetupAdminUser(ctx context.Context, authService auth.Service, cfg *config.Config, superuser *model.SuperuserConfiguration) (*model.Credential, error) { +// CreateAdminUser setup base groups, policies and create admin user +func CreateAdminUser(ctx context.Context, authService auth.Service, cfg *config.Config, superuser *model.SuperuserConfiguration) (*model.Credential, error) { // Set up the basic groups and policies now := time.Now() - err := SetupBaseGroups(ctx, authService, cfg, now) + err := CreateBaseGroups(ctx, authService, cfg, now) if err != nil { return nil, err } @@ -245,7 +245,7 @@ func CreateInitialAdminUserWithKeys(ctx context.Context, authService auth.Servic } // create first admin user - cred, err := SetupAdminUser(ctx, authService, cfg, adminUser) + cred, err := CreateAdminUser(ctx, authService, cfg, adminUser) if err != nil { return nil, err } @@ -257,9 +257,9 @@ func CreateInitialAdminUserWithKeys(ctx context.Context, authService auth.Servic return cred, err } -func SetupBaseGroups(ctx context.Context, authService auth.Service, cfg *config.Config, ts time.Time) error { +func CreateBaseGroups(ctx context.Context, authService auth.Service, cfg *config.Config, ts time.Time) error { if cfg.IsAuthUISimplified() { - return SetupACLBaseGroups(ctx, authService, ts) + return CreateACLBaseGroups(ctx, authService, ts) } - return SetupRBACBaseGroups(ctx, authService, ts) + return CreateRBACBaseGroups(ctx, authService, ts) } diff --git a/pkg/block/blocktest/adapter.go b/pkg/block/blocktest/adapter.go index 5289974ca49..c9a2bfe4db9 100644 --- a/pkg/block/blocktest/adapter.go +++ b/pkg/block/blocktest/adapter.go @@ -58,7 +58,9 @@ func testAdapterPutGet(t *testing.T, adapter block.Adapter, storageNamespace, ex reader, err := adapter.Get(ctx, obj, size) require.NoError(t, err) - defer func() { _ = reader.Close() }() + defer func() { + require.NoError(t, reader.Close()) + }() got, err := io.ReadAll(reader) require.NoError(t, err) require.Equal(t, contents, string(got)) diff --git a/pkg/block/gs/adapter_test.go b/pkg/block/gs/adapter_test.go index 28df73bdc6f..9afbb212904 100644 --- a/pkg/block/gs/adapter_test.go +++ b/pkg/block/gs/adapter_test.go @@ -10,12 +10,11 @@ import ( "github.com/treeverse/lakefs/pkg/block/gs" ) -func getAdapter() *gs.Adapter { - adapter := gs.NewAdapter(client) - return adapter +func newAdapter() *gs.Adapter { + return gs.NewAdapter(client) } -func TestS3Adapter(t *testing.T) { +func TestAdapter(t *testing.T) { basePath, err := url.JoinPath("gs://", bucketName) require.NoError(t, err) localPath, err := url.JoinPath(basePath, "lakefs") @@ -23,12 +22,20 @@ func TestS3Adapter(t *testing.T) { externalPath, err := url.JoinPath(basePath, "external") require.NoError(t, err) - adapter := getAdapter() + adapter := newAdapter() + defer func() { + require.NoError(t, adapter.Close()) + }() + blocktest.AdapterTest(t, adapter, localPath, externalPath) } func TestAdapterNamespace(t *testing.T) { - adapter := getAdapter() + adapter := newAdapter() + defer func() { + require.NoError(t, adapter.Close()) + }() + expr, err := regexp.Compile(adapter.GetStorageNamespaceInfo().ValidityRegex) require.NoError(t, err) diff --git a/pkg/block/gs/main_test.go b/pkg/block/gs/main_test.go index addf2e39ea5..7242b1d3965 100644 --- a/pkg/block/gs/main_test.go +++ b/pkg/block/gs/main_test.go @@ -13,32 +13,28 @@ import ( "google.golang.org/api/option" ) -const ( - emulatorContainerTimeoutSeconds = 10 * 60 // 10 min - bucketName = "bucket1" - emulatorTestEndpoint = "127.0.0.1" - emulatorTestPort = "4443" - gcsProjectID = "testProject" -) +const bucketName = "bucket1" -var ( - blockURL string - pool *dockertest.Pool - client *storage.Client -) +var client *storage.Client func TestMain(m *testing.M) { - var err error + const ( + emulatorContainerTimeoutSeconds = 10 * 60 // 10 min + emulatorTestEndpoint = "127.0.0.1" + emulatorTestPort = "4443" + gcsProjectID = "testProject" + ) + ctx := context.Background() - // External port required for -public-host configuration in docker cmd + // External port required for '-public-host' configuration in docker cmd endpoint := fmt.Sprintf("%s:%s", emulatorTestEndpoint, emulatorTestPort) - pool, err = dockertest.NewPool("") + pool, err := dockertest.NewPool("") if err != nil { log.Fatalf("Could not connect to Docker: %s", err) } resource, err := pool.RunWithOptions(&dockertest.RunOptions{ Repository: "fsouza/fake-gcs-server", - Tag: "1.45.2", + Tag: "1.47.4", Cmd: []string{ "-scheme", "http", @@ -55,32 +51,32 @@ func TestMain(m *testing.M) { }, }) if err != nil { - panic(err) + log.Fatalf("Could not start fake-gcs-server: %s", err) } // set cleanup closer := func() { err = pool.Purge(resource) if err != nil { - panic("could not purge emulator container: " + err.Error()) + log.Fatalf("Could not purge fake-gcs-server: %s", err) } } // expire, just to make sure err = resource.Expire(emulatorContainerTimeoutSeconds) if err != nil { - panic("could not expire emulator container: " + err.Error()) + log.Fatalf("Could not expire fake-gcs-server: %s", err) } - // Create test client and bucket - blockURL = fmt.Sprintf("http://%s/storage/v1/", endpoint) + // Create the test client and bucket + blockURL := fmt.Sprintf("http://%s/storage/v1/", endpoint) client, err = storage.NewClient(ctx, option.WithEndpoint(blockURL), option.WithoutAuthentication()) if err != nil { - log.Fatalf("create client: %s", err) + log.Fatalf("Could not create gs client: %s", err) } - if err = client.Bucket(bucketName).Create(ctx, gcsProjectID, nil); err != nil { - log.Fatalf("create bucket: %s", err) + if err := client.Bucket(bucketName).Create(ctx, gcsProjectID, nil); err != nil { + log.Fatalf("Could not create bucket '%s': %s", bucketName, err) } code := m.Run() diff --git a/pkg/gateway/operations/listbuckets.go b/pkg/gateway/operations/listbuckets.go index dc1dd8c0781..370bae99539 100644 --- a/pkg/gateway/operations/listbuckets.go +++ b/pkg/gateway/operations/listbuckets.go @@ -23,7 +23,7 @@ func (controller *ListBuckets) RequiredPermissions(_ *http.Request) (permissions func (controller *ListBuckets) Handle(w http.ResponseWriter, req *http.Request, o *AuthorizedOperation) { o.Incr("list_repos", o.Principal, "", "") - buckets := []serde.Bucket{} + buckets := make([]serde.Bucket, 0) var after string for { // list repositories diff --git a/pkg/graveler/committed/diff_test.go b/pkg/graveler/committed/diff_test.go index 4efddbdb78a..1be166d74ec 100644 --- a/pkg/graveler/committed/diff_test.go +++ b/pkg/graveler/committed/diff_test.go @@ -350,7 +350,7 @@ func TestNextRange(t *testing.T) { if it.NextRange() { t.Fatal("expected false from iterator after close") } - if err := it.Err(); err != committed.ErrNoRange { + if err := it.Err(); !errors.Is(err, committed.ErrNoRange) { t.Fatalf("expected to get err=%s, got: %s", committed.ErrNoRange, err) } }) @@ -388,7 +388,7 @@ func TestNextErr(t *testing.T) { val, rng := it.Value() t.Fatalf("unexptected result from it.NextRange(), expected false, got true with value=%v , rng=%v", val, rng) } - if err := it.Err(); err != committed.ErrNoRange { + if err := it.Err(); !errors.Is(err, committed.ErrNoRange) { t.Fatalf("expected to get err=%s, got: %s", committed.ErrNoRange, err) } } diff --git a/pkg/graveler/committed/import_test.go b/pkg/graveler/committed/import_test.go index 547f8b0ae8b..1096c42390d 100644 --- a/pkg/graveler/committed/import_test.go +++ b/pkg/graveler/committed/import_test.go @@ -2,11 +2,13 @@ package committed_test import ( "context" + "errors" + "testing" + "github.com/golang/mock/gomock" "github.com/treeverse/lakefs/pkg/graveler" "github.com/treeverse/lakefs/pkg/graveler/committed" "github.com/treeverse/lakefs/pkg/graveler/committed/mock" - "testing" ) func Test_import(t *testing.T) { @@ -371,8 +373,8 @@ func Test_import(t *testing.T) { writer.EXPECT().Close(gomock.Any()).Return(&metaRangeId, nil).AnyTimes() committedManager := committed.NewCommittedManager(metaRangeManager, rangeManager, params) _, err := committedManager.Import(ctx, "ns", destMetaRangeID, sourceMetaRangeID, tst.prefixes) - if err != expectedResult.expectedErr { - t.Fatal(err) + if !errors.Is(err, expectedResult.expectedErr) { + t.Fatalf("Import error = '%v', expected '%v'", err, expectedResult.expectedErr) } }) } diff --git a/pkg/graveler/committed/merge_test.go b/pkg/graveler/committed/merge_test.go index ad1e9bdc457..ec58c09ae7e 100644 --- a/pkg/graveler/committed/merge_test.go +++ b/pkg/graveler/committed/merge_test.go @@ -1730,8 +1730,8 @@ func runMergeTests(tests testCases, t *testing.T) { writer.EXPECT().Close(gomock.Any()).Return(&metaRangeId, nil).AnyTimes() committedManager := committed.NewCommittedManager(metaRangeManager, rangeManager, params) _, err := committedManager.Merge(ctx, "ns", destMetaRangeID, sourceMetaRangeID, baseMetaRangeID, mergeStrategy) - if err != expectedResult.expectedErr { - t.Fatal(err) + if !errors.Is(err, expectedResult.expectedErr) { + t.Fatalf("Merge error='%v', expected='%v'", err, expectedResult.expectedErr) } }) } diff --git a/pkg/graveler/graveler_test.go b/pkg/graveler/graveler_test.go index 60ec4fc46c2..46bad638af6 100644 --- a/pkg/graveler/graveler_test.go +++ b/pkg/graveler/graveler_test.go @@ -270,7 +270,7 @@ func TestGraveler_Get(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { Value, err := tt.r.Get(context.Background(), repository, "", []byte("key")) - if err != tt.expectedErr { + if !errors.Is(err, tt.expectedErr) { t.Fatalf("wrong error, expected:%v got:%v", tt.expectedErr, err) } if err != nil { @@ -347,7 +347,7 @@ func TestGraveler_Set(t *testing.T) { t.Run(tt.name, func(t *testing.T) { store := newGraveler(t, tt.committedMgr, tt.stagingMgr, tt.refMgr, nil, testutil.NewProtectedBranchesManagerFake()) err := store.Set(ctx, repository, "branch-1", newSetVal.Key, *newSetVal.Value, graveler.WithIfAbsent(tt.ifAbsent)) - if err != tt.expectedErr { + if !errors.Is(err, tt.expectedErr) { t.Fatalf("Set() - error: %v, expected: %v", err, tt.expectedErr) } lastVal := tt.stagingMgr.LastSetValueRecord @@ -573,7 +573,7 @@ func TestGravelerGet_Advanced(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { Value, err := tt.r.Get(context.Background(), repository, "", []byte("staged")) - if err != tt.expectedErr { + if !errors.Is(err, tt.expectedErr) { t.Fatalf("wrong error, expected:%v got:%v", tt.expectedErr, err) } if err != nil { @@ -811,7 +811,7 @@ func TestGraveler_Diff(t *testing.T) { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() diff, err := tt.r.Diff(ctx, repository, "ref1", "b1") - if err != tt.expectedErr { + if !errors.Is(err, tt.expectedErr) { t.Fatalf("wrong error, expected:%s got:%s", tt.expectedErr, err) } if err != nil { @@ -924,7 +924,7 @@ func TestGraveler_DiffUncommitted(t *testing.T) { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() diff, err := tt.r.DiffUncommitted(ctx, repository, "branch") - if err != tt.expectedErr { + if !errors.Is(err, tt.expectedErr) { t.Fatalf("wrong error, expected:%s got:%s", tt.expectedErr, err) } if err != nil { diff --git a/pkg/graveler/ref/manager_test.go b/pkg/graveler/ref/manager_test.go index d6cec54a643..c1e132068e6 100644 --- a/pkg/graveler/ref/manager_test.go +++ b/pkg/graveler/ref/manager_test.go @@ -852,7 +852,7 @@ func TestManager_LogGraph(t *testing.T) { nextCommitTS = nextCommitTS.Add(time.Minute) parentIDs := make([]graveler.CommitID, 0, len(parentNames)) for _, parentName := range parentNames { - parentIDs = append(parentIDs, graveler.CommitID(commitNameToID[parentName])) + parentIDs = append(parentIDs, commitNameToID[parentName]) } c := graveler.Commit{ Committer: "user1", diff --git a/pkg/graveler/settings/manager_test.go b/pkg/graveler/settings/manager_test.go index a969d4b1ad5..f7bfd58b0a4 100644 --- a/pkg/graveler/settings/manager_test.go +++ b/pkg/graveler/settings/manager_test.go @@ -208,7 +208,7 @@ func TestEmpty(t *testing.T) { emptySettings := &settings.ExampleSettings{} _, err := m.Get(ctx, repository, "settingKey", emptySettings) // the key was not set, an error should be returned - if err != graveler.ErrNotFound { + if !errors.Is(err, graveler.ErrNotFound) { t.Fatalf("expected error %v, got %v", graveler.ErrNotFound, err) } // when using Update on an unset key, the update function gets an empty setting object to operate on diff --git a/pkg/kv/kvtest/store.go b/pkg/kv/kvtest/store.go index 09ffba34e73..3ef4294c561 100644 --- a/pkg/kv/kvtest/store.go +++ b/pkg/kv/kvtest/store.go @@ -297,7 +297,7 @@ func testStoreSetIf(t *testing.T, ms MakeStore) { t.Run("predicate_empty_slice", func(t *testing.T) { key := uniqueKey("predicate-empty-slice") - val1 := []byte{} + val1 := make([]byte, 0) err := store.Set(ctx, []byte(testPartitionKey), key, val1) if err != nil { t.Fatalf("Set while testing predicate empty slice - key=%s value=%s: %s", key, val1, err) @@ -561,7 +561,7 @@ func testStoreContextCancelled(t *testing.T, ms MakeStore) { // cancel the context for all requests cancel() t.Run("Set", func(t *testing.T) { - // set test key with value1 + // set the test key with value1 err := store.Set(ctx, []byte(testPartitionKey), testKey, testValue1) if err != nil && !errors.Is(err, context.Canceled) { t.Fatalf("expected context cancellation error, got: %s", err) diff --git a/pkg/kv/migrations/migrations_test.go b/pkg/kv/migrations/migrations_test.go index b8b63c24341..5cccb6529b5 100644 --- a/pkg/kv/migrations/migrations_test.go +++ b/pkg/kv/migrations/migrations_test.go @@ -27,13 +27,13 @@ func TestGetMinPermission(t *testing.T) { Action string Permission model.ACLPermission }{ - {Action: permissions.ReadObjectAction, Permission: acl.ACLRead}, - {Action: "fs:Read*", Permission: acl.ACLRead}, - {Action: "fs:ReadO*", Permission: acl.ACLRead}, - {Action: permissions.ListObjectsAction, Permission: acl.ACLRead}, - {Action: "fs:List*", Permission: acl.ACLRead}, - {Action: permissions.ReadActionsAction, Permission: acl.ACLWrite}, - {Action: "fs:WriteO?ject", Permission: acl.ACLWrite}, + {Action: permissions.ReadObjectAction, Permission: acl.ReadPermission}, + {Action: "fs:Read*", Permission: acl.ReadPermission}, + {Action: "fs:ReadO*", Permission: acl.ReadPermission}, + {Action: permissions.ListObjectsAction, Permission: acl.ReadPermission}, + {Action: "fs:List*", Permission: acl.ReadPermission}, + {Action: permissions.ReadActionsAction, Permission: acl.WritePermission}, + {Action: "fs:WriteO?ject", Permission: acl.WritePermission}, } mig := migrations.NewACLsMigrator(nil, false) @@ -59,51 +59,51 @@ func TestComputePermission(t *testing.T) { { Name: "read-all", Actions: auth.GetActionsForPolicyTypeOrDie("FSRead"), - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, { Name: "read-one", Actions: []string{permissions.ReadRepositoryAction}, - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, { Name: "read-two", Actions: []string{permissions.ListObjectsAction, permissions.ReadTagAction}, - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, { Name: "only-own-credentials", Actions: auth.GetActionsForPolicyTypeOrDie("AuthManageOwnCredentials"), - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, { Name: "write-all", Actions: auth.GetActionsForPolicyTypeOrDie("FSReadWrite"), - Permission: acl.ACLWrite, + Permission: acl.WritePermission, }, { Name: "write-one", Actions: []string{permissions.WriteObjectAction}, - Permission: acl.ACLWrite, + Permission: acl.WritePermission, }, { Name: "write-one-read-one-create-one", Actions: []string{permissions.CreateCommitAction, permissions.ReadObjectAction, permissions.CreateMetaRangeAction}, - Permission: acl.ACLWrite, + Permission: acl.WritePermission, }, { Name: "super-all", Actions: auth.GetActionsForPolicyTypeOrDie("FSFullAccess"), - Permission: acl.ACLSuper, + Permission: acl.SuperPermission, }, { Name: "super-one", Actions: []string{permissions.AttachStorageNamespaceAction}, - Permission: acl.ACLSuper, + Permission: acl.SuperPermission, }, { Name: "super-one-write-one-read-two", Actions: []string{permissions.CreateTagAction, permissions.AttachStorageNamespaceAction, permissions.ReadConfigAction, permissions.ReadRepositoryAction}, - Permission: acl.ACLSuper, + Permission: acl.SuperPermission, }, { Name: "admin-all", Actions: auth.GetActionsForPolicyTypeOrDie("AllAccess"), - Permission: acl.ACLAdmin, + Permission: acl.AdminPermission, }, { Name: "admin-one", Actions: []string{permissions.SetGarbageCollectionRulesAction}, - Permission: acl.ACLAdmin, + Permission: acl.AdminPermission, }, } @@ -125,7 +125,7 @@ func TestComputePermission(t *testing.T) { } func TestBroaderPermission(t *testing.T) { - perms := []model.ACLPermission{"", acl.ACLRead, acl.ACLWrite, acl.ACLSuper, acl.ACLAdmin} + perms := []model.ACLPermission{"", acl.ReadPermission, acl.WritePermission, acl.SuperPermission, acl.AdminPermission} for i, a := range perms { for j, b := range perms { after := i > j @@ -164,7 +164,7 @@ func TestNewACLForPolicies_Generator(t *testing.T) { ctx := context.Background() svc, _ := authtestutil.SetupService(t, ctx, []byte("shh...")) - err := setup.SetupRBACBaseGroups(ctx, svc, now) + err := setup.CreateRBACBaseGroups(ctx, svc, now) if err != nil { t.Fatal(err) } @@ -182,25 +182,25 @@ func TestNewACLForPolicies_Generator(t *testing.T) { Name: "ExactlyFSFullAccess", Policies: []*model.Policy{getPolicy(t, ctx, svc, "FSFullAccess")}, ACL: model.ACL{ - Permission: acl.ACLSuper, + Permission: acl.SuperPermission, }, }, { Name: "GroupSuperUsers", Policies: getPolicies(t, ctx, svc, "SuperUsers"), ACL: model.ACL{ - Permission: acl.ACLSuper, + Permission: acl.SuperPermission, }, }, { Name: "ExactlyFSReadAll", Policies: []*model.Policy{getPolicy(t, ctx, svc, "FSReadAll")}, ACL: model.ACL{ - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, }, { Name: "GroupViewers", Policies: getPolicies(t, ctx, svc, "Viewers"), ACL: model.ACL{ - Permission: acl.ACLRead, + Permission: acl.ReadPermission, }, }, } @@ -349,6 +349,7 @@ func TestMigrateImportPermissions(t *testing.T) { func createARN(name string) string { return fmt.Sprintf("arn:%s:this:is:an:arn", name) } + func verifyMigration(t *testing.T, ctx context.Context, authService *auth.AuthService, policies []model.Policy, cfg config.Config) { for _, prev := range policies { policy, err := authService.GetPolicy(ctx, prev.DisplayName) diff --git a/pkg/kv/migrations/rbac_to_acl.go b/pkg/kv/migrations/rbac_to_acl.go index 2b41038ecaa..6c2a92ad936 100644 --- a/pkg/kv/migrations/rbac_to_acl.go +++ b/pkg/kv/migrations/rbac_to_acl.go @@ -29,7 +29,7 @@ const ( var ( // ErrTooMany is returned when this migration does not support a // particular number of resources. It should not occur on any - // reasonably-sized installation. + // reasonably sized installation. ErrTooMany = errors.New("too many") ErrTooManyPolicies = fmt.Errorf("%w policies", ErrTooMany) ErrTooManyGroups = fmt.Errorf("%w groups", ErrTooMany) @@ -42,7 +42,7 @@ var ( // allPermissions lists all permissions, from most restrictive to // most permissive. It includes "" for some edge cases. - allPermissions = []model.ACLPermission{"", acl.ACLRead, acl.ACLWrite, acl.ACLSuper, acl.ACLAdmin} + allPermissions = []model.ACLPermission{"", acl.ReadPermission, acl.WritePermission, acl.SuperPermission, acl.AdminPermission} ) func MigrateToACL(ctx context.Context, kvStore kv.Store, cfg *config.Config, logger logging.Logger, version int, force bool) error { @@ -132,9 +132,9 @@ func reportACL(acl model.ACL) string { } // checkPolicyACLName fails if policy name is named as an ACL policy (start -// with ACLPolicyPrefix) but is not an ACL policy. +// with PolicyPrefix) but is not an ACL policy. func checkPolicyACLName(ctx context.Context, svc auth.Service, name string) error { - if !acl.IsACLPolicyName(name) { + if !acl.IsPolicyName(name) { return nil } @@ -187,7 +187,7 @@ func rbacToACL(ctx context.Context, svc auth.Service, doUpdate bool, creationTim "acl": fmt.Sprintf("%+v", newACL), }).Info("Computed ACL") - aclPolicyName := acl.ACLPolicyName(group.DisplayName) + aclPolicyName := acl.PolicyName(group.DisplayName) err = checkPolicyACLName(ctx, svc, aclPolicyName) if err != nil { warnings = multierror.Append(warnings, warn) @@ -259,7 +259,7 @@ type ACLsMigrator struct { } func makeSet(allEls ...[]string) map[string]struct{} { - ret := make(map[string]struct{}, 0) + ret := make(map[string]struct{}) for _, els := range allEls { for _, el := range els { ret[el] = struct{}{} @@ -277,10 +277,10 @@ func NewACLsMigrator(svc auth.Service, doUpdate bool) *ACLsMigrator { svc: svc, doUpdate: doUpdate, Actions: map[model.ACLPermission]map[string]struct{}{ - acl.ACLAdmin: makeSet(auth.GetActionsForPolicyTypeOrDie("AllAccess")), - acl.ACLSuper: makeSet(auth.GetActionsForPolicyTypeOrDie("FSFullAccess"), manageOwnCredentials, ciRead), - acl.ACLWrite: makeSet(auth.GetActionsForPolicyTypeOrDie("FSReadWrite"), manageOwnCredentials, ciRead), - acl.ACLRead: makeSet(auth.GetActionsForPolicyTypeOrDie("FSRead"), manageOwnCredentials), + acl.AdminPermission: makeSet(auth.GetActionsForPolicyTypeOrDie("AllAccess")), + acl.SuperPermission: makeSet(auth.GetActionsForPolicyTypeOrDie("FSFullAccess"), manageOwnCredentials, ciRead), + acl.WritePermission: makeSet(auth.GetActionsForPolicyTypeOrDie("FSReadWrite"), manageOwnCredentials, ciRead), + acl.ReadPermission: makeSet(auth.GetActionsForPolicyTypeOrDie("FSRead"), manageOwnCredentials), }, } } @@ -354,7 +354,7 @@ func (mig *ACLsMigrator) GetMinPermission(action string) model.ACLPermission { } // Try a wildcard match against all known actions: find the least - // permissions that allows all actions that the action pattern + // permission that allows all actions that the action pattern // matches. for _, permission := range allPermissions { // This loop is reasonably efficient only for small numbers @@ -363,11 +363,11 @@ func (mig *ACLsMigrator) GetMinPermission(action string) model.ACLPermission { permissionOK := true for _, a := range permissions.Actions { if !wildcard.Match(action, a) { - // a does not include action. + // 'a' does not include action. continue } if someActionMatches(a, actionsForPermission) { - // a is allowed at permission. + // 'a' is allowed at permission. continue } permissionOK = false @@ -402,7 +402,7 @@ func (mig *ACLsMigrator) ComputePermission(ctx context.Context, actions []string } } if permission == "" { - return permission, fmt.Errorf("%w actions", ErrEmpty) + return "", fmt.Errorf("%w actions", ErrEmpty) } return permission, nil @@ -413,15 +413,15 @@ func (mig *ACLsMigrator) ComputePermission(ctx context.Context, actions []string func (mig *ACLsMigrator) ComputeAddedActions(permission model.ACLPermission, alreadyAllowedActions map[string]struct{}) []string { var allAllowedActions map[string]struct{} switch permission { - case acl.ACLRead: - allAllowedActions = mig.Actions[acl.ACLRead] - case acl.ACLWrite: - allAllowedActions = mig.Actions[acl.ACLWrite] - case acl.ACLSuper: - allAllowedActions = mig.Actions[acl.ACLSuper] - case acl.ACLAdmin: + case acl.ReadPermission: + allAllowedActions = mig.Actions[acl.ReadPermission] + case acl.WritePermission: + allAllowedActions = mig.Actions[acl.WritePermission] + case acl.SuperPermission: + allAllowedActions = mig.Actions[acl.SuperPermission] + case acl.AdminPermission: default: - allAllowedActions = mig.Actions[acl.ACLAdmin] + allAllowedActions = mig.Actions[acl.AdminPermission] } addedActions := make(map[string]struct{}, len(allAllowedActions)) for _, action := range permissions.Actions { @@ -441,14 +441,14 @@ func BroaderPermission(a, b model.ACLPermission) bool { switch a { case "": return false - case acl.ACLRead: + case acl.ReadPermission: return b == "" - case acl.ACLWrite: - return b == "" || b == acl.ACLRead - case acl.ACLSuper: - return b == "" || b == acl.ACLRead || b == acl.ACLWrite - case acl.ACLAdmin: - return b == "" || b == acl.ACLRead || b == acl.ACLWrite || b == acl.ACLSuper + case acl.WritePermission: + return b == "" || b == acl.ReadPermission + case acl.SuperPermission: + return b == "" || b == acl.ReadPermission || b == acl.WritePermission + case acl.AdminPermission: + return b == "" || b == acl.ReadPermission || b == acl.WritePermission || b == acl.SuperPermission } panic(fmt.Sprintf("impossible comparison %s and %s", a, b)) } diff --git a/pkg/loadtest/local_load_test.go b/pkg/loadtest/local_load_test.go index a1b9797105f..751fc79afa0 100644 --- a/pkg/loadtest/local_load_test.go +++ b/pkg/loadtest/local_load_test.go @@ -73,7 +73,7 @@ func TestLocalLoad(t *testing.T) { actionsService := actions.NewService(ctx, actions.NewActionsKVStore(kvStore), source, outputWriter, &actions.DecreasingIDGenerator{}, &stats.NullCollector{}, actions.Config{Enabled: true}) c.SetHooksHandler(actionsService) - credentials, err := setup.SetupAdminUser(ctx, authService, conf, superuser) + credentials, err := setup.CreateAdminUser(ctx, authService, conf, superuser) testutil.Must(t, err) authenticator := auth.NewBuiltinAuthenticator(authService) diff --git a/pkg/samplerepo/samplecontent.go b/pkg/samplerepo/samplecontent.go index eb10a7ec854..cbda2829356 100644 --- a/pkg/samplerepo/samplecontent.go +++ b/pkg/samplerepo/samplecontent.go @@ -105,7 +105,6 @@ func PopulateSampleRepo(ctx context.Context, repo *catalog.Repository, cat catal return nil }) - if err != nil { return err } @@ -118,10 +117,7 @@ func PopulateSampleRepo(ctx context.Context, repo *catalog.Repository, cat catal return err } -func SampleRepoAddBranchProtection(ctx context.Context, repo *catalog.Repository, cat catalog.Interface) error { - // Set branch protection on main branch - - err := cat.CreateBranchProtectionRule(ctx, repo.Name, repo.DefaultBranch, []graveler.BranchProtectionBlockedAction{graveler.BranchProtectionBlockedAction_COMMIT}) - - return err +func AddBranchProtection(ctx context.Context, repo *catalog.Repository, cat catalog.Interface) error { + // Set branch protection on the main branch + return cat.CreateBranchProtectionRule(ctx, repo.Name, repo.DefaultBranch, []graveler.BranchProtectionBlockedAction{graveler.BranchProtectionBlockedAction_COMMIT}) } From bc5bfaf564dd616594cccf7159b158edb452dd69 Mon Sep 17 00:00:00 2001 From: Barak Amar Date: Wed, 20 Sep 2023 18:56:13 +0800 Subject: [PATCH 2/5] Fix UI alert if quickstart repository failed on local (#6622) --- webui/src/pages/repositories/index.jsx | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/webui/src/pages/repositories/index.jsx b/webui/src/pages/repositories/index.jsx index 9e335494e57..52fe16cd7a3 100644 --- a/webui/src/pages/repositories/index.jsx +++ b/webui/src/pages/repositories/index.jsx @@ -26,6 +26,7 @@ import {useRouter} from "../../lib/hooks/router"; import {Route, Routes} from "react-router-dom"; import RepositoryPage from './repository'; import Button from "react-bootstrap/Button"; +import Alert from "react-bootstrap/Alert"; dayjs.extend(relativeTime); @@ -95,7 +96,7 @@ const CreateRepositoryModal = ({show, error, onSubmit, onCancel, inProgress, sam ); }; -const GetStarted = ({onCreateSampleRepo, onCreateEmptyRepo, creatingRepo}) => { +const GetStarted = ({onCreateSampleRepo, onCreateEmptyRepo, creatingRepo, createRepoError }) => { return (

Welcome to lakeFS!

@@ -113,16 +114,25 @@ const GetStarted = ({onCreateSampleRepo, onCreateEmptyRepo, creatingRepo}) => { } creatingRepo={creatingRepo} variant={"success"} enabled={true} onClick={onCreateSampleRepo} /> + {createRepoError && + + + {createRepoError.message} + + + } +
Already working with lakeFS and just need an empty repository?
+ getting-started
); }; -const RepositoryList = ({ onPaginate, prefix, after, refresh, onCreateSampleRepo, onCreateEmptyRepo, toggleShowActionsBar, creatingRepo }) => { +const RepositoryList = ({ onPaginate, prefix, after, refresh, onCreateSampleRepo, onCreateEmptyRepo, toggleShowActionsBar, creatingRepo, createRepoError }) => { const {results, loading, error, nextPage} = useAPIWithPagination(() => { return repositories.list(prefix, after); @@ -131,7 +141,7 @@ const RepositoryList = ({ onPaginate, prefix, after, refresh, onCreateSampleRepo if (loading) return ; if (error) return ; if (!after && !prefix && results.length === 0) { - return ; + return ; } toggleShowActionsBar(); @@ -272,6 +282,7 @@ const RepositoriesPage = () => { onCreateEmptyRepo={createRepositoryButtonCallback} toggleShowActionsBar={toggleShowActionsBar} creatingRepo={creatingRepo} + createRepoError={createRepoError} /> Date: Wed, 20 Sep 2023 15:09:38 +0300 Subject: [PATCH 3/5] do not use cache for navs (#6633) --- docs/_layouts/default.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html index e76ee078e4a..0c279c8a921 100644 --- a/docs/_layouts/default.html +++ b/docs/_layouts/default.html @@ -114,7 +114,7 @@ }); - {% include_cached nav.html pages=site.html_pages %} + {% include nav.html pages=site.html_pages %}
{% include header_menu.html %}
From 7e1be21170530fa73a8447e45477588ecc0c489f Mon Sep 17 00:00:00 2001 From: Yoni Date: Wed, 20 Sep 2023 16:07:14 +0300 Subject: [PATCH 4/5] API cleanup: make createSymlinkFile internal (#6630) --- api/swagger.yml | 2 +- clients/java/README.md | 2 +- clients/java/api/openapi.yaml | 2 +- clients/java/docs/InternalApi.md | 96 ++++++++++++ clients/java/docs/MetadataApi.md | 96 ------------ .../io/lakefs/clients/api/InternalApi.java | 143 ++++++++++++++++++ .../io/lakefs/clients/api/MetadataApi.java | 142 ----------------- .../lakefs/clients/api/InternalApiTest.java | 18 +++ .../lakefs/clients/api/MetadataApiTest.java | 17 --- clients/python/README.md | 2 +- clients/python/docs/InternalApi.md | 121 +++++++++++++++ clients/python/docs/MetadataApi.md | 121 --------------- .../python/lakefs_client/api/internal_api.py | 137 +++++++++++++++++ .../python/lakefs_client/api/metadata_api.py | 136 ----------------- clients/python/test/test_internal_api.py | 7 + clients/python/test/test_metadata_api.py | 7 - docs/assets/js/swagger.yml | 2 +- 17 files changed, 527 insertions(+), 524 deletions(-) diff --git a/api/swagger.yml b/api/swagger.yml index 3b9ebba8be6..291499d100d 100644 --- a/api/swagger.yml +++ b/api/swagger.yml @@ -3810,7 +3810,7 @@ paths: description: path to the table data post: tags: - - metadata + - internal operationId: createSymlinkFile summary: creates symlink files corresponding to the given directory diff --git a/clients/java/README.md b/clients/java/README.md index 441a6dcf064..b2ca82ec224 100644 --- a/clients/java/README.md +++ b/clients/java/README.md @@ -190,6 +190,7 @@ Class | Method | HTTP request | Description *ImportApi* | [**importStart**](docs/ImportApi.md#importStart) | **POST** /repositories/{repository}/branches/{branch}/import | import data from object store *ImportApi* | [**importStatus**](docs/ImportApi.md#importStatus) | **GET** /repositories/{repository}/branches/{branch}/import | get import status *InternalApi* | [**createBranchProtectionRulePreflight**](docs/InternalApi.md#createBranchProtectionRulePreflight) | **GET** /repositories/{repository}/branch_protection/set_allowed | +*InternalApi* | [**createSymlinkFile**](docs/InternalApi.md#createSymlinkFile) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory *InternalApi* | [**getAuthCapabilities**](docs/InternalApi.md#getAuthCapabilities) | **GET** /auth/capabilities | list authentication capabilities supported *InternalApi* | [**getSetupState**](docs/InternalApi.md#getSetupState) | **GET** /setup_lakefs | check if the lakeFS installation is already set up *InternalApi* | [**postStatsEvents**](docs/InternalApi.md#postStatsEvents) | **POST** /statistics | post stats events, this endpoint is meant for internal use only @@ -197,7 +198,6 @@ Class | Method | HTTP request | Description *InternalApi* | [**setup**](docs/InternalApi.md#setup) | **POST** /setup_lakefs | setup lakeFS and create a first user *InternalApi* | [**setupCommPrefs**](docs/InternalApi.md#setupCommPrefs) | **POST** /setup_comm_prefs | setup communications preferences *InternalApi* | [**uploadObjectPreflight**](docs/InternalApi.md#uploadObjectPreflight) | **GET** /repositories/{repository}/branches/{branch}/objects/stage_allowed | -*MetadataApi* | [**createSymlinkFile**](docs/MetadataApi.md#createSymlinkFile) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory *MetadataApi* | [**getMetaRange**](docs/MetadataApi.md#getMetaRange) | **GET** /repositories/{repository}/metadata/meta_range/{meta_range} | return URI to a meta-range file *MetadataApi* | [**getRange**](docs/MetadataApi.md#getRange) | **GET** /repositories/{repository}/metadata/range/{range} | return URI to a range file *ObjectsApi* | [**copyObject**](docs/ObjectsApi.md#copyObject) | **POST** /repositories/{repository}/branches/{branch}/objects/copy | create a copy of an object diff --git a/clients/java/api/openapi.yaml b/clients/java/api/openapi.yaml index 1c53abe4a5d..939d4d31d5f 100644 --- a/clients/java/api/openapi.yaml +++ b/clients/java/api/openapi.yaml @@ -4418,7 +4418,7 @@ paths: description: Internal Server Error summary: creates symlink files corresponding to the given directory tags: - - metadata + - internal x-accepts: application/json /repositories/{repository}/actions/runs: get: diff --git a/clients/java/docs/InternalApi.md b/clients/java/docs/InternalApi.md index 9471c000d80..3e631593959 100644 --- a/clients/java/docs/InternalApi.md +++ b/clients/java/docs/InternalApi.md @@ -5,6 +5,7 @@ All URIs are relative to *http://localhost/api/v1* Method | HTTP request | Description ------------- | ------------- | ------------- [**createBranchProtectionRulePreflight**](InternalApi.md#createBranchProtectionRulePreflight) | **GET** /repositories/{repository}/branch_protection/set_allowed | +[**createSymlinkFile**](InternalApi.md#createSymlinkFile) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory [**getAuthCapabilities**](InternalApi.md#getAuthCapabilities) | **GET** /auth/capabilities | list authentication capabilities supported [**getSetupState**](InternalApi.md#getSetupState) | **GET** /setup_lakefs | check if the lakeFS installation is already set up [**postStatsEvents**](InternalApi.md#postStatsEvents) | **POST** /statistics | post stats events, this endpoint is meant for internal use only @@ -105,6 +106,101 @@ null (empty response body) **409** | Resource Conflicts With Target | - | **0** | Internal Server Error | - | + +# **createSymlinkFile** +> StorageURI createSymlinkFile(repository, branch, location) + +creates symlink files corresponding to the given directory + +### Example +```java +// Import classes: +import io.lakefs.clients.api.ApiClient; +import io.lakefs.clients.api.ApiException; +import io.lakefs.clients.api.Configuration; +import io.lakefs.clients.api.auth.*; +import io.lakefs.clients.api.models.*; +import io.lakefs.clients.api.InternalApi; + +public class Example { + public static void main(String[] args) { + ApiClient defaultClient = Configuration.getDefaultApiClient(); + defaultClient.setBasePath("http://localhost/api/v1"); + + // Configure HTTP basic authorization: basic_auth + HttpBasicAuth basic_auth = (HttpBasicAuth) defaultClient.getAuthentication("basic_auth"); + basic_auth.setUsername("YOUR USERNAME"); + basic_auth.setPassword("YOUR PASSWORD"); + + // Configure API key authorization: cookie_auth + ApiKeyAuth cookie_auth = (ApiKeyAuth) defaultClient.getAuthentication("cookie_auth"); + cookie_auth.setApiKey("YOUR API KEY"); + // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) + //cookie_auth.setApiKeyPrefix("Token"); + + // Configure HTTP bearer authorization: jwt_token + HttpBearerAuth jwt_token = (HttpBearerAuth) defaultClient.getAuthentication("jwt_token"); + jwt_token.setBearerToken("BEARER TOKEN"); + + // Configure API key authorization: oidc_auth + ApiKeyAuth oidc_auth = (ApiKeyAuth) defaultClient.getAuthentication("oidc_auth"); + oidc_auth.setApiKey("YOUR API KEY"); + // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) + //oidc_auth.setApiKeyPrefix("Token"); + + // Configure API key authorization: saml_auth + ApiKeyAuth saml_auth = (ApiKeyAuth) defaultClient.getAuthentication("saml_auth"); + saml_auth.setApiKey("YOUR API KEY"); + // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) + //saml_auth.setApiKeyPrefix("Token"); + + InternalApi apiInstance = new InternalApi(defaultClient); + String repository = "repository_example"; // String | + String branch = "branch_example"; // String | + String location = "location_example"; // String | path to the table data + try { + StorageURI result = apiInstance.createSymlinkFile(repository, branch, location); + System.out.println(result); + } catch (ApiException e) { + System.err.println("Exception when calling InternalApi#createSymlinkFile"); + System.err.println("Status code: " + e.getCode()); + System.err.println("Reason: " + e.getResponseBody()); + System.err.println("Response headers: " + e.getResponseHeaders()); + e.printStackTrace(); + } + } +} +``` + +### Parameters + +Name | Type | Description | Notes +------------- | ------------- | ------------- | ------------- + **repository** | **String**| | + **branch** | **String**| | + **location** | **String**| path to the table data | [optional] + +### Return type + +[**StorageURI**](StorageURI.md) + +### Authorization + +[basic_auth](../README.md#basic_auth), [cookie_auth](../README.md#cookie_auth), [jwt_token](../README.md#jwt_token), [oidc_auth](../README.md#oidc_auth), [saml_auth](../README.md#saml_auth) + +### HTTP request headers + + - **Content-Type**: Not defined + - **Accept**: application/json + +### HTTP response details +| Status code | Description | Response headers | +|-------------|-------------|------------------| +**201** | location created | - | +**401** | Unauthorized | - | +**404** | Resource Not Found | - | +**0** | Internal Server Error | - | + # **getAuthCapabilities** > AuthCapabilities getAuthCapabilities() diff --git a/clients/java/docs/MetadataApi.md b/clients/java/docs/MetadataApi.md index d9f666f7c27..f4d70ba1d10 100644 --- a/clients/java/docs/MetadataApi.md +++ b/clients/java/docs/MetadataApi.md @@ -4,106 +4,10 @@ All URIs are relative to *http://localhost/api/v1* Method | HTTP request | Description ------------- | ------------- | ------------- -[**createSymlinkFile**](MetadataApi.md#createSymlinkFile) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory [**getMetaRange**](MetadataApi.md#getMetaRange) | **GET** /repositories/{repository}/metadata/meta_range/{meta_range} | return URI to a meta-range file [**getRange**](MetadataApi.md#getRange) | **GET** /repositories/{repository}/metadata/range/{range} | return URI to a range file - -# **createSymlinkFile** -> StorageURI createSymlinkFile(repository, branch, location) - -creates symlink files corresponding to the given directory - -### Example -```java -// Import classes: -import io.lakefs.clients.api.ApiClient; -import io.lakefs.clients.api.ApiException; -import io.lakefs.clients.api.Configuration; -import io.lakefs.clients.api.auth.*; -import io.lakefs.clients.api.models.*; -import io.lakefs.clients.api.MetadataApi; - -public class Example { - public static void main(String[] args) { - ApiClient defaultClient = Configuration.getDefaultApiClient(); - defaultClient.setBasePath("http://localhost/api/v1"); - - // Configure HTTP basic authorization: basic_auth - HttpBasicAuth basic_auth = (HttpBasicAuth) defaultClient.getAuthentication("basic_auth"); - basic_auth.setUsername("YOUR USERNAME"); - basic_auth.setPassword("YOUR PASSWORD"); - - // Configure API key authorization: cookie_auth - ApiKeyAuth cookie_auth = (ApiKeyAuth) defaultClient.getAuthentication("cookie_auth"); - cookie_auth.setApiKey("YOUR API KEY"); - // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) - //cookie_auth.setApiKeyPrefix("Token"); - - // Configure HTTP bearer authorization: jwt_token - HttpBearerAuth jwt_token = (HttpBearerAuth) defaultClient.getAuthentication("jwt_token"); - jwt_token.setBearerToken("BEARER TOKEN"); - - // Configure API key authorization: oidc_auth - ApiKeyAuth oidc_auth = (ApiKeyAuth) defaultClient.getAuthentication("oidc_auth"); - oidc_auth.setApiKey("YOUR API KEY"); - // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) - //oidc_auth.setApiKeyPrefix("Token"); - - // Configure API key authorization: saml_auth - ApiKeyAuth saml_auth = (ApiKeyAuth) defaultClient.getAuthentication("saml_auth"); - saml_auth.setApiKey("YOUR API KEY"); - // Uncomment the following line to set a prefix for the API key, e.g. "Token" (defaults to null) - //saml_auth.setApiKeyPrefix("Token"); - - MetadataApi apiInstance = new MetadataApi(defaultClient); - String repository = "repository_example"; // String | - String branch = "branch_example"; // String | - String location = "location_example"; // String | path to the table data - try { - StorageURI result = apiInstance.createSymlinkFile(repository, branch, location); - System.out.println(result); - } catch (ApiException e) { - System.err.println("Exception when calling MetadataApi#createSymlinkFile"); - System.err.println("Status code: " + e.getCode()); - System.err.println("Reason: " + e.getResponseBody()); - System.err.println("Response headers: " + e.getResponseHeaders()); - e.printStackTrace(); - } - } -} -``` - -### Parameters - -Name | Type | Description | Notes -------------- | ------------- | ------------- | ------------- - **repository** | **String**| | - **branch** | **String**| | - **location** | **String**| path to the table data | [optional] - -### Return type - -[**StorageURI**](StorageURI.md) - -### Authorization - -[basic_auth](../README.md#basic_auth), [cookie_auth](../README.md#cookie_auth), [jwt_token](../README.md#jwt_token), [oidc_auth](../README.md#oidc_auth), [saml_auth](../README.md#saml_auth) - -### HTTP request headers - - - **Content-Type**: Not defined - - **Accept**: application/json - -### HTTP response details -| Status code | Description | Response headers | -|-------------|-------------|------------------| -**201** | location created | - | -**401** | Unauthorized | - | -**404** | Resource Not Found | - | -**0** | Internal Server Error | - | - # **getMetaRange** > StorageURI getMetaRange(repository, metaRange) diff --git a/clients/java/src/main/java/io/lakefs/clients/api/InternalApi.java b/clients/java/src/main/java/io/lakefs/clients/api/InternalApi.java index 554c1b6479a..90f2d882d65 100644 --- a/clients/java/src/main/java/io/lakefs/clients/api/InternalApi.java +++ b/clients/java/src/main/java/io/lakefs/clients/api/InternalApi.java @@ -34,6 +34,7 @@ import io.lakefs.clients.api.model.Setup; import io.lakefs.clients.api.model.SetupState; import io.lakefs.clients.api.model.StatsEventsList; +import io.lakefs.clients.api.model.StorageURI; import java.lang.reflect.Type; import java.util.ArrayList; @@ -184,6 +185,148 @@ public okhttp3.Call createBranchProtectionRulePreflightAsync(String repository, localVarApiClient.executeAsync(localVarCall, _callback); return localVarCall; } + /** + * Build call for createSymlinkFile + * @param repository (required) + * @param branch (required) + * @param location path to the table data (optional) + * @param _callback Callback for upload/download progress + * @return Call to execute + * @throws ApiException If fail to serialize the request body object + * @http.response.details + + + + + + +
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
+ */ + public okhttp3.Call createSymlinkFileCall(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { + Object localVarPostBody = null; + + // create path and map variables + String localVarPath = "/repositories/{repository}/refs/{branch}/symlink" + .replaceAll("\\{" + "repository" + "\\}", localVarApiClient.escapeString(repository.toString())) + .replaceAll("\\{" + "branch" + "\\}", localVarApiClient.escapeString(branch.toString())); + + List localVarQueryParams = new ArrayList(); + List localVarCollectionQueryParams = new ArrayList(); + Map localVarHeaderParams = new HashMap(); + Map localVarCookieParams = new HashMap(); + Map localVarFormParams = new HashMap(); + + if (location != null) { + localVarQueryParams.addAll(localVarApiClient.parameterToPair("location", location)); + } + + final String[] localVarAccepts = { + "application/json" + }; + final String localVarAccept = localVarApiClient.selectHeaderAccept(localVarAccepts); + if (localVarAccept != null) { + localVarHeaderParams.put("Accept", localVarAccept); + } + + final String[] localVarContentTypes = { + + }; + final String localVarContentType = localVarApiClient.selectHeaderContentType(localVarContentTypes); + localVarHeaderParams.put("Content-Type", localVarContentType); + + String[] localVarAuthNames = new String[] { "basic_auth", "cookie_auth", "jwt_token", "oidc_auth", "saml_auth" }; + return localVarApiClient.buildCall(localVarPath, "POST", localVarQueryParams, localVarCollectionQueryParams, localVarPostBody, localVarHeaderParams, localVarCookieParams, localVarFormParams, localVarAuthNames, _callback); + } + + @SuppressWarnings("rawtypes") + private okhttp3.Call createSymlinkFileValidateBeforeCall(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { + + // verify the required parameter 'repository' is set + if (repository == null) { + throw new ApiException("Missing the required parameter 'repository' when calling createSymlinkFile(Async)"); + } + + // verify the required parameter 'branch' is set + if (branch == null) { + throw new ApiException("Missing the required parameter 'branch' when calling createSymlinkFile(Async)"); + } + + + okhttp3.Call localVarCall = createSymlinkFileCall(repository, branch, location, _callback); + return localVarCall; + + } + + /** + * creates symlink files corresponding to the given directory + * + * @param repository (required) + * @param branch (required) + * @param location path to the table data (optional) + * @return StorageURI + * @throws ApiException If fail to call the API, e.g. server error or cannot deserialize the response body + * @http.response.details + + + + + + +
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
+ */ + public StorageURI createSymlinkFile(String repository, String branch, String location) throws ApiException { + ApiResponse localVarResp = createSymlinkFileWithHttpInfo(repository, branch, location); + return localVarResp.getData(); + } + + /** + * creates symlink files corresponding to the given directory + * + * @param repository (required) + * @param branch (required) + * @param location path to the table data (optional) + * @return ApiResponse<StorageURI> + * @throws ApiException If fail to call the API, e.g. server error or cannot deserialize the response body + * @http.response.details + + + + + + +
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
+ */ + public ApiResponse createSymlinkFileWithHttpInfo(String repository, String branch, String location) throws ApiException { + okhttp3.Call localVarCall = createSymlinkFileValidateBeforeCall(repository, branch, location, null); + Type localVarReturnType = new TypeToken(){}.getType(); + return localVarApiClient.execute(localVarCall, localVarReturnType); + } + + /** + * creates symlink files corresponding to the given directory (asynchronously) + * + * @param repository (required) + * @param branch (required) + * @param location path to the table data (optional) + * @param _callback The callback to be executed when the API call finishes + * @return The request call + * @throws ApiException If fail to process the API call, e.g. serializing the request body object + * @http.response.details + + + + + + +
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
+ */ + public okhttp3.Call createSymlinkFileAsync(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { + + okhttp3.Call localVarCall = createSymlinkFileValidateBeforeCall(repository, branch, location, _callback); + Type localVarReturnType = new TypeToken(){}.getType(); + localVarApiClient.executeAsync(localVarCall, localVarReturnType, _callback); + return localVarCall; + } /** * Build call for getAuthCapabilities * @param _callback Callback for upload/download progress diff --git a/clients/java/src/main/java/io/lakefs/clients/api/MetadataApi.java b/clients/java/src/main/java/io/lakefs/clients/api/MetadataApi.java index 0985c15a17b..44607fa80ac 100644 --- a/clients/java/src/main/java/io/lakefs/clients/api/MetadataApi.java +++ b/clients/java/src/main/java/io/lakefs/clients/api/MetadataApi.java @@ -55,148 +55,6 @@ public void setApiClient(ApiClient apiClient) { this.localVarApiClient = apiClient; } - /** - * Build call for createSymlinkFile - * @param repository (required) - * @param branch (required) - * @param location path to the table data (optional) - * @param _callback Callback for upload/download progress - * @return Call to execute - * @throws ApiException If fail to serialize the request body object - * @http.response.details - - - - - - -
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
- */ - public okhttp3.Call createSymlinkFileCall(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { - Object localVarPostBody = null; - - // create path and map variables - String localVarPath = "/repositories/{repository}/refs/{branch}/symlink" - .replaceAll("\\{" + "repository" + "\\}", localVarApiClient.escapeString(repository.toString())) - .replaceAll("\\{" + "branch" + "\\}", localVarApiClient.escapeString(branch.toString())); - - List localVarQueryParams = new ArrayList(); - List localVarCollectionQueryParams = new ArrayList(); - Map localVarHeaderParams = new HashMap(); - Map localVarCookieParams = new HashMap(); - Map localVarFormParams = new HashMap(); - - if (location != null) { - localVarQueryParams.addAll(localVarApiClient.parameterToPair("location", location)); - } - - final String[] localVarAccepts = { - "application/json" - }; - final String localVarAccept = localVarApiClient.selectHeaderAccept(localVarAccepts); - if (localVarAccept != null) { - localVarHeaderParams.put("Accept", localVarAccept); - } - - final String[] localVarContentTypes = { - - }; - final String localVarContentType = localVarApiClient.selectHeaderContentType(localVarContentTypes); - localVarHeaderParams.put("Content-Type", localVarContentType); - - String[] localVarAuthNames = new String[] { "basic_auth", "cookie_auth", "jwt_token", "oidc_auth", "saml_auth" }; - return localVarApiClient.buildCall(localVarPath, "POST", localVarQueryParams, localVarCollectionQueryParams, localVarPostBody, localVarHeaderParams, localVarCookieParams, localVarFormParams, localVarAuthNames, _callback); - } - - @SuppressWarnings("rawtypes") - private okhttp3.Call createSymlinkFileValidateBeforeCall(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { - - // verify the required parameter 'repository' is set - if (repository == null) { - throw new ApiException("Missing the required parameter 'repository' when calling createSymlinkFile(Async)"); - } - - // verify the required parameter 'branch' is set - if (branch == null) { - throw new ApiException("Missing the required parameter 'branch' when calling createSymlinkFile(Async)"); - } - - - okhttp3.Call localVarCall = createSymlinkFileCall(repository, branch, location, _callback); - return localVarCall; - - } - - /** - * creates symlink files corresponding to the given directory - * - * @param repository (required) - * @param branch (required) - * @param location path to the table data (optional) - * @return StorageURI - * @throws ApiException If fail to call the API, e.g. server error or cannot deserialize the response body - * @http.response.details - - - - - - -
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
- */ - public StorageURI createSymlinkFile(String repository, String branch, String location) throws ApiException { - ApiResponse localVarResp = createSymlinkFileWithHttpInfo(repository, branch, location); - return localVarResp.getData(); - } - - /** - * creates symlink files corresponding to the given directory - * - * @param repository (required) - * @param branch (required) - * @param location path to the table data (optional) - * @return ApiResponse<StorageURI> - * @throws ApiException If fail to call the API, e.g. server error or cannot deserialize the response body - * @http.response.details - - - - - - -
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
- */ - public ApiResponse createSymlinkFileWithHttpInfo(String repository, String branch, String location) throws ApiException { - okhttp3.Call localVarCall = createSymlinkFileValidateBeforeCall(repository, branch, location, null); - Type localVarReturnType = new TypeToken(){}.getType(); - return localVarApiClient.execute(localVarCall, localVarReturnType); - } - - /** - * creates symlink files corresponding to the given directory (asynchronously) - * - * @param repository (required) - * @param branch (required) - * @param location path to the table data (optional) - * @param _callback The callback to be executed when the API call finishes - * @return The request call - * @throws ApiException If fail to process the API call, e.g. serializing the request body object - * @http.response.details - - - - - - -
Status Code Description Response Headers
201 location created -
401 Unauthorized -
404 Resource Not Found -
0 Internal Server Error -
- */ - public okhttp3.Call createSymlinkFileAsync(String repository, String branch, String location, final ApiCallback _callback) throws ApiException { - - okhttp3.Call localVarCall = createSymlinkFileValidateBeforeCall(repository, branch, location, _callback); - Type localVarReturnType = new TypeToken(){}.getType(); - localVarApiClient.executeAsync(localVarCall, localVarReturnType, _callback); - return localVarCall; - } /** * Build call for getMetaRange * @param repository (required) diff --git a/clients/java/src/test/java/io/lakefs/clients/api/InternalApiTest.java b/clients/java/src/test/java/io/lakefs/clients/api/InternalApiTest.java index 44dff9228e2..805eb7a0b64 100644 --- a/clients/java/src/test/java/io/lakefs/clients/api/InternalApiTest.java +++ b/clients/java/src/test/java/io/lakefs/clients/api/InternalApiTest.java @@ -21,6 +21,7 @@ import io.lakefs.clients.api.model.Setup; import io.lakefs.clients.api.model.SetupState; import io.lakefs.clients.api.model.StatsEventsList; +import io.lakefs.clients.api.model.StorageURI; import org.junit.Test; import org.junit.Ignore; @@ -53,6 +54,23 @@ public void createBranchProtectionRulePreflightTest() throws ApiException { // TODO: test validations } + /** + * creates symlink files corresponding to the given directory + * + * + * + * @throws ApiException + * if the Api call fails + */ + @Test + public void createSymlinkFileTest() throws ApiException { + String repository = null; + String branch = null; + String location = null; + StorageURI response = api.createSymlinkFile(repository, branch, location); + // TODO: test validations + } + /** * list authentication capabilities supported * diff --git a/clients/java/src/test/java/io/lakefs/clients/api/MetadataApiTest.java b/clients/java/src/test/java/io/lakefs/clients/api/MetadataApiTest.java index 56fcddc7aa7..b90cd6e4115 100644 --- a/clients/java/src/test/java/io/lakefs/clients/api/MetadataApiTest.java +++ b/clients/java/src/test/java/io/lakefs/clients/api/MetadataApiTest.java @@ -33,23 +33,6 @@ public class MetadataApiTest { private final MetadataApi api = new MetadataApi(); - /** - * creates symlink files corresponding to the given directory - * - * - * - * @throws ApiException - * if the Api call fails - */ - @Test - public void createSymlinkFileTest() throws ApiException { - String repository = null; - String branch = null; - String location = null; - StorageURI response = api.createSymlinkFile(repository, branch, location); - // TODO: test validations - } - /** * return URI to a meta-range file * diff --git a/clients/python/README.md b/clients/python/README.md index f3f42893be4..faeb5656224 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -171,6 +171,7 @@ Class | Method | HTTP request | Description *ImportApi* | [**import_start**](docs/ImportApi.md#import_start) | **POST** /repositories/{repository}/branches/{branch}/import | import data from object store *ImportApi* | [**import_status**](docs/ImportApi.md#import_status) | **GET** /repositories/{repository}/branches/{branch}/import | get import status *InternalApi* | [**create_branch_protection_rule_preflight**](docs/InternalApi.md#create_branch_protection_rule_preflight) | **GET** /repositories/{repository}/branch_protection/set_allowed | +*InternalApi* | [**create_symlink_file**](docs/InternalApi.md#create_symlink_file) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory *InternalApi* | [**get_auth_capabilities**](docs/InternalApi.md#get_auth_capabilities) | **GET** /auth/capabilities | list authentication capabilities supported *InternalApi* | [**get_setup_state**](docs/InternalApi.md#get_setup_state) | **GET** /setup_lakefs | check if the lakeFS installation is already set up *InternalApi* | [**post_stats_events**](docs/InternalApi.md#post_stats_events) | **POST** /statistics | post stats events, this endpoint is meant for internal use only @@ -178,7 +179,6 @@ Class | Method | HTTP request | Description *InternalApi* | [**setup**](docs/InternalApi.md#setup) | **POST** /setup_lakefs | setup lakeFS and create a first user *InternalApi* | [**setup_comm_prefs**](docs/InternalApi.md#setup_comm_prefs) | **POST** /setup_comm_prefs | setup communications preferences *InternalApi* | [**upload_object_preflight**](docs/InternalApi.md#upload_object_preflight) | **GET** /repositories/{repository}/branches/{branch}/objects/stage_allowed | -*MetadataApi* | [**create_symlink_file**](docs/MetadataApi.md#create_symlink_file) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory *MetadataApi* | [**get_meta_range**](docs/MetadataApi.md#get_meta_range) | **GET** /repositories/{repository}/metadata/meta_range/{meta_range} | return URI to a meta-range file *MetadataApi* | [**get_range**](docs/MetadataApi.md#get_range) | **GET** /repositories/{repository}/metadata/range/{range} | return URI to a range file *ObjectsApi* | [**copy_object**](docs/ObjectsApi.md#copy_object) | **POST** /repositories/{repository}/branches/{branch}/objects/copy | create a copy of an object diff --git a/clients/python/docs/InternalApi.md b/clients/python/docs/InternalApi.md index fd40783290c..ae6f8b321c5 100644 --- a/clients/python/docs/InternalApi.md +++ b/clients/python/docs/InternalApi.md @@ -5,6 +5,7 @@ All URIs are relative to *http://localhost/api/v1* Method | HTTP request | Description ------------- | ------------- | ------------- [**create_branch_protection_rule_preflight**](InternalApi.md#create_branch_protection_rule_preflight) | **GET** /repositories/{repository}/branch_protection/set_allowed | +[**create_symlink_file**](InternalApi.md#create_symlink_file) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory [**get_auth_capabilities**](InternalApi.md#get_auth_capabilities) | **GET** /auth/capabilities | list authentication capabilities supported [**get_setup_state**](InternalApi.md#get_setup_state) | **GET** /setup_lakefs | check if the lakeFS installation is already set up [**post_stats_events**](InternalApi.md#post_stats_events) | **POST** /statistics | post stats events, this endpoint is meant for internal use only @@ -119,6 +120,126 @@ void (empty response body) [[Back to top]](#) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to Model list]](../README.md#documentation-for-models) [[Back to README]](../README.md) +# **create_symlink_file** +> StorageURI create_symlink_file(repository, branch) + +creates symlink files corresponding to the given directory + +### Example + +* Basic Authentication (basic_auth): +* Api Key Authentication (cookie_auth): +* Bearer (JWT) Authentication (jwt_token): +* Api Key Authentication (oidc_auth): +* Api Key Authentication (saml_auth): + +```python +import time +import lakefs_client +from lakefs_client.api import internal_api +from lakefs_client.model.error import Error +from lakefs_client.model.storage_uri import StorageURI +from pprint import pprint +# Defining the host is optional and defaults to http://localhost/api/v1 +# See configuration.py for a list of all supported configuration parameters. +configuration = lakefs_client.Configuration( + host = "http://localhost/api/v1" +) + +# The client must configure the authentication and authorization parameters +# in accordance with the API server security policy. +# Examples for each auth method are provided below, use the example that +# satisfies your auth use case. + +# Configure HTTP basic authorization: basic_auth +configuration = lakefs_client.Configuration( + username = 'YOUR_USERNAME', + password = 'YOUR_PASSWORD' +) + +# Configure API key authorization: cookie_auth +configuration.api_key['cookie_auth'] = 'YOUR_API_KEY' + +# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed +# configuration.api_key_prefix['cookie_auth'] = 'Bearer' + +# Configure Bearer authorization (JWT): jwt_token +configuration = lakefs_client.Configuration( + access_token = 'YOUR_BEARER_TOKEN' +) + +# Configure API key authorization: oidc_auth +configuration.api_key['oidc_auth'] = 'YOUR_API_KEY' + +# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed +# configuration.api_key_prefix['oidc_auth'] = 'Bearer' + +# Configure API key authorization: saml_auth +configuration.api_key['saml_auth'] = 'YOUR_API_KEY' + +# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed +# configuration.api_key_prefix['saml_auth'] = 'Bearer' + +# Enter a context with an instance of the API client +with lakefs_client.ApiClient(configuration) as api_client: + # Create an instance of the API class + api_instance = internal_api.InternalApi(api_client) + repository = "repository_example" # str | + branch = "branch_example" # str | + location = "location_example" # str | path to the table data (optional) + + # example passing only required values which don't have defaults set + try: + # creates symlink files corresponding to the given directory + api_response = api_instance.create_symlink_file(repository, branch) + pprint(api_response) + except lakefs_client.ApiException as e: + print("Exception when calling InternalApi->create_symlink_file: %s\n" % e) + + # example passing only required values which don't have defaults set + # and optional values + try: + # creates symlink files corresponding to the given directory + api_response = api_instance.create_symlink_file(repository, branch, location=location) + pprint(api_response) + except lakefs_client.ApiException as e: + print("Exception when calling InternalApi->create_symlink_file: %s\n" % e) +``` + + +### Parameters + +Name | Type | Description | Notes +------------- | ------------- | ------------- | ------------- + **repository** | **str**| | + **branch** | **str**| | + **location** | **str**| path to the table data | [optional] + +### Return type + +[**StorageURI**](StorageURI.md) + +### Authorization + +[basic_auth](../README.md#basic_auth), [cookie_auth](../README.md#cookie_auth), [jwt_token](../README.md#jwt_token), [oidc_auth](../README.md#oidc_auth), [saml_auth](../README.md#saml_auth) + +### HTTP request headers + + - **Content-Type**: Not defined + - **Accept**: application/json + + +### HTTP response details + +| Status code | Description | Response headers | +|-------------|-------------|------------------| +**201** | location created | - | +**401** | Unauthorized | - | +**404** | Resource Not Found | - | +**0** | Internal Server Error | - | + +[[Back to top]](#) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to Model list]](../README.md#documentation-for-models) [[Back to README]](../README.md) + # **get_auth_capabilities** > AuthCapabilities get_auth_capabilities() diff --git a/clients/python/docs/MetadataApi.md b/clients/python/docs/MetadataApi.md index 46180afdd82..f95a2aa2c03 100644 --- a/clients/python/docs/MetadataApi.md +++ b/clients/python/docs/MetadataApi.md @@ -4,131 +4,10 @@ All URIs are relative to *http://localhost/api/v1* Method | HTTP request | Description ------------- | ------------- | ------------- -[**create_symlink_file**](MetadataApi.md#create_symlink_file) | **POST** /repositories/{repository}/refs/{branch}/symlink | creates symlink files corresponding to the given directory [**get_meta_range**](MetadataApi.md#get_meta_range) | **GET** /repositories/{repository}/metadata/meta_range/{meta_range} | return URI to a meta-range file [**get_range**](MetadataApi.md#get_range) | **GET** /repositories/{repository}/metadata/range/{range} | return URI to a range file -# **create_symlink_file** -> StorageURI create_symlink_file(repository, branch) - -creates symlink files corresponding to the given directory - -### Example - -* Basic Authentication (basic_auth): -* Api Key Authentication (cookie_auth): -* Bearer (JWT) Authentication (jwt_token): -* Api Key Authentication (oidc_auth): -* Api Key Authentication (saml_auth): - -```python -import time -import lakefs_client -from lakefs_client.api import metadata_api -from lakefs_client.model.error import Error -from lakefs_client.model.storage_uri import StorageURI -from pprint import pprint -# Defining the host is optional and defaults to http://localhost/api/v1 -# See configuration.py for a list of all supported configuration parameters. -configuration = lakefs_client.Configuration( - host = "http://localhost/api/v1" -) - -# The client must configure the authentication and authorization parameters -# in accordance with the API server security policy. -# Examples for each auth method are provided below, use the example that -# satisfies your auth use case. - -# Configure HTTP basic authorization: basic_auth -configuration = lakefs_client.Configuration( - username = 'YOUR_USERNAME', - password = 'YOUR_PASSWORD' -) - -# Configure API key authorization: cookie_auth -configuration.api_key['cookie_auth'] = 'YOUR_API_KEY' - -# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed -# configuration.api_key_prefix['cookie_auth'] = 'Bearer' - -# Configure Bearer authorization (JWT): jwt_token -configuration = lakefs_client.Configuration( - access_token = 'YOUR_BEARER_TOKEN' -) - -# Configure API key authorization: oidc_auth -configuration.api_key['oidc_auth'] = 'YOUR_API_KEY' - -# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed -# configuration.api_key_prefix['oidc_auth'] = 'Bearer' - -# Configure API key authorization: saml_auth -configuration.api_key['saml_auth'] = 'YOUR_API_KEY' - -# Uncomment below to setup prefix (e.g. Bearer) for API key, if needed -# configuration.api_key_prefix['saml_auth'] = 'Bearer' - -# Enter a context with an instance of the API client -with lakefs_client.ApiClient(configuration) as api_client: - # Create an instance of the API class - api_instance = metadata_api.MetadataApi(api_client) - repository = "repository_example" # str | - branch = "branch_example" # str | - location = "location_example" # str | path to the table data (optional) - - # example passing only required values which don't have defaults set - try: - # creates symlink files corresponding to the given directory - api_response = api_instance.create_symlink_file(repository, branch) - pprint(api_response) - except lakefs_client.ApiException as e: - print("Exception when calling MetadataApi->create_symlink_file: %s\n" % e) - - # example passing only required values which don't have defaults set - # and optional values - try: - # creates symlink files corresponding to the given directory - api_response = api_instance.create_symlink_file(repository, branch, location=location) - pprint(api_response) - except lakefs_client.ApiException as e: - print("Exception when calling MetadataApi->create_symlink_file: %s\n" % e) -``` - - -### Parameters - -Name | Type | Description | Notes -------------- | ------------- | ------------- | ------------- - **repository** | **str**| | - **branch** | **str**| | - **location** | **str**| path to the table data | [optional] - -### Return type - -[**StorageURI**](StorageURI.md) - -### Authorization - -[basic_auth](../README.md#basic_auth), [cookie_auth](../README.md#cookie_auth), [jwt_token](../README.md#jwt_token), [oidc_auth](../README.md#oidc_auth), [saml_auth](../README.md#saml_auth) - -### HTTP request headers - - - **Content-Type**: Not defined - - **Accept**: application/json - - -### HTTP response details - -| Status code | Description | Response headers | -|-------------|-------------|------------------| -**201** | location created | - | -**401** | Unauthorized | - | -**404** | Resource Not Found | - | -**0** | Internal Server Error | - | - -[[Back to top]](#) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to Model list]](../README.md#documentation-for-models) [[Back to README]](../README.md) - # **get_meta_range** > StorageURI get_meta_range(repository, meta_range) diff --git a/clients/python/lakefs_client/api/internal_api.py b/clients/python/lakefs_client/api/internal_api.py index 6f4238cd944..19ea97f55af 100644 --- a/clients/python/lakefs_client/api/internal_api.py +++ b/clients/python/lakefs_client/api/internal_api.py @@ -29,6 +29,7 @@ from lakefs_client.model.setup import Setup from lakefs_client.model.setup_state import SetupState from lakefs_client.model.stats_events_list import StatsEventsList +from lakefs_client.model.storage_uri import StorageURI class InternalApi(object): @@ -97,6 +98,72 @@ def __init__(self, api_client=None): }, api_client=api_client ) + self.create_symlink_file_endpoint = _Endpoint( + settings={ + 'response_type': (StorageURI,), + 'auth': [ + 'basic_auth', + 'cookie_auth', + 'jwt_token', + 'oidc_auth', + 'saml_auth' + ], + 'endpoint_path': '/repositories/{repository}/refs/{branch}/symlink', + 'operation_id': 'create_symlink_file', + 'http_method': 'POST', + 'servers': None, + }, + params_map={ + 'all': [ + 'repository', + 'branch', + 'location', + ], + 'required': [ + 'repository', + 'branch', + ], + 'nullable': [ + ], + 'enum': [ + ], + 'validation': [ + ] + }, + root_map={ + 'validations': { + }, + 'allowed_values': { + }, + 'openapi_types': { + 'repository': + (str,), + 'branch': + (str,), + 'location': + (str,), + }, + 'attribute_map': { + 'repository': 'repository', + 'branch': 'branch', + 'location': 'location', + }, + 'location_map': { + 'repository': 'path', + 'branch': 'path', + 'location': 'query', + }, + 'collection_format_map': { + } + }, + headers_map={ + 'accept': [ + 'application/json' + ], + 'content_type': [], + }, + api_client=api_client + ) self.get_auth_capabilities_endpoint = _Endpoint( settings={ 'response_type': (AuthCapabilities,), @@ -525,6 +592,76 @@ def create_branch_protection_rule_preflight( repository return self.create_branch_protection_rule_preflight_endpoint.call_with_http_info(**kwargs) + def create_symlink_file( + self, + repository, + branch, + **kwargs + ): + """creates symlink files corresponding to the given directory # noqa: E501 + + This method makes a synchronous HTTP request by default. To make an + asynchronous HTTP request, please pass async_req=True + + >>> thread = api.create_symlink_file(repository, branch, async_req=True) + >>> result = thread.get() + + Args: + repository (str): + branch (str): + + Keyword Args: + location (str): path to the table data. [optional] + _return_http_data_only (bool): response data without head status + code and headers. Default is True. + _preload_content (bool): if False, the urllib3.HTTPResponse object + will be returned without reading/decoding response data. + Default is True. + _request_timeout (int/float/tuple): timeout setting for this request. If + one number provided, it will be total request timeout. It can also + be a pair (tuple) of (connection, read) timeouts. + Default is None. + _check_input_type (bool): specifies if type checking + should be done one the data sent to the server. + Default is True. + _check_return_type (bool): specifies if type checking + should be done one the data received from the server. + Default is True. + _host_index (int/None): specifies the index of the server + that we want to use. + Default is read from the configuration. + async_req (bool): execute request asynchronously + + Returns: + StorageURI + If the method is called asynchronously, returns the request + thread. + """ + kwargs['async_req'] = kwargs.get( + 'async_req', False + ) + kwargs['_return_http_data_only'] = kwargs.get( + '_return_http_data_only', True + ) + kwargs['_preload_content'] = kwargs.get( + '_preload_content', True + ) + kwargs['_request_timeout'] = kwargs.get( + '_request_timeout', None + ) + kwargs['_check_input_type'] = kwargs.get( + '_check_input_type', True + ) + kwargs['_check_return_type'] = kwargs.get( + '_check_return_type', True + ) + kwargs['_host_index'] = kwargs.get('_host_index') + kwargs['repository'] = \ + repository + kwargs['branch'] = \ + branch + return self.create_symlink_file_endpoint.call_with_http_info(**kwargs) + def get_auth_capabilities( self, **kwargs diff --git a/clients/python/lakefs_client/api/metadata_api.py b/clients/python/lakefs_client/api/metadata_api.py index e81d3b25b41..7e58219d1e9 100644 --- a/clients/python/lakefs_client/api/metadata_api.py +++ b/clients/python/lakefs_client/api/metadata_api.py @@ -37,72 +37,6 @@ def __init__(self, api_client=None): if api_client is None: api_client = ApiClient() self.api_client = api_client - self.create_symlink_file_endpoint = _Endpoint( - settings={ - 'response_type': (StorageURI,), - 'auth': [ - 'basic_auth', - 'cookie_auth', - 'jwt_token', - 'oidc_auth', - 'saml_auth' - ], - 'endpoint_path': '/repositories/{repository}/refs/{branch}/symlink', - 'operation_id': 'create_symlink_file', - 'http_method': 'POST', - 'servers': None, - }, - params_map={ - 'all': [ - 'repository', - 'branch', - 'location', - ], - 'required': [ - 'repository', - 'branch', - ], - 'nullable': [ - ], - 'enum': [ - ], - 'validation': [ - ] - }, - root_map={ - 'validations': { - }, - 'allowed_values': { - }, - 'openapi_types': { - 'repository': - (str,), - 'branch': - (str,), - 'location': - (str,), - }, - 'attribute_map': { - 'repository': 'repository', - 'branch': 'branch', - 'location': 'location', - }, - 'location_map': { - 'repository': 'path', - 'branch': 'path', - 'location': 'query', - }, - 'collection_format_map': { - } - }, - headers_map={ - 'accept': [ - 'application/json' - ], - 'content_type': [], - }, - api_client=api_client - ) self.get_meta_range_endpoint = _Endpoint( settings={ 'response_type': (StorageURI,), @@ -226,76 +160,6 @@ def __init__(self, api_client=None): api_client=api_client ) - def create_symlink_file( - self, - repository, - branch, - **kwargs - ): - """creates symlink files corresponding to the given directory # noqa: E501 - - This method makes a synchronous HTTP request by default. To make an - asynchronous HTTP request, please pass async_req=True - - >>> thread = api.create_symlink_file(repository, branch, async_req=True) - >>> result = thread.get() - - Args: - repository (str): - branch (str): - - Keyword Args: - location (str): path to the table data. [optional] - _return_http_data_only (bool): response data without head status - code and headers. Default is True. - _preload_content (bool): if False, the urllib3.HTTPResponse object - will be returned without reading/decoding response data. - Default is True. - _request_timeout (int/float/tuple): timeout setting for this request. If - one number provided, it will be total request timeout. It can also - be a pair (tuple) of (connection, read) timeouts. - Default is None. - _check_input_type (bool): specifies if type checking - should be done one the data sent to the server. - Default is True. - _check_return_type (bool): specifies if type checking - should be done one the data received from the server. - Default is True. - _host_index (int/None): specifies the index of the server - that we want to use. - Default is read from the configuration. - async_req (bool): execute request asynchronously - - Returns: - StorageURI - If the method is called asynchronously, returns the request - thread. - """ - kwargs['async_req'] = kwargs.get( - 'async_req', False - ) - kwargs['_return_http_data_only'] = kwargs.get( - '_return_http_data_only', True - ) - kwargs['_preload_content'] = kwargs.get( - '_preload_content', True - ) - kwargs['_request_timeout'] = kwargs.get( - '_request_timeout', None - ) - kwargs['_check_input_type'] = kwargs.get( - '_check_input_type', True - ) - kwargs['_check_return_type'] = kwargs.get( - '_check_return_type', True - ) - kwargs['_host_index'] = kwargs.get('_host_index') - kwargs['repository'] = \ - repository - kwargs['branch'] = \ - branch - return self.create_symlink_file_endpoint.call_with_http_info(**kwargs) - def get_meta_range( self, repository, diff --git a/clients/python/test/test_internal_api.py b/clients/python/test/test_internal_api.py index 96637ea841f..70a356f9016 100644 --- a/clients/python/test/test_internal_api.py +++ b/clients/python/test/test_internal_api.py @@ -30,6 +30,13 @@ def test_create_branch_protection_rule_preflight(self): """ pass + def test_create_symlink_file(self): + """Test case for create_symlink_file + + creates symlink files corresponding to the given directory # noqa: E501 + """ + pass + def test_get_auth_capabilities(self): """Test case for get_auth_capabilities diff --git a/clients/python/test/test_metadata_api.py b/clients/python/test/test_metadata_api.py index a4f276d8574..7ffc345a98a 100644 --- a/clients/python/test/test_metadata_api.py +++ b/clients/python/test/test_metadata_api.py @@ -24,13 +24,6 @@ def setUp(self): def tearDown(self): pass - def test_create_symlink_file(self): - """Test case for create_symlink_file - - creates symlink files corresponding to the given directory # noqa: E501 - """ - pass - def test_get_meta_range(self): """Test case for get_meta_range diff --git a/docs/assets/js/swagger.yml b/docs/assets/js/swagger.yml index 3b9ebba8be6..291499d100d 100644 --- a/docs/assets/js/swagger.yml +++ b/docs/assets/js/swagger.yml @@ -3810,7 +3810,7 @@ paths: description: path to the table data post: tags: - - metadata + - internal operationId: createSymlinkFile summary: creates symlink files corresponding to the given directory From 88ba18b4855d06998bdb374c198799860388c8c3 Mon Sep 17 00:00:00 2001 From: Yoni Date: Wed, 20 Sep 2023 16:16:39 +0300 Subject: [PATCH 5/5] docs/remove old gc (#6635) --- docs/howto/garbage-collection/committed.md | 308 ------------------- docs/howto/garbage-collection/gc.md | 261 ++++++++++++++++ docs/howto/garbage-collection/index.md | 249 --------------- docs/howto/garbage-collection/internals.md | 116 ------- docs/howto/garbage-collection/managed-gc.md | 2 +- docs/howto/garbage-collection/uncommitted.md | 127 -------- docs/understand/glossary.md | 2 +- 7 files changed, 263 insertions(+), 802 deletions(-) delete mode 100644 docs/howto/garbage-collection/committed.md create mode 100644 docs/howto/garbage-collection/gc.md delete mode 100644 docs/howto/garbage-collection/internals.md delete mode 100644 docs/howto/garbage-collection/uncommitted.md diff --git a/docs/howto/garbage-collection/committed.md b/docs/howto/garbage-collection/committed.md deleted file mode 100644 index 60803e7525a..00000000000 --- a/docs/howto/garbage-collection/committed.md +++ /dev/null @@ -1,308 +0,0 @@ ---- -title: (deprecated) Committed Objects -description: Clean up unnecessary objects using the garbage collection feature in lakeFS. -parent: Garbage Collection -grand_parent: How-To -nav_order: 98 -redirect_from: - - /howto/garbage-collection-committed.html ---- - -# Garbage Collection: committed objects - -{: .warning-title } -> Deprecation notice -> -> This feature will be available up to version 0.9.1 of the lakeFS metadata client. It will be discontinued in subsequent versions. -> Please visit the new [garbage collection documentation](./index.md). - -By default, lakeFS keeps all your objects forever. This allows you to travel back in time to previous versions of your data. -However, sometimes you may want to hard-delete your objects - namely, delete them from the underlying storage. -Reasons for this include cost-reduction and privacy policies. - -Garbage collection rules in lakeFS define for how long to retain objects after they have been deleted. -lakeFS provides a Spark program to hard-delete objects whose retention period has ended according to the GC rules. - -This program does not remove any commits: you will still be able to use commits containing hard-deleted objects, -but trying to read these objects from lakeFS will result in a `410 Gone` HTTP status. -{: .note} - -[lakeFS Cloud](https://lakefs.cloud) users enjoy a managed Garbage Collection service, and do not need to run this Spark program. -{: .note } - -{% include toc.html %} - -## Understanding Garbage Collection - -For every branch, the GC job retains deleted objects for the number of days defined for the branch. -In the absence of a branch-specific rule, the default rule for the repository is used. -If an object is present in more than one branch ancestry, it's retained according to the rule with the largest number of days between those branches. -That is, it's hard-deleted only after the retention period has ended for all relevant branches. - -Example GC rules for a repository: -```json -{ - "default_retention_days": 14, - "branches": [ - {"branch_id": "main", "retention_days": 21}, - {"branch_id": "dev", "retention_days": 7} - ] -} -``` - -In the above example, objects are retained for 14 days after deletion by default. However, if they are present in the branch `main`, they are retained for 21 days. -Objects present in the `dev` branch (but not in any other branch) are retained for 7 days after they are deleted. - -## Configuring GC rules - -To define garbage collection rules, either use the `lakectl` command or the lakeFS web UI: - -
- -
- -Create a JSON file with your GC rules: - -```bash -cat <> example_repo_gc_rules.json -{ - "default_retention_days": 14, - "branches": [ - {"branch_id": "main", "retention_days": 21}, - {"branch_id": "dev", "retention_days": 7} - ] -} -EOT -``` - -Set the GC rules using `lakectl`: -```bash -lakectl gc set-config lakefs://example-repo -f example_repo_gc_rules.json -``` - -
-
-From the lakeFS web UI: - -1. Navigate to the main page of your repository. -2. Go to _Settings_ -> _Retention_. -3. Click _Edit policy_ and paste your GC rule into the text box as a JSON. -4. Save your changes. - -![GC Rules From UI]({{ site.baseurl }}/assets/img/gc_rules_from_ui.png) -
-
- -## Running the GC job - -To run the job, use the following `spark-submit` command (or using your preferred method of running Spark programs). -The job will hard-delete objects that were deleted and whose retention period has ended according to the GC rules. - -
- -
- ```bash -spark-submit --class io.treeverse.clients.GarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.s3a.access.key= \ - -c spark.hadoop.fs.s3a.secret.key= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-312-hadoop3/0.9.1/lakefs-spark-client-312-hadoop3-assembly-0.9.1.jar \ - example-repo us-east-1 - ``` -
-
- ```bash -spark-submit --class io.treeverse.clients.GarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.s3a.access.key= \ - -c spark.hadoop.fs.s3a.secret.key= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-301/0.9.1/lakefs-spark-client-301-assembly-0.9.1.jar \ - example-repo us-east-1 - ``` -
- -
- - If you want to access your storage using the account key: - - ```bash -spark-submit --class io.treeverse.clients.GarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:3.2.1 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.azure.account.key..dfs.core.windows.net= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-312-hadoop3/0.9.1/lakefs-spark-client-312-hadoop3-assembly-0.9.1.jar \ - example-repo - ``` - - Or, if you want to access your storage using an Azure service principal: - - ```bash -spark-submit --class io.treeverse.clients.GarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:3.2.1 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.azure.account.auth.type..dfs.core.windows.net=OAuth \ - -c spark.hadoop.fs.azure.account.oauth.provider.type..dfs.core.windows.net=org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider \ - -c spark.hadoop.fs.azure.account.oauth2.client.id..dfs.core.windows.net= \ - -c spark.hadoop.fs.azure.account.oauth2.client.secret..dfs.core.windows.net= \ - -c spark.hadoop.fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net=https://login.microsoftonline.com//oauth2/token \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-312-hadoop3/0.9.1/lakefs-spark-client-312-hadoop3-assembly-0.9.1.jar \ - example-repo - ``` - -**Notes:** -* On Azure, GC was tested only on Spark 3.3.0, but may work with other Spark and Hadoop versions. -* In case you don't have `hadoop-azure` package as part of your environment, you should add the package to your spark-submit with `--packages org.apache.hadoop:hadoop-azure:3.2.1` -* For GC to work on Azure blob, [soft delete](https://docs.microsoft.com/en-us/azure/storage/blobs/soft-delete-blob-overview) should be disabled. -
- -
-⚠️ At the moment, only the "mark" phase of the Garbage Collection is supported for GCP. -That is, this program will output a list of expired objects, and you will have to delete them manually. -We have [concrete plans](https://github.com/treeverse/lakeFS/issues/3626) to extend this support to actually delete the objects. -{: .note .note-warning } - -```bash -spark-submit --class io.treeverse.clients.GarbageCollector \ - --jars https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.google.cloud.auth.service.account.enable=true \ - -c spark.hadoop.google.cloud.auth.service.account.json.keyfile= \ - -c spark.hadoop.fs.gs.project.id= \ - -c spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ - -c spark.hadoop.fs.AbstractFileSystem.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS \ - -c spark.hadoop.lakefs.gc.do_sweep=false \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-312-hadoop3/0.9.1/lakefs-spark-client-312-hadoop3-assembly-0.9.1.jar \ - example-repo -``` - -This program will not delete anything. -Instead, it will find all the objects that are safe to delete and save a list containing all their keys, in Parquet format. -The list will then be found under the path: -``` -gs:///_lakefs/logs/gc/expired_addresses/ -``` - -Note that this is a path in your Google Storage bucket, and not in your lakeFS repository. -For example, if your repository's underlying storage is `gs://example-bucket/example-path`, you will find the list in: -``` -gs://example-bucket/example-path/_lakefs/logs/gc/expired_addresses/dt=/ -``` - -You can now delete the objects appearing in the list from your Google Storage bucket. -
-
- -You will find the list of objects hard-deleted by the job in the storage -namespace of the repository. It is saved in Parquet format under `_lakefs/logs/gc/deleted_objects`. - -### GC job options - -By default, GC first creates a list of expired objects according to your retention rules and then hard-deletes those objects. -However, you can use GC options to break the GC job down into two stages: -1. Mark stage: GC will mark the expired objects to hard-delete, **without** deleting them. -2. Sweep stage: GC will hard-delete objects marked by a previous mark-only GC run. - -By breaking GC into these stages, you can pause and create a backup of the objects that GC is about to sweep and later -restore them. You can use the [GC backup and restore](#backup-and-restore) utility to do that. - -#### Mark only mode - -To make GC run the mark stage only, add the following properties to your spark-submit command: -```properties -spark.hadoop.lakefs.gc.do_sweep=false -spark.hadoop.lakefs.gc.mark_id= # Replace with your own identification string. This MARK_ID will enable you to start a sweep (actual deletion) run later -``` -Running in mark only mode, GC will write the addresses of the expired objects to delete to the following location: `STORAGE_NAMESPACE/_lakefs/retention/gc/addresses/mark_id=/` as a parquet. - -**Notes:** -* Mark only mode is only available from v0.4.0 of lakeFS Spark client. -* The `spark.hadoop.lakefs.debug.gc.no_delete` property has been deprecated with v0.4.0. - -#### Sweep only mode - -To make GC run the sweep stage only, add the following properties to your spark-submit command: -```properties -spark.hadoop.lakefs.gc.do_mark=false -spark.hadoop.lakefs.gc.mark_id= # Replace with the identifier you used on a previous mark-only run -``` -Running in sweep only mode, GC will hard-delete the expired objects marked by a mark-only run and listed in: `STORAGE_NAMESPACE/_lakefs/retention/gc/addresses/mark_id=/`. - -**Note:** Mark only mode is only available from v0.4.0 of lakeFS Spark client. - -## Considerations - -1. In order for an object to be hard-deleted, it must be deleted from all branches. - You should remove stale branches to prevent them from retaining old objects. - For example, consider a branch that has been merged to `main` and has become stale. - An object which is later deleted from `main` will always be present in the stale branch, preventing it from being hard-deleted. - -1. lakeFS will never delete objects outside your repository's storage namespace. - In particular, objects that were imported using `lakectl ingest` or the UI import wizard will not be affected by GC jobs. - -1. In cases where deleted objects are brought back to life while a GC job is running, said objects may or may not be - deleted. Such actions include: - 1. Reverting a commit in which a file was deleted. - 1. Branching out from an old commit. - 1. Expanding the retention period of a branch. - 1. Creating a branch from an existing branch, where the new branch has a longer retention period. - -## Backup and restore - -GC was created to hard-delete objects from your underlying objects store according to your retention rules. However, when you start -using the feature you may want to first gain confidence in the decisions GC makes. The GC backup and restore utility helps you do that. - -**Use-cases:** -* Backup: copy expired objects from your repository's storage namespace to an external location before running GC in [sweep only mode](#sweep-only-mode). -* Restore: copy objects that were hard-deleted by GC from an external location you used for saving your backup into your repository's storage namespace. - -Follow [rclone documentation](https://rclone.org/docs/) to configure remote access to the underlying storage used by lakeFS. -Replace `LAKEFS_STORAGE_NAMESPACE` with remote:bucket/path which points to the lakeFS repository storage namespace. -The `BACKUP_STORAGE_LOCATION` attribute points to a storage location outside your lakeFS storage namespace into which you want to save the backup. - -### Backup command - -```shell -rclone --include "*.txt" cat "/_lakefs/retention/gc/addresses.text/mark_id=/" | \ - rclone -P --no-traverse --files-from - copy -``` - -### Restore command - -```shell -rclone --include "*.txt" cat "/_lakefs/retention/gc/addresses.text/mark_id=/" | \ - rclone -P --no-traverse --files-from - copy -``` - -### Example - -The following of commands used to backup/resource a configured remote 'azure' (Azure blob storage) to access example repository storage namespace `https://lakefs.blob.core.windows.net/repo/example/`: - -```shell -# Backup -rclone --include "*.txt" cat "azure://repo/example/_lakefs/retention/gc/addresses.text/mark_id=a64d1885-6202-431f-a0a3-8832e4a5865a/" | \ - rclone -P --no-traverse --files-from - copy azure://repo/example/ azure://backup/repo-example/ - -# Restore -rclone --include "*.txt" cat "azure://tal/azure-br/_lakefs/retention/gc/addresses.text/mark_id=a64d1885-6202-431f-a0a3-8832e4a5865a/" | \ - rclone -P --no-traverse --files-from - copy azure://backup/repo-example/ azure://repo/example/ -``` diff --git a/docs/howto/garbage-collection/gc.md b/docs/howto/garbage-collection/gc.md new file mode 100644 index 00000000000..37a60226d05 --- /dev/null +++ b/docs/howto/garbage-collection/gc.md @@ -0,0 +1,261 @@ +--- +title: Garbage Collection +description: Clean up expired objects using the garbage collection feature in lakeFS. +parent: Garbage Collection +grand_parent: How-To +nav_order: 1 +redirect_from: + - /howto/garbage-collection/index.html + - /howto/garbage-collection/committed.html + - /howto/garbage-collection/uncommitted.html + - /howto/garbage-collection/internals.html + - /reference/garbage-collection.html + - /howto/garbage-collection-index.html + - /howto/garbage-collection.html + - /reference/retention.html +--- + +# Garbage Collection + +[lakeFS Cloud](https://lakefs.cloud) users enjoy a [managed garbage collection]({% link howto/garbage-collection/managed-gc.md %}) service, and do not need to run this Spark program. +{: .tip } + + +By default, lakeFS keeps all your objects forever. This allows you to travel back in time to previous versions of your data. +However, sometimes you may want to remove the objects from the underlying storage completely. +Reasons for this include cost-reduction and privacy policies. + +The garbage collection (GC) job is a Spark program that removes the following from the underlying storage: +1. _Committed objects_ that have been deleted (or replaced) in lakeFS, and are considered expired according to [rules you define](#garbage-collection-rules). +2. _Uncommitted objects_ that are no longer accessible + * For example, objects deleted before ever being committed. + +{% include toc.html %} + +## Garbage collection rules + +{: .note } +These rules only apply to objects that have been _committed_ at some point. +Without retention rules, only inaccessible _uncommitted_ objects will be removed by the job. + +Garbage collection rules determine for how long an object is kept in the storage after it is _deleted_ (or replaced) in lakeFS. +For every branch, the GC job retains deleted objects for the number of days defined for the branch. +In the absence of a branch-specific rule, the default rule for the repository is used. +If an object is present in more than one branch ancestry, it is removed only after the retention period has ended for +all relevant branches. + +Example GC rules for a repository: +```json +{ + "default_retention_days": 14, + "branches": [ + {"branch_id": "main", "retention_days": 21}, + {"branch_id": "dev", "retention_days": 7} + ] +} +``` + +In the above example, objects will be retained for 14 days after deletion by default. +However, if present in the branch `main`, objects will be retained for 21 days. +Objects present _only_ in the `dev` branch will be retained for 7 days after they are deleted. + +### How to configure garbage collection rules + +To define retention rules, either use the `lakectl` command, the lakeFS web UI, or [API](/reference/api.html#/retention/set%20garbage%20collection%20rules): + +
+ +
+ +Create a JSON file with your GC rules: + +```bash +cat <> example_repo_gc_rules.json +{ + "default_retention_days": 14, + "branches": [ + {"branch_id": "main", "retention_days": 21}, + {"branch_id": "dev", "retention_days": 7} + ] +} +EOT +``` + +Set the GC rules using `lakectl`: +```bash +lakectl gc set-config lakefs://example-repo -f example_repo_gc_rules.json +``` + +
+
+From the lakeFS web UI: + +1. Navigate to the main page of your repository. +2. Go to _Settings_ -> _Garbage Collection_. +3. Click _Edit policy_ and paste your GC rule into the text box as a JSON. +4. Save your changes. + +![GC Rules From UI]({{ site.baseurl }}/assets/img/gc_rules_from_ui.png) +
+
+ +## How to run the garbage collection job + +To run the job, use the following `spark-submit` command (or using your preferred method of running Spark programs). + +
+ +
+ ```bash +spark-submit --class io.treeverse.gc.GarbageCollection \ + --packages org.apache.hadoop:hadoop-aws:2.7.7 \ + -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ + -c spark.hadoop.lakefs.api.access_key= \ + -c spark.hadoop.lakefs.api.secret_key= \ + -c spark.hadoop.fs.s3a.access.key= \ + -c spark.hadoop.fs.s3a.secret.key= \ + http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ + example-repo us-east-1 + ``` +
+
+ ```bash +spark-submit --class io.treeverse.gc.GarbageCollection \ + --packages org.apache.hadoop:hadoop-aws:2.7.7 \ + -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ + -c spark.hadoop.lakefs.api.access_key= \ + -c spark.hadoop.lakefs.api.secret_key= \ + -c spark.hadoop.fs.s3a.access.key= \ + -c spark.hadoop.fs.s3a.secret.key= \ + http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-301/0.10.0/lakefs-spark-client-301-assembly-0.10.0.jar \ + example-repo us-east-1 + ``` +
+ +
+ +If you want to access your storage using the account key: + + ```bash +spark-submit --class io.treeverse.gc.GarbageCollection \ + --packages org.apache.hadoop:hadoop-aws:3.2.1 \ + -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ + -c spark.hadoop.lakefs.api.access_key= \ + -c spark.hadoop.lakefs.api.secret_key= \ + -c spark.hadoop.fs.azure.account.key..dfs.core.windows.net= \ + http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ + example-repo + ``` + +Or, if you want to access your storage using an Azure service principal: + + ```bash +spark-submit --class io.treeverse.gc.GarbageCollection \ + --packages org.apache.hadoop:hadoop-aws:3.2.1 \ + -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ + -c spark.hadoop.lakefs.api.access_key= \ + -c spark.hadoop.lakefs.api.secret_key= \ + -c spark.hadoop.fs.azure.account.auth.type..dfs.core.windows.net=OAuth \ + -c spark.hadoop.fs.azure.account.oauth.provider.type..dfs.core.windows.net=org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider \ + -c spark.hadoop.fs.azure.account.oauth2.client.id..dfs.core.windows.net= \ + -c spark.hadoop.fs.azure.account.oauth2.client.secret..dfs.core.windows.net= \ + -c spark.hadoop.fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net=https://login.microsoftonline.com//oauth2/token \ + http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ + example-repo + ``` + +**Notes:** +* On Azure, GC was tested only on Spark 3.3.0, but may work with other Spark and Hadoop versions. +* In case you don't have `hadoop-azure` package as part of your environment, you should add the package to your spark-submit with `--packages org.apache.hadoop:hadoop-azure:3.2.1` +* For GC to work on Azure blob, [soft delete](https://docs.microsoft.com/en-us/azure/storage/blobs/soft-delete-blob-overview) should be disabled. +
+ +
+⚠️ At the moment, only the "mark" phase of the Garbage Collection is supported for GCP. +That is, this program will output a list of expired objects, and you will have to delete them manually. +We have [concrete plans](https://github.com/treeverse/lakeFS/issues/3626) to extend this support to actually delete the objects. +{: .note .note-warning } + +```bash +spark-submit --class io.treeverse.gc.GarbageCollection \ + --jars https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar \ + -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ + -c spark.hadoop.lakefs.api.access_key= \ + -c spark.hadoop.lakefs.api.secret_key= \ + -c spark.hadoop.google.cloud.auth.service.account.enable=true \ + -c spark.hadoop.google.cloud.auth.service.account.json.keyfile= \ + -c spark.hadoop.fs.gs.project.id= \ + -c spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ + -c spark.hadoop.fs.AbstractFileSystem.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS \ + -c spark.hadoop.lakefs.gc.do_sweep=false \ + http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ + example-repo +``` + +This program will not delete anything. +Instead, it will find all the objects that are safe to delete and save a list containing all their keys, in Parquet format. +The list will then be found under the path: +``` +gs:///_lakefs/retention/gc/unified//deleted/ +``` + +Note that this is a path in your Google Storage bucket, and not in your lakeFS repository. +It is now safe to remove the objects that appear in this list directly from the storage. + +
+
+ +You will find the list of objects removed by the job in the storage +namespace of the repository. It is saved in Parquet format under `_lakefs/retention/gc/unified//deleted/`. + +### Mark and Sweep stages + +You can break the job into two stages: +* _Mark_: find objects to remove, without actually removing them. +* _Sweep_: remove the objects. + +#### Mark-only mode + +To make GC run the mark stage only, add the following to your spark-submit command: +```properties +spark.hadoop.lakefs.gc.do_sweep=false +``` + +In mark-only mode, GC will write the keys of the expired objects under: `/_lakefs/retention/gc/unified//`. +_MARK_ID_ is generated by the job. You can find it in the driver's output: + +``` +Report for mark_id=gmc6523jatlleurvdm30 path=s3a://example-bucket/_lakefs/retention/gc/unified/gmc6523jatlleurvdm30 +``` + +#### Sweep-only mode + +To make GC run the sweep stage only, add the following properties to your spark-submit command: +```properties +spark.hadoop.lakefs.gc.do_mark=false +spark.hadoop.lakefs.gc.mark_id= # Replace with the identifier you obtained from a previous mark-only run +``` + +## Garbage collection notes + +1. In order for an object to be removed, it must not exist on the HEAD of any branch. + You should remove stale branches to prevent them from retaining old objects. + For example, consider a branch that has been merged to `main` and has become stale. + An object which is later deleted from `main` will always be present in the stale branch, preventing it from being removed. + +1. lakeFS will never delete objects outside your repository's storage namespace. + In particular, objects that were imported using `lakectl import` or the UI import wizard will not be affected by GC jobs. + +1. In cases where deleted objects are brought back to life while a GC job is running (for example, by reverting a commit), + the objects may or may not be deleted. + +1. Garbage collection does not remove any commits: you will still be able to use commits containing removed objects, + but trying to read these objects from lakeFS will result in a `410 Gone` HTTP status. diff --git a/docs/howto/garbage-collection/index.md b/docs/howto/garbage-collection/index.md index 497069244f9..0d2c6ff59d8 100644 --- a/docs/howto/garbage-collection/index.md +++ b/docs/howto/garbage-collection/index.md @@ -3,254 +3,5 @@ title: Garbage Collection description: Clean up expired objects using the garbage collection feature in lakeFS. parent: How-To has_children: true -redirect_from: - - /reference/garbage-collection.html - - /howto/garbage-collection-index.html - - /howto/garbage-collection.html - - /reference/retention.html --- - -# Garbage Collection - [lakeFS Cloud](https://lakefs.cloud) users enjoy a [managed garbage collection]({% link howto/garbage-collection/managed-gc.md %}) service, and do not need to run this Spark program. -{: .tip } - - -By default, lakeFS keeps all your objects forever. This allows you to travel back in time to previous versions of your data. -However, sometimes you may want to remove the objects from the underlying storage completely. -Reasons for this include cost-reduction and privacy policies. - -The garbage collection (GC) job is a Spark program that removes the following from the underlying storage: -1. _Committed objects_ that have been deleted (or replaced) in lakeFS, and are considered expired according to [rules you define](#garbage-collection-rules). -2. _Uncommitted objects_ that are no longer accessible - * For example, objects deleted before ever being committed. - -{% include toc.html %} - -## Garbage collection rules - -{: .note } -These rules only apply to objects that have been _committed_ at some point. -Without retention rules, only inaccessible _uncommitted_ objects will be removed by the job. - -Garbage collection rules determine for how long an object is kept in the storage after it is _deleted_ (or replaced) in lakeFS. -For every branch, the GC job retains deleted objects for the number of days defined for the branch. -In the absence of a branch-specific rule, the default rule for the repository is used. -If an object is present in more than one branch ancestry, it is removed only after the retention period has ended for -all relevant branches. - -Example GC rules for a repository: -```json -{ - "default_retention_days": 14, - "branches": [ - {"branch_id": "main", "retention_days": 21}, - {"branch_id": "dev", "retention_days": 7} - ] -} -``` - -In the above example, objects will be retained for 14 days after deletion by default. -However, if present in the branch `main`, objects will be retained for 21 days. -Objects present _only_ in the `dev` branch will be retained for 7 days after they are deleted. - -### How to configure garbage collection rules - -To define retention rules, either use the `lakectl` command, the lakeFS web UI, or [API](/reference/api.html#/retention/set%20garbage%20collection%20rules): - -
- -
- -Create a JSON file with your GC rules: - -```bash -cat <> example_repo_gc_rules.json -{ - "default_retention_days": 14, - "branches": [ - {"branch_id": "main", "retention_days": 21}, - {"branch_id": "dev", "retention_days": 7} - ] -} -EOT -``` - -Set the GC rules using `lakectl`: -```bash -lakectl gc set-config lakefs://example-repo -f example_repo_gc_rules.json -``` - -
-
-From the lakeFS web UI: - -1. Navigate to the main page of your repository. -2. Go to _Settings_ -> _Garbage Collection_. -3. Click _Edit policy_ and paste your GC rule into the text box as a JSON. -4. Save your changes. - -![GC Rules From UI]({{ site.baseurl }}/assets/img/gc_rules_from_ui.png) -
-
- -## How to run the garbage collection job - -To run the job, use the following `spark-submit` command (or using your preferred method of running Spark programs). - -
- -
- ```bash -spark-submit --class io.treeverse.gc.GarbageCollection \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.s3a.access.key= \ - -c spark.hadoop.fs.s3a.secret.key= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ - example-repo us-east-1 - ``` -
-
- ```bash -spark-submit --class io.treeverse.gc.GarbageCollection \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.s3a.access.key= \ - -c spark.hadoop.fs.s3a.secret.key= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client-301/0.10.0/lakefs-spark-client-301-assembly-0.10.0.jar \ - example-repo us-east-1 - ``` -
- -
- -If you want to access your storage using the account key: - - ```bash -spark-submit --class io.treeverse.gc.GarbageCollection \ - --packages org.apache.hadoop:hadoop-aws:3.2.1 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.azure.account.key..dfs.core.windows.net= \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ - example-repo - ``` - -Or, if you want to access your storage using an Azure service principal: - - ```bash -spark-submit --class io.treeverse.gc.GarbageCollection \ - --packages org.apache.hadoop:hadoop-aws:3.2.1 \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.fs.azure.account.auth.type..dfs.core.windows.net=OAuth \ - -c spark.hadoop.fs.azure.account.oauth.provider.type..dfs.core.windows.net=org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider \ - -c spark.hadoop.fs.azure.account.oauth2.client.id..dfs.core.windows.net= \ - -c spark.hadoop.fs.azure.account.oauth2.client.secret..dfs.core.windows.net= \ - -c spark.hadoop.fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net=https://login.microsoftonline.com//oauth2/token \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ - example-repo - ``` - -**Notes:** -* On Azure, GC was tested only on Spark 3.3.0, but may work with other Spark and Hadoop versions. -* In case you don't have `hadoop-azure` package as part of your environment, you should add the package to your spark-submit with `--packages org.apache.hadoop:hadoop-azure:3.2.1` -* For GC to work on Azure blob, [soft delete](https://docs.microsoft.com/en-us/azure/storage/blobs/soft-delete-blob-overview) should be disabled. -
- -
-⚠️ At the moment, only the "mark" phase of the Garbage Collection is supported for GCP. -That is, this program will output a list of expired objects, and you will have to delete them manually. -We have [concrete plans](https://github.com/treeverse/lakeFS/issues/3626) to extend this support to actually delete the objects. -{: .note .note-warning } - -```bash -spark-submit --class io.treeverse.gc.GarbageCollection \ - --jars https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar \ - -c spark.hadoop.lakefs.api.url=https://lakefs.example.com:8000/api/v1 \ - -c spark.hadoop.lakefs.api.access_key= \ - -c spark.hadoop.lakefs.api.secret_key= \ - -c spark.hadoop.google.cloud.auth.service.account.enable=true \ - -c spark.hadoop.google.cloud.auth.service.account.json.keyfile= \ - -c spark.hadoop.fs.gs.project.id= \ - -c spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ - -c spark.hadoop.fs.AbstractFileSystem.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS \ - -c spark.hadoop.lakefs.gc.do_sweep=false \ - http://treeverse-clients-us-east.s3-website-us-east-1.amazonaws.com/lakefs-spark-client/0.10.0/lakefs-spark-client-assembly-0.10.0.jar \ - example-repo -``` - -This program will not delete anything. -Instead, it will find all the objects that are safe to delete and save a list containing all their keys, in Parquet format. -The list will then be found under the path: -``` -gs:///_lakefs/retention/gc/unified//deleted/ -``` - -Note that this is a path in your Google Storage bucket, and not in your lakeFS repository. -It is now safe to remove the objects that appear in this list directly from the storage. - -
-
- -You will find the list of objects removed by the job in the storage -namespace of the repository. It is saved in Parquet format under `_lakefs/retention/gc/unified//deleted/`. - -### Mark and Sweep stages - -You can break the job into two stages: -* _Mark_: find objects to remove, without actually removing them. -* _Sweep_: remove the objects. - -#### Mark-only mode - -To make GC run the mark stage only, add the following to your spark-submit command: -```properties -spark.hadoop.lakefs.gc.do_sweep=false -``` - -In mark-only mode, GC will write the keys of the expired objects under: `/_lakefs/retention/gc/unified//`. -_MARK_ID_ is generated by the job. You can find it in the driver's output: - -``` -Report for mark_id=gmc6523jatlleurvdm30 path=s3a://example-bucket/_lakefs/retention/gc/unified/gmc6523jatlleurvdm30 -``` - -#### Sweep-only mode - -To make GC run the sweep stage only, add the following properties to your spark-submit command: -```properties -spark.hadoop.lakefs.gc.do_mark=false -spark.hadoop.lakefs.gc.mark_id= # Replace with the identifier you obtained from a previous mark-only run -``` - -## Garbage collection notes - -1. In order for an object to be removed, it must not exist on the HEAD of any branch. - You should remove stale branches to prevent them from retaining old objects. - For example, consider a branch that has been merged to `main` and has become stale. - An object which is later deleted from `main` will always be present in the stale branch, preventing it from being removed. - -1. lakeFS will never delete objects outside your repository's storage namespace. - In particular, objects that were imported using `lakectl import` or the UI import wizard will not be affected by GC jobs. - -1. In cases where deleted objects are brought back to life while a GC job is running (for example, by reverting a commit), - the objects may or may not be deleted. - -1. Garbage collection does not remove any commits: you will still be able to use commits containing removed objects, - but trying to read these objects from lakeFS will result in a `410 Gone` HTTP status. diff --git a/docs/howto/garbage-collection/internals.md b/docs/howto/garbage-collection/internals.md deleted file mode 100644 index 60afc9fedb7..00000000000 --- a/docs/howto/garbage-collection/internals.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -title: Internals -description: How Garbage Collection in lakeFS works -parent: Garbage Collection -grand_parent: How-To -nav_order: 2 -redirect_from: - - /howto/gc-internals.html ---- - -# Committed Garbage Collection Internals - -{: .warning-title } -> Deprecation notice -> -> This page describes a deprecated feature. Please visit the new [garbage collection documentation](./index.html). - - -## What gets collected - -Because each object in lakeFS may be accessible from multiple branches, it -might not be obvious which objects will be considered garbage and collected. - -Garbage collection is configured by specifying the number of days to retain -objects on each branch. If a branch is configured to retain objects for a -given number of days, any object that was accessible from the HEAD of a -branch in that past number of days will be retained. - -The garbage collection process proceeds in three main phases: - -1. **Discover which commits will retain their objects.** For every branch, - the garbage collection job looks at the HEAD of the branch that many days - ago; every commit at or since that HEAD must be retained. - - ```mermaid - %%{init: { 'theme': 'base', 'gitGraph': {'rotateCommitLabel': true}} }%% - gitGraph - commit id: "2022-02-27 🚮" - commit id: "2022-03-01 🚮" - commit id: "2022-03-09" - branch dev - checkout main - commit id: "2022-03-12" - checkout dev - commit id: "d: 2022-03-14 🚮" - commit id: "d: 2022-03-16 🚮" - checkout main - commit id: "2022-03-18" - checkout dev - commit id: "d: 2022-03-20 🚮" - commit id: "d: 2022-03-23" - checkout main - merge dev - commit id: "2022-03-26" - ``` - - Continuing the example, branch `main` retains for 21 days and branch `dev` - for 7. When running GC on 2022-03-31: - - - 7 days ago, on 2022-03-24 the head of branch `dev` was `d: - 2022-03-23`. So, that commit is retained (along with all more recent - commits on `dev`) but all older commits `d: *` will be collected. - - 21 days ago, on 2022-03-10, the head of branch `main` was - `2022-03-09`. So that commit is retained (along with all more recent - commits on `main`) but commits `2022-02-27` and `2022-03-01` will be - collected. - -1. **Discover which objects need to be garbage collected.** Hold (_only_) - objects accessible on some retained commits. - - In the example, all objects of commit `2022-03-12`, for instance, are - retained. This _includes_ objects added in previous commits. However, - objects added in commit `d: 2022-03-14` which were overwritten or - deleted in commit `d: 2022-03-20` are not visible in any retained commit - and will be garbage collected. - -1. **Garbage collect those objects by deleting them.** The data of any - deleted object will no longer be accessible. lakeFS retains all metadata - about the object, but attempting to read it via the lakeFS API or the S3 - gateway will return HTTP status 410 ("Gone"). - -## What does _not_ get collected - -Some objects will _not_ be collected regardless of configured GC rules: -* Any object that is accessible from any branch's HEAD. -* Objects stored outside the repository's [storage namespace][storage-namespace]. - For example, objects imported using the lakeFS import UI are not collected. -* Uncommitted objects, see [Uncommitted Garbage Collection](./uncommitted.html), - -## Performance - -Garbage collection reads many commits. It uses Spark to spread the load of -reading the contents of all of these commits. For very large jobs running -on very large clusters, you may want to tweak this load. To do this: - -* Add `-c spark.hadoop.lakefs.gc.range.num_partitions=RANGE_PARTITIONS` - (default 50) to spread the initial load of reading commits across more - Spark executors. -* Add `-c spark.hadoop.lakefs.gc.address.num_partitions=RANGE_PARTITIONS` - (default 200) to spread the load of reading all objects included in a - commit across more Spark executors. - -Normally this should not be needed. - -## Networking - -Garbage collection communicates with the lakeFS server. Very large -repositories may require increasing a read timeout. If you run into timeout errors during communication from the Spark job to lakeFS consider increasing these timeouts: - -* Add `-c spark.hadoop.lakefs.api.read.timeout_seconds=TIMEOUT_IN_SECONDS` - (default 10) to allow lakeFS more time to respond to requests. -* Add `-c - spark.hadoop.lakefs.api.connection.timeout_seconds=TIMEOUT_IN_SECONDS` - (default 10) to wait longer for lakeFS to accept connections. - -[storage-namespace]: {% link understand/glossary.md %}#storage-namespace diff --git a/docs/howto/garbage-collection/managed-gc.md b/docs/howto/garbage-collection/managed-gc.md index 9d950101f04..254c3eee658 100644 --- a/docs/howto/garbage-collection/managed-gc.md +++ b/docs/howto/garbage-collection/managed-gc.md @@ -2,7 +2,7 @@ title: Managed Garbage Collection description: Reduce the operational overhead of running garbage collection manually. parent: Garbage Collection -nav_order: 1 +nav_order: 5 grand_parent: How-To redirect_from: - /cloud/managed-gc.html diff --git a/docs/howto/garbage-collection/uncommitted.md b/docs/howto/garbage-collection/uncommitted.md deleted file mode 100644 index 74ccfda49a2..00000000000 --- a/docs/howto/garbage-collection/uncommitted.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -title: (deprecated) Uncommitted Objects -description: Clean up uncommitted objects that are no longer needed. -parent: Garbage Collection -grand_parent: How-To -nav_order: 99 -redirect_from: - - /howto/garbage-collection-uncommitted.html ---- - -# Garbage collection: uncommitted objects - -{: .warning-title } -> Deprecation notice -> -> This feature will be available up to version 0.9.1 of the lakeFS metadata client. It will be discontinued in subsequent versions. -> Please visit the new [garbage collection documentation](./index.md). - -Deletion of objects that were never committed was always a difficulty for lakeFS, see -[#1933](https://github.com/treeverse/lakeFS/issues/1933) for more details. Examples for -objects that will be collected as part of the uncommitted GC job: -1. Objects that were uploaded to lakeFS and deleted. -2. Objects that were uploaded to lakeFS and were overridden. - -{% include toc.html %} - -See discussion on the original [design PR](https://github.com/treeverse/lakeFS/pull/4015) to understand why we didn't go with a server-only solution. -{: .note} - -The uncommitted GC will not clean: -1. Committed objects. See [Committed Garbage Collection](./committed.md) -2. Everything mentioned in [what does not get collected]( {% link howto/garbage-collection/internals.md %}#what-does-_not_-get-collected) - -## Prerequisites - -1. lakeFS server version must be at least [v0.87.0](https://github.com/treeverse/lakeFS/releases/tag/v0.87.0). - If your version is lower, you should first upgrade. -2. Read the [limitations](#limitations) section. -3. Setup [rclone](https://rclone.org/) to access underlying bucket for backup and restore. - - -## Running the uncommitted GC - -1. Mark the files to delete - summary and report will be generated under `/_lakefs/retention/gc/uncommitted//`. - By listing the bucket under 'uncommitted' the last entry represents the last mark ID of the uncommitted GC. - The GC job prints out "Report for mark_id=..." which includes the mark ID with the run summary. - - ```bash - spark-submit \ - --conf spark.hadoop.lakefs.gc.do_sweep=false \ - --conf spark.hadoop.lakefs.api.url= \ - --conf spark.hadoop.fs.s3a.access.key= \ - --conf spark.hadoop.fs.s3a.secret.key= \ - --conf spark.hadoop.lakefs.api.access_key= \ - --conf spark.hadoop.lakefs.api.secret_key= \ - --class io.treeverse.gc.UncommittedGarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - - ``` - -2. Backup (optional but recommended) - when you start using the feature you may want to first gain confidence in the decisions uncommitted GC makes. Backup will copy the objects marked to be deleted for run ID to a specified location. - Follow [rclone documentation](https://rclone.org/docs/) to configure remote access to lakeFS storage. - Note that the lakeFS and backup locations are specified as `remote:path` based on how rclone was configured. - - ```shell - rclone --include "*.txt" cat "/_lakefs/retention/gc/uncommitted//deleted.text/" | \ - rclone -P --no-traverse --files-from - copy - ``` - -4. Sweep - delete reported objects to delete based on mark ID - - ```bash - spark-submit \ - --conf spark.hadoop.lakefs.gc.mark_id= \ - --conf spark.hadoop.lakefs.gc.do_mark=false \ - --conf spark.hadoop.lakefs.api.url= \ - --conf spark.hadoop.fs.s3a.access.key= \ - --conf spark.hadoop.fs.s3a.secret.key= \ - --conf spark.hadoop.lakefs.api.access_key= \ - --conf spark.hadoop.lakefs.api.secret_key= \ - --class io.treeverse.gc.UncommittedGarbageCollector \ - --packages org.apache.hadoop:hadoop-aws:2.7.7 \ - - ``` - -5. Restore - in any case we would like to undo and restore the data from from our backup. The following command will copy the objects back from the backup location using the information stored under the specific mark ID. - Note that the lakeFS and backup locations are specified as `remote:path` based on how rclone was configured. - - ```shell - rclone --include "*.txt" cat "remote:/_lakefs/retention/gc/uncommitted//deleted.text/" | \ - rclone -P --no-traverse --files-from - copy - ``` - - -## Uncommitted GC job options - -Similar to the [committed GC option]({% link howto/garbage-collection/committed.md %}#gc-job-options). - -## Limitations - -The uncommitted GC job has several limitations in its Beta version: -1. Support is limited to S3 repositories, it was not tested on ABS, GS or MinIO. -1. Scale may be limited, see performance results below. -1. [Issue](https://github.com/treeverse/lakeFS/issues/5088) associated to commit during copy object. - -## Next steps - -The uncommitted GC is under development, next releases will include: - -1. Incorporation of committed & uncommitted GC into a single job. We understand the friction - of having 2 garbage collection jobs for a lakeFS installation and working to creating a - single job for it. -2. Removing the limitation of a read-only lakeFS during the job run. -3. Performance improvements: - 1. Better parallelization of the storage namespace traversal. - 2. Optimized Run: GC will only iterate over objects that were written to the - repository since the last GC run. For more information see the [proposal](https://github.com/treeverse/lakeFS/blob/master/design/accepted/gc_plus/uncommitted-gc.md#flow-2-optimized-run). -4. Backup & Restore, similar to [committed GC]({% link howto/garbage-collection/committed.md %}#backup-and-restore). -5. Support for non-S3 repositories. - -## Performance - -The uncommitted GC job was tested on a repository with 1K branches, -25K uncommitted objects and 2K commits. -The storage namespace number of objects prior to the cleanup was 103K objects. -The job ran on a Spark cluster with a single master and 2 workers of type [i3.2xlarge](https://aws.amazon.com/ec2/instance-types/i3/) -The job finished after 5 minutes deleting 15K objects. diff --git a/docs/understand/glossary.md b/docs/understand/glossary.md index b24db3316d1..dc2696266c4 100644 --- a/docs/understand/glossary.md +++ b/docs/understand/glossary.md @@ -36,7 +36,7 @@ It is sometimes referred as multi-table transactions. That is, lakeFS offers tra ## Data Lake Governance ### -The goal of data lake governance is to apply policies, standards and processes on the data. This allows creating high-quality data and ensuring that it’s used appropriately across the organization. Data lake governance improves the data quality and increases data usage for business decision-making, leading to operational improvements, better-informed business strategies, and stronger financial performance. lakeFS Cloud offers advanced data lake management features such as: [Role-Based Access Control]({% link reference/security/rbac.md %}), [Branch Aware Managed Garbage Collection]({% link howto/garbage-collection/committed.md %}), [Data Lineage and Audit log]({% link reference/auditing.md %}). +The goal of data lake governance is to apply policies, standards and processes on the data. This allows creating high-quality data and ensuring that it’s used appropriately across the organization. Data lake governance improves the data quality and increases data usage for business decision-making, leading to operational improvements, better-informed business strategies, and stronger financial performance. lakeFS Cloud offers advanced data lake management features such as: [Role-Based Access Control]({% link reference/security/rbac.md %}), [Branch Aware Managed Garbage Collection]({% link howto/garbage-collection/gc.md %}), [Data Lineage and Audit log]({% link reference/auditing.md %}). ## Data Lifecycle Management In data-intensive applications, data should be managed through its entire lifecycle similar to how teams manage code. By doing so, we could leverage the best practices and tools from application lifecycle management (like CI/CD operations) and apply them to data. lakeFS offers data lifecycle management via [isolated data development environments]({% link understand/use_cases/etl_testing.md %}) instead of shared buckets.