From fa73053268006f3e545273cdbe3d5d612f457f6f Mon Sep 17 00:00:00 2001 From: adilkhan Date: Fri, 1 Sep 2023 19:11:15 +0600 Subject: [PATCH 01/22] first changes --- deeplake/core/vectorstore/deeplake_vectorstore.py | 2 ++ deeplake/core/vectorstore/vector_search/utils.py | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index debdda12e1..32a0945101 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -15,6 +15,7 @@ from deeplake.constants import ( DEFAULT_VECTORSTORE_TENSORS, ) +from deeplake.client.client import DeepLakeBackendClient from deeplake.core.vectorstore import utils from deeplake.core.vectorstore.vector_search import vector_search from deeplake.core.vectorstore.vector_search import dataset as dataset_utils @@ -127,6 +128,7 @@ def __init__( self.ingestion_batch_size = ingestion_batch_size self.num_workers = num_workers + token = token or DeepLakeBackendClient().get_token() if creds is None: creds = {} diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index f513332a6e..c3bdb1bcaf 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -25,13 +25,13 @@ def parse_exec_option(dataset, exec_option, indra_installed): """Select the best available exec_option for the given dataset and environment""" if exec_option is None or exec_option == "auto": - if isinstance(dataset, DeepLakeCloudDataset): - if "vectordb/" in dataset.base_storage.path: - return "tensor_db" - elif indra_installed: - return "compute_engine" - else: - return "python" + if ( + isinstance(dataset, DeepLakeCloudDataset) + and "vectordb/" in dataset.base_storage.path + ): + return "tensor_db" + elif isinstance(dataset, DeepLakeCloudDataset) and indra_installed: + return "compute_engine" else: return "python" else: From 3008160556fd785beac9ef54d0baf9c08d6d7de8 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Fri, 1 Sep 2023 21:09:19 +0600 Subject: [PATCH 02/22] few more changes --- deeplake/core/vectorstore/vector_search/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index c3bdb1bcaf..1cb62477db 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -2,7 +2,7 @@ from deeplake.enterprise.util import raise_indra_installation_error from deeplake.util.warnings import always_warn -from deeplake.core.dataset import DeepLakeCloudDataset +from deeplake.core.dataset import DeepLakeCloudDataset, Dataset import numpy as np @@ -30,7 +30,7 @@ def parse_exec_option(dataset, exec_option, indra_installed): and "vectordb/" in dataset.base_storage.path ): return "tensor_db" - elif isinstance(dataset, DeepLakeCloudDataset) and indra_installed: + elif isinstance(dataset, (DeepLakeCloudDataset, Dataset)) and indra_installed: return "compute_engine" else: return "python" From c8796b3d43cc4158d8307fdbfdd56dde250eebc0 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 4 Sep 2023 17:28:06 +0600 Subject: [PATCH 03/22] added tests --- .../core/vectorstore/deeplake_vectorstore.py | 2 +- .../vectorstore/test_deeplake_vectorstore.py | 89 +++++++++++++++++++ .../core/vectorstore/vector_search/utils.py | 9 +- 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 32a0945101..a357b538e0 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -148,7 +148,7 @@ def __init__( ) self.embedding_function = embedding_function self.exec_option = utils.parse_exec_option( - self.dataset, exec_option, _INDRA_INSTALLED + self.dataset, exec_option, _INDRA_INSTALLED, token ) self.verbose = verbose self.tensor_params = tensor_params diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 85b4ad4f72..25162b00e7 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -23,6 +23,7 @@ DatasetHandlerError, ) from deeplake.core.vectorstore.vector_search import dataset as dataset_utils +from deeplake.cli.auth import login, logout EMBEDDING_DIM = 100 @@ -1801,3 +1802,91 @@ def test_read_only(): def test_delete_by_path_wrong_path(): with pytest.raises(DatasetHandlerError): VectorStore.delete_by_path("some_path") + + +@requires_libdeeplake +def test_exec_option_with_auth(local_path, hub_cloud_path, hub_cloud_dev_token): + db = VectorStore(path=local_path) + assert db.exec_option == "python" + + db = VectorStore( + path=local_path, + token=hub_cloud_dev_token, + ) + assert db.exec_option == "compute_engine" + + db = VectorStore( + path=hub_cloud_path, + token=hub_cloud_dev_token, + ) + assert db.exec_option == "compute_engine" + + db = VectorStore( + path=hub_cloud_path, + token=hub_cloud_dev_token, + runtime={"tensor_db": True}, + ) + assert db.exec_option == "tensor_db" + + +@requires_libdeeplake +def test_exec_option_cli( + local_path, + hub_cloud_path, + username, + password, + runner, +): + # Testing exec_option with cli login and logout commands are executed + runner.invoke(login, f"-u {username} -p {password}") + + # local dataset and logged in with cli + db = VectorStore( + path=local_path, + ) + assert db.exec_option == "compute_engine" + + # hub cloud dataset and logged in with cli + db = VectorStore( + path=hub_cloud_path, + ) + assert db.exec_option == "compute_engine" + + # logging out with cli + runner.invoke(logout) + + # local dataset and logged out with cli + db = VectorStore( + path=hub_cloud_path, + ) + assert db.exec_option == "python" + + +@requires_libdeeplake +@pytest.mark.parametrize( + "generator", + [ + "s3_ds_generator", + "gcs_ds_generator", + "azure_ds_generator", + ], +) +def test_exec_option_with_connected_datasets( + hub_cloud_dev_token, + hub_cloud_path, + hub_cloud_dev_managed_creds_key, + generator, +): + ds = generator() + ds.create_tensor("x") + ds.x.append(10) + + ds.connect( + creds_key=hub_cloud_dev_managed_creds_key, + dest_path=hub_cloud_path, + token=hub_cloud_dev_token, + ) + ds.add_creds_key(hub_cloud_dev_managed_creds_key, managed=True) + + db = VectorStore(path=hub_cloud_path) + assert db.exec_option == "compute_engine" diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 1cb62477db..8b98810206 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -21,16 +21,21 @@ def parse_tensor_return(tensor): return tensor.data(aslist=True)["value"] -def parse_exec_option(dataset, exec_option, indra_installed): +def parse_exec_option(dataset, exec_option, indra_installed, token): """Select the best available exec_option for the given dataset and environment""" if exec_option is None or exec_option == "auto": if ( isinstance(dataset, DeepLakeCloudDataset) and "vectordb/" in dataset.base_storage.path + and token is not None ): return "tensor_db" - elif isinstance(dataset, (DeepLakeCloudDataset, Dataset)) and indra_installed: + elif ( + isinstance(dataset, (DeepLakeCloudDataset, Dataset)) + and indra_installed + and token is not None + ): return "compute_engine" else: return "python" From 71dab4e421b3db47bff5da2514fc83ca88f09e6e Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 4 Sep 2023 20:49:07 +0600 Subject: [PATCH 04/22] Changes --- deeplake/core/vectorstore/deeplake_vectorstore.py | 4 +++- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index b1af457870..7f1e7dd8c5 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -128,7 +128,9 @@ def __init__( self.ingestion_batch_size = ingestion_batch_size self.num_workers = num_workers - token = token or DeepLakeBackendClient().get_token() + user_profile = DeepLakeBackendClient().get_user_profile() + if user_profile["name"] != "public": + token = token or DeepLakeBackendClient().get_token() if creds is None: creds = {} diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index a9815198fb..6f19354fad 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1819,6 +1819,7 @@ def test_exec_option_cli( username, password, runner, + hub_cloud_dev_token, ): # Testing exec_option with cli login and logout commands are executed runner.invoke(login, f"-u {username} -p {password}") @@ -1844,6 +1845,13 @@ def test_exec_option_cli( ) assert db.exec_option == "python" + # logging in with cli token + runner.invoke(login, f"-t {hub_cloud_dev_token}") + db = VectorStore( + path=local_path, + ) + assert db.exec_option == "compute_engine" + @requires_libdeeplake @pytest.mark.parametrize( From 0616fdfb4ab834c9d7895d07320f49871b48c3bc Mon Sep 17 00:00:00 2001 From: adilkhan Date: Wed, 6 Sep 2023 16:53:41 +0600 Subject: [PATCH 05/22] WIP --- deeplake/api/dataset.py | 8 ++ deeplake/client/client.py | 18 +++ deeplake/client/config.py | 1 + deeplake/core/dataset/dataset.py | 3 + .../core/vectorstore/deeplake_vectorstore.py | 21 ++- .../vector_search/dataset/dataset.py | 9 ++ .../core/vectorstore/vector_search/utils.py | 120 ++++++++++++++++-- deeplake/util/bugout_reporter.py | 1 + 8 files changed, 165 insertions(+), 16 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 1e9be1e2f4..a1ddf1a376 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -11,6 +11,7 @@ from deeplake.auto.unstructured.yolo.yolo import YoloDataset from deeplake.client.client import DeepLakeBackendClient from deeplake.client.log import logger +from deeplake.client.utils import read_token from deeplake.core.dataset import Dataset, dataset_factory from deeplake.core.tensor import Tensor from deeplake.core.meta.dataset_meta import DatasetMeta @@ -424,6 +425,12 @@ def empty( local_cache_size=local_cache_size, ) + token = token or read_token(from_env=True) + if token is not None and org_id is None: + # for local datasets + client = DeepLakeBackendClient(token=token) + org_id = client.get_user_profile()["name"] + feature_report_path( path, "empty", @@ -434,6 +441,7 @@ def empty( "lock_timeout": lock_timeout, }, token=token, + username=org_id or "public", ) except Exception as e: if isinstance(e, UserNotLoggedInException): diff --git a/deeplake/client/client.py b/deeplake/client/client.py index c551dfe254..33dfe29287 100644 --- a/deeplake/client/client.py +++ b/deeplake/client/client.py @@ -39,6 +39,7 @@ GET_PRESIGNED_URL_SUFFIX, CONNECT_DATASET_SUFFIX, REMOTE_QUERY_SUFFIX, + ORG_PERMISSION_SUFFIX, ) from deeplake.client.log import logger import jwt # should add it to requirements.txt @@ -509,3 +510,20 @@ def remote_query( ).json() return response + + def has_indra_org_permission(self, org_id: str) -> Dict[str, Any]: + """Queries a remote dataset. + + Args: + org_id (str): The organization to which the dataset belongs. + + Returns: + Dict[str, Any]: The json response containing org permissions. + """ + response = self.request( + "GET", + ORG_PERMISSION_SUFFIX.format(org_id), + endpoint=self.endpoint(), + ).json() + + return response diff --git a/deeplake/client/config.py b/deeplake/client/config.py index d3280f2b42..326a4f5e5b 100644 --- a/deeplake/client/config.py +++ b/deeplake/client/config.py @@ -30,3 +30,4 @@ DEFAULT_REQUEST_TIMEOUT = 170 DEEPLAKE_AUTH_TOKEN = "ACTIVELOOP_TOKEN" +ORG_PERMISSION_SUFFIX = "/api/organizations/{}/features/dataset_query" diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index fbb24341e7..56fa613478 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -4681,3 +4681,6 @@ def _temp_write_access(self): def _get_storage_repository(self) -> Optional[str]: return getattr(self.base_storage, "repository", None) + + def get_user_name(self) -> Optional[str]: + return getattr(self.base_storage, "user_name", None) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 7f1e7dd8c5..992c12e12a 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -16,10 +16,12 @@ DEFAULT_VECTORSTORE_TENSORS, ) from deeplake.client.client import DeepLakeBackendClient +from deeplake.client.utils import read_token from deeplake.core.vectorstore import utils from deeplake.core.vectorstore.vector_search import vector_search from deeplake.core.vectorstore.vector_search import dataset as dataset_utils from deeplake.core.vectorstore.vector_search import filter as filter_utils +from time import time from deeplake.util.bugout_reporter import ( feature_report_path, @@ -46,6 +48,7 @@ def __init__( verbose: bool = True, runtime: Optional[Dict] = None, creds: Optional[Union[Dict, str]] = None, + org_id: Optional[str] = None, **kwargs: Any, ) -> None: """Creates an empty VectorStore or loads an existing one if it exists at the specified ``path``. @@ -106,6 +109,13 @@ def __init__( Danger: Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ + token = token or read_token(from_env=True) + if token is not None and org_id is None: + # for local datasets + client = DeepLakeBackendClient(token=token) + org_id = client.get_user_profile()["name"] + # if org_id is None and + feature_report_path( path, "vs.initialize", @@ -124,13 +134,11 @@ def __init__( "runtime": runtime, }, token=token, + username=org_id, ) self.ingestion_batch_size = ingestion_batch_size self.num_workers = num_workers - user_profile = DeepLakeBackendClient().get_user_profile() - if user_profile["name"] != "public": - token = token or DeepLakeBackendClient().get_token() if creds is None: creds = {} @@ -146,12 +154,17 @@ def __init__( embedding_function, overwrite, runtime, + org_id, **kwargs, ) self.embedding_function = embedding_function + start = time() self.exec_option = utils.parse_exec_option( - self.dataset, exec_option, _INDRA_INSTALLED, token + self.dataset, exec_option, _INDRA_INSTALLED, token, org_id ) + # self.exec_option = "python" + end = time() + print("finished parsing exec option in ", end - start) self.verbose = verbose self.tensor_params = tensor_params diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py index e52b09d8b8..4e563816ff 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/dataset.py @@ -15,6 +15,7 @@ _INDRA_INSTALLED = False # pragma: no cover import deeplake +from deeplake.util.path import get_path_type from deeplake.core.vectorstore.vector_search import utils from deeplake.core.vectorstore.vector_search.ingestion import ingest_data from deeplake.constants import ( @@ -39,11 +40,13 @@ def create_or_load_dataset( embedding_function, overwrite, runtime, + org_id, **kwargs, ): utils.check_indra_installation( exec_option=exec_option, indra_installed=_INDRA_INSTALLED ) + org_id = org_id if get_path_type(dataset_path) == "local" else None if not overwrite and dataset_exists(dataset_path, token, creds, **kwargs): if tensor_params is not None and tensor_params != DEFAULT_VECTORSTORE_TENSORS: @@ -58,6 +61,7 @@ def create_or_load_dataset( logger, read_only, runtime, + org_id, **kwargs, ) @@ -71,6 +75,7 @@ def create_or_load_dataset( overwrite, creds, runtime, + org_id, **kwargs, ) @@ -89,6 +94,7 @@ def load_dataset( logger, read_only, runtime, + org_id, **kwargs, ): if dataset_path == DEFAULT_VECTORSTORE_DEEPLAKE_PATH: @@ -103,6 +109,7 @@ def load_dataset( read_only=read_only, creds=creds, verbose=False, + org_id=org_id, **kwargs, ) check_tensors(dataset) @@ -171,6 +178,7 @@ def create_dataset( overwrite, creds, runtime, + org_id, **kwargs, ): if exec_option == "tensor_db" and ( @@ -191,6 +199,7 @@ def create_dataset( verbose=False, overwrite=overwrite, creds=creds, + org_id=org_id, **kwargs, ) create_tensors(tensor_params, dataset, logger, embedding_function) diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 8b98810206..f3bd9d50e2 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -1,11 +1,15 @@ +from abc import ABC, abstractmethod + from deeplake.constants import MB from deeplake.enterprise.util import raise_indra_installation_error from deeplake.util.warnings import always_warn - +from deeplake.client.utils import read_token from deeplake.core.dataset import DeepLakeCloudDataset, Dataset +from deeplake.client.client import DeepLakeBackendClient import numpy as np +import jwt import random import string from typing import Optional, List, Dict, Callable @@ -21,26 +25,118 @@ def parse_tensor_return(tensor): return tensor.data(aslist=True)["value"] -def parse_exec_option(dataset, exec_option, indra_installed, token): - """Select the best available exec_option for the given dataset and environment""" +def parse_exec_option_for_cloud_dataset(dataset, indra_installed, token, org_id): + client = dataset.client + user_profile = client.get_user_profile() + if user_profile["name"] != "public": + token = token or client.get_token() + response = client.has_indra_org_permission(org_id) + has_access_to_indra = response.get("available", False) - if exec_option is None or exec_option == "auto": + # option 1: dataset is created in vector_db: + if ( + isinstance(dataset, DeepLakeCloudDataset) + and "vectordb/" in dataset.base_storage.path + and token is not None + ): + return "tensor_db" + # option 2: dataset is created in a linked storage or locally, + # indra is installed user/org has access to indra + elif ( + isinstance(dataset, (DeepLakeCloudDataset)) + and indra_installed + and has_access_to_indra + ): + return "compute_engine" + else: + return "python" + + +class ExecOptionBase(ABC): + def get_token(self, token): + user_profile = self.client.get_user_profile() + if user_profile["name"] != "public": + token = token or self.client.get_token() + return token + + def get_response(self): + response = self.client.has_indra_org_permission(self.org_id) + return response.get("available", False) + + @abstractmethod + def get_exec_option(self): + return NotImplementedError() + + @abstractmethod + def get_org_id(self): + return NotImplementedError() + + +class ExecOptionCloudDataset(ExecOptionBase): + def __init__(self, dataset, indra_installed, org_id): + self.dataset = dataset + self.indra_installed = indra_installed + self.client = dataset.client + self.token = self.dataset.token + self.org_id = self.get_org_id() + + def get_exec_option(self): + # option 1: dataset is created in vector_db: if ( - isinstance(dataset, DeepLakeCloudDataset) - and "vectordb/" in dataset.base_storage.path - and token is not None + isinstance(self.dataset, DeepLakeCloudDataset) + and "vectordb/" in self.dataset.base_storage.path + and self.token is not None ): return "tensor_db" + # option 2: dataset is created in a linked storage or locally, + # indra is installed user/org has access to indra elif ( - isinstance(dataset, (DeepLakeCloudDataset, Dataset)) - and indra_installed - and token is not None + isinstance(self.dataset, (DeepLakeCloudDataset)) + and self.indra_installed + and self.get_response() ): return "compute_engine" else: return "python" - else: - return exec_option + + def get_org_id(self): + return self.dataset.org_id + + +class ExecOptionLocalDataset(ExecOptionBase): + def __init__(self, dataset, indra_installed, org_id): + self.dataset = dataset + self.indra_installed = indra_installed + self.token = self.dataset.token + self.org_id = org_id + + def get_org_id(self): + if self.org_id is None and self.token: + return jwt.decode(self.token, options={"verify_signature": False})["id"] + return self.org_id + + def get_exec_option(self): + if self.token is None: + return "python" + + self.org_id = self.get_org_id() + + if self.indra_installed and self.get_response(): + return "compute_engine" + return "python" + + +def exec_option_factory(dataset, indra_installed, org_id): + if dataset.client is None: + return ExecOptionLocalDataset(dataset, indra_installed, org_id) + return ExecOptionCloudDataset(dataset, indra_installed, org_id) + + +def parse_exec_option(dataset, exec_option, indra_installed, org_id): + if exec_option is None or exec_option == "auto": + exec_option = exec_option_factory(dataset, indra_installed, org_id) + return exec_option.get_exec_option() + return exec_option def parse_return_tensors(dataset, return_tensors, embedding_tensor, return_view): diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index 1cc0b15006..2929b473fd 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -151,6 +151,7 @@ def feature_report_path( parameters: dict, starts_with: str = "hub://", token: Optional[str] = None, + username: str = "public", ): """Helper function for generating humbug feature reports depending on the path""" From 7325267cf4b64c39e6e2d90b76cfd4510ffd52ac Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 17:50:20 +0600 Subject: [PATCH 06/22] Changes + tests --- deeplake/api/dataset.py | 18 ++++----- .../core/vectorstore/deeplake_vectorstore.py | 20 ++++------ .../vectorstore/test_deeplake_vectorstore.py | 21 +++++----- .../vector_search/dataset/dataset.py | 7 ++++ .../core/vectorstore/vector_search/utils.py | 40 +++++-------------- deeplake/util/bugout_reporter.py | 9 +++-- 6 files changed, 49 insertions(+), 66 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index a1ddf1a376..3d4eba57b7 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -1,6 +1,7 @@ import os import deeplake +import jwt import pathlib import posixpath from typing import Dict, Optional, Union, List @@ -11,7 +12,7 @@ from deeplake.auto.unstructured.yolo.yolo import YoloDataset from deeplake.client.client import DeepLakeBackendClient from deeplake.client.log import logger -from deeplake.client.utils import read_token +from deeplake.client.utils import get_user_name, read_token from deeplake.core.dataset import Dataset, dataset_factory from deeplake.core.tensor import Tensor from deeplake.core.meta.dataset_meta import DatasetMeta @@ -362,6 +363,7 @@ def empty( lock_enabled: Optional[bool] = True, lock_timeout: Optional[int] = 0, verbose: bool = True, + username: str = "public", ) -> Dataset: """Creates an empty dataset @@ -384,7 +386,8 @@ def empty( org_id (str, Optional): Organization id to be used for enabling enterprise features. Only applicable for local datasets. verbose (bool): If True, logs will be printed. Defaults to True. lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely - lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally + lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. + username (str): Username to be used for creating a dataset in the Deep Lake Tensor Database. Returns: Dataset: Dataset created using the arguments provided. @@ -425,12 +428,6 @@ def empty( local_cache_size=local_cache_size, ) - token = token or read_token(from_env=True) - if token is not None and org_id is None: - # for local datasets - client = DeepLakeBackendClient(token=token) - org_id = client.get_user_profile()["name"] - feature_report_path( path, "empty", @@ -441,7 +438,7 @@ def empty( "lock_timeout": lock_timeout, }, token=token, - username=org_id or "public", + username=username, ) except Exception as e: if isinstance(e, UserNotLoggedInException): @@ -488,6 +485,7 @@ def load( check_integrity: bool = True, lock_timeout: Optional[int] = 0, lock_enabled: Optional[bool] = True, + username: str = "public", ) -> Dataset: """Loads an existing dataset @@ -553,6 +551,7 @@ def load( reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded, setting ``reset=True`` will reset HEAD changes and load the previous version. check_integrity (bool): If the param is True it will do integrity check during dataset loading otherwise the check is not performed + username: (str): Username to be used for creating a dataset in the Deep Lake Tensor Database. .. # noqa: DAR101 @@ -605,6 +604,7 @@ def load( "load", {"lock_enabled": lock_enabled, "lock_timeout": lock_timeout}, token=token, + username=username, ) except Exception as e: if isinstance(e, UserNotLoggedInException): diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 992c12e12a..6792066948 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -1,6 +1,7 @@ import logging import pathlib from typing import Optional, Any, Iterable, List, Dict, Union, Callable +import jwt import numpy as np @@ -16,7 +17,7 @@ DEFAULT_VECTORSTORE_TENSORS, ) from deeplake.client.client import DeepLakeBackendClient -from deeplake.client.utils import read_token +from deeplake.client.utils import get_user_name, read_token from deeplake.core.vectorstore import utils from deeplake.core.vectorstore.vector_search import vector_search from deeplake.core.vectorstore.vector_search import dataset as dataset_utils @@ -110,11 +111,9 @@ def __init__( Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ token = token or read_token(from_env=True) - if token is not None and org_id is None: - # for local datasets - client = DeepLakeBackendClient(token=token) - org_id = client.get_user_profile()["name"] - # if org_id is None and + username = "public" + if token is not None: + username = jwt.decode(token, options={"verify_signature": False})["id"] feature_report_path( path, @@ -134,7 +133,7 @@ def __init__( "runtime": runtime, }, token=token, - username=org_id, + username=username, ) self.ingestion_batch_size = ingestion_batch_size @@ -155,16 +154,13 @@ def __init__( overwrite, runtime, org_id, + username, **kwargs, ) self.embedding_function = embedding_function - start = time() self.exec_option = utils.parse_exec_option( - self.dataset, exec_option, _INDRA_INSTALLED, token, org_id + self.dataset, exec_option, _INDRA_INSTALLED, username ) - # self.exec_option = "python" - end = time() - print("finished parsing exec option in ", end - start) self.verbose = verbose self.tensor_params = tensor_params diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 6f19354fad..f43265d89b 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1855,29 +1855,26 @@ def test_exec_option_cli( @requires_libdeeplake @pytest.mark.parametrize( - "generator", + "path", [ - "s3_ds_generator", - "gcs_ds_generator", - "azure_ds_generator", + "s3_path", + "gcs_path", + "azure_path", ], + indirect=True, ) def test_exec_option_with_connected_datasets( hub_cloud_dev_token, hub_cloud_path, hub_cloud_dev_managed_creds_key, - generator, + path, ): - ds = generator() - ds.create_tensor("x") - ds.x.append(10) + db = VectorStore(path, overwrite=True) - ds.connect( + db.dataset.connect( creds_key=hub_cloud_dev_managed_creds_key, dest_path=hub_cloud_path, token=hub_cloud_dev_token, ) - ds.add_creds_key(hub_cloud_dev_managed_creds_key, managed=True) - - db = VectorStore(path=hub_cloud_path) + db.dataset.add_creds_key(hub_cloud_dev_managed_creds_key, managed=True) assert db.exec_option == "compute_engine" diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py index 4e563816ff..4797d63fb3 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/dataset.py @@ -41,6 +41,7 @@ def create_or_load_dataset( overwrite, runtime, org_id, + username, **kwargs, ): utils.check_indra_installation( @@ -62,6 +63,7 @@ def create_or_load_dataset( read_only, runtime, org_id, + username, **kwargs, ) @@ -76,6 +78,7 @@ def create_or_load_dataset( creds, runtime, org_id, + username, **kwargs, ) @@ -95,6 +98,7 @@ def load_dataset( read_only, runtime, org_id, + username, **kwargs, ): if dataset_path == DEFAULT_VECTORSTORE_DEEPLAKE_PATH: @@ -110,6 +114,7 @@ def load_dataset( creds=creds, verbose=False, org_id=org_id, + username=username, **kwargs, ) check_tensors(dataset) @@ -179,6 +184,7 @@ def create_dataset( creds, runtime, org_id, + username, **kwargs, ): if exec_option == "tensor_db" and ( @@ -200,6 +206,7 @@ def create_dataset( overwrite=overwrite, creds=creds, org_id=org_id, + username=username, **kwargs, ) create_tensors(tensor_params, dataset, logger, embedding_function) diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index f3bd9d50e2..7b0f7a7534 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -59,26 +59,18 @@ def get_token(self, token): token = token or self.client.get_token() return token - def get_response(self): - response = self.client.has_indra_org_permission(self.org_id) - return response.get("available", False) - @abstractmethod def get_exec_option(self): return NotImplementedError() - @abstractmethod - def get_org_id(self): - return NotImplementedError() - class ExecOptionCloudDataset(ExecOptionBase): - def __init__(self, dataset, indra_installed, org_id): + def __init__(self, dataset, indra_installed, username): self.dataset = dataset self.indra_installed = indra_installed self.client = dataset.client self.token = self.dataset.token - self.org_id = self.get_org_id() + self.username = username def get_exec_option(self): # option 1: dataset is created in vector_db: @@ -93,48 +85,38 @@ def get_exec_option(self): elif ( isinstance(self.dataset, (DeepLakeCloudDataset)) and self.indra_installed - and self.get_response() + and self.username != "public" ): return "compute_engine" else: return "python" - def get_org_id(self): - return self.dataset.org_id - class ExecOptionLocalDataset(ExecOptionBase): - def __init__(self, dataset, indra_installed, org_id): + def __init__(self, dataset, indra_installed, username): self.dataset = dataset self.indra_installed = indra_installed self.token = self.dataset.token - self.org_id = org_id - - def get_org_id(self): - if self.org_id is None and self.token: - return jwt.decode(self.token, options={"verify_signature": False})["id"] - return self.org_id + self.username = username def get_exec_option(self): if self.token is None: return "python" - self.org_id = self.get_org_id() - - if self.indra_installed and self.get_response(): + if self.indra_installed and self.username != "public": return "compute_engine" return "python" -def exec_option_factory(dataset, indra_installed, org_id): +def exec_option_factory(dataset, indra_installed, username): if dataset.client is None: - return ExecOptionLocalDataset(dataset, indra_installed, org_id) - return ExecOptionCloudDataset(dataset, indra_installed, org_id) + return ExecOptionLocalDataset(dataset, indra_installed, username) + return ExecOptionCloudDataset(dataset, indra_installed, username) -def parse_exec_option(dataset, exec_option, indra_installed, org_id): +def parse_exec_option(dataset, exec_option, indra_installed, username): if exec_option is None or exec_option == "auto": - exec_option = exec_option_factory(dataset, indra_installed, org_id) + exec_option = exec_option_factory(dataset, indra_installed, username) return exec_option.get_exec_option() return exec_option diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index 2929b473fd..e04050306a 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -1,4 +1,5 @@ import json +import jwt import os from pathlib import Path from platform import machine @@ -7,7 +8,7 @@ from deeplake.client.config import REPORTING_CONFIG_FILE_PATH from deeplake.client.client import DeepLakeBackendClient -from deeplake.client.utils import get_user_name +from deeplake.client.utils import get_user_name, read_token from deeplake.util.bugout_token import BUGOUT_TOKEN from humbug.consent import HumbugConsent from humbug.report import HumbugReporter @@ -163,10 +164,10 @@ def feature_report_path( if path.startswith(starts_with): parameters["Path"] = path - if token is not None: - client = DeepLakeBackendClient(token=token) - username = client.get_user_profile()["name"] + token = token or read_token(from_env=True) + if token is not None and username is "public": + username = jwt.decode(token, options={"verify_signature": False})["id"] set_username(username) deeplake_reporter.feature_report( From ae8072f23999bef8ce679ba514243e459df1070a Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 17:58:43 +0600 Subject: [PATCH 07/22] code clean up --- .../core/vectorstore/deeplake_vectorstore.py | 7 ++--- .../core/vectorstore/vector_search/utils.py | 27 ------------------- 2 files changed, 2 insertions(+), 32 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index efb8113222..5c26c9eca6 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -1,6 +1,6 @@ import logging import pathlib -from typing import Optional, Any, Iterable, List, Dict, Union, Callable +from typing import Optional, Any, List, Dict, Union, Callable import jwt import numpy as np @@ -16,13 +16,10 @@ from deeplake.constants import ( DEFAULT_VECTORSTORE_TENSORS, ) -from deeplake.client.client import DeepLakeBackendClient -from deeplake.client.utils import get_user_name, read_token +from deeplake.client.utils import read_token from deeplake.core.vectorstore import utils from deeplake.core.vectorstore.vector_search import vector_search from deeplake.core.vectorstore.vector_search import dataset as dataset_utils -from deeplake.core.vectorstore.vector_search import filter as filter_utils -from time import time from deeplake.util.bugout_reporter import ( feature_report_path, diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 7b0f7a7534..c0ede8ff9c 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -25,33 +25,6 @@ def parse_tensor_return(tensor): return tensor.data(aslist=True)["value"] -def parse_exec_option_for_cloud_dataset(dataset, indra_installed, token, org_id): - client = dataset.client - user_profile = client.get_user_profile() - if user_profile["name"] != "public": - token = token or client.get_token() - response = client.has_indra_org_permission(org_id) - has_access_to_indra = response.get("available", False) - - # option 1: dataset is created in vector_db: - if ( - isinstance(dataset, DeepLakeCloudDataset) - and "vectordb/" in dataset.base_storage.path - and token is not None - ): - return "tensor_db" - # option 2: dataset is created in a linked storage or locally, - # indra is installed user/org has access to indra - elif ( - isinstance(dataset, (DeepLakeCloudDataset)) - and indra_installed - and has_access_to_indra - ): - return "compute_engine" - else: - return "python" - - class ExecOptionBase(ABC): def get_token(self, token): user_profile = self.client.get_user_profile() From 2e70b958b0277914a60fe9d75e8e1754639bf693 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 18:50:18 +0600 Subject: [PATCH 08/22] Changed feature_report_path --- .../core/vectorstore/deeplake_vectorstore.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 5c26c9eca6..c9fdf6291d 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -107,10 +107,12 @@ def __init__( Danger: Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ - token = token or read_token(from_env=True) - username = "public" - if token is not None: - username = jwt.decode(token, options={"verify_signature": False})["id"] + self.token = token or read_token(from_env=True) + self.username = "public" + if self.token is not None: + self.username = jwt.decode(self.token, options={"verify_signature": False})[ + "id" + ] feature_report_path( path, @@ -125,12 +127,12 @@ def __init__( "read_only": read_only, "ingestion_batch_size": ingestion_batch_size, "exec_option": exec_option, - "token": token, + "token": self.token, "verbose": verbose, "runtime": runtime, }, - token=token, - username=username, + token=self.token, + username=self.username, ) self.ingestion_batch_size = ingestion_batch_size @@ -142,7 +144,7 @@ def __init__( self.dataset = dataset_utils.create_or_load_dataset( tensor_params, path, - token, + self.token, creds, logger, read_only, @@ -151,12 +153,12 @@ def __init__( overwrite, runtime, org_id, - username, + self.username, **kwargs, ) self.embedding_function = embedding_function self.exec_option = utils.parse_exec_option( - self.dataset, exec_option, _INDRA_INSTALLED, username + self.dataset, exec_option, _INDRA_INSTALLED, self.username ) self.verbose = verbose self.tensor_params = tensor_params @@ -252,6 +254,8 @@ def add( "embedding_function": True if embedding_function is not None else False, "embedding_data": True if embedding_data is not None else False, }, + token=self.token, + username=self.username, ) ( @@ -392,6 +396,8 @@ def search( "return_tensors": return_tensors, "return_view": return_view, }, + token=self.token, + username=self.username, ) if exec_option is None and self.exec_option != "python" and callable(filter): @@ -501,6 +507,8 @@ def delete( "exec_option": exec_option, "delete_all": delete_all, }, + token=self.token, + username=self.username, ) if not row_ids: @@ -594,6 +602,8 @@ def update_embedding( "filter": True if filter is not None else False, "exec_option": exec_option, }, + token=self.token, + username=self.username, ) ( @@ -645,12 +655,20 @@ def delete_by_path( Danger: This method permanently deletes all of your data if the Vector Store exists! Be very careful when using this method. """ + token = token or read_token(from_env=True) + if token: + username = jwt.decode(token, options={"verify_signature": False})["id"] feature_report_path( path, "vs.delete_by_path", - {}, + parameters={ + "path": path, + "token": token, + "force": force, + }, token=token, + username=username, ) deeplake.delete(path, large_ok=True, token=token, force=force) From 6a966e4648633ab1a5402ebb75592ec0849983bc Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 19:12:10 +0600 Subject: [PATCH 09/22] Added path --- deeplake/core/vectorstore/deeplake_vectorstore.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index c9fdf6291d..ba34688b24 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -109,6 +109,7 @@ def __init__( """ self.token = token or read_token(from_env=True) self.username = "public" + self.path = path if self.token is not None: self.username = jwt.decode(self.token, options={"verify_signature": False})[ "id" @@ -245,7 +246,8 @@ def add( Optional[List[str]]: List of ids if ``return_ids`` is set to True. Otherwise, None. """ - deeplake_reporter.feature_report( + feature_report_path( + path=self.path, feature_name="vs.add", parameters={ "tensors": list(tensors.keys()) if tensors else None, @@ -381,7 +383,8 @@ def search( Dict: Dictionary where keys are tensor names and values are the results of the search """ - deeplake_reporter.feature_report( + feature_report_path( + path=self.path, feature_name="vs.search", parameters={ "embedding_data": True if embedding_data is not None else False, @@ -497,7 +500,8 @@ def delete( ValueError: If neither ``ids``, ``filter``, ``query``, nor ``delete_all`` are specified, or if an invalid ``exec_option`` is provided. """ - deeplake_reporter.feature_report( + feature_report_path( + path=self.path, feature_name="vs.delete", parameters={ "ids": True if ids is not None else False, @@ -593,7 +597,8 @@ def update_embedding( embedding_source_tensor (Union[str, List[str]], optional): Name of tensor with data that needs to be converted to embeddings. Defaults to `text`. embedding_tensor (Optional[Union[str, List[str]]], optional): Name of the tensor with embeddings. Defaults to None. """ - deeplake_reporter.feature_report( + feature_report_path( + path=self.path, feature_name="vs.delete", parameters={ "ids": True if ids is not None else False, From 9becdbcebb900440691d5a0c4825f87a7e44c3d2 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 19:37:24 +0600 Subject: [PATCH 10/22] tests fix --- deeplake/core/vectorstore/deeplake_vectorstore.py | 1 + deeplake/core/vectorstore/test_deeplake_vectorstore.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index ba34688b24..15a7b03ae9 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -661,6 +661,7 @@ def delete_by_path( This method permanently deletes all of your data if the Vector Store exists! Be very careful when using this method. """ token = token or read_token(from_env=True) + username = "public" if token: username = jwt.decode(token, options={"verify_signature": False})["id"] diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index f43265d89b..4348357a5d 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1805,7 +1805,7 @@ def test_exec_option_with_auth(local_path, hub_cloud_path, hub_cloud_dev_token): assert db.exec_option == "compute_engine" db = VectorStore( - path=hub_cloud_path, + path=hub_cloud_path + "_tensor_db", token=hub_cloud_dev_token, runtime={"tensor_db": True}, ) From f780e1715764feba4679ef270c0d4fb8af7c1d01 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 19:41:35 +0600 Subject: [PATCH 11/22] Code smell fix --- deeplake/util/bugout_reporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index e04050306a..33084a7938 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -166,7 +166,7 @@ def feature_report_path( token = token or read_token(from_env=True) - if token is not None and username is "public": + if token is not None and username == "public": username = jwt.decode(token, options={"verify_signature": False})["id"] set_username(username) From 29ceb26bb898bd67a02acc55f94c15a444226204 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 20:19:29 +0600 Subject: [PATCH 12/22] Fix CLI test --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 4348357a5d..cda7ac9904 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -24,6 +24,7 @@ ) from deeplake.core.vectorstore.vector_search import dataset as dataset_utils from deeplake.cli.auth import login, logout +from click.testing import CliRunner EMBEDDING_DIM = 100 @@ -1816,11 +1817,12 @@ def test_exec_option_with_auth(local_path, hub_cloud_path, hub_cloud_dev_token): def test_exec_option_cli( local_path, hub_cloud_path, - username, - password, - runner, hub_cloud_dev_token, + hub_cloud_dev_credentials, ): + runner = CliRunner() + username, password = hub_cloud_dev_credentials + runner.invoke(login, f"-u {username} -p {password}") # Testing exec_option with cli login and logout commands are executed runner.invoke(login, f"-u {username} -p {password}") From ce886650dcbad3fa18ac2de7f1b371b56fab0079 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 20:19:59 +0600 Subject: [PATCH 13/22] remove duplicate --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index cda7ac9904..013f3cde82 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1822,7 +1822,6 @@ def test_exec_option_cli( ): runner = CliRunner() username, password = hub_cloud_dev_credentials - runner.invoke(login, f"-u {username} -p {password}") # Testing exec_option with cli login and logout commands are executed runner.invoke(login, f"-u {username} -p {password}") From 68885b55a99889da10309e6acf37affa16305309 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Thu, 7 Sep 2023 21:22:26 +0600 Subject: [PATCH 14/22] upd --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 013f3cde82..4e326815d3 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1842,7 +1842,7 @@ def test_exec_option_cli( # local dataset and logged out with cli db = VectorStore( - path=hub_cloud_path, + path=local_path, ) assert db.exec_option == "python" From 2095402e948dfa00f34a05f3715fc5a432c8db47 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Fri, 8 Sep 2023 00:00:03 +0600 Subject: [PATCH 15/22] test fix --- .../vector_search/dataset/test_dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py index 43c4fdb0e5..668e529b4d 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py @@ -42,6 +42,8 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): embedding_function=Embedding, runtime=None, exec_option="python", + org_id=None, + username="public", ) assert len(dataset) == 0 assert set(dataset.tensors.keys()) == { @@ -69,6 +71,8 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): runtime={"tensor_db": True}, overwrite=True, embedding_function=Embedding, + org_id=None, + username="testingacc2", ) assert len(dataset) == 0 assert set(dataset.tensors.keys()) == { @@ -102,6 +106,8 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): runtime=None, overwrite=True, embedding_function=Embedding, + org_id=None, + username="testingacc2", ) @@ -119,6 +125,8 @@ def test_load(caplog, hub_cloud_dev_token): token=hub_cloud_dev_token, embedding_function=None, runtime=None, + org_id=None, + username="testingacc2", ) assert dataset.max_len == 10 @@ -140,6 +148,8 @@ def test_load(caplog, hub_cloud_dev_token): embedding_function=None, overwrite=False, runtime=None, + org_id=None, + username="public", ) assert ( f"The default deeplake path location is used: {DEFAULT_VECTORSTORE_DEEPLAKE_PATH}" @@ -159,6 +169,8 @@ def test_load(caplog, hub_cloud_dev_token): embedding_function=None, overwrite=False, runtime=None, + org_id=None, + username="public", ) with pytest.raises(ValueError): From 6014d4b9eb1496af046f3dea1a9b1979b493bcdd Mon Sep 17 00:00:00 2001 From: adilkhan Date: Sat, 9 Sep 2023 18:47:03 +0600 Subject: [PATCH 16/22] removing username from params --- deeplake/api/dataset.py | 6 ------ deeplake/core/dataset/dataset.py | 5 +---- deeplake/core/vectorstore/deeplake_vectorstore.py | 12 ------------ .../vectorstore/vector_search/dataset/dataset.py | 7 ------- deeplake/util/bugout_reporter.py | 6 +++--- 5 files changed, 4 insertions(+), 32 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 3d4eba57b7..c1832cf63a 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -363,7 +363,6 @@ def empty( lock_enabled: Optional[bool] = True, lock_timeout: Optional[int] = 0, verbose: bool = True, - username: str = "public", ) -> Dataset: """Creates an empty dataset @@ -387,7 +386,6 @@ def empty( verbose (bool): If True, logs will be printed. Defaults to True. lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. - username (str): Username to be used for creating a dataset in the Deep Lake Tensor Database. Returns: Dataset: Dataset created using the arguments provided. @@ -438,7 +436,6 @@ def empty( "lock_timeout": lock_timeout, }, token=token, - username=username, ) except Exception as e: if isinstance(e, UserNotLoggedInException): @@ -485,7 +482,6 @@ def load( check_integrity: bool = True, lock_timeout: Optional[int] = 0, lock_enabled: Optional[bool] = True, - username: str = "public", ) -> Dataset: """Loads an existing dataset @@ -551,7 +547,6 @@ def load( reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded, setting ``reset=True`` will reset HEAD changes and load the previous version. check_integrity (bool): If the param is True it will do integrity check during dataset loading otherwise the check is not performed - username: (str): Username to be used for creating a dataset in the Deep Lake Tensor Database. .. # noqa: DAR101 @@ -604,7 +599,6 @@ def load( "load", {"lock_enabled": lock_enabled, "lock_timeout": lock_timeout}, token=token, - username=username, ) except Exception as e: if isinstance(e, UserNotLoggedInException): diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index 5209d4e690..b1410b5eb1 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -35,7 +35,7 @@ from deeplake.util.tensor_db import parse_runtime_parameters from deeplake.api.info import load_info from deeplake.client.log import logger -from deeplake.client.utils import get_user_name +from deeplake.client.utils import read_token from deeplake.client.client import DeepLakeBackendClient from deeplake.constants import ( FIRST_COMMIT_ID, @@ -4684,6 +4684,3 @@ def _temp_write_access(self): def _get_storage_repository(self) -> Optional[str]: return getattr(self.base_storage, "repository", None) - - def get_user_name(self) -> Optional[str]: - return getattr(self.base_storage, "user_name", None) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 15a7b03ae9..4b4fa2bd2f 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -108,12 +108,7 @@ def __init__( Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ self.token = token or read_token(from_env=True) - self.username = "public" self.path = path - if self.token is not None: - self.username = jwt.decode(self.token, options={"verify_signature": False})[ - "id" - ] feature_report_path( path, @@ -133,7 +128,6 @@ def __init__( "runtime": runtime, }, token=self.token, - username=self.username, ) self.ingestion_batch_size = ingestion_batch_size @@ -154,7 +148,6 @@ def __init__( overwrite, runtime, org_id, - self.username, **kwargs, ) self.embedding_function = embedding_function @@ -257,7 +250,6 @@ def add( "embedding_data": True if embedding_data is not None else False, }, token=self.token, - username=self.username, ) ( @@ -400,7 +392,6 @@ def search( "return_view": return_view, }, token=self.token, - username=self.username, ) if exec_option is None and self.exec_option != "python" and callable(filter): @@ -512,7 +503,6 @@ def delete( "delete_all": delete_all, }, token=self.token, - username=self.username, ) if not row_ids: @@ -608,7 +598,6 @@ def update_embedding( "exec_option": exec_option, }, token=self.token, - username=self.username, ) ( @@ -674,7 +663,6 @@ def delete_by_path( "force": force, }, token=token, - username=username, ) deeplake.delete(path, large_ok=True, token=token, force=force) diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py index 0f37bb5746..3236bb3a19 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/dataset.py @@ -41,7 +41,6 @@ def create_or_load_dataset( overwrite, runtime, org_id, - username, **kwargs, ): utils.check_indra_installation( @@ -63,7 +62,6 @@ def create_or_load_dataset( read_only, runtime, org_id, - username, **kwargs, ) @@ -78,7 +76,6 @@ def create_or_load_dataset( creds, runtime, org_id, - username, **kwargs, ) @@ -98,7 +95,6 @@ def load_dataset( read_only, runtime, org_id, - username, **kwargs, ): if dataset_path == DEFAULT_VECTORSTORE_DEEPLAKE_PATH: @@ -114,7 +110,6 @@ def load_dataset( creds=creds, verbose=False, org_id=org_id, - username=username, **kwargs, ) check_tensors(dataset) @@ -184,7 +179,6 @@ def create_dataset( creds, runtime, org_id, - username, **kwargs, ): if exec_option == "tensor_db" and ( @@ -206,7 +200,6 @@ def create_dataset( overwrite=overwrite, creds=creds, org_id=org_id, - username=username, **kwargs, ) create_tensors(tensor_params, dataset, logger, embedding_function) diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index 33084a7938..f5d9e6f730 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -152,7 +152,6 @@ def feature_report_path( parameters: dict, starts_with: str = "hub://", token: Optional[str] = None, - username: str = "public", ): """Helper function for generating humbug feature reports depending on the path""" @@ -166,9 +165,10 @@ def feature_report_path( token = token or read_token(from_env=True) - if token is not None and username == "public": + username = "public" + if token is not None: username = jwt.decode(token, options={"verify_signature": False})["id"] - set_username(username) + set_username(username) deeplake_reporter.feature_report( feature_name=feature_name, From cace8ab4d72966739277ff08261242f9099d268e Mon Sep 17 00:00:00 2001 From: adilkhan Date: Sat, 9 Sep 2023 19:09:53 +0600 Subject: [PATCH 17/22] Changes --- deeplake/core/vectorstore/deeplake_vectorstore.py | 13 ++++++++++--- deeplake/util/bugout_reporter.py | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 4b4fa2bd2f..349b4aa2c6 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -109,6 +109,11 @@ def __init__( """ self.token = token or read_token(from_env=True) self.path = path + self.username = "public" + if self.token is not None: + self.username = jwt.decode(self.token, options={"verify_signature": False})[ + "id" + ] feature_report_path( path, @@ -128,6 +133,7 @@ def __init__( "runtime": runtime, }, token=self.token, + username=self.username, ) self.ingestion_batch_size = ingestion_batch_size @@ -250,6 +256,7 @@ def add( "embedding_data": True if embedding_data is not None else False, }, token=self.token, + username=self.username, ) ( @@ -392,6 +399,7 @@ def search( "return_view": return_view, }, token=self.token, + username=self.username, ) if exec_option is None and self.exec_option != "python" and callable(filter): @@ -503,6 +511,7 @@ def delete( "delete_all": delete_all, }, token=self.token, + username=self.username, ) if not row_ids: @@ -598,6 +607,7 @@ def update_embedding( "exec_option": exec_option, }, token=self.token, + username=self.username, ) ( @@ -650,9 +660,6 @@ def delete_by_path( This method permanently deletes all of your data if the Vector Store exists! Be very careful when using this method. """ token = token or read_token(from_env=True) - username = "public" - if token: - username = jwt.decode(token, options={"verify_signature": False})["id"] feature_report_path( path, diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index f5d9e6f730..c8a89c0e68 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -152,6 +152,7 @@ def feature_report_path( parameters: dict, starts_with: str = "hub://", token: Optional[str] = None, + username: str = "public", ): """Helper function for generating humbug feature reports depending on the path""" From 06f455f2d004cc376e6292ca3e631c27279aa7af Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 11 Sep 2023 15:28:44 +0600 Subject: [PATCH 18/22] Sonar fix --- .../core/vectorstore/vector_search/dataset/test_dataset.py | 6 ------ deeplake/core/vectorstore/vector_search/utils.py | 7 +++---- deeplake/util/bugout_reporter.py | 1 - 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py index 668e529b4d..597e0c0732 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py @@ -43,7 +43,6 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): runtime=None, exec_option="python", org_id=None, - username="public", ) assert len(dataset) == 0 assert set(dataset.tensors.keys()) == { @@ -72,7 +71,6 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): overwrite=True, embedding_function=Embedding, org_id=None, - username="testingacc2", ) assert len(dataset) == 0 assert set(dataset.tensors.keys()) == { @@ -107,7 +105,6 @@ def test_create(caplog, hub_cloud_path, hub_cloud_dev_token): overwrite=True, embedding_function=Embedding, org_id=None, - username="testingacc2", ) @@ -126,7 +123,6 @@ def test_load(caplog, hub_cloud_dev_token): embedding_function=None, runtime=None, org_id=None, - username="testingacc2", ) assert dataset.max_len == 10 @@ -149,7 +145,6 @@ def test_load(caplog, hub_cloud_dev_token): overwrite=False, runtime=None, org_id=None, - username="public", ) assert ( f"The default deeplake path location is used: {DEFAULT_VECTORSTORE_DEEPLAKE_PATH}" @@ -170,7 +165,6 @@ def test_load(caplog, hub_cloud_dev_token): overwrite=False, runtime=None, org_id=None, - username="public", ) with pytest.raises(ValueError): diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index c0ede8ff9c..a7fd7033de 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -154,18 +154,17 @@ def generate_random_string(length): return random_string -def generate_json(value): - key = "abc" +def generate_json(value, key): return {key: value} -def create_data(number_of_data, embedding_dim=100): +def create_data(number_of_data, embedding_dim=100, metadata_key="abc"): embeddings = np.random.uniform( low=-10, high=10, size=(number_of_data, embedding_dim) ).astype(np.float32) texts = [generate_random_string(1000) for i in range(number_of_data)] ids = [f"{i}" for i in range(number_of_data)] - metadata = [generate_json(i) for i in range(number_of_data)] + metadata = [generate_json(i, metadata_key) for i in range(number_of_data)] images = ["deeplake/tests/dummy_data/images/car.jpg" for i in range(number_of_data)] return texts, embeddings, ids, metadata, images diff --git a/deeplake/util/bugout_reporter.py b/deeplake/util/bugout_reporter.py index c8a89c0e68..8808ace4ed 100644 --- a/deeplake/util/bugout_reporter.py +++ b/deeplake/util/bugout_reporter.py @@ -166,7 +166,6 @@ def feature_report_path( token = token or read_token(from_env=True) - username = "public" if token is not None: username = jwt.decode(token, options={"verify_signature": False})["id"] set_username(username) From 181d9779f67e2ca72032a88fdeae039bd6c098ab Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 11 Sep 2023 18:33:04 +0600 Subject: [PATCH 19/22] Tests fix --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 4 ++-- deeplake/core/vectorstore/vector_search/utils.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 4e326815d3..a0825b7f32 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -239,7 +239,7 @@ def test_search_basic(local_path, hub_cloud_dev_token): token=hub_cloud_dev_token, ) - assert vector_store.exec_option == "python" + assert vector_store.exec_option == "compute_engine" vector_store.add(embedding=embeddings, text=texts, metadata=metadatas) @@ -395,7 +395,7 @@ def filter_fn(x): path=local_path, overwrite=True, token=hub_cloud_dev_token, exec_option=None ) - assert vector_store_none_exec.exec_option == "python" + assert vector_store_none_exec.exec_option == "compute_engine" # Check that filter_fn with cloud dataset (and therefore "compute_engine" exec option) switches to "python" automatically. with pytest.warns(None): diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index a7fd7033de..4522b7514c 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -2,6 +2,7 @@ from deeplake.constants import MB from deeplake.enterprise.util import raise_indra_installation_error +from deeplake.util.exceptions import TensorDoesNotExistError from deeplake.util.warnings import always_warn from deeplake.client.utils import read_token from deeplake.core.dataset import DeepLakeCloudDataset, Dataset @@ -105,6 +106,10 @@ def parse_return_tensors(dataset, return_tensors, embedding_tensor, return_view) for tensor in dataset.tensors if (tensor != embedding_tensor or return_tensors == "*") ] + for tensor in return_tensors: + if tensor not in dataset.tensors: + raise TensorDoesNotExistError(tensor) + return return_tensors From 61559794c158efac39ed628a0c831b3684e406a1 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 11 Sep 2023 18:39:12 +0600 Subject: [PATCH 20/22] darglint fix --- deeplake/api/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index c1832cf63a..3d941e4837 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -386,6 +386,7 @@ def empty( verbose (bool): If True, logs will be printed. Defaults to True. lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. + Returns: Dataset: Dataset created using the arguments provided. From 84700a6477b513a0731b7c39cd234752fd4176d2 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Mon, 11 Sep 2023 19:24:48 +0600 Subject: [PATCH 21/22] Extended VectorStore functionality --- .../core/vectorstore/deeplake_vectorstore.py | 28 +++++++++++++------ .../vectorstore/test_deeplake_vectorstore.py | 28 +++++++++++++++++++ .../core/vectorstore/vector_search/utils.py | 14 +++++++--- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 349b4aa2c6..03cd9f0a04 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -107,13 +107,8 @@ def __init__( Danger: Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ - self.token = token or read_token(from_env=True) + self._token = token self.path = path - self.username = "public" - if self.token is not None: - self.username = jwt.decode(self.token, options={"verify_signature": False})[ - "id" - ] feature_report_path( path, @@ -157,12 +152,27 @@ def __init__( **kwargs, ) self.embedding_function = embedding_function - self.exec_option = utils.parse_exec_option( - self.dataset, exec_option, _INDRA_INSTALLED, self.username - ) + self._exec_option = exec_option self.verbose = verbose self.tensor_params = tensor_params + @property + def token(self): + return self._token or read_token(from_env=True) + + @property + def exec_option(self) -> str: + return utils.parse_exec_option( + self.dataset, self._exec_option, _INDRA_INSTALLED, self.username + ) + + @property + def username(self) -> str: + username = "public" + if self.token is not None: + username = jwt.decode(self.token, options={"verify_signature": False})["id"] + return username + def add( self, embedding_function: Optional[Union[Callable, List[Callable]]] = None, diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index a0825b7f32..7286a0bfeb 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1837,6 +1837,12 @@ def test_exec_option_cli( ) assert db.exec_option == "compute_engine" + # hub cloud dataset and logged in with cli + db = VectorStore( + path="mem://abc", + ) + assert db.exec_option == "python" + # logging out with cli runner.invoke(logout) @@ -1846,12 +1852,28 @@ def test_exec_option_cli( ) assert db.exec_option == "python" + # Check whether after logging out exec_option changes to python # logging in with cli token runner.invoke(login, f"-t {hub_cloud_dev_token}") db = VectorStore( path=local_path, ) assert db.exec_option == "compute_engine" + # logging out with cli + runner.invoke(logout) + assert db.exec_option == "python" + + # Check whether after logging out when token specified exec_option doesn't change + # logging in with cli token + runner.invoke(login, f"-t {hub_cloud_dev_token}") + db = VectorStore( + path=local_path, + token=hub_cloud_dev_token, + ) + assert db.exec_option == "compute_engine" + # logging out with cli + runner.invoke(logout) + assert db.exec_option == "compute_engine" @requires_libdeeplake @@ -1870,7 +1892,13 @@ def test_exec_option_with_connected_datasets( hub_cloud_dev_managed_creds_key, path, ): + runner = CliRunner() + db = VectorStore(path, overwrite=True) + assert db.exec_option == "python" + + runner.invoke(login, f"-t {hub_cloud_dev_token}") + assert db.exec_option == "python" db.dataset.connect( creds_key=hub_cloud_dev_managed_creds_key, diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 4522b7514c..ede0ad6ce1 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -7,6 +7,7 @@ from deeplake.client.utils import read_token from deeplake.core.dataset import DeepLakeCloudDataset, Dataset from deeplake.client.client import DeepLakeBackendClient +from deeplake.util.path import get_path_type import numpy as np @@ -39,12 +40,13 @@ def get_exec_option(self): class ExecOptionCloudDataset(ExecOptionBase): - def __init__(self, dataset, indra_installed, username): + def __init__(self, dataset, indra_installed, username, path_type): self.dataset = dataset self.indra_installed = indra_installed self.client = dataset.client self.token = self.dataset.token self.username = username + self.path_type = path_type def get_exec_option(self): # option 1: dataset is created in vector_db: @@ -57,7 +59,7 @@ def get_exec_option(self): # option 2: dataset is created in a linked storage or locally, # indra is installed user/org has access to indra elif ( - isinstance(self.dataset, (DeepLakeCloudDataset)) + self.path_type == "hub" and self.indra_installed and self.username != "public" ): @@ -77,15 +79,19 @@ def get_exec_option(self): if self.token is None: return "python" + if "mem" in self.dataset.path: + return "python" + if self.indra_installed and self.username != "public": return "compute_engine" return "python" def exec_option_factory(dataset, indra_installed, username): - if dataset.client is None: + path_type = get_path_type(dataset.path) + if path_type == "local": return ExecOptionLocalDataset(dataset, indra_installed, username) - return ExecOptionCloudDataset(dataset, indra_installed, username) + return ExecOptionCloudDataset(dataset, indra_installed, username, path_type) def parse_exec_option(dataset, exec_option, indra_installed, username): From 84d12cc5e60378e68ba1647b8d6f7945ecd6ef97 Mon Sep 17 00:00:00 2001 From: adilkhan Date: Tue, 12 Sep 2023 14:53:57 +0600 Subject: [PATCH 22/22] marked test as slow --- deeplake/api/tests/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deeplake/api/tests/test_api.py b/deeplake/api/tests/test_api.py index b97e31cc0b..62ab22504d 100644 --- a/deeplake/api/tests/test_api.py +++ b/deeplake/api/tests/test_api.py @@ -2871,6 +2871,7 @@ def test_extend_rollbacks(local_ds, lfpw_links): ds.commit() +@pytest.mark.slow @pytest.mark.parametrize( "compression_args", [