Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AL-2409] Enable local dataset usage with Indra #2573

Merged
merged 25 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

import deeplake
import jwt
import pathlib
import posixpath
from typing import Dict, Optional, Union, List
Expand All @@ -11,6 +12,7 @@
from deeplake.auto.unstructured.yolo.yolo import YoloDataset
from deeplake.client.client import DeepLakeBackendClient
from deeplake.client.log import logger
from deeplake.client.utils import get_user_name, read_token
from deeplake.core.dataset import Dataset, dataset_factory
from deeplake.core.tensor import Tensor
from deeplake.core.meta.dataset_meta import DatasetMeta
Expand Down Expand Up @@ -361,6 +363,7 @@ def empty(
lock_enabled: Optional[bool] = True,
lock_timeout: Optional[int] = 0,
verbose: bool = True,
username: str = "public",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we have username parameter when it is inferred from credentials?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does token counts as creds?

) -> Dataset:
"""Creates an empty dataset

Expand All @@ -383,8 +386,8 @@ def empty(
org_id (str, Optional): Organization id to be used for enabling enterprise features. Only applicable for local datasets.
verbose (bool): If True, logs will be printed. Defaults to True.
lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely
lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. Defaults to ``True``.

lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally.
username (str): Username to be used for creating a dataset in the Deep Lake Tensor Database.
Returns:
Dataset: Dataset created using the arguments provided.

Expand Down Expand Up @@ -435,6 +438,7 @@ def empty(
"lock_timeout": lock_timeout,
},
token=token,
username=username,
)
except Exception as e:
if isinstance(e, UserNotLoggedInException):
Expand Down Expand Up @@ -481,6 +485,7 @@ def load(
check_integrity: bool = True,
lock_timeout: Optional[int] = 0,
lock_enabled: Optional[bool] = True,
username: str = "public",
) -> Dataset:
"""Loads an existing dataset

Expand Down Expand Up @@ -546,6 +551,7 @@ def load(
reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded,
setting ``reset=True`` will reset HEAD changes and load the previous version.
check_integrity (bool): If the param is True it will do integrity check during dataset loading otherwise the check is not performed
username: (str): Username to be used for creating a dataset in the Deep Lake Tensor Database.

..
# noqa: DAR101
Expand Down Expand Up @@ -598,6 +604,7 @@ def load(
"load",
{"lock_enabled": lock_enabled, "lock_timeout": lock_timeout},
token=token,
username=username,
)
except Exception as e:
if isinstance(e, UserNotLoggedInException):
Expand Down
18 changes: 18 additions & 0 deletions deeplake/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
GET_PRESIGNED_URL_SUFFIX,
CONNECT_DATASET_SUFFIX,
REMOTE_QUERY_SUFFIX,
ORG_PERMISSION_SUFFIX,
)
from deeplake.client.log import logger
import jwt # should add it to requirements.txt
Expand Down Expand Up @@ -509,3 +510,20 @@ def remote_query(
).json()

return response

def has_indra_org_permission(self, org_id: str) -> Dict[str, Any]:
"""Queries a remote dataset.

Args:
org_id (str): The organization to which the dataset belongs.

Returns:
Dict[str, Any]: The json response containing org permissions.
"""
response = self.request(
"GET",
ORG_PERMISSION_SUFFIX.format(org_id),
endpoint=self.endpoint(),
).json()

return response
1 change: 1 addition & 0 deletions deeplake/client/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@
DEFAULT_REQUEST_TIMEOUT = 170

DEEPLAKE_AUTH_TOKEN = "ACTIVELOOP_TOKEN"
ORG_PERMISSION_SUFFIX = "/api/organizations/{}/features/dataset_query"
3 changes: 3 additions & 0 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4684,3 +4684,6 @@ def _temp_write_access(self):

def _get_storage_repository(self) -> Optional[str]:
return getattr(self.base_storage, "repository", None)

def get_user_name(self) -> Optional[str]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used?

return getattr(self.base_storage, "user_name", None)
56 changes: 45 additions & 11 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import pathlib
from typing import Optional, Any, Iterable, List, Dict, Union, Callable
from typing import Optional, Any, List, Dict, Union, Callable
import jwt

import numpy as np

Expand All @@ -15,10 +16,10 @@
from deeplake.constants import (
DEFAULT_VECTORSTORE_TENSORS,
)
from deeplake.client.utils import read_token
from deeplake.core.vectorstore import utils
from deeplake.core.vectorstore.vector_search import vector_search
from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
from deeplake.core.vectorstore.vector_search import filter as filter_utils

from deeplake.util.bugout_reporter import (
feature_report_path,
Expand All @@ -45,6 +46,7 @@ def __init__(
verbose: bool = True,
runtime: Optional[Dict] = None,
creds: Optional[Union[Dict, str]] = None,
org_id: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Creates an empty VectorStore or loads an existing one if it exists at the specified ``path``.
Expand Down Expand Up @@ -105,6 +107,14 @@ def __init__(
Danger:
Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter.
"""
self.token = token or read_token(from_env=True)
self.username = "public"
self.path = path
if self.token is not None:
self.username = jwt.decode(self.token, options={"verify_signature": False})[
"id"
]

feature_report_path(
path,
"vs.initialize",
Expand All @@ -118,11 +128,12 @@ def __init__(
"read_only": read_only,
"ingestion_batch_size": ingestion_batch_size,
"exec_option": exec_option,
"token": token,
"token": self.token,
"verbose": verbose,
"runtime": runtime,
},
token=token,
token=self.token,
username=self.username,
)

self.ingestion_batch_size = ingestion_batch_size
Expand All @@ -134,19 +145,21 @@ def __init__(
self.dataset = dataset_utils.create_or_load_dataset(
tensor_params,
path,
token,
self.token,
creds,
logger,
read_only,
exec_option,
embedding_function,
overwrite,
runtime,
org_id,
self.username,
**kwargs,
)
self.embedding_function = embedding_function
self.exec_option = utils.parse_exec_option(
self.dataset, exec_option, _INDRA_INSTALLED
self.dataset, exec_option, _INDRA_INSTALLED, self.username
)
self.verbose = verbose
self.tensor_params = tensor_params
Expand Down Expand Up @@ -233,7 +246,8 @@ def add(
Optional[List[str]]: List of ids if ``return_ids`` is set to True. Otherwise, None.
"""

deeplake_reporter.feature_report(
feature_report_path(
path=self.path,
feature_name="vs.add",
parameters={
"tensors": list(tensors.keys()) if tensors else None,
Expand All @@ -242,6 +256,8 @@ def add(
"embedding_function": True if embedding_function is not None else False,
"embedding_data": True if embedding_data is not None else False,
},
token=self.token,
username=self.username,
)

(
Expand Down Expand Up @@ -367,7 +383,8 @@ def search(
Dict: Dictionary where keys are tensor names and values are the results of the search
"""

deeplake_reporter.feature_report(
feature_report_path(
path=self.path,
feature_name="vs.search",
parameters={
"embedding_data": True if embedding_data is not None else False,
Expand All @@ -382,6 +399,8 @@ def search(
"return_tensors": return_tensors,
"return_view": return_view,
},
token=self.token,
username=self.username,
)

if exec_option is None and self.exec_option != "python" and callable(filter):
Expand Down Expand Up @@ -481,7 +500,8 @@ def delete(
ValueError: If neither ``ids``, ``filter``, ``query``, nor ``delete_all`` are specified, or if an invalid ``exec_option`` is provided.
"""

deeplake_reporter.feature_report(
feature_report_path(
path=self.path,
feature_name="vs.delete",
parameters={
"ids": True if ids is not None else False,
Expand All @@ -491,6 +511,8 @@ def delete(
"exec_option": exec_option,
"delete_all": delete_all,
},
token=self.token,
username=self.username,
)

if not row_ids:
Expand Down Expand Up @@ -575,7 +597,8 @@ def update_embedding(
embedding_source_tensor (Union[str, List[str]], optional): Name of tensor with data that needs to be converted to embeddings. Defaults to `text`.
embedding_tensor (Optional[Union[str, List[str]]], optional): Name of the tensor with embeddings. Defaults to None.
"""
deeplake_reporter.feature_report(
feature_report_path(
path=self.path,
feature_name="vs.delete",
parameters={
"ids": True if ids is not None else False,
Expand All @@ -584,6 +607,8 @@ def update_embedding(
"filter": True if filter is not None else False,
"exec_option": exec_option,
},
token=self.token,
username=self.username,
)

(
Expand Down Expand Up @@ -635,12 +660,21 @@ def delete_by_path(
Danger:
This method permanently deletes all of your data if the Vector Store exists! Be very careful when using this method.
"""
token = token or read_token(from_env=True)
username = "public"
if token:
username = jwt.decode(token, options={"verify_signature": False})["id"]

feature_report_path(
path,
"vs.delete_by_path",
{},
parameters={
"path": path,
"token": token,
"force": force,
},
token=token,
username=username,
)
deeplake.delete(path, large_ok=True, token=token, force=force)

Expand Down
95 changes: 95 additions & 0 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
DatasetHandlerError,
)
from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
from deeplake.cli.auth import login, logout
from click.testing import CliRunner


EMBEDDING_DIM = 100
Expand Down Expand Up @@ -1784,3 +1786,96 @@ def test_read_only():
def test_delete_by_path_wrong_path():
with pytest.raises(DatasetHandlerError):
VectorStore.delete_by_path("some_path")


@requires_libdeeplake
def test_exec_option_with_auth(local_path, hub_cloud_path, hub_cloud_dev_token):
db = VectorStore(path=local_path)
assert db.exec_option == "python"

db = VectorStore(
path=local_path,
token=hub_cloud_dev_token,
)
assert db.exec_option == "compute_engine"

db = VectorStore(
path=hub_cloud_path,
token=hub_cloud_dev_token,
)
assert db.exec_option == "compute_engine"

db = VectorStore(
path=hub_cloud_path + "_tensor_db",
token=hub_cloud_dev_token,
runtime={"tensor_db": True},
)
assert db.exec_option == "tensor_db"


@requires_libdeeplake
def test_exec_option_cli(
local_path,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_credentials,
):
runner = CliRunner()
username, password = hub_cloud_dev_credentials
# Testing exec_option with cli login and logout commands are executed
runner.invoke(login, f"-u {username} -p {password}")

# local dataset and logged in with cli
db = VectorStore(
path=local_path,
)
assert db.exec_option == "compute_engine"

# hub cloud dataset and logged in with cli
db = VectorStore(
path=hub_cloud_path,
)
assert db.exec_option == "compute_engine"

# logging out with cli
runner.invoke(logout)

# local dataset and logged out with cli
db = VectorStore(
path=local_path,
)
assert db.exec_option == "python"

# logging in with cli token
runner.invoke(login, f"-t {hub_cloud_dev_token}")
db = VectorStore(
path=local_path,
)
assert db.exec_option == "compute_engine"


@requires_libdeeplake
@pytest.mark.parametrize(
"path",
[
"s3_path",
"gcs_path",
"azure_path",
],
indirect=True,
)
def test_exec_option_with_connected_datasets(
hub_cloud_dev_token,
hub_cloud_path,
hub_cloud_dev_managed_creds_key,
path,
):
db = VectorStore(path, overwrite=True)

db.dataset.connect(
creds_key=hub_cloud_dev_managed_creds_key,
dest_path=hub_cloud_path,
token=hub_cloud_dev_token,
)
db.dataset.add_creds_key(hub_cloud_dev_managed_creds_key, managed=True)
assert db.exec_option == "compute_engine"
Loading