diff --git a/oxen/Cargo.lock b/oxen/Cargo.lock index 5b089bb..b807c26 100644 --- a/oxen/Cargo.lock +++ b/oxen/Cargo.lock @@ -866,9 +866,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.3" +version = "0.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" +checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" [[package]] name = "bindgen" @@ -2575,9 +2575,9 @@ checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" [[package]] name = "liboxen" -version = "0.9.1" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5c336147545f23b80f22ff06faf3fc9e766043c7a6a87151e3fff2eed824868" +checksum = "c2897f48f0de5764e5044abeea0e2e396dde28e57b9c96d80627eecef6ba191c" dependencies = [ "actix-files", "actix-web", @@ -2725,9 +2725,9 @@ dependencies = [ [[package]] name = "lofty" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8143c1ac799df98778738e48e403990dcae27c9843f89ae0bd79967ddd512448" +checksum = "cfa7a62ede7d634892901a2be8bb32f3e13d0418f276d2a391a509afe050f01b" dependencies = [ "base64", "byteorder", @@ -3128,7 +3128,7 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "oxen" -version = "0.2.0" +version = "0.3.0" dependencies = [ "bindgen 0.66.1", "cc", diff --git a/oxen/Cargo.toml b/oxen/Cargo.toml index bdcb510..b83a1c3 100644 --- a/oxen/Cargo.toml +++ b/oxen/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "oxen" -version = "0.2.0" +version = "0.3.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -15,7 +15,7 @@ log = "0.4.17" pyo3-log = "0.8.1" tokio = { version = "1", features = ["full"] } pyo3-polars = "0.6.0" -liboxen = "0.9.1" +liboxen = "0.9.3" # liboxen = { path = "../../rust/Oxen/src/lib" } [build-dependencies] diff --git a/oxen/python/oxen/__init__.py b/oxen/python/oxen/__init__.py index 05acdda..9d78c11 100644 --- a/oxen/python/oxen/__init__.py +++ b/oxen/python/oxen/__init__.py @@ -12,6 +12,8 @@ from oxen.op import Op from oxen import auth from oxen import loaders +from oxen.clone import clone +from oxen.init import init # Names of public modules we want to expose __all__ = [ @@ -23,9 +25,11 @@ "PyLocalRepo", "PyStagedData", "Op", + "clone", + "init", "RemoteRepo", "LocalRepo", + "auth", "loaders", "util", - "auth", ] diff --git a/oxen/python/oxen/auth.py b/oxen/python/oxen/auth.py index 4623e16..fa0e68a 100644 --- a/oxen/python/oxen/auth.py +++ b/oxen/python/oxen/auth.py @@ -1,7 +1,20 @@ from .oxen import auth, util from typing import Optional -def config_auth(host: str, token: str, path: Optional[str] = None): + +def config_auth(token: str, host: str = "hub.oxen.ai", path: Optional[str] = None): + """ + Configures authentication for a host. + + Args: + token: `str` + The token to use for authentication. + host: `str` + The host to configure authentication for. Default: 'hub.oxen.ai' + path: `Optional[str]` + The path to save the authentication config to. + Defaults to $HOME/.config/oxen/user_config.toml + """ if path is None: path = f"{util.get_oxen_config_dir()}/user_config.toml" if not path.endswith(".toml"): diff --git a/oxen/python/oxen/clone.py b/oxen/python/oxen/clone.py new file mode 100644 index 0000000..ac4a315 --- /dev/null +++ b/oxen/python/oxen/clone.py @@ -0,0 +1,50 @@ +from typing import Optional +from oxen.local_repo import LocalRepo + + +def clone( + repo_id: str, + path: Optional[str] = None, + host: str = "hub.oxen.ai", + branch: str = "main", + protocol: str = "https", + shallow=False, + all=False, +): + """ + Clone a repository + + Args: + repo_id: `str` + Name of the repository in the format 'namespace/repo_name'. + For example 'ox/chatbot' + path: `Optional[str]` + The path to clone the repo to. Defaults to the name of the repository. + host: `str` + The host to connect to. Defaults to 'hub.oxen.ai' + branch: `str` + The branch name id to clone. Defaults to 'main' + protocol: `str` + The protocol to use. Defaults to 'https' + shallow: `bool` + Whether to do a shallow clone or not. Default: False + all: `bool` + Whether to clone the full commit history or not. Default: False + Returns: + [LocalRepo](/python-api/local_repo) + A LocalRepo object that can be used to interact with the cloned repo. + """ + # Verify repo_id format + if not "/" in repo_id: + raise ValueError(f"Invalid repo_id format: {repo_id}") + # Get repo name from repo_id + repo_name = repo_id.split("/")[1] + # Get path from repo_name if not provided + if path is None: + path = repo_name + # Get repo url + repo_url = f"{protocol}://{host}/{repo_id}" + # Clone repo + local_repo = LocalRepo(path) + local_repo.clone(repo_url, branch=branch, shallow=shallow, all=all) + return local_repo diff --git a/oxen/python/oxen/init.py b/oxen/python/oxen/init.py new file mode 100644 index 0000000..e37ee3a --- /dev/null +++ b/oxen/python/oxen/init.py @@ -0,0 +1,19 @@ +from oxen.local_repo import LocalRepo + + +def init( + path: str, +): + """ + Initialize a [LocalRepo](/python-api/local_repo) at the given path. + + Args: + path: `str` + The path to initialize the repo at. + Returns: + [LocalRepo](/python-api/local_repo) + A LocalRepo object that can be used to interact with the repo. + """ + # Init Repo + local_repo = LocalRepo(path) + return local_repo.init() diff --git a/oxen/python/oxen/local_repo.py b/oxen/python/oxen/local_repo.py index e546307..5a80ec2 100644 --- a/oxen/python/oxen/local_repo.py +++ b/oxen/python/oxen/local_repo.py @@ -1,6 +1,7 @@ from oxen import PyLocalRepo import os + class LocalRepo: """ The LocalRepo class that allows you to interact with your local oxen repo. @@ -43,8 +44,6 @@ def __init__(self, path: str = "", mkdir=False): path = os.path.abspath(path) if not os.path.exists(path) and mkdir: os.makedirs(path) - elif not os.path.exists(path): - raise Exception(f"Path {path} does not exist.") self._repo = PyLocalRepo(path) @@ -57,6 +56,7 @@ def init(self): Will create a .oxen folder to store all the versions and metadata. """ self._repo.init() + return self def clone(self, url: str, branch: str = "main", shallow=False, all=False): """ @@ -148,6 +148,9 @@ def set_remote(self, name: str, url: str): """ self._repo.set_remote(name, url) + def create_remote(self, name: str): + self._repo.create_remote(name) + def push(self, remote_name: str = "origin", branch: str = "main"): """ Push data to a remote repo from a local repo. @@ -186,4 +189,4 @@ def current_branch(self): """ Returns the current branch. """ - return self._repo.current_branch() \ No newline at end of file + return self._repo.current_branch() diff --git a/oxen/python/oxen/remote_repo.py b/oxen/python/oxen/remote_repo.py index 7b26d32..70eb90c 100644 --- a/oxen/python/oxen/remote_repo.py +++ b/oxen/python/oxen/remote_repo.py @@ -2,7 +2,50 @@ import os from typing import Optional -from oxen import PyRemoteRepo +from typing import List, Tuple +from .oxen import PyRemoteRepo, remote + + +def get_repo(name: str, host: str = "hub.oxen.ai"): + """ + Get a RemoteRepo object for the specified name. For example 'ox/CatDogBBox'. + + Args: + name: `str` + Name of the repository in the format 'namespace/repo_name'. + host: `str` + The host to connect to. Defaults to 'hub.oxen.ai' + Returns: + [RemoteRepo](/python-api/remote_repo) + """ + return remote.get_repo(name, host) + + +def create_repo( + name: str, + description="", + is_public: bool = True, + host: str = "hub.oxen.ai", + files: List[Tuple[str, str]] = [], +): + """ + Create a new repository on the remote server. + + Args: + name: `str` + Name of the repository in the format 'namespace/repo_name'. + description: `str` + Description of the repository. Only applicable to [OxenHub](https://oxen.ai). + is_public: `bool` + Whether the repository is public or private. Only applicable to [OxenHub](https://oxen.ai). + host: `str` + The host to connect to. Defaults to 'hub.oxen.ai' + files: `List[Tuple[str, str]]` + A list of tuples containing the path to the file and the contents of the file that you would like to seed the repository with. + Returns: + [RemoteRepo](/python-api/remote_repo) + """ + return remote.create_repo(name, description, is_public, host, files) class RemoteRepo: @@ -115,7 +158,9 @@ def ls( return self._repo.ls(directory, page_num, page_size) - def download(self, remote_path: str, local_path: Optional[str] = None, revision: str = ""): + def download( + self, remote_path: str, local_path: Optional[str] = None, revision: str = "" + ): """ Download a file or directory from the remote repo. diff --git a/oxen/python/oxen/user.py b/oxen/python/oxen/user.py index 3ab9526..3e05c9d 100644 --- a/oxen/python/oxen/user.py +++ b/oxen/python/oxen/user.py @@ -1,16 +1,38 @@ from .oxen import user, util from typing import Optional + def config_user(name: str, email: str, path: Optional[str] = None): + """ + Configures user for a host. + + Args: + name: `str` + The name to use for user. + email: `str` + The email to use for user. + path: `Optional[str]` + The path to save the user config to. + Defaults to $HOME/.config/oxen/user_config.toml + """ if path is None: path = f"{util.get_oxen_config_dir()}/user_config.toml" if not path.endswith(".toml"): raise ValueError(f"Path {path} must end with .toml") return user.config_user(name, email, path) + def current_user(path: Optional[str] = None): + """ + Gets the current user. + + Args: + path: `Optional[str]` + The path to load the user config from. + Defaults to $HOME/.config/oxen/user_config.toml + """ if path is None: path = f"{util.get_oxen_config_dir()}/user_config.toml" if not path.endswith(".toml"): raise ValueError(f"Path {path} must end with .toml") - return user.current_user(path) \ No newline at end of file + return user.current_user(path) diff --git a/oxen/src/lib.rs b/oxen/src/lib.rs index ba6235f..2af1483 100644 --- a/oxen/src/lib.rs +++ b/oxen/src/lib.rs @@ -1,8 +1,4 @@ -use std::path::PathBuf; -use error::PyOxenError; -use liboxen::{config::UserConfig, model::{repository::local_repository::FileNew, RepositoryNew}}; -use py_remote_repo::PyRemoteRepo; use pyo3::prelude::*; pub mod error; @@ -18,6 +14,7 @@ pub mod py_remote_repo; pub mod py_paginated_dir_entries; pub mod py_staged_data; pub mod py_user; +pub mod remote; pub mod user; pub mod util; @@ -70,25 +67,11 @@ fn oxen(py: Python, m: &PyModule) -> PyResult<()> { user_module.add_function(wrap_pyfunction!(user::current_user, user_module)?)?; m.add_submodule(user_module)?; - Ok(()) -} + // Remote Module + let remote_module = PyModule::new(py, "remote")?; + remote_module.add_function(wrap_pyfunction!(remote::get_repo, remote_module)?)?; + remote_module.add_function(wrap_pyfunction!(remote::create_repo, remote_module)?)?; + m.add_submodule(remote_module)?; -// TODO: be able to pass in file list and contents from python -#[pyfunction] -pub fn create_remote_repo(namespace: String, name: String, host: String) -> Result { - let result = pyo3_asyncio::tokio::get_runtime().block_on(async { - let config = UserConfig::get()?; - let user = config.to_user(); - let files: Vec = vec![FileNew { - path: PathBuf::from("README.md"), - contents: format!("# {}\n", &name), - }]; - let repo = RepositoryNew::from_files(&namespace, &name, files, user); - liboxen::api::remote::repositories::create(repo, &host).await - })?; - Ok(PyRemoteRepo { - repo: result.clone(), - host: host.clone(), - revision: "main".to_string(), - }) + Ok(()) } diff --git a/oxen/src/py_diff.rs b/oxen/src/py_diff.rs index 103cf38..d3011d0 100644 --- a/oxen/src/py_diff.rs +++ b/oxen/src/py_diff.rs @@ -10,21 +10,21 @@ pub struct PyDiff { #[pymethods] impl PyDiff { fn __repr__(&self) -> String { - format!("PyDiff(type={})", self.get_type()) + format!("PyDiff(type=TODO)") } - #[getter] - pub fn get_type(&self) -> String { - match &self.diff { - GenericDiff::DirDiff(_diff) => { - "dir".to_string() - }, - GenericDiff::TabularDiff(_diff) => { - "tabular".to_string() - }, - // GenericDiff::TextDiff(_diff) => { - // "text".to_string() - // }, - } - } + // #[getter] + // pub fn get_type(&self) -> String { + // match &self.diff { + // GenericDiff::DirDiff(_diff) => { + // "dir".to_string() + // }, + // GenericDiff::TabularDiff(_diff) => { + // "tabular".to_string() + // }, + // // GenericDiff::TextDiff(_diff) => { + // // "text".to_string() + // // }, + // } + // } } \ No newline at end of file diff --git a/oxen/src/py_remote_repo.rs b/oxen/src/py_remote_repo.rs index 6254ace..6803c11 100644 --- a/oxen/src/py_remote_repo.rs +++ b/oxen/src/py_remote_repo.rs @@ -1,9 +1,9 @@ use liboxen::model::entry::mod_entry::ModType; -use liboxen::model::repository::local_repository::FileNew; +use liboxen::model::file::FileNew; use pyo3::prelude::*; use liboxen::config::UserConfig; -use liboxen::model::{NewCommitBody, ContentType, Remote, RemoteRepository, RepositoryNew}; +use liboxen::model::{NewCommitBody, ContentType, Remote, RemoteRepository, RepoNew}; use liboxen::{api, command}; use pyo3::exceptions::PyValueError; @@ -54,6 +54,14 @@ impl PyRemoteRepo { }) } + fn __repr__(&self) -> String { + format!("RemoteRepo(namespace='{}', name='{}', url='{}')", self.namespace(), self.name(), self.url()) + } + + fn __str__(&self) -> String { + format!("{}/{}", self.namespace(), self.name()) + } + fn url(&self) -> &str { self.repo.url() } @@ -77,21 +85,22 @@ impl PyRemoteRepo { fn create(&mut self, empty: bool) -> Result { let result = pyo3_asyncio::tokio::get_runtime().block_on(async { if empty { - api::remote::repositories::create_empty( - &self.repo.namespace, - &self.repo.name, - &self.host, - ) - .await + let repo = RepoNew::from_namespace_name_host( + self.repo.namespace.clone(), + self.repo.name.clone(), + self.host.clone(), + ); + api::remote::repositories::create_empty(repo).await } else { let config = UserConfig::get()?; let user = config.to_user(); let files: Vec = vec![FileNew { path: PathBuf::from("README.md"), contents: format!("# {}\n", &self.repo.name), + user: user.clone() }]; - let repo = RepositoryNew::from_files(&self.repo.namespace, &self.repo.name, files, user); - api::remote::repositories::create(repo, &self.host).await + let repo = RepoNew::from_files(&self.repo.namespace, &self.repo.name, files); + api::remote::repositories::create(repo).await } })?; diff --git a/oxen/src/py_staged_data.rs b/oxen/src/py_staged_data.rs index d8ddd05..af12d39 100644 --- a/oxen/src/py_staged_data.rs +++ b/oxen/src/py_staged_data.rs @@ -21,6 +21,14 @@ impl PyStagedData { self.data.to_string() } + pub fn is_dirty(&self) -> bool { + return !self.data.is_clean(); + } + + pub fn is_clean(&self) -> bool { + return self.data.is_clean(); + } + pub fn added_files(&self) -> PyResult> { Ok(self .data diff --git a/oxen/src/remote.rs b/oxen/src/remote.rs new file mode 100644 index 0000000..00c69d4 --- /dev/null +++ b/oxen/src/remote.rs @@ -0,0 +1,82 @@ + +use pyo3::prelude::*; +use std::path::PathBuf; + +use liboxen::config::UserConfig; +use liboxen::constants::DEFAULT_BRANCH_NAME; +use liboxen::model::{file::FileNew, RepoNew}; +use liboxen::error::OxenError; +use crate::error::PyOxenError; +use crate::py_remote_repo::PyRemoteRepo; + +#[pyfunction] +pub fn get_repo(name: String, host: String) -> Result, PyOxenError> { + let result = pyo3_asyncio::tokio::get_runtime().block_on(async { + liboxen::api::remote::repositories::get_by_name_and_host(name, &host).await + })?; + + if let Some(repo) = result { + return Ok(Some(PyRemoteRepo { + repo: repo.clone(), + host: host.clone(), + revision: DEFAULT_BRANCH_NAME.to_string(), + })); + } + + Ok(None) +} + +#[pyfunction] +pub fn create_repo( + name: String, + description: String, + is_public: bool, + host: String, + files: Vec<(String, String)> +) -> Result { + // Check that name is valid ex: :namespace/:repo_name + if !name.contains("/") { + return Err(OxenError::basic_str(format!( + "Invalid repository name: {}", + name + )).into()); + } + + let namespace = name.split("/").collect::>()[0].to_string(); + let repo_name = name.split("/").collect::>()[1].to_string(); + + let result = pyo3_asyncio::tokio::get_runtime().block_on(async { + let config = UserConfig::get()?; + let user = config.to_user(); + + if files.is_empty() { + let mut repo = RepoNew::from_namespace_name_host(namespace, repo_name, host.clone()); + if !description.is_empty() { + repo.description = Some(description); + } + repo.is_public = Some(is_public); + + liboxen::api::remote::repositories::create_empty(repo).await + } else { + let files: Vec = files.iter().map(|(path, contents)| { + FileNew { + path: PathBuf::from(path), + contents: contents.to_string(), + user: user.clone() + } + }).collect(); + let mut repo = RepoNew::from_files(&namespace, &repo_name, files); + if !description.is_empty() { + repo.description = Some(description); + } + repo.is_public = Some(is_public); + + liboxen::api::remote::repositories::create(repo).await + } + })?; + Ok(PyRemoteRepo { + repo: result.clone(), + host: host.clone(), + revision: DEFAULT_BRANCH_NAME.to_string(), + }) +} diff --git a/oxen/tests/test_download.py b/oxen/tests/test_download.py index fe8b2ea..1fb7159 100644 --- a/oxen/tests/test_download.py +++ b/oxen/tests/test_download.py @@ -1,4 +1,3 @@ - # def test_download_directory_with_slash( # celeba_remote_repo_fully_pushed, empty_local_dir # ): @@ -9,6 +8,7 @@ # # download the annotations directory # remote_repo.download("annotations/") + def test_download_directory_without_slash( celeba_remote_repo_fully_pushed, empty_local_dir ): diff --git a/scripts/hf2oxen.py b/scripts/hf2oxen.py index 071a39d..5b630ba 100644 --- a/scripts/hf2oxen.py +++ b/scripts/hf2oxen.py @@ -1,14 +1,142 @@ -import datasets + import argparse +import os +from huggingface_hub import list_repo_refs, HfApi +from datasets import load_dataset +from oxen.remote_repo import create_repo, get_repo +from oxen import LocalRepo + +def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']): + """ Returns a human readable string representation of bytes """ + return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:]) + +def get_dataset_info(dataset_name): + import requests + # headers = {"Authorization": f"Bearer {API_TOKEN}"} + headers = {} + API_URL = f"https://datasets-server.huggingface.co/info?dataset={dataset_name}" + def query(): + response = requests.get(API_URL, headers=headers) + return response.json() + data = query() + return data # argparse the name of the dataset parser = argparse.ArgumentParser(description='Download a dataset from hugging face and upload to Oxen.') # parse dataset as -d or --dataset parser.add_argument('-d','--dataset', dest="dataset", required=True, help="Name of the dataset to download from hugging face") +parser.add_argument('-o','--output', dest="output", required=True, help="The output directory to save the dataset to") +parser.add_argument('-n', '--namespace', dest="namespace", default="ox", help="The oxen namespace to upload to") +parser.add_argument('--host', dest="host", default="hub.oxen.ai", help="The host to upload to") args = parser.parse_args() -# download the dataset from hugging face -hf_dataset = datasets.load_dataset(args.dataset) +dataset_name = args.dataset +output_dir = args.output +namespace = args.namespace +host = args.host + +api = HfApi() + +info = api.repo_info(dataset_name, repo_type="dataset") +print(info) +print(info.description) +commits = api.list_repo_commits(dataset_name, repo_type="dataset") +commits.reverse() +print(f"Got {len(commits)} commits") + +info = get_dataset_info(dataset_name) +print(info) +sizes = [] +for key in info['dataset_info'].keys(): + info_obj = info['dataset_info'][key] + if 'size_in_bytes' in info_obj: + size_in_bytes = info_obj['size_in_bytes'] + else: + size_in_bytes = info_obj['dataset_size'] + print(f"{key}: {human_size(size_in_bytes)}") + sizes.append(size_in_bytes) +sum_sizes = sum(sizes) +print(f"Dataset size: {human_size(sum_sizes)}") + +if sum_sizes > 5_000_000_000: + print(f"Dataset size is {human_size(sum_sizes)}, this is greater than 5GB, do not continue") + exit(1) + +# if dir exists, do not continue +output_dir = os.path.join(output_dir, dataset_name) +if os.path.exists(output_dir): + print(f"Directory {output_dir} exists, do not continue") + exit(1) + +clean_name = dataset_name +if "/" in clean_name: + clean_name = dataset_name.replace("/", "_") + +name = f"{namespace}/{clean_name}" +# Create Remote Repo +if get_repo(name, host=host): + print(f"Repo {name} exists, do not continue") + exit(1) + +# create dir +os.makedirs(output_dir) + +# TODO: Create repo with description and README.md based off of contents of dataset info +remote_repo = create_repo(name, host=host) +local_repo = LocalRepo(output_dir) +local_repo.init() +local_repo.set_remote("origin", remote_repo.url()) + +for commit in commits: + print(f"Loading commit: {commit}...") + + # download the dataset from hugging face + try: + hf_dataset = load_dataset(dataset_name, revision=commit.commit_id) + print(hf_dataset) + for key, dataset in hf_dataset.items(): + filename = os.path.join(output_dir, f"{key}.parquet") + dataset.to_parquet(filename) + local_repo.add(filename) + + except Exception as e: + print(f"Got Exception: {e}") + error_str = f"{e}" + split_str = "Please pick one among the available configs: [" + if split_str in error_str: + config_options = error_str.split(split_str)[-1] + config_options = config_options.split("]")[0] + print(f"Available configs for {dataset_name}: {config_options}") + options = config_options.split(",") + for option in options: + option = option.replace("'", "").strip() + print(f"Download dataset {dataset_name} with option {option}") + hf_dataset = load_dataset(dataset_name, option, revision=commit.commit_id) + print(hf_dataset) + + # info = hf_dataset.info + # print(info) + + for key, dataset in hf_dataset.items(): + filename = os.path.join(output_dir, f"{key}_{option}.parquet") + dataset.to_parquet(filename) + local_repo.add(filename) + except: + print(f"Failed to download dataset {dataset_name} with commit {commit}") + continue + + status = local_repo.status() + commit_message = f"{commit.title}\n\n{commit.message}" + if status.is_dirty(): + print(f"✅ Committing with message: {commit_message}...") + + if commit_message == "": + commit.message = f"Update dataset {commit.commit_id}" + + local_repo.commit(commit_message) + else: + print(f"🤷‍♂️ Skipping commit with message: {commit_message}...") + +print(f"Uploading {dataset_name} to {host}...") +local_repo.push() -# export the dataset to a parquet file -hf_dataset.export_to_file(f"{args.dataset}.parquet") \ No newline at end of file