rotationalio · bbengfort · Dec 20, 2024 · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,7 @@ cython_debug/
 #.idea/
 
 construe/datasets/fixtures
-construe/models/fixtures
+construe/models/fixtures
+
+# For credentials management
+.secret
diff --git a/README.md b/README.md
@@ -103,7 +103,76 @@ Options:
 7. [Movie Scenes](https://huggingface.co/datasets/unography/movie-scenes): An image dataset that contains stills from commercial movies and can be used for image classification and content-moderation tasks.
 
 
-## Releases
+## Developer Information
+
+If you are a construe developer there are several helper utilities built into the library that will allow you to manage datasets and models both locally and in the cloud. But first, there are additional dependencies that you must install.
+
+In `requirements.txt` uncomment the section that says: `"# Packaging Dependencies"`, e.g. your requirements should now have a section that appears similar to:
+
+```
+# Packaging Dependencies
+black==24.10.0
+build==1.2.2.post1
+datasets==3.1.0
+flake8==7.1.1
+google-cloud-storage==2.19.0
+packaging==24.2
+pip==24.3.1
+setuptools==75.3.0
+twine==5.1.1
+wheel==0.45.0
+```
+
+**NOTE:** the README might not be up to date with all required dependencies, so make sure you use the latest `requirements.txt`.
+
+Then install these dependencies and the test dependencies:
+
+```
+$ pip install -r requirements.txt
+$ pip install -r tests/requirements.txt
+```
+
+### Tests and Linting
+
+All tests are in the `tests` folder and are structured similarly to the `construe` module. All tests can be run with `pytest`:
+
+```
+$ pytest
+```
+
+We use `flake8` for linting as configured in `setup.cfg` -- note that the `.flake8` file is for IDEs only and is not used when running tests. If you want to use `black` to automatically format your files:
+
+```
+$ black path/to/file.py
+```
+
+### Dataset Management
+
+The `python -m construe.datasets` utility provides some helper functionality for managing datasets including the following commands:
+
+- **manifest**: Generate a manifest file from local fixtures.
+- **originals**: Download original datasets and store them in fixtures.
+- **sample**: Create a sample dataset from the original that is smaller.
+- **upload**: Upload datasets to GCP for user downloads.
+
+To regenerate the datasets you would run the `originals` command first to download the datasets from HuggingFace or elsewhere on the web, then run `sample` to create statistical samples on those datasets. Run `manifest` to generate the new manifest for the datasets and SHA256 signatures, then run `upload` to save them to our GCP bucket.
+
+You must have valid GCP service account credentials to upload datasets.
+
+### Models Management
+
+The `python -m construe.models` utility provides helpers for managing models and converting them to the tflite format including the following commands:
+
+- **convert**: Convert source models to the tflite format for use in embeded systems.
+- **manifest**: Generate a manifest file from local fixtures.
+- **originals**: Download original models and store them in fixtures.
+- **upload**: Upload converted models to GCP for user downloads.
+
+To regenerate the models you would run the `originals` command to download the models from HuggingFace, then run `convert` to transform them into the tflite format. Run `manifest` to generate the new manifest for the models and SHA256 signatures, then run `upload` to save them to our GCP bucket.
+
+You must have valid GCP service account credentials to upload datasets.
+
+### Releases
 
 To release the construe library and deploy to PyPI run the following commands:
 

diff --git a/construe/cloud/__init__.py b/construe/cloud/__init__.py
@@ -0,0 +1,7 @@
+"""
+The cloud module helps developers create manifests and upload datasets and manifests
+to cloud storage where they're cached for use with the LLM benchmark utility.
+
+This module should not be used by benchmark users; use the datasets and models
+utility directly instead.
+"""
diff --git a/construe/cloud/gcp.py b/construe/cloud/gcp.py
@@ -0,0 +1,74 @@
+"""
+Upload models to the google cloud bucket (developers only).
+"""
+
+import os
+import glob
+import json
+
+try:
+    from google.cloud import storage
+except ImportError:
+    storage = None
+
+
+CONSTRUE_BUCKET = "construe"
+GOOGLE_CREDENTIALS = "GOOGLE_APPLICATION_CREDENTIALS"
+
+
+def upload(name, path, client=None, bucket=CONSTRUE_BUCKET):
+    """
+    Upload data from source path to a bucket with destination name.
+    """
+    if client is None:
+        client = connect_storage()
+
+    if not os.path.exists(path) or not os.path.isfile(path):
+        raise ValueError("no zip file exists at " + path)
+
+    bucket = client.get_bucket(bucket)
+    blob = bucket.blob(name)
+    blob.upload_from_filename(path)
+
+    return blob.public_url
+
+
+def connect_storage(credentials=None):
+    """
+    Create a google cloud storage client and connect.
+    """
+    # Attempt to fetch credentials from environment
+    credentials = credentials or os.environ.get(GOOGLE_CREDENTIALS, None)
+
+    # Attempt to get credentials from the .secret folder
+    credentials = credentials or find_service_account()
+
+    if credentials is None:
+        raise RuntimeError(
+            "could not find service account credentials: "
+            "set either $GOOGLE_APPLICATION_CREDENTIALS to the path "
+            "or store the credentials in the .secret folder"
+        )
+
+    # Cannot connect without the storage library.
+    if storage is None:
+        raise ImportError(
+            "the google.cloud.storage module is required, install using pip"
+        )
+
+    return storage.Client.from_service_account_json(credentials)
+
+
+def find_service_account():
+    secret = os.path.abspath(os.path.join(
+        os.path.dirname(__file__),
+        "..", "..", ".secret", "*.json"
+    ))
+
+    for path in glob.glob(secret):
+        with open(path, "r") as f:
+            data = json.load(f)
+            if "universe_domain" in data and data["universe_domain"] == "googleapis.com":
+                return path
+
+    return None
diff --git a/construe/cloud/manifest.py b/construe/cloud/manifest.py
@@ -0,0 +1,51 @@
+"""
+Manifest handlers for downloading cloud resources and checking signatures.
+"""
+
+import os
+import json
+import glob
+
+from urllib.parse import urljoin
+
+from .signature import sha256sum
+from ..version import get_version
+
+
+BUCKET = "construe"
+BASE_URL = "https://storage.googleapis.com/"
+
+MODELS = "models"
+DATASETS = "datasets"
+
+
+def load_manifest(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def generate_manifest(fixtures, out, upload_type):
+    manifest = {}
+    version = get_version(short=True)
+
+    for path in glob.glob(os.path.join(fixtures, "*.zip")):
+        fname = os.path.basename(path)
+        name, _ = os.path.splitext(fname)
+
+        manifest[name] = {
+            "url": make_fixture_url(fname, upload_type=upload_type, version=version),
+            "signature": sha256sum(path),
+        }
+
+    with open(out, "w") as o:
+        json.dump(manifest, o, indent=2)
+
+
+def make_fixture_url(fname, upload_type, version=None):
+    path = make_fixture_path(fname, upload_type, version)
+    return urljoin(BASE_URL, path)
+
+
+def make_fixture_path(fname, upload_type, version=None):
+    version = version or get_version(short=True)
+    return os.path.join(BUCKET, f"v{version}", upload_type, fname)
diff --git a/construe/datasets/signature.py → construe/cloud/signature.py b/construe/datasets/signature.py → construe/cloud/signature.py
diff --git a/construe/datasets/__main__.py b/construe/datasets/__main__.py
@@ -9,6 +9,7 @@
 from .source import download_source_datasets, SOURCE_DATASETS
 from .source import sample_source_datasets
 from .manifest import generate_manifest
+from .upload import upload_datasets
 from .path import FIXTURES
 
 
@@ -63,11 +64,19 @@ def manifest(fixtures=FIXTURES, out=None):
     multiple=True,
     help="specify datasets to exclude from source download",
 )
-def originals(fixtures=FIXTURES, exclude=None):
+@click.option(
+    "-i",
+    "--include",
+    type=click.Choice(SOURCE_DATASETS, case_sensitive=False),
+    default=None,
+    multiple=True,
+    help="specify datasets to explicitly include in source download",
+)
+def originals(fixtures=FIXTURES, exclude=None, include=None):
     """
     Download original datasets and store them in fixtures.
     """
-    download_source_datasets(out=FIXTURES, exclude=exclude)
+    download_source_datasets(out=FIXTURES, exclude=exclude, include=include)
 
 
 @main.command(epilog=EPILOG)
@@ -112,6 +121,44 @@ def sample(dataset, fixtures=FIXTURES, out=FIXTURES, size=0.25, suffix="-sample"
     sample_source_datasets(dataset, fixtures, out, size, suffix)
 
 
+@main.command(epilog=EPILOG)
+@click.option(
+    "-f",
+    "--fixtures",
+    type=str,
+    default=FIXTURES,
+    help="path to fixtures directory where source datasets have been downloaded",
+)
+@click.option(
+    "-e",
+    "--exclude",
+    type=click.Choice(SOURCE_DATASETS, case_sensitive=False),
+    default=None,
+    multiple=True,
+    help="specify datasets to exclude from upload",
+)
+@click.option(
+    "-i",
+    "--include",
+    type=click.Choice(SOURCE_DATASETS, case_sensitive=False),
+    default=None,
+    multiple=True,
+    help="specify datasets to explicitly include in upload",
+)
+@click.option(
+    "-c",
+    "--credentials",
+    type=str,
+    default=None,
+    help="path to service account json credentials for upload",
+)
+def upload(**kwargs):
+    """
+    Upload datasets to GCP for user downloads.
+    """
+    upload_datasets(**kwargs)
+
+
 if __name__ == "__main__":
     main(
         obj={},

diff --git a/construe/datasets/download.py b/construe/datasets/download.py
@@ -9,7 +9,7 @@
 from functools import partial
 from urllib.request import urlopen
 
-from .signature import sha256sum
+from ..cloud.signature import sha256sum
 from .manifest import load_manifest
 from .path import get_data_home, cleanup_dataset
 from .path import DIALECTS, LOWLIGHT, REDDIT, MOVIES, ESSAYS, AEGIS, NSFW

diff --git a/construe/datasets/manifest.json b/construe/datasets/manifest.json
@@ -1,58 +1,58 @@
 {
   "dialects": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/dialects.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects.zip",
     "signature": "0e6767047e05f618560d097dfa0587530636c52fc19507c087bdff556b389489"
   },
   "lowlight": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/lowlight.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight.zip",
     "signature": "ddc36eb7f0443efa5e71939e503d0834fd48451281d9658d5cb7ead30143b98f"
   },
   "dialects-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/dialects-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects-sample.zip",
     "signature": "9e9509f4d82468c896bede36b16c6de218a1dce28a56ae49d1fb75933bf770c5"
   },
   "reddit": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/reddit.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit.zip",
     "signature": "d97419403f0d940970b2542d5b188570dacedae3c2a68ada3520cfa95c52f75c"
   },
   "movies-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/movies-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies-sample.zip",
     "signature": "2d3d9294ad875e7489db94fc2ab02c1ad6dfdc15a2bf1a5037be36a6defc8168"
   },
   "essays-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/essays-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays-sample.zip",
     "signature": "a77fc1c2c2718d79132598e6c873fd5b08c40c2e4049d995317747fb76b96631"
   },
   "aegis-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/aegis-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis-sample.zip",
     "signature": "a2b3ae9c5a19833cc594fc4c14a6bfce35ab9c6086f0c2836d2719ab788119bd"
   },
   "aegis": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/aegis.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis.zip",
     "signature": "c846f20d893461525839cd2f61f85faf0dcbff03e1998fd8f747506ff65bec69"
   },
   "nsfw-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/nsfw-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw-sample.zip",
     "signature": "d5044f30769d3a6e9ba639312120dc955bdfcf4d8aa8a6f3ee493334644b9fcd"
   },
   "essays": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/essays.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays.zip",
     "signature": "3a7b260dd5baec9134c7398ac7b9b297d7b1a387bce1a9f99cd8d3e0a7ceb9cc"
   },
   "reddit-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/reddit-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit-sample.zip",
     "signature": "24088c648b8c3497d0b682102c3fa965d46ca22abe8f94695287e09bf82db991"
   },
   "lowlight-sample": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/lowlight-sample.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight-sample.zip",
     "signature": "f34bafa588441b8e240b0932e9ac446d9f805bdfdb22640c036c441258220eaf"
   },
   "movies": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/movies.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies.zip",
     "signature": "618f7aa8aa103192ee8b76fc701ff182b2a41e5e78675a4d6af707e490d36f45"
   },
   "nsfw": {
-    "url": "https://storage.googleapis.com/construe/v0.3.0/nsfw.zip",
+    "url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw.zip",
     "signature": "7ac498e8f17428c51a5c8c366aaf10b47663a9eb8a560fd8abe01366eaf60139"
   }
 }