From 5e951aa30c21e519cab8fc691720340002c6ee11 Mon Sep 17 00:00:00 2001 From: tall-josh Date: Mon, 20 May 2024 15:27:13 +1000 Subject: [PATCH 1/2] download and write Highlighter Datasets --- download-and-writer-datasets-locally.md | 95 +++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 download-and-writer-datasets-locally.md diff --git a/download-and-writer-datasets-locally.md b/download-and-writer-datasets-locally.md new file mode 100644 index 0000000..89632a4 --- /dev/null +++ b/download-and-writer-datasets-locally.md @@ -0,0 +1,95 @@ ++++ +title = "Download Datasets From Highlighter" +description = "Download a Dataset from Highlighter and save it to a local file system in a selected format" +date = 2024-05-20T08:00:00+00:00 +updated = 2024-05-20T08:00:00+00:00 +draft = false +weight = 100 +sort_by = "weight" +template = "docs/page.html" + +[extra] +toc = true +top = false ++++ + +The Highlighter SDK allows you to download your Datasets from your Highlighter +account and save it in some common formats. + +When converting to common formats sucs as Coco or Yolo things like `entity_id` +will not be preserved. Only the information nessessary for training will end up +in the resulting saved dataset. If you want to save a dataset localled and not +loose this information you must use the `hdf` or `json` format. + +## CLI + +```console +hl dataset read --help +Usage: hl dataset read [OPTIONS] COMMAND [ARGS]... + +Options: + -i, --dataset-ids TEXT integet or : + --page-size INTEGER [default: 200] + --help Show this message and exit. + +Commands: + coco + hdf + yolo +``` + +**Use `--help` to see the format specific cli options** + +For example, the following will: + - download dataset 123 and 456 + - save the images to `/my/image/cache/` + - save the annotations as a `coco` dataset to `my_dataset/` + +```console +hl dataset read -i 123:train -i 456:test coco --annotations-dir my_dataset/ --data-file-dir /my/image/cache/ + +ls my_dataset/ +> test.json train.json +``` + +Use `--help` to see the format specific cli Options + +```console +hl dataset read -i 123 yolo --help +``` + +# Python API + +The following will do same download and conversion as in the CLI example + +```python +from pathlib import Path +from highlighter.datasets import Dataset +from highlighter.datasets.formats.coco import CocoWriter +from highlighter import HLClient + +client = HLClient.from_env() + +train_ds = Dataset.read_highlighter_dataset_assessments( + client, 123 + ) +train_ds.data_files_df.loc[:, "split"] = "train" + +test_ds = Dataset.read_highlighter_dataset_assessments( + client, 456 + ) +test_ds.data_files_df.loc[:, "split"] = "test" + +combined_ds = Dataset.combine([train_ds, test_ds]) + +annotations_dir = Path("my_dataset/") +writer = CocoWriter(annotations_dir) +writer.write(combined_ds) + +images_dir = Path("/my/image/cache/") +Dataset.download_dataset_files( + client, + images_dir, + combined_ds.data_files_df, + ) +``` From 022f1b026969228c273ff1940bb69dec5a8c083c Mon Sep 17 00:00:00 2001 From: tall-josh Date: Mon, 20 May 2024 15:45:10 +1000 Subject: [PATCH 2/2] Move download-and-write-datasets-locally.md --- .../docs/how-to-guides/download-and-writer-datasets-locally.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename download-and-writer-datasets-locally.md => content/docs/how-to-guides/download-and-writer-datasets-locally.md (100%) diff --git a/download-and-writer-datasets-locally.md b/content/docs/how-to-guides/download-and-writer-datasets-locally.md similarity index 100% rename from download-and-writer-datasets-locally.md rename to content/docs/how-to-guides/download-and-writer-datasets-locally.md