From b91e496359fef5d49512536b8d227683571111b4 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Thu, 15 Feb 2024 11:15:24 -0500
Subject: [PATCH 1/7] formats

---
 lilac/formats/sharegpt.py                     |   2 +-
 lilac/router_dataset.py                       |  10 ++
 lilac/router_dataset_signals.py               |  49 ++++++-
 notebooks/Clustering copy.ipynb               | 120 ++++++++++++++++++
 .../lib/components/ComputeClusterModal.svelte | 119 ++++++++++++-----
 .../src/lib/queries/datasetQueries.ts         |   4 +
 web/lib/fastapi_client/index.ts               |   1 +
 .../models/ClusterInputSelectorConfig.ts      |  13 ++
 .../fastapi_client/models/ClusterOptions.ts   |   5 +-
 .../services/DatasetsService.ts               |  25 ++++
 10 files changed, 304 insertions(+), 44 deletions(-)
 create mode 100644 notebooks/Clustering copy.ipynb
 create mode 100644 web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts

diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py
index 75c9f2e91..49a6b1038 100644
--- a/lilac/formats/sharegpt.py
+++ b/lilac/formats/sharegpt.py
@@ -59,5 +59,5 @@ class ShareGPT(DatasetFormat):
 
   input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
     selector.name: selector
-    for selector in [_SYSTEM_SELECTOR, _HUMAN_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR]
+    for selector in [_HUMAN_SELECTOR, _SYSTEM_SELECTOR, _GPT_SELECTOR, _TOOL_SELECTOR]
   }
diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py
index fbef6181f..76a0b08e2 100644
--- a/lilac/router_dataset.py
+++ b/lilac/router_dataset.py
@@ -534,3 +534,13 @@ def restore_rows(
     searches=options.searches,
     filters=sanitized_filters,
   )
+
+
+@router.get('/{namespace}/{dataset_name}/format_selectors')
+def get_format_selectors(namespace: str, dataset_name: str) -> list[str]:
+  """Get format selectors for the dataset if a format has been inferred."""
+  dataset = get_dataset(namespace, dataset_name)
+  manifest = dataset.manifest()
+  if manifest.dataset_format:
+    return list(manifest.dataset_format.input_selectors.keys())
+  return []
diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py
index 7c90335cc..b20455a60 100644
--- a/lilac/router_dataset_signals.py
+++ b/lilac/router_dataset_signals.py
@@ -1,5 +1,5 @@
 """Routing endpoints for running signals on datasets."""
-from typing import Annotated, Optional
+from typing import Annotated, Optional, Union
 
 from fastapi import APIRouter, HTTPException
 from fastapi.params import Depends
@@ -7,9 +7,11 @@
 from pydantic import Field as PydanticField
 
 from .auth import UserInfo, get_session_user, get_user_access
+from .config import ClusterInputSelectorConfig
+from .dataset_format import DatasetFormatInputSelector, get_dataset_format_cls
 from .db_manager import get_dataset
 from .router_utils import RouteErrorHandler
-from .schema import Path
+from .schema import Path, PathTuple, normalize_path
 from .signal import Signal, resolve_signal
 from .tasks import TaskId, get_task_manager, launch_task
 
@@ -82,7 +84,9 @@ def run() -> None:
 class ClusterOptions(BaseModel):
   """The request for the cluster endpoint."""
 
-  input: Path
+  input: Optional[Path] = None
+  input_selector: Optional[ClusterInputSelectorConfig] = None
+
   output_path: Optional[Path] = None
   use_garden: bool = PydanticField(
     default=False, description='Accelerate computation by running remotely on Lilac Garden.'
@@ -107,14 +111,45 @@ def cluster(
   if not get_user_access(user).dataset.compute_signals:
     raise HTTPException(401, 'User does not have access to compute clusters over this dataset.')
 
-  path_str = '.'.join(map(str, options.input))
-  task_name = f'[{namespace}/{dataset_name}] Clustering "{path_str}"'
-  task_id = get_task_manager().task_id(name=task_name)
+  if options.input is None and options.input_selector is None:
+    raise HTTPException(400, 'Either input or input_selector must be provided.')
+
   dataset = get_dataset(namespace, dataset_name)
+  manifest = dataset.manifest()
+
+  cluster_input: Optional[Union[DatasetFormatInputSelector, PathTuple]] = None
+  if options.input:
+    path_str = '.'.join(map(str, options.input))
+    task_name = f'[{namespace}/{dataset_name}] Clustering "{path_str}"'
+    cluster_input = normalize_path(options.input)
+  elif options.input_selector:
+    dataset_format = manifest.dataset_format
+    if dataset_format is None:
+      raise ValueError('Dataset format is not defined.')
+
+    format_cls = get_dataset_format_cls(dataset_format.name)
+    if format_cls is None:
+      raise ValueError(f'Unknown format: {c.input_selector.format}')
+
+    format = format_cls()
+    if format != manifest.dataset_format:
+      raise ValueError(
+        f'Cluster input format {c.input_selector.format} does not match '
+        f'dataset format {manifest.dataset_format}'
+      )
+
+    cluster_input = format_cls.input_selectors[c.input_selector.selector]
+
+    task_name = (
+      f'[{namespace}/{dataset_name}] Clustering using input selector '
+      f'"{options.input_selector.selector}"'
+    )
+
+  task_id = get_task_manager().task_id(name=task_name)
 
   def run() -> None:
     dataset.cluster(
-      options.input,
+      cluster_input,
       options.output_path,
       use_garden=options.use_garden,
       overwrite=options.overwrite,
diff --git a/notebooks/Clustering copy.ipynb b/notebooks/Clustering copy.ipynb
new file mode 100644
index 000000000..84a3ffbca
--- /dev/null
+++ b/notebooks/Clustering copy.ipynb	
@@ -0,0 +1,120 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Clustering\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook accompanies the [Cluster a dataset](https://docs.lilacml.com/datasets/dataset_cluster.html) guide.\n",
+    "Let's start by loading a small dataset of multi-turn conversations between a human and a chatbot:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset \"capybara\" written to ./datasets/local/capybara\n"
+     ]
+    }
+   ],
+   "source": [
+    "import lilac as ll\n",
+    "\n",
+    "ds = ll.get_dataset('local', 'OpenHermes-2.5-100k')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can cluster the `input` field under the `conversation` array by calling:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/capybara][1 shards] map \"extract_text\" to \"('conversation_input__cluster',)\": 100%|██████████| 16006/16006 [00:00<00:00, 30424.61it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrote map output to conversation_input__cluster-00000-of-00001.parquet\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[local/capybara][1 shards] map \"compute_clusters\" to \"('conversation_input__cluster',)\":   0%|          | 0/16006 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "jinaai/jina-embeddings-v2-small-en using device: mps:0\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds.cluster('conversation.*.input')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start a web server to visualize the data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ll.start_server()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
index 1763643c8..5856b28ff 100644
--- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte
+++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
@@ -18,14 +18,16 @@
 </script>
 
 <script lang="ts">
-  import {clusterMutation} from '$lib/queries/datasetQueries';
+  import {clusterMutation, queryFormatSelectors} from '$lib/queries/datasetQueries';
   import {queryAuthInfo} from '$lib/queries/serverQueries';
-  import type {Path} from '$lilac';
+  import {serializePath, type Path} from '$lilac';
   import {
     ComposedModal,
     ModalBody,
     ModalFooter,
     ModalHeader,
+    Select,
+    SelectItem,
     Toggle
   } from 'carbon-components-svelte';
   import FieldSelect from './commands/selectors/FieldSelect.svelte';
@@ -36,6 +38,29 @@
 
   $: canComputeRemotely = $authInfo.data?.access.dataset.execute_remotely;
 
+  $: formatSelectorsQuery =
+    options != null ? queryFormatSelectors(options?.namespace, options?.datasetName) : null;
+
+  let selectedFormatSelector: string | undefined = undefined;
+  let formatSelectors: string[] | undefined = undefined;
+  let outputColumn: string | undefined = undefined;
+  $: outputColumnRequired = formatSelectors != null;
+  $: {
+    if (options?.output_path != null) {
+      outputColumn = serializePath(options.output_path);
+    }
+  }
+  $: {
+    if (
+      formatSelectorsQuery != null &&
+      $formatSelectorsQuery != null &&
+      $formatSelectorsQuery.data != null
+    ) {
+      selectedFormatSelector = $formatSelectorsQuery.data[0];
+      formatSelectors = $formatSelectorsQuery.data;
+    }
+  }
+
   function close() {
     store.set(null);
   }
@@ -47,7 +72,7 @@
       {
         input: options.input,
         use_garden: options.use_garden,
-        output_path: options.output_path,
+        output_path: outputColumn,
         overwrite: options.overwrite
       }
     ]);
@@ -59,47 +84,71 @@
   <ComposedModal open on:submit={submit} on:close={close}>
     <ModalHeader title="Compute clusters" />
     <ModalBody hasForm>
-      <div class="max-w-2xl">
-        <FieldSelect
-          filter={f => f.dtype?.type === 'string'}
-          defaultPath={options.input}
-          bind:path={options.input}
-          labelText="Field"
-        />
-      </div>
-      <div class="mt-8">
-        <div class="label mb-2 font-medium text-gray-700">Use Garden</div>
-        <div class="label mb-2 text-sm text-gray-700">
-          Accelerate computation by running remotely on <a
-            href="https://lilacml.com/#garden"
-            target="_blank">Lilac Garden</a
-          >
+      <div class="flex max-w-2xl flex-col gap-y-8">
+        <div>
+          <FieldSelect
+            filter={f => f.dtype?.type === 'string'}
+            defaultPath={options.input}
+            bind:path={options.input}
+            labelText="Field"
+          />
         </div>
-        <Toggle
-          disabled={!canComputeRemotely}
-          labelA={'False'}
-          labelB={'True'}
-          bind:toggled={options.use_garden}
-          hideLabel
-        />
-        {#if !canComputeRemotely}
-          <div class="mt-2">
-            <a href="https://forms.gle/Gz9cpeKJccNar5Lq8" target="_blank">
-              Sign up for Lilac Garden
-            </a>
-            to enable this feature.
+        {#if formatSelectors != null}
+          <div>
+            <div class="label text-s mb-2 font-medium text-gray-700">Selector</div>
+            <Select hideLabel={true} bind:selected={selectedFormatSelector} required>
+              {#each formatSelectors as formatSelector}
+                <SelectItem value={formatSelector} text={formatSelector} />
+              {/each}
+            </Select>
           </div>
         {/if}
-      </div>
-      <div class="mt-8">
-        <div class="label text-s mb-2 font-medium text-gray-700">Overwrite</div>
-        <Toggle labelA={'False'} labelB={'True'} bind:toggled={options.overwrite} hideLabel />
+        <div>
+          <div class="label text-s mb-2 font-medium text-gray-700">
+            {outputColumnRequired ? '*' : ''} Output column
+          </div>
+          <input
+            required={outputColumnRequired}
+            class="h-full w-full rounded border border-neutral-300 p-2"
+            placeholder="Choose a new column name to write clusters"
+            bind:value={outputColumn}
+          />
+        </div>
+        <div>
+          <div class="label mb-2 font-medium text-gray-700">Use Garden</div>
+          <div class="label text-sm text-gray-700">
+            Accelerate computation by running remotely on <a
+              href="https://lilacml.com/#garden"
+              target="_blank">Lilac Garden</a
+            >
+          </div>
+          <Toggle
+            disabled={!canComputeRemotely}
+            labelA={'False'}
+            labelB={'True'}
+            bind:toggled={options.use_garden}
+            hideLabel
+          />
+          {#if !canComputeRemotely}
+            <div>
+              <a href="https://forms.gle/Gz9cpeKJccNar5Lq8" target="_blank">
+                Sign up for Lilac Garden
+              </a>
+              to enable this feature.
+            </div>
+          {/if}
+        </div>
+        <div>
+          <div class="label text-s mb-2 font-medium text-gray-700">Overwrite</div>
+          <Toggle labelA={'False'} labelB={'True'} bind:toggled={options.overwrite} hideLabel />
+        </div>
       </div>
     </ModalBody>
     <ModalFooter
       primaryButtonText="Cluster"
       secondaryButtonText="Cancel"
       on:click:button--secondary={close}
+      primaryButtonDisabled={outputColumnRequired && !outputColumn}
     />
   </ComposedModal>
 {/if}
diff --git a/web/blueprint/src/lib/queries/datasetQueries.ts b/web/blueprint/src/lib/queries/datasetQueries.ts
index 5fb5bcadc..7b7373d5a 100644
--- a/web/blueprint/src/lib/queries/datasetQueries.ts
+++ b/web/blueprint/src/lib/queries/datasetQueries.ts
@@ -333,3 +333,7 @@ function invalidateQueriesLabelEdit(
     ]);
   }
 }
+export const queryFormatSelectors = createApiQuery(
+  DatasetsService.getFormatSelectors,
+  DATASETS_TAG
+);
diff --git a/web/lib/fastapi_client/index.ts b/web/lib/fastapi_client/index.ts
index b58bbe552..29485f716 100644
--- a/web/lib/fastapi_client/index.ts
+++ b/web/lib/fastapi_client/index.ts
@@ -12,6 +12,7 @@ export type { AuthenticationInfo } from './models/AuthenticationInfo';
 export type { BinaryFilter } from './models/BinaryFilter';
 export type { ClusterInfo } from './models/ClusterInfo';
 export type { ClusterInputFormatSelectorInfo } from './models/ClusterInputFormatSelectorInfo';
+export type { ClusterInputSelectorConfig } from './models/ClusterInputSelectorConfig';
 export type { ClusterOptions } from './models/ClusterOptions';
 export type { ClusterResponse } from './models/ClusterResponse';
 export type { Column } from './models/Column';
diff --git a/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts b/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts
new file mode 100644
index 000000000..29dc41f35
--- /dev/null
+++ b/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts
@@ -0,0 +1,13 @@
+/* generated using openapi-typescript-codegen -- do no edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+
+/**
+ * Configures a format selector for a cluster input.
+ */
+export type ClusterInputSelectorConfig = {
+    format: string;
+    selector: string;
+};
+
diff --git a/web/lib/fastapi_client/models/ClusterOptions.ts b/web/lib/fastapi_client/models/ClusterOptions.ts
index 5bb5b9b6a..dab52a970 100644
--- a/web/lib/fastapi_client/models/ClusterOptions.ts
+++ b/web/lib/fastapi_client/models/ClusterOptions.ts
@@ -3,11 +3,14 @@
 /* tslint:disable */
 /* eslint-disable */
 
+import type { ClusterInputSelectorConfig } from './ClusterInputSelectorConfig';
+
 /**
  * The request for the cluster endpoint.
  */
 export type ClusterOptions = {
-    input: (Array<string> | string);
+    input?: (Array<string> | string | null);
+    input_selector?: (ClusterInputSelectorConfig | null);
     output_path?: (Array<string> | string | null);
     /**
      * Accelerate computation by running remotely on Lilac Garden.
diff --git a/web/lib/fastapi_client/services/DatasetsService.ts b/web/lib/fastapi_client/services/DatasetsService.ts
index 7ee3ab55f..c2b8c0c3c 100644
--- a/web/lib/fastapi_client/services/DatasetsService.ts
+++ b/web/lib/fastapi_client/services/DatasetsService.ts
@@ -525,6 +525,31 @@ export class DatasetsService {
         });
     }
 
+    /**
+     * Get Format Selectors
+     * Get format selectors for the dataset if a format has been inferred.
+     * @param namespace
+     * @param datasetName
+     * @returns string Successful Response
+     * @throws ApiError
+     */
+    public static getFormatSelectors(
+        namespace: string,
+        datasetName: string,
+    ): CancelablePromise<Array<string>> {
+        return __request(OpenAPI, {
+            method: 'GET',
+            url: '/api/v1/datasets/{namespace}/{dataset_name}/format_selectors',
+            path: {
+                'namespace': namespace,
+                'dataset_name': datasetName,
+            },
+            errors: {
+                422: `Validation Error`,
+            },
+        });
+    }
+
     /**
      * Compute Signal
      * Compute a signal for a dataset.

From f1a0c4224494b54cb0c6f52bbb48bb34de765b9c Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Thu, 15 Feb 2024 15:50:57 -0500
Subject: [PATCH 2/7] save

---
 lilac/formats/openai_json.py                  |  4 +-
 lilac/formats/openchat.py                     |  2 +-
 lilac/formats/sharegpt.py                     |  2 +-
 lilac/router_dataset_signals.py               | 22 +++--------
 .../lib/components/ComputeClusterModal.svelte | 37 ++++++++++++++-----
 .../commands/selectors/FieldSelect.svelte     |  3 +-
 web/lib/fastapi_client/index.ts               |  1 -
 .../models/ClusterInputSelectorConfig.ts      | 13 -------
 .../fastapi_client/models/ClusterOptions.ts   |  4 +-
 9 files changed, 40 insertions(+), 48 deletions(-)
 delete mode 100644 web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts

diff --git a/lilac/formats/openai_json.py b/lilac/formats/openai_json.py
index 3b861daec..47a198047 100644
--- a/lilac/formats/openai_json.py
+++ b/lilac/formats/openai_json.py
@@ -32,7 +32,7 @@ class OpenAIJSON(DatasetFormat):
   Taken from: https://platform.openai.com/docs/api-reference/chat
   """
 
-  name: ClassVar[str] = 'openai_json'
+  name: ClassVar[str] = 'OpenAI JSON'
   data_schema: Schema = schema(
     {
       'messages': [
@@ -88,7 +88,7 @@ class OpenAIConversationJSON(DatasetFormat):
   Note that here "messages" is "conversation" for support with common datasets.
   """
 
-  name: ClassVar[str] = 'openai_conversation_json'
+  name: ClassVar[str] = 'OpenAI Conversation JSON'
   data_schema: Schema = schema(
     {
       'conversation': [
diff --git a/lilac/formats/openchat.py b/lilac/formats/openchat.py
index 815268e0d..9bee2ee32 100644
--- a/lilac/formats/openchat.py
+++ b/lilac/formats/openchat.py
@@ -10,7 +10,7 @@
 class OpenChat(DatasetFormat):
   """OpenChat format."""
 
-  name: ClassVar[str] = 'openchat'
+  name: ClassVar[str] = 'OpenChat'
   data_schema: Schema = schema(
     {
       'items': [
diff --git a/lilac/formats/sharegpt.py b/lilac/formats/sharegpt.py
index 49a6b1038..30134205d 100644
--- a/lilac/formats/sharegpt.py
+++ b/lilac/formats/sharegpt.py
@@ -37,7 +37,7 @@ def _sharegpt_selector(item: Item, conv_from: str) -> str:
 class ShareGPT(DatasetFormat):
   """ShareGPT format."""
 
-  name: ClassVar[str] = 'sharegpt'
+  name: ClassVar[str] = 'ShareGPT'
   data_schema: Schema = schema(
     {
       'conversations': [
diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py
index b20455a60..700668752 100644
--- a/lilac/router_dataset_signals.py
+++ b/lilac/router_dataset_signals.py
@@ -7,7 +7,6 @@
 from pydantic import Field as PydanticField
 
 from .auth import UserInfo, get_session_user, get_user_access
-from .config import ClusterInputSelectorConfig
 from .dataset_format import DatasetFormatInputSelector, get_dataset_format_cls
 from .db_manager import get_dataset
 from .router_utils import RouteErrorHandler
@@ -85,7 +84,7 @@ class ClusterOptions(BaseModel):
   """The request for the cluster endpoint."""
 
   input: Optional[Path] = None
-  input_selector: Optional[ClusterInputSelectorConfig] = None
+  input_selector: Optional[str] = None
 
   output_path: Optional[Path] = None
   use_garden: bool = PydanticField(
@@ -111,9 +110,6 @@ def cluster(
   if not get_user_access(user).dataset.compute_signals:
     raise HTTPException(401, 'User does not have access to compute clusters over this dataset.')
 
-  if options.input is None and options.input_selector is None:
-    raise HTTPException(400, 'Either input or input_selector must be provided.')
-
   dataset = get_dataset(namespace, dataset_name)
   manifest = dataset.manifest()
 
@@ -129,21 +125,15 @@ def cluster(
 
     format_cls = get_dataset_format_cls(dataset_format.name)
     if format_cls is None:
-      raise ValueError(f'Unknown format: {c.input_selector.format}')
+      raise ValueError(f'Unknown format: {dataset_format.name}')
 
-    format = format_cls()
-    if format != manifest.dataset_format:
-      raise ValueError(
-        f'Cluster input format {c.input_selector.format} does not match '
-        f'dataset format {manifest.dataset_format}'
-      )
-
-    cluster_input = format_cls.input_selectors[c.input_selector.selector]
+    cluster_input = format_cls.input_selectors[options.input_selector]
 
     task_name = (
-      f'[{namespace}/{dataset_name}] Clustering using input selector '
-      f'"{options.input_selector.selector}"'
+      f'[{namespace}/{dataset_name}] Clustering using input selector ' f'"{options.input_selector}"'
     )
+  else:
+    raise HTTPException(400, 'Either input or input_selector must be provided.')
 
   task_id = get_task_manager().task_id(name=task_name)
 
diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
index 5856b28ff..1be5de9d9 100644
--- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte
+++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
@@ -18,7 +18,11 @@
 </script>
 
 <script lang="ts">
-  import {clusterMutation, queryFormatSelectors} from '$lib/queries/datasetQueries';
+  import {
+    clusterMutation,
+    queryDatasetManifest,
+    queryFormatSelectors
+  } from '$lib/queries/datasetQueries';
   import {queryAuthInfo} from '$lib/queries/serverQueries';
   import {serializePath, type Path} from '$lilac';
   import {
@@ -39,12 +43,17 @@
   $: canComputeRemotely = $authInfo.data?.access.dataset.execute_remotely;
 
   $: formatSelectorsQuery =
-    options != null ? queryFormatSelectors(options?.namespace, options?.datasetName) : null;
-
-  let selectedFormatSelector: string | undefined = undefined;
+    options != null ? queryFormatSelectors(options.namespace, options.datasetName) : null;
+  $: datasetManifest =
+    options != null ? queryDatasetManifest(options.namespace, options.datasetName) : null;
+  let selectedFormatSelector = 'none';
   let formatSelectors: string[] | undefined = undefined;
   let outputColumn: string | undefined = undefined;
-  $: outputColumnRequired = formatSelectors != null;
+  $: outputColumnRequired =
+    formatSelectors != null &&
+    formatSelectors.length > 0 &&
+    selectedFormatSelector != null &&
+    selectedFormatSelector != 'none';
   $: {
     if (options?.output_path != null) {
       outputColumn = serializePath(options.output_path);
@@ -56,7 +65,6 @@
       $formatSelectorsQuery != null &&
       $formatSelectorsQuery.data != null
     ) {
-      selectedFormatSelector = $formatSelectorsQuery.data[0];
       formatSelectors = $formatSelectorsQuery.data;
     }
   }
@@ -66,13 +74,15 @@
   }
   function submit() {
     if (!options) return;
+
     $clusterQuery.mutate([
       options.namespace,
       options.datasetName,
       {
-        input: options.input,
+        input: selectedFormatSelector == null ? options.input : null,
         use_garden: options.use_garden,
         output_path: outputColumn,
+        input_selector: selectedFormatSelector,
         overwrite: options.overwrite
       }
     ]);
@@ -87,16 +97,21 @@
       <div class="flex max-w-2xl flex-col gap-y-8">
         <div>
           <FieldSelect
+            disabled={selectedFormatSelector != null && selectedFormatSelector != 'none'}
             filter={f => f.dtype?.type === 'string'}
             defaultPath={options.input}
             bind:path={options.input}
             labelText="Field"
           />
         </div>
-        {#if formatSelectors != null}
+        {#if formatSelectors != null && formatSelectors.length > 0}
           <div>
-            <div class="label text-s mb-2 font-medium text-gray-700">Selector</div>
+            <div class="label text-s mb-2 font-medium text-gray-700">
+              {$datasetManifest?.data?.dataset_manifest.dataset_format?.['format_name']} selector
+            </div>
             <Select hideLabel={true} bind:selected={selectedFormatSelector} required>
+              <SelectItem value={'none'} text={'None'} />
+
               {#each formatSelectors as formatSelector}
                 <SelectItem value={formatSelector} text={formatSelector} />
               {/each}
@@ -105,7 +120,9 @@
         {/if}
         <div>
           <div class="label text-s mb-2 font-medium text-gray-700">
-            {outputColumnRequired ? '*' : ''} Output column
+            {outputColumnRequired ? '*' : ''} Output column {!outputColumnRequired
+              ? '(Optional)'
+              : ''}
           </div>
           <input
             required={outputColumnRequired}
diff --git a/web/blueprint/src/lib/components/commands/selectors/FieldSelect.svelte b/web/blueprint/src/lib/components/commands/selectors/FieldSelect.svelte
index 045d5cde2..a7ad26e64 100644
--- a/web/blueprint/src/lib/components/commands/selectors/FieldSelect.svelte
+++ b/web/blueprint/src/lib/components/commands/selectors/FieldSelect.svelte
@@ -17,6 +17,7 @@
 
   export let defaultPath: Path | undefined = undefined;
   export let path: Path | undefined = undefined;
+  export let disabled = false;
 
   const datasetViewStore = getDatasetViewContext();
 
@@ -83,7 +84,7 @@
     <div class="label text-s mb-2 font-medium text-gray-700">
       {labelText}
     </div>
-    <Select hideLabel={true} {helperText} bind:selected={selectedPath} required>
+    <Select hideLabel={true} {helperText} bind:selected={selectedPath} required {disabled}>
       {#if sourceFields?.length}
         <SelectItemGroup label="Source Fields">
           {#each sourceFields as field}
diff --git a/web/lib/fastapi_client/index.ts b/web/lib/fastapi_client/index.ts
index 29485f716..b58bbe552 100644
--- a/web/lib/fastapi_client/index.ts
+++ b/web/lib/fastapi_client/index.ts
@@ -12,7 +12,6 @@ export type { AuthenticationInfo } from './models/AuthenticationInfo';
 export type { BinaryFilter } from './models/BinaryFilter';
 export type { ClusterInfo } from './models/ClusterInfo';
 export type { ClusterInputFormatSelectorInfo } from './models/ClusterInputFormatSelectorInfo';
-export type { ClusterInputSelectorConfig } from './models/ClusterInputSelectorConfig';
 export type { ClusterOptions } from './models/ClusterOptions';
 export type { ClusterResponse } from './models/ClusterResponse';
 export type { Column } from './models/Column';
diff --git a/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts b/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts
deleted file mode 100644
index 29dc41f35..000000000
--- a/web/lib/fastapi_client/models/ClusterInputSelectorConfig.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-/* generated using openapi-typescript-codegen -- do no edit */
-/* istanbul ignore file */
-/* tslint:disable */
-/* eslint-disable */
-
-/**
- * Configures a format selector for a cluster input.
- */
-export type ClusterInputSelectorConfig = {
-    format: string;
-    selector: string;
-};
-
diff --git a/web/lib/fastapi_client/models/ClusterOptions.ts b/web/lib/fastapi_client/models/ClusterOptions.ts
index dab52a970..a5b200fec 100644
--- a/web/lib/fastapi_client/models/ClusterOptions.ts
+++ b/web/lib/fastapi_client/models/ClusterOptions.ts
@@ -3,14 +3,12 @@
 /* tslint:disable */
 /* eslint-disable */
 
-import type { ClusterInputSelectorConfig } from './ClusterInputSelectorConfig';
-
 /**
  * The request for the cluster endpoint.
  */
 export type ClusterOptions = {
     input?: (Array<string> | string | null);
-    input_selector?: (ClusterInputSelectorConfig | null);
+    input_selector?: (string | null);
     output_path?: (Array<string> | string | null);
     /**
      * Accelerate computation by running remotely on Lilac Garden.

From 2837d8df4f95b9c90e35b8e2b3550025a0bee2ed Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Thu, 15 Feb 2024 15:53:21 -0500
Subject: [PATCH 3/7] save

---
 notebooks/Clustering copy.ipynb | 120 --------------------------------
 1 file changed, 120 deletions(-)
 delete mode 100644 notebooks/Clustering copy.ipynb

diff --git a/notebooks/Clustering copy.ipynb b/notebooks/Clustering copy.ipynb
deleted file mode 100644
index 84a3ffbca..000000000
--- a/notebooks/Clustering copy.ipynb	
+++ /dev/null
@@ -1,120 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Clustering\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook accompanies the [Cluster a dataset](https://docs.lilacml.com/datasets/dataset_cluster.html) guide.\n",
-    "Let's start by loading a small dataset of multi-turn conversations between a human and a chatbot:\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset \"capybara\" written to ./datasets/local/capybara\n"
-     ]
-    }
-   ],
-   "source": [
-    "import lilac as ll\n",
-    "\n",
-    "ds = ll.get_dataset('local', 'OpenHermes-2.5-100k')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can cluster the `input` field under the `conversation` array by calling:\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[local/capybara][1 shards] map \"extract_text\" to \"('conversation_input__cluster',)\": 100%|██████████| 16006/16006 [00:00<00:00, 30424.61it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrote map output to conversation_input__cluster-00000-of-00001.parquet\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[local/capybara][1 shards] map \"compute_clusters\" to \"('conversation_input__cluster',)\":   0%|          | 0/16006 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "jinaai/jina-embeddings-v2-small-en using device: mps:0\n"
-     ]
-    }
-   ],
-   "source": [
-    "ds.cluster('conversation.*.input')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's start a web server to visualize the data\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ll.start_server()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 50e566afaf9794a5680dbac795624fc6515b4724 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Thu, 15 Feb 2024 15:59:44 -0500
Subject: [PATCH 4/7] save

---
 lilac/load_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lilac/load_test.py b/lilac/load_test.py
index 0c3ae2895..1ac93e244 100644
--- a/lilac/load_test.py
+++ b/lilac/load_test.py
@@ -513,7 +513,7 @@ def _test_topic_fn(docs: list[tuple[str, float]]) -> str:
         dataset_namespace='namespace',
         dataset_name='test',
         input_selector=ClusterInputSelectorConfig(
-          format='sharegpt',
+          format='ShareGPT',
           selector='human',
         ),
         output_path=('cluster',),

From d14c3d295169ca1a37743227001220957bc54bf8 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Fri, 16 Feb 2024 10:10:26 -0500
Subject: [PATCH 5/7] save

---
 .../src/lib/components/ComputeClusterModal.svelte         | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
index 1be5de9d9..604ac1b67 100644
--- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte
+++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
@@ -68,6 +68,14 @@
       formatSelectors = $formatSelectorsQuery.data;
     }
   }
+  $: {
+    if (selectedFormatSelector != null && selectedFormatSelector != 'none') {
+      // Choose a reasonable default output column.
+      outputColumn = `${selectedFormatSelector}__clusters`;
+    } else if (selectedFormatSelector === 'none') {
+      outputColumn = undefined;
+    }
+  }
 
   function close() {
     store.set(null);

From 5f15662fdc7fcdb5f61329a7805589f41b17c44e Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Sat, 17 Feb 2024 09:27:59 -0500
Subject: [PATCH 6/7] save

---
 lilac/data/clustering.py                      | 28 +++++++++++--------
 lilac/router_dataset_signals.py               | 13 +++++++++
 .../lib/components/ComputeClusterModal.svelte |  9 ++++++
 .../src/lib/queries/datasetQueries.ts         |  5 ++++
 web/lib/fastapi_client/index.ts               |  1 +
 .../models/DefaultClusterOutputPathOptions.ts | 12 ++++++++
 .../services/DatasetsService.ts               | 22 +++++++++++++++
 7 files changed, 79 insertions(+), 11 deletions(-)
 create mode 100644 web/lib/fastapi_client/models/DefaultClusterOutputPathOptions.ts

diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
index a9762372f..65eaceceb 100644
--- a/lilac/data/clustering.py
+++ b/lilac/data/clustering.py
@@ -104,17 +104,7 @@ def cluster_impl(
   if output_path:
     cluster_output_path = normalize_path(output_path)
   elif path:
-    # The sibling output path is the same as the input path, but with a different suffix.
-    index = 0
-    for i, path_part in enumerate(path):
-      if path_part == PATH_WILDCARD:
-        break
-      else:
-        index = i
-
-    parent = path[:index]
-    sibling = '_'.join([p for p in path[index:] if p != PATH_WILDCARD])
-    cluster_output_path = (*parent, f'{sibling}__{FIELD_SUFFIX}')
+    cluster_output_path = default_cluster_output_path(path)
   else:
     raise ValueError('input must be provided.')
 
@@ -416,3 +406,19 @@ def _hdbscan_cluster(
 
   for cluster_id, membership_prob in zip(labels, memberships):
     yield {CLUSTER_ID: int(cluster_id), CLUSTER_MEMBERSHIP_PROB: float(membership_prob)}
+
+
+def default_cluster_output_path(input_path: Path) -> Path:
+  """Default output path for clustering."""
+  input_path = normalize_path(input_path)
+  # The sibling output path is the same as the input path, but with a different suffix.
+  index = 0
+  for i, path_part in enumerate(input_path):
+    if path_part == PATH_WILDCARD:
+      break
+    else:
+      index = i
+
+  parent = input_path[:index]
+  sibling = '_'.join([p for p in input_path[index:] if p != PATH_WILDCARD])
+  return (*parent, f'{sibling}__{FIELD_SUFFIX}')
diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py
index 700668752..4c4f4d8df 100644
--- a/lilac/router_dataset_signals.py
+++ b/lilac/router_dataset_signals.py
@@ -7,6 +7,7 @@
 from pydantic import Field as PydanticField
 
 from .auth import UserInfo, get_session_user, get_user_access
+from .data.clustering import default_cluster_output_path
 from .dataset_format import DatasetFormatInputSelector, get_dataset_format_cls
 from .db_manager import get_dataset
 from .router_utils import RouteErrorHandler
@@ -150,6 +151,18 @@ def run() -> None:
   return ClusterResponse(task_id=task_id)
 
 
+class DefaultClusterOutputPathOptions(BaseModel):
+  """Request body for the default cluster output path endpoint."""
+
+  input_path: Path
+
+
+@router.post('/{namespace}/{dataset_name}/default_cluster_output_path')
+def get_default_cluster_output_path(options: DefaultClusterOutputPathOptions) -> Path:
+  """Get format selectors for the dataset if a format has been inferred."""
+  return default_cluster_output_path(options.input_path)
+
+
 class DeleteSignalOptions(BaseModel):
   """The request for the delete signal endpoint."""
 
diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
index 604ac1b67..1380a7aff 100644
--- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte
+++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte
@@ -21,6 +21,7 @@
   import {
     clusterMutation,
     queryDatasetManifest,
+    queryDefaultClusterOutputPath,
     queryFormatSelectors
   } from '$lib/queries/datasetQueries';
   import {queryAuthInfo} from '$lib/queries/serverQueries';
@@ -54,6 +55,14 @@
     formatSelectors.length > 0 &&
     selectedFormatSelector != null &&
     selectedFormatSelector != 'none';
+  $: defaultClusterOutputPath = options?.input
+    ? queryDefaultClusterOutputPath({input_path: options.input})
+    : null;
+  $: {
+    if ($defaultClusterOutputPath?.data != null) {
+      outputColumn = serializePath($defaultClusterOutputPath.data);
+    }
+  }
   $: {
     if (options?.output_path != null) {
       outputColumn = serializePath(options.output_path);
diff --git a/web/blueprint/src/lib/queries/datasetQueries.ts b/web/blueprint/src/lib/queries/datasetQueries.ts
index 7b7373d5a..608661e42 100644
--- a/web/blueprint/src/lib/queries/datasetQueries.ts
+++ b/web/blueprint/src/lib/queries/datasetQueries.ts
@@ -337,3 +337,8 @@ export const queryFormatSelectors = createApiQuery(
   DatasetsService.getFormatSelectors,
   DATASETS_TAG
 );
+
+export const queryDefaultClusterOutputPath = createApiQuery(
+  DatasetsService.getDefaultClusterOutputPath,
+  DATASETS_TAG
+);
diff --git a/web/lib/fastapi_client/index.ts b/web/lib/fastapi_client/index.ts
index b58bbe552..50ed213d2 100644
--- a/web/lib/fastapi_client/index.ts
+++ b/web/lib/fastapi_client/index.ts
@@ -37,6 +37,7 @@ export type { DatasetSettings } from './models/DatasetSettings';
 export type { DatasetUISettings } from './models/DatasetUISettings';
 export type { DatasetUserAccess } from './models/DatasetUserAccess';
 export type { DataType } from './models/DataType';
+export type { DefaultClusterOutputPathOptions } from './models/DefaultClusterOutputPathOptions';
 export type { DeleteRowsOptions } from './models/DeleteRowsOptions';
 export type { DeleteSignalOptions } from './models/DeleteSignalOptions';
 export type { DeleteSignalResponse } from './models/DeleteSignalResponse';
diff --git a/web/lib/fastapi_client/models/DefaultClusterOutputPathOptions.ts b/web/lib/fastapi_client/models/DefaultClusterOutputPathOptions.ts
new file mode 100644
index 000000000..459df2f38
--- /dev/null
+++ b/web/lib/fastapi_client/models/DefaultClusterOutputPathOptions.ts
@@ -0,0 +1,12 @@
+/* generated using openapi-typescript-codegen -- do no edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+
+/**
+ * Request body for the default cluster output path endpoint.
+ */
+export type DefaultClusterOutputPathOptions = {
+    input_path: (Array<string> | string);
+};
+
diff --git a/web/lib/fastapi_client/services/DatasetsService.ts b/web/lib/fastapi_client/services/DatasetsService.ts
index c2b8c0c3c..9bc66be4e 100644
--- a/web/lib/fastapi_client/services/DatasetsService.ts
+++ b/web/lib/fastapi_client/services/DatasetsService.ts
@@ -9,6 +9,7 @@ import type { ComputeSignalOptions } from '../models/ComputeSignalOptions';
 import type { ComputeSignalResponse } from '../models/ComputeSignalResponse';
 import type { DatasetInfo } from '../models/DatasetInfo';
 import type { DatasetSettings } from '../models/DatasetSettings';
+import type { DefaultClusterOutputPathOptions } from '../models/DefaultClusterOutputPathOptions';
 import type { DeleteRowsOptions } from '../models/DeleteRowsOptions';
 import type { DeleteSignalOptions } from '../models/DeleteSignalOptions';
 import type { DeleteSignalResponse } from '../models/DeleteSignalResponse';
@@ -608,6 +609,27 @@ export class DatasetsService {
         });
     }
 
+    /**
+     * Get Default Cluster Output Path
+     * Get format selectors for the dataset if a format has been inferred.
+     * @param requestBody
+     * @returns any Successful Response
+     * @throws ApiError
+     */
+    public static getDefaultClusterOutputPath(
+        requestBody: DefaultClusterOutputPathOptions,
+    ): CancelablePromise<(Array<string> | string)> {
+        return __request(OpenAPI, {
+            method: 'POST',
+            url: '/api/v1/datasets/{namespace}/{dataset_name}/default_cluster_output_path',
+            body: requestBody,
+            mediaType: 'application/json',
+            errors: {
+                422: `Validation Error`,
+            },
+        });
+    }
+
     /**
      * Delete Signal
      * Delete a signal from a dataset.

From 63b250e637b3574e51717a8b8d82fbb837b07c6a Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nsthorat@gmail.com>
Date: Sat, 17 Feb 2024 09:36:11 -0500
Subject: [PATCH 7/7] save

---
 lilac/data/clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
index 65eaceceb..7530aafe4 100644
--- a/lilac/data/clustering.py
+++ b/lilac/data/clustering.py
@@ -408,7 +408,7 @@ def _hdbscan_cluster(
     yield {CLUSTER_ID: int(cluster_id), CLUSTER_MEMBERSHIP_PROB: float(membership_prob)}
 
 
-def default_cluster_output_path(input_path: Path) -> Path:
+def default_cluster_output_path(input_path: Path) -> PathTuple:
   """Default output path for clustering."""
   input_path = normalize_path(input_path)
   # The sibling output path is the same as the input path, but with a different suffix.