From 5f5e4b7b62dff93ee6096dd9a7933b596ed60775 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 5 Dec 2023 23:03:53 +0530
Subject: [PATCH 01/14] Create generate_rest.yaml

Propose generate rest api endpoints

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 130 ++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 specification/protocol/generate_rest.yaml

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
new file mode 100644
index 0000000..4459eff
--- /dev/null
+++ b/specification/protocol/generate_rest.yaml
@@ -0,0 +1,130 @@
+openapi: 3.1.0
+info:
+  title: Open Inference API for text generation
+  description: Open Inference API for text generation
+  version: 1.0.0
+components:
+  schemas:
+    GenerateRequest:
+      type: object
+      required: 
+        - text_input
+      properties:
+        text_input:
+          type: string
+        parameters:
+          $ref: '#/components/schemas/GenerateParameters'
+    GenerateParameters:
+      type: object
+      properties:
+        temperature:
+          type: number
+          "format": "float"
+          "default": "null"
+          "examples": [0.5]
+          minimum: 0
+          description: |-
+            What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+        top_p:
+          type: number
+          "format": "float"
+          maximum: 1.0
+          minimum: 0
+          description: |-
+            An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+        max_token:
+          type: integer
+          "format": "int32"
+          default: 20
+          minimum: 0
+          maximum: 512
+          description: The maximum number of tokens to generate in the completion.
+        best_of:
+          type: integer
+          "format": "int32"
+          minimum: 0
+          examples: [1]
+          description: |-
+            Generates best_of completions server-side and returns the "best" (the one with the highest log probability per token).
+        stop:
+          type: array
+          items:
+            type: string
+          maxItems: 4
+          description: |-
+            Up to 4 sequences where the API will stop generating further tokens.
+        frequency_penalty:
+          type: number
+          format:  "float"
+          examples: [1.03]
+          minimum: 0
+          description: |-
+            Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+    GenerateResponse:
+      type: object
+      required:
+        - text_output
+        - model_name
+        - model_version
+      properties:
+        text_output:
+          type: string
+        model_name:
+          type: string
+        model_version:
+          type: string
+    GenerateStreamResponse:
+      type: object
+      required:
+        - text_output
+        - model_name
+        - model_version
+        - done
+      properties:
+        text_output:
+          type: string
+        model_name:
+          type: string
+        model_version:
+          type: string
+paths:
+  /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate:
+    post:
+      parameters:
+        - name: model_name
+          required: true
+          in: path
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GenerateRequest'
+      responses:
+        '200':
+          description: generated text
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateResponse'
+  /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate_stream:
+    post:
+      parameters:
+        - name: model_name
+          required: true
+          in: path
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GenerateRequest'
+      responses:
+        '200':
+          description: generated text stream
+          content:
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/GenerateStreamResponse'

From 19973d4b5108e1756767b8f65a9363b00f866bb4 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Thu, 7 Dec 2023 22:27:03 +0530
Subject: [PATCH 02/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 49 ++---------------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 4459eff..d608b43 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -17,49 +17,8 @@ components:
     GenerateParameters:
       type: object
       properties:
-        temperature:
-          type: number
-          "format": "float"
-          "default": "null"
-          "examples": [0.5]
-          minimum: 0
-          description: |-
-            What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
-        top_p:
-          type: number
-          "format": "float"
-          maximum: 1.0
-          minimum: 0
-          description: |-
-            An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
-        max_token:
-          type: integer
-          "format": "int32"
-          default: 20
-          minimum: 0
-          maximum: 512
-          description: The maximum number of tokens to generate in the completion.
-        best_of:
-          type: integer
-          "format": "int32"
-          minimum: 0
-          examples: [1]
-          description: |-
-            Generates best_of completions server-side and returns the "best" (the one with the highest log probability per token).
-        stop:
-          type: array
-          items:
-            type: string
-          maxItems: 4
-          description: |-
-            Up to 4 sequences where the API will stop generating further tokens.
-        frequency_penalty:
-          type: number
-          format:  "float"
-          examples: [1.03]
-          minimum: 0
-          description: |-
-            Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+        parameter:
+          description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs 
     GenerateResponse:
       type: object
       required:
@@ -88,7 +47,7 @@ components:
         model_version:
           type: string
 paths:
-  /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate:
+  /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate:
     post:
       parameters:
         - name: model_name
@@ -108,7 +67,7 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/GenerateResponse'
-  /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate_stream:
+  /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream:
     post:
       parameters:
         - name: model_name

From 725b5ec9cf91d81bd5296f305252fbb6e8379da9 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 12 Dec 2023 02:23:28 +0530
Subject: [PATCH 03/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 44 +++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index d608b43..e3cbbbf 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -19,6 +19,41 @@ components:
       properties:
         parameter:
           description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs 
+        "temperature":
+          "type": "number"
+          "format": "float"
+          "default": "null"
+          "minimum": 0
+          "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
+        "top_p":
+          "type": "number"
+          "format": "float"
+          "maximum": 1
+          "minimum": 0
+          "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered."
+        "max_token":
+          "type": "integer"
+          "format": "int32"
+          "default": 20
+          "minimum": 0
+          "maximum": 512
+          "description": "The maximum number of tokens to generate in the completion."
+        "best_of":
+          "type": "integer"
+          "format": "int32"
+          "minimum": 0
+          "description": "Generates best_of completions server-side and returns the \"best\" (the one with the highest log probability per token)."
+        "stop":
+          "type": "array"
+          "items":
+            "type": "string"
+          "maxItems": 5
+          "description": "Up to 5 sequences where the API will stop generating further tokens."
+        "frequency_penalty":
+          "type": "number"
+          "format": "float"
+          "minimum": 0
+          "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim."
     GenerateResponse:
       type: object
       required:
@@ -46,6 +81,15 @@ components:
           type: string
         model_version:
           type: string
+        done:
+          type: boolean
+    GenerateErrorResponse:
+      type: object
+      required:
+        - error
+      properties:
+        error:
+          type: string
 paths:
   /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate:
     post:

From 8e729f013a97e16a860c31b3e212e56ae6c337dc Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 12 Dec 2023 02:26:13 +0530
Subject: [PATCH 04/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 70 +++++++++++------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index e3cbbbf..744c33d 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -19,41 +19,41 @@ components:
       properties:
         parameter:
           description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs 
-        "temperature":
-          "type": "number"
-          "format": "float"
-          "default": "null"
-          "minimum": 0
-          "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
-        "top_p":
-          "type": "number"
-          "format": "float"
-          "maximum": 1
-          "minimum": 0
-          "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered."
-        "max_token":
-          "type": "integer"
-          "format": "int32"
-          "default": 20
-          "minimum": 0
-          "maximum": 512
-          "description": "The maximum number of tokens to generate in the completion."
-        "best_of":
-          "type": "integer"
-          "format": "int32"
-          "minimum": 0
-          "description": "Generates best_of completions server-side and returns the \"best\" (the one with the highest log probability per token)."
-        "stop":
-          "type": "array"
-          "items":
-            "type": "string"
-          "maxItems": 5
-          "description": "Up to 5 sequences where the API will stop generating further tokens."
-        "frequency_penalty":
-          "type": "number"
-          "format": "float"
-          "minimum": 0
-          "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim."
+        temperature:
+          type: number
+          format: float
+          default: null
+          minimum: 0
+          description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+        top_p:
+          type: number
+          format: float
+          maximum: 1
+          minimum: 0
+          description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+        max_token:
+          type: integer
+          format: int32
+          default: 20
+          minimum: 0
+          maximum: 512
+          description: The maximum number of tokens to generate in the completion.
+        best_of:
+          type: integer
+          format: int32
+          minimum: 0
+          description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token).
+        stop:
+          type: array
+          items:
+            type: string
+          maxItems: 5
+          description: Up to 5 sequences where the API will stop generating further tokens.
+        frequency_penalty:
+          type: number
+          format: float
+          minimum: 0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
     GenerateResponse:
       type: object
       required:

From e300fe38a7f6ddaa4afbe05134b1526a8e0e8a40 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 12 Dec 2023 12:00:02 +0530
Subject: [PATCH 05/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 744c33d..d7c3fe7 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -31,7 +31,7 @@ components:
           maximum: 1
           minimum: 0
           description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
-        max_token:
+        max_tokens:
           type: integer
           format: int32
           default: 20

From 04686fa4471259cea12969b74bdd2e2b6f2ee0fb Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 2 Jan 2024 15:16:08 +0530
Subject: [PATCH 06/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 146 ++++++++++++++++------
 1 file changed, 105 insertions(+), 41 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index d7c3fe7..5429544 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -15,51 +15,51 @@ components:
         parameters:
           $ref: '#/components/schemas/GenerateParameters'
     GenerateParameters:
-      type: object
-      properties:
-        parameter:
-          description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs 
-        temperature:
-          type: number
-          format: float
-          default: null
-          minimum: 0
-          description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
-        top_p:
-          type: number
-          format: float
-          maximum: 1
-          minimum: 0
-          description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
-        max_tokens:
-          type: integer
-          format: int32
-          default: 20
-          minimum: 0
-          maximum: 512
-          description: The maximum number of tokens to generate in the completion.
-        best_of:
-          type: integer
-          format: int32
-          minimum: 0
-          description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token).
-        stop:
-          type: array
-          items:
-            type: string
-          maxItems: 5
-          description: Up to 5 sequences where the API will stop generating further tokens.
-        frequency_penalty:
-          type: number
-          format: float
-          minimum: 0
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+        allOf:
+          $ref: '#/components/schemas/GenerateParameters'
+        type: object
+        additionalProperties: {}
+        properties:
+          temperature:
+            type: number
+            format: float
+            default: null
+            minimum: 0
+            description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+          top_p:
+            type: number
+            format: float
+            maximum: 1
+            minimum: 0
+            description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+          max_tokens:
+            type: integer
+            format: int32
+            default: 20
+            minimum: 0
+            maximum: 512
+            description: The maximum number of tokens to generate in the completion.
+          best_of:
+            type: integer
+            format: int32
+            minimum: 0
+            description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token).
+          stop:
+            type: array
+            items:
+              type: string
+            maxItems: 5
+            description: Up to 5 sequences where the API will stop generating further tokens.
+          frequency_penalty:
+            type: number
+            format: float
+            minimum: 0
+            description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
     GenerateResponse:
       type: object
       required:
         - text_output
         - model_name
-        - model_version
       properties:
         text_output:
           type: string
@@ -72,7 +72,6 @@ components:
       required:
         - text_output
         - model_name
-        - model_version
         - done
       properties:
         text_output:
@@ -111,6 +110,39 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/GenerateResponse'
+        '422':
+          description: Input validation error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Input validation error
+        '424':
+          description: Generation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Request failed during generation
+        '429':
+          description: Model is overloaded
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Model is overloaded
+        '500':
+          description: Incomplete generation
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Incomplete generation
+
   /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream:
     post:
       parameters:
@@ -131,3 +163,35 @@ paths:
             text/event-stream:
               schema:
                 $ref: '#/components/schemas/GenerateStreamResponse'
+        '422':
+          description: Input validation error
+          content:
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Input validation error
+        '424':
+          description: Generation Error
+          content:
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Request failed during generation
+        '429':
+          description: Model is overloaded
+          content:
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Model is overloaded
+        '500':
+          description: Incomplete generation
+          content:
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/GenerateErrorResponse'
+              example:
+                error: Incomplete generation

From d977938fad72f783930a8c33a1dcc30fd8ef5e86 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Wed, 3 Jan 2024 17:52:31 +0530
Subject: [PATCH 07/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 5429544..6b655f5 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -39,22 +39,12 @@ components:
             minimum: 0
             maximum: 512
             description: The maximum number of tokens to generate in the completion.
-          best_of:
-            type: integer
-            format: int32
-            minimum: 0
-            description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token).
           stop:
             type: array
             items:
               type: string
             maxItems: 5
             description: Up to 5 sequences where the API will stop generating further tokens.
-          frequency_penalty:
-            type: number
-            format: float
-            minimum: 0
-            description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
     GenerateResponse:
       type: object
       required:
@@ -72,7 +62,6 @@ components:
       required:
         - text_output
         - model_name
-        - done
       properties:
         text_output:
           type: string
@@ -80,8 +69,8 @@ components:
           type: string
         model_version:
           type: string
-        done:
-          type: boolean
+        finish_reason:
+          type: string
     GenerateErrorResponse:
       type: object
       required:

From 50efd57512c49fa0dfc538121e0cf0d3d47094e8 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 9 Jan 2024 11:45:09 +0530
Subject: [PATCH 08/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 6b655f5..3076e5d 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -23,7 +23,7 @@ components:
           temperature:
             type: number
             format: float
-            default: null
+            default: 1
             minimum: 0
             description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
           top_p:
@@ -36,14 +36,13 @@ components:
             type: integer
             format: int32
             default: 20
-            minimum: 0
+            minimum: 1
             maximum: 512
             description: The maximum number of tokens to generate in the completion.
           stop:
             type: array
             items:
               type: string
-            maxItems: 5
             description: Up to 5 sequences where the API will stop generating further tokens.
     GenerateResponse:
       type: object

From 058b57be6b2ba8cb0903fa74edb61b012756c1cf Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 9 Jan 2024 11:47:45 +0530
Subject: [PATCH 09/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 3076e5d..eecc607 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -25,7 +25,7 @@ components:
             format: float
             default: 1
             minimum: 0
-            description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+            description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
           top_p:
             type: number
             format: float
@@ -43,7 +43,7 @@ components:
             type: array
             items:
               type: string
-            description: Up to 5 sequences where the API will stop generating further tokens.
+            description: Sequences where the API will stop generating further tokens.
     GenerateResponse:
       type: object
       required:

From e6977a699a4b9c4c561e64d9cf3e0ac4d2dcbbad Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 9 Jan 2024 12:04:13 +0530
Subject: [PATCH 10/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index eecc607..6eab662 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -37,7 +37,6 @@ components:
             format: int32
             default: 20
             minimum: 1
-            maximum: 512
             description: The maximum number of tokens to generate in the completion.
           stop:
             type: array

From 24d9129cefcbab8474fba77ae6ad3438d2791273 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 16 Jan 2024 18:41:57 +0530
Subject: [PATCH 11/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 112 +++++++++++++++-------
 1 file changed, 75 insertions(+), 37 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 6eab662..d80e0c8 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -5,6 +5,40 @@ info:
   version: 1.0.0
 components:
   schemas:
+    GenerateErrorResponse:
+      type: object
+      required:
+        - error
+      properties:
+        error:
+          type: string  
+    GenerateParameters:
+      type: object
+      additionalProperties: {}
+      properties:
+        temperature:
+          type: number
+          format: float
+          default: 1
+          minimum: 0
+          description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+        top_p:
+          type: number
+          format: float
+          maximum: 1
+          minimum: 0
+          description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+        max_tokens:
+          type: integer
+          format: int32
+          default: 20
+          minimum: 1
+          description: The maximum number of tokens to generate in the completion.
+        stop:
+          type: array
+          items:
+            type: string
+          description: Sequences where the API will stop generating further tokens.
     GenerateRequest:
       type: object
       required: 
@@ -13,36 +47,10 @@ components:
         text_input:
           type: string
         parameters:
-          $ref: '#/components/schemas/GenerateParameters'
-    GenerateParameters:
-        allOf:
-          $ref: '#/components/schemas/GenerateParameters'
-        type: object
-        additionalProperties: {}
-        properties:
-          temperature:
-            type: number
-            format: float
-            default: 1
-            minimum: 0
-            description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
-          top_p:
-            type: number
-            format: float
-            maximum: 1
-            minimum: 0
-            description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
-          max_tokens:
-            type: integer
-            format: int32
-            default: 20
-            minimum: 1
-            description: The maximum number of tokens to generate in the completion.
-          stop:
-            type: array
-            items:
-              type: string
-            description: Sequences where the API will stop generating further tokens.
+          allOf: 
+            - $ref: '#/components/schemas/GenerateParameters'
+        logprob:
+          type: boolean  
     GenerateResponse:
       type: object
       required:
@@ -55,6 +63,8 @@ components:
           type: string
         model_version:
           type: string
+        logprobs:
+          $ref: '#/components/schemas/Logprobs'
     GenerateStreamResponse:
       type: object
       required:
@@ -69,18 +79,41 @@ components:
           type: string
         finish_reason:
           type: string
-    GenerateErrorResponse:
+        logprobs:
+          $ref: '#/components/schemas/Logprobs'
+    Logprobs:
+      type: array
+      items:
+        $ref: '#/components/schemas/Token'
+    Token:
       type: object
       required:
-        - error
+        - id
+        - text
+        - logprob
+        - special
       properties:
-        error:
+        id:
+          type: integer
+          format: int32
+          minimum: 0
+        logprob:
+          type: number
+          format: float
+        special:
+          type: boolean
+        text:
           type: string
 paths:
-  /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate:
+  /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate:
     post:
       parameters:
-        - name: model_name
+        - name: MODEL_NAME
+          required: true
+          in: path
+          schema:
+            type: string
+        - name: MODEL_VERSION
           required: true
           in: path
           schema:
@@ -130,10 +163,15 @@ paths:
               example:
                 error: Incomplete generation
 
-  /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream:
+  /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate_stream:
     post:
       parameters:
-        - name: model_name
+        - name: MODEL_NAME
+          required: true
+          in: path
+          schema:
+            type: string
+        - name: MODEL_VERSION
           required: true
           in: path
           schema:

From 642f018bbe05f0d8aac7adf57627532389eea28b Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Wed, 17 Jan 2024 19:37:10 +0530
Subject: [PATCH 12/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 30 +++++++++++++++++------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index d80e0c8..244d338 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -5,6 +5,14 @@ info:
   version: 1.0.0
 components:
   schemas:
+    Details:
+      type: object
+      additionalProperties: {}
+      properties:
+        finish_reason:
+          type: string
+        logprobs:
+          $ref: '#/components/schemas/Logprobs'
     GenerateErrorResponse:
       type: object
       required:
@@ -39,6 +47,8 @@ components:
           items:
             type: string
           description: Sequences where the API will stop generating further tokens.
+        logprob:
+          type: boolean  
     GenerateRequest:
       type: object
       required: 
@@ -49,8 +59,6 @@ components:
         parameters:
           allOf: 
             - $ref: '#/components/schemas/GenerateParameters'
-        logprob:
-          type: boolean  
     GenerateResponse:
       type: object
       required:
@@ -63,8 +71,8 @@ components:
           type: string
         model_version:
           type: string
-        logprobs:
-          $ref: '#/components/schemas/Logprobs'
+        details:
+          $ref: '#/components/schemas/Details'
     GenerateStreamResponse:
       type: object
       required:
@@ -77,14 +85,20 @@ components:
           type: string
         model_version:
           type: string
-        finish_reason:
-          type: string
-        logprobs:
-          $ref: '#/components/schemas/Logprobs'
+        details:
+          $ref: '#/components/schemas/StreamDetails'
     Logprobs:
       type: array
       items:
         $ref: '#/components/schemas/Token'
+    StreamDetails:
+      type: object
+      additionalProperties: {}
+      properties:
+        finish_reason:
+          type: string
+        token:
+          $ref: '#/components/schemas/Token'
     Token:
       type: object
       required:

From 5902baebb55dd8122a0cb4223fb93eab1c3fa8f5 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 23 Jan 2024 16:58:06 +0530
Subject: [PATCH 13/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 24 +++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 244d338..930a754 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -7,12 +7,22 @@ components:
   schemas:
     Details:
       type: object
+      required: 
+        - finish_reason
+        - logprobs
       additionalProperties: {}
       properties:
         finish_reason:
-          type: string
+          $ref: '#/components/schemas/Finish_Reason'
         logprobs:
           $ref: '#/components/schemas/Logprobs'
+    Finish_Reason:
+      type: string
+      enum: 
+        - length
+        - eos_token
+        - stop_sequence
+      description: The reason the model stopped generating tokens. `length` if number of generated tokens == `max_tokens`. `eos_token` if the model generated its end of sequence token and `stop_sequence` if the model generated a text included in `stop` array
     GenerateErrorResponse:
       type: object
       required:
@@ -47,8 +57,9 @@ components:
           items:
             type: string
           description: Sequences where the API will stop generating further tokens.
-        logprob:
-          type: boolean  
+        details:
+          type: boolean 
+          description: Flag to request for detailed response body that would include finish_reason and logprobs.
     GenerateRequest:
       type: object
       required: 
@@ -91,12 +102,13 @@ components:
       type: array
       items:
         $ref: '#/components/schemas/Token'
+      description: Log probability information for the tokens.
     StreamDetails:
       type: object
       additionalProperties: {}
       properties:
         finish_reason:
-          type: string
+          $ref: '#/components/schemas/Finish_Reason'
         token:
           $ref: '#/components/schemas/Token'
     Token:
@@ -111,13 +123,17 @@ components:
           type: integer
           format: int32
           minimum: 0
+          description: Id of the token.
         logprob:
           type: number
           format: float
+          description: The log probability of this token.
         special:
           type: boolean
+          description: Describes if the token is a special token. Can be used to ignore tokens when concatenating
         text:
           type: string
+          description: The token text value.
 paths:
   /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate:
     post:

From 23fb361dce69463e6fb9454a5be1f9f1b749e637 Mon Sep 17 00:00:00 2001
From: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
Date: Tue, 23 Jan 2024 16:59:21 +0530
Subject: [PATCH 14/14] Update generate_rest.yaml

Signed-off-by: Gavrish Prabhu <gavrish.prabhu@nutanix.com>
---
 specification/protocol/generate_rest.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml
index 930a754..68061ee 100644
--- a/specification/protocol/generate_rest.yaml
+++ b/specification/protocol/generate_rest.yaml
@@ -105,6 +105,9 @@ components:
       description: Log probability information for the tokens.
     StreamDetails:
       type: object
+      required: 
+        - finish_reason
+        - token
       additionalProperties: {}
       properties:
         finish_reason: