From 5f5e4b7b62dff93ee6096dd9a7933b596ed60775 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 5 Dec 2023 23:03:53 +0530 Subject: [PATCH 01/14] Create generate_rest.yaml Propose generate rest api endpoints Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 130 ++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 specification/protocol/generate_rest.yaml diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml new file mode 100644 index 0000000..4459eff --- /dev/null +++ b/specification/protocol/generate_rest.yaml @@ -0,0 +1,130 @@ +openapi: 3.1.0 +info: + title: Open Inference API for text generation + description: Open Inference API for text generation + version: 1.0.0 +components: + schemas: + GenerateRequest: + type: object + required: + - text_input + properties: + text_input: + type: string + parameters: + $ref: '#/components/schemas/GenerateParameters' + GenerateParameters: + type: object + properties: + temperature: + type: number + "format": "float" + "default": "null" + "examples": [0.5] + minimum: 0 + description: |- + What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + top_p: + type: number + "format": "float" + maximum: 1.0 + minimum: 0 + description: |- + An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. + max_token: + type: integer + "format": "int32" + default: 20 + minimum: 0 + maximum: 512 + description: The maximum number of tokens to generate in the completion. + best_of: + type: integer + "format": "int32" + minimum: 0 + examples: [1] + description: |- + Generates best_of completions server-side and returns the "best" (the one with the highest log probability per token). + stop: + type: array + items: + type: string + maxItems: 4 + description: |- + Up to 4 sequences where the API will stop generating further tokens. + frequency_penalty: + type: number + format: "float" + examples: [1.03] + minimum: 0 + description: |- + Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + GenerateResponse: + type: object + required: + - text_output + - model_name + - model_version + properties: + text_output: + type: string + model_name: + type: string + model_version: + type: string + GenerateStreamResponse: + type: object + required: + - text_output + - model_name + - model_version + - done + properties: + text_output: + type: string + model_name: + type: string + model_version: + type: string +paths: + /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate: + post: + parameters: + - name: model_name + required: true + in: path + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateRequest' + responses: + '200': + description: generated text + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateResponse' + /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate_stream: + post: + parameters: + - name: model_name + required: true + in: path + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateRequest' + responses: + '200': + description: generated text stream + content: + text/event-stream: + schema: + $ref: '#/components/schemas/GenerateStreamResponse' From 19973d4b5108e1756767b8f65a9363b00f866bb4 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Thu, 7 Dec 2023 22:27:03 +0530 Subject: [PATCH 02/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 49 ++--------------------- 1 file changed, 4 insertions(+), 45 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 4459eff..d608b43 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -17,49 +17,8 @@ components: GenerateParameters: type: object properties: - temperature: - type: number - "format": "float" - "default": "null" - "examples": [0.5] - minimum: 0 - description: |- - What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. - top_p: - type: number - "format": "float" - maximum: 1.0 - minimum: 0 - description: |- - An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. - max_token: - type: integer - "format": "int32" - default: 20 - minimum: 0 - maximum: 512 - description: The maximum number of tokens to generate in the completion. - best_of: - type: integer - "format": "int32" - minimum: 0 - examples: [1] - description: |- - Generates best_of completions server-side and returns the "best" (the one with the highest log probability per token). - stop: - type: array - items: - type: string - maxItems: 4 - description: |- - Up to 4 sequences where the API will stop generating further tokens. - frequency_penalty: - type: number - format: "float" - examples: [1.03] - minimum: 0 - description: |- - Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + parameter: + description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs GenerateResponse: type: object required: @@ -88,7 +47,7 @@ components: model_version: type: string paths: - /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate: + /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate: post: parameters: - name: model_name @@ -108,7 +67,7 @@ paths: application/json: schema: $ref: '#/components/schemas/GenerateResponse' - /v2/models/{model_name}[/versions/${MODEL_VERSION}]/generate_stream: + /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream: post: parameters: - name: model_name From 725b5ec9cf91d81bd5296f305252fbb6e8379da9 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 12 Dec 2023 02:23:28 +0530 Subject: [PATCH 03/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index d608b43..e3cbbbf 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -19,6 +19,41 @@ components: properties: parameter: description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs + "temperature": + "type": "number" + "format": "float" + "default": "null" + "minimum": 0 + "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic." + "top_p": + "type": "number" + "format": "float" + "maximum": 1 + "minimum": 0 + "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered." + "max_token": + "type": "integer" + "format": "int32" + "default": 20 + "minimum": 0 + "maximum": 512 + "description": "The maximum number of tokens to generate in the completion." + "best_of": + "type": "integer" + "format": "int32" + "minimum": 0 + "description": "Generates best_of completions server-side and returns the \"best\" (the one with the highest log probability per token)." + "stop": + "type": "array" + "items": + "type": "string" + "maxItems": 5 + "description": "Up to 5 sequences where the API will stop generating further tokens." + "frequency_penalty": + "type": "number" + "format": "float" + "minimum": 0 + "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim." GenerateResponse: type: object required: @@ -46,6 +81,15 @@ components: type: string model_version: type: string + done: + type: boolean + GenerateErrorResponse: + type: object + required: + - error + properties: + error: + type: string paths: /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate: post: From 8e729f013a97e16a860c31b3e212e56ae6c337dc Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 12 Dec 2023 02:26:13 +0530 Subject: [PATCH 04/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 70 +++++++++++------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index e3cbbbf..744c33d 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -19,41 +19,41 @@ components: properties: parameter: description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs - "temperature": - "type": "number" - "format": "float" - "default": "null" - "minimum": 0 - "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic." - "top_p": - "type": "number" - "format": "float" - "maximum": 1 - "minimum": 0 - "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered." - "max_token": - "type": "integer" - "format": "int32" - "default": 20 - "minimum": 0 - "maximum": 512 - "description": "The maximum number of tokens to generate in the completion." - "best_of": - "type": "integer" - "format": "int32" - "minimum": 0 - "description": "Generates best_of completions server-side and returns the \"best\" (the one with the highest log probability per token)." - "stop": - "type": "array" - "items": - "type": "string" - "maxItems": 5 - "description": "Up to 5 sequences where the API will stop generating further tokens." - "frequency_penalty": - "type": "number" - "format": "float" - "minimum": 0 - "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim." + temperature: + type: number + format: float + default: null + minimum: 0 + description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + top_p: + type: number + format: float + maximum: 1 + minimum: 0 + description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. + max_token: + type: integer + format: int32 + default: 20 + minimum: 0 + maximum: 512 + description: The maximum number of tokens to generate in the completion. + best_of: + type: integer + format: int32 + minimum: 0 + description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token). + stop: + type: array + items: + type: string + maxItems: 5 + description: Up to 5 sequences where the API will stop generating further tokens. + frequency_penalty: + type: number + format: float + minimum: 0 + description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. GenerateResponse: type: object required: From e300fe38a7f6ddaa4afbe05134b1526a8e0e8a40 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 12 Dec 2023 12:00:02 +0530 Subject: [PATCH 05/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 744c33d..d7c3fe7 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -31,7 +31,7 @@ components: maximum: 1 minimum: 0 description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. - max_token: + max_tokens: type: integer format: int32 default: 20 From 04686fa4471259cea12969b74bdd2e2b6f2ee0fb Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 2 Jan 2024 15:16:08 +0530 Subject: [PATCH 06/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 146 ++++++++++++++++------ 1 file changed, 105 insertions(+), 41 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index d7c3fe7..5429544 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -15,51 +15,51 @@ components: parameters: $ref: '#/components/schemas/GenerateParameters' GenerateParameters: - type: object - properties: - parameter: - description: Parameters are framework specific. An optional object containing zero or more parameters for this generate request expressed as key/value pairs - temperature: - type: number - format: float - default: null - minimum: 0 - description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. - top_p: - type: number - format: float - maximum: 1 - minimum: 0 - description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. - max_tokens: - type: integer - format: int32 - default: 20 - minimum: 0 - maximum: 512 - description: The maximum number of tokens to generate in the completion. - best_of: - type: integer - format: int32 - minimum: 0 - description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token). - stop: - type: array - items: - type: string - maxItems: 5 - description: Up to 5 sequences where the API will stop generating further tokens. - frequency_penalty: - type: number - format: float - minimum: 0 - description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + allOf: + $ref: '#/components/schemas/GenerateParameters' + type: object + additionalProperties: {} + properties: + temperature: + type: number + format: float + default: null + minimum: 0 + description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + top_p: + type: number + format: float + maximum: 1 + minimum: 0 + description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. + max_tokens: + type: integer + format: int32 + default: 20 + minimum: 0 + maximum: 512 + description: The maximum number of tokens to generate in the completion. + best_of: + type: integer + format: int32 + minimum: 0 + description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token). + stop: + type: array + items: + type: string + maxItems: 5 + description: Up to 5 sequences where the API will stop generating further tokens. + frequency_penalty: + type: number + format: float + minimum: 0 + description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. GenerateResponse: type: object required: - text_output - model_name - - model_version properties: text_output: type: string @@ -72,7 +72,6 @@ components: required: - text_output - model_name - - model_version - done properties: text_output: @@ -111,6 +110,39 @@ paths: application/json: schema: $ref: '#/components/schemas/GenerateResponse' + '422': + description: Input validation error + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Input validation error + '424': + description: Generation Error + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Request failed during generation + '429': + description: Model is overloaded + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Model is overloaded + '500': + description: Incomplete generation + content: + application/json: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Incomplete generation + /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream: post: parameters: @@ -131,3 +163,35 @@ paths: text/event-stream: schema: $ref: '#/components/schemas/GenerateStreamResponse' + '422': + description: Input validation error + content: + text/event-stream: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Input validation error + '424': + description: Generation Error + content: + text/event-stream: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Request failed during generation + '429': + description: Model is overloaded + content: + text/event-stream: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Model is overloaded + '500': + description: Incomplete generation + content: + text/event-stream: + schema: + $ref: '#/components/schemas/GenerateErrorResponse' + example: + error: Incomplete generation From d977938fad72f783930a8c33a1dcc30fd8ef5e86 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Wed, 3 Jan 2024 17:52:31 +0530 Subject: [PATCH 07/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 5429544..6b655f5 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -39,22 +39,12 @@ components: minimum: 0 maximum: 512 description: The maximum number of tokens to generate in the completion. - best_of: - type: integer - format: int32 - minimum: 0 - description: Generates best_of completions server-side and returns the \best\ (the one with the highest log probability per token). stop: type: array items: type: string maxItems: 5 description: Up to 5 sequences where the API will stop generating further tokens. - frequency_penalty: - type: number - format: float - minimum: 0 - description: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. GenerateResponse: type: object required: @@ -72,7 +62,6 @@ components: required: - text_output - model_name - - done properties: text_output: type: string @@ -80,8 +69,8 @@ components: type: string model_version: type: string - done: - type: boolean + finish_reason: + type: string GenerateErrorResponse: type: object required: From 50efd57512c49fa0dfc538121e0cf0d3d47094e8 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 9 Jan 2024 11:45:09 +0530 Subject: [PATCH 08/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 6b655f5..3076e5d 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -23,7 +23,7 @@ components: temperature: type: number format: float - default: null + default: 1 minimum: 0 description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. top_p: @@ -36,14 +36,13 @@ components: type: integer format: int32 default: 20 - minimum: 0 + minimum: 1 maximum: 512 description: The maximum number of tokens to generate in the completion. stop: type: array items: type: string - maxItems: 5 description: Up to 5 sequences where the API will stop generating further tokens. GenerateResponse: type: object From 058b57be6b2ba8cb0903fa74edb61b012756c1cf Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 9 Jan 2024 11:47:45 +0530 Subject: [PATCH 09/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 3076e5d..eecc607 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -25,7 +25,7 @@ components: format: float default: 1 minimum: 0 - description: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. top_p: type: number format: float @@ -43,7 +43,7 @@ components: type: array items: type: string - description: Up to 5 sequences where the API will stop generating further tokens. + description: Sequences where the API will stop generating further tokens. GenerateResponse: type: object required: From e6977a699a4b9c4c561e64d9cf3e0ac4d2dcbbad Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 9 Jan 2024 12:04:13 +0530 Subject: [PATCH 10/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index eecc607..6eab662 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -37,7 +37,6 @@ components: format: int32 default: 20 minimum: 1 - maximum: 512 description: The maximum number of tokens to generate in the completion. stop: type: array From 24d9129cefcbab8474fba77ae6ad3438d2791273 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 16 Jan 2024 18:41:57 +0530 Subject: [PATCH 11/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 112 +++++++++++++++------- 1 file changed, 75 insertions(+), 37 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 6eab662..d80e0c8 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -5,6 +5,40 @@ info: version: 1.0.0 components: schemas: + GenerateErrorResponse: + type: object + required: + - error + properties: + error: + type: string + GenerateParameters: + type: object + additionalProperties: {} + properties: + temperature: + type: number + format: float + default: 1 + minimum: 0 + description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + top_p: + type: number + format: float + maximum: 1 + minimum: 0 + description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. + max_tokens: + type: integer + format: int32 + default: 20 + minimum: 1 + description: The maximum number of tokens to generate in the completion. + stop: + type: array + items: + type: string + description: Sequences where the API will stop generating further tokens. GenerateRequest: type: object required: @@ -13,36 +47,10 @@ components: text_input: type: string parameters: - $ref: '#/components/schemas/GenerateParameters' - GenerateParameters: - allOf: - $ref: '#/components/schemas/GenerateParameters' - type: object - additionalProperties: {} - properties: - temperature: - type: number - format: float - default: 1 - minimum: 0 - description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. - top_p: - type: number - format: float - maximum: 1 - minimum: 0 - description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. - max_tokens: - type: integer - format: int32 - default: 20 - minimum: 1 - description: The maximum number of tokens to generate in the completion. - stop: - type: array - items: - type: string - description: Sequences where the API will stop generating further tokens. + allOf: + - $ref: '#/components/schemas/GenerateParameters' + logprob: + type: boolean GenerateResponse: type: object required: @@ -55,6 +63,8 @@ components: type: string model_version: type: string + logprobs: + $ref: '#/components/schemas/Logprobs' GenerateStreamResponse: type: object required: @@ -69,18 +79,41 @@ components: type: string finish_reason: type: string - GenerateErrorResponse: + logprobs: + $ref: '#/components/schemas/Logprobs' + Logprobs: + type: array + items: + $ref: '#/components/schemas/Token' + Token: type: object required: - - error + - id + - text + - logprob + - special properties: - error: + id: + type: integer + format: int32 + minimum: 0 + logprob: + type: number + format: float + special: + type: boolean + text: type: string paths: - /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate: + /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate: post: parameters: - - name: model_name + - name: MODEL_NAME + required: true + in: path + schema: + type: string + - name: MODEL_VERSION required: true in: path schema: @@ -130,10 +163,15 @@ paths: example: error: Incomplete generation - /v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream: + /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate_stream: post: parameters: - - name: model_name + - name: MODEL_NAME + required: true + in: path + schema: + type: string + - name: MODEL_VERSION required: true in: path schema: From 642f018bbe05f0d8aac7adf57627532389eea28b Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Wed, 17 Jan 2024 19:37:10 +0530 Subject: [PATCH 12/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index d80e0c8..244d338 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -5,6 +5,14 @@ info: version: 1.0.0 components: schemas: + Details: + type: object + additionalProperties: {} + properties: + finish_reason: + type: string + logprobs: + $ref: '#/components/schemas/Logprobs' GenerateErrorResponse: type: object required: @@ -39,6 +47,8 @@ components: items: type: string description: Sequences where the API will stop generating further tokens. + logprob: + type: boolean GenerateRequest: type: object required: @@ -49,8 +59,6 @@ components: parameters: allOf: - $ref: '#/components/schemas/GenerateParameters' - logprob: - type: boolean GenerateResponse: type: object required: @@ -63,8 +71,8 @@ components: type: string model_version: type: string - logprobs: - $ref: '#/components/schemas/Logprobs' + details: + $ref: '#/components/schemas/Details' GenerateStreamResponse: type: object required: @@ -77,14 +85,20 @@ components: type: string model_version: type: string - finish_reason: - type: string - logprobs: - $ref: '#/components/schemas/Logprobs' + details: + $ref: '#/components/schemas/StreamDetails' Logprobs: type: array items: $ref: '#/components/schemas/Token' + StreamDetails: + type: object + additionalProperties: {} + properties: + finish_reason: + type: string + token: + $ref: '#/components/schemas/Token' Token: type: object required: From 5902baebb55dd8122a0cb4223fb93eab1c3fa8f5 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 23 Jan 2024 16:58:06 +0530 Subject: [PATCH 13/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 24 +++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 244d338..930a754 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -7,12 +7,22 @@ components: schemas: Details: type: object + required: + - finish_reason + - logprobs additionalProperties: {} properties: finish_reason: - type: string + $ref: '#/components/schemas/Finish_Reason' logprobs: $ref: '#/components/schemas/Logprobs' + Finish_Reason: + type: string + enum: + - length + - eos_token + - stop_sequence + description: The reason the model stopped generating tokens. `length` if number of generated tokens == `max_tokens`. `eos_token` if the model generated its end of sequence token and `stop_sequence` if the model generated a text included in `stop` array GenerateErrorResponse: type: object required: @@ -47,8 +57,9 @@ components: items: type: string description: Sequences where the API will stop generating further tokens. - logprob: - type: boolean + details: + type: boolean + description: Flag to request for detailed response body that would include finish_reason and logprobs. GenerateRequest: type: object required: @@ -91,12 +102,13 @@ components: type: array items: $ref: '#/components/schemas/Token' + description: Log probability information for the tokens. StreamDetails: type: object additionalProperties: {} properties: finish_reason: - type: string + $ref: '#/components/schemas/Finish_Reason' token: $ref: '#/components/schemas/Token' Token: @@ -111,13 +123,17 @@ components: type: integer format: int32 minimum: 0 + description: Id of the token. logprob: type: number format: float + description: The log probability of this token. special: type: boolean + description: Describes if the token is a special token. Can be used to ignore tokens when concatenating text: type: string + description: The token text value. paths: /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate: post: From 23fb361dce69463e6fb9454a5be1f9f1b749e637 Mon Sep 17 00:00:00 2001 From: Gavrish Prabhu Date: Tue, 23 Jan 2024 16:59:21 +0530 Subject: [PATCH 14/14] Update generate_rest.yaml Signed-off-by: Gavrish Prabhu --- specification/protocol/generate_rest.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/specification/protocol/generate_rest.yaml b/specification/protocol/generate_rest.yaml index 930a754..68061ee 100644 --- a/specification/protocol/generate_rest.yaml +++ b/specification/protocol/generate_rest.yaml @@ -105,6 +105,9 @@ components: description: Log probability information for the tokens. StreamDetails: type: object + required: + - finish_reason + - token additionalProperties: {} properties: finish_reason: