From cd7cad8f2c51a9d7c54c3f71f4a31292357d43cd Mon Sep 17 00:00:00 2001 From: Jayr Date: Wed, 21 Feb 2024 13:11:57 +0800 Subject: [PATCH] added new embedding models --- serverless_openai/apis.py | 18 ++++++++++------ serverless_openai/helpers.py | 26 +++++++++++++---------- setup.py | 2 +- test.ipynb | 40 +++++++++++++++++++++++++++++------- 4 files changed, 61 insertions(+), 25 deletions(-) diff --git a/serverless_openai/apis.py b/serverless_openai/apis.py index b7ba346..37b70f6 100644 --- a/serverless_openai/apis.py +++ b/serverless_openai/apis.py @@ -290,16 +290,22 @@ def vision_longimage( def embeddings( self, prompt: EmbeddingPrompts, - model: EmbeddingModels = EmbeddingModels.ada2, + model: EmbeddingModels = EmbeddingModels.te_ada2, dimensions: int = 1536, tries: int = 5, timeout: int = 500, ) -> OpenAIResults: - data = { - "model": model, - "input": prompt, - "dimensions": dimensions - } + if model == "text-embedding-ada-002": + data = { + "model": model, + "input": prompt, + } + else: + data = { + "model": model, + "input": prompt, + "dimensions": dimensions + } res = {} for _ in range(tries): diff --git a/serverless_openai/helpers.py b/serverless_openai/helpers.py index a0a3c04..573558d 100644 --- a/serverless_openai/helpers.py +++ b/serverless_openai/helpers.py @@ -64,7 +64,9 @@ class Config: arbitrary_types_allowed = True class EmbeddingModels(str, ExtendedEnum): - ada2 : str = "text-embedding-ada-002" + te_ada2 : str = "text-embedding-ada-002" + te3_small: str = "text-embedding-3-small" + te3_large: str = "text-embedding-3-large" class EmbeddingPrompts(BaseModel): prompt: Union[str, List[str]] @@ -154,29 +156,31 @@ def crop_image( def cosine_similarity( sim: Similarity, - data_list: List[str], - get_scores: bool = False - ) -> list: + data_list: List[str] + ) -> dict: vector = np.array(sim.vector) matrix = np.array(sim.matrix) scores = (np.sum(vector*matrix,axis=1) / ( np.sqrt(np.sum(matrix**2,axis=1)) * np.sqrt(np.sum(vector**2)) ) ) - if get_scores: - return scores - result_list = get_similarity_result(scores, data_list) - return result_list + res_dict = get_similarity_result(scores, data_list) + return res_dict def get_similarity_result( scores: list, all_data: list, topn: int = 5 - ) -> list: + ) -> dict: idx = (-scores).argsort() result_list = [] + scores_llist = [] for ix in idx[:topn]: p = all_data[ix] - scr = scores[ix] + score = scores[ix] + scores_llist.append(score) result_list.append(p) - return result_list + return { + "result_list": result_list, + "scores_llist": scores_llist + } def get_token_count( text: str diff --git a/setup.py b/setup.py index 519067d..ff9428a 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -VERSION = '1.3.1' +VERSION = '1.3.2' DESCRIPTION = "A package for using Openai in serverless environment" LONG_DESCRIPTION = 'A package for using Openai with scraping and etc. in serverless application such as AWS Lambda and GCP Cloud Function' diff --git a/test.ipynb b/test.ipynb index 71dea12..753bb52 100644 --- a/test.ipynb +++ b/test.ipynb @@ -457,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -466,14 +466,27 @@ "text": [ "# of embeddings for the first vector: 1\n", "# of embeddings for the second vector: 4\n", - "Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n" + "Results: {\n", + " \"result_list\": [\n", + " \"Black cat\",\n", + " \"white Cat\",\n", + " \"Red Cat\",\n", + " \"apple\"\n", + " ],\n", + " \"scores_llist\": [\n", + " 1.0,\n", + " 0.9070401864448968,\n", + " 0.9044700016527735,\n", + " 0.79191801054189\n", + " ]\n", + "}\n" ] } ], "source": [ "from serverless_openai import OpenAIAPI, Similarity\n", "from serverless_openai.helpers import cosine_similarity\n", - "import os\n", + "import os, json\n", "\n", "# Initialize OpenAIAPI with API Key\n", "openai_api_key = os.getenv(\"openai_api_key\")\n", @@ -493,7 +506,7 @@ "\n", " sim = Similarity(vector=res1.result, matrix=res2.result)\n", " result_list = cosine_similarity(sim, prompt_list)\n", - " print(\"Results:\", result_list)" + " print(\"Results:\", json.dumps(result_list, indent=4))" ] }, { @@ -505,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -515,7 +528,20 @@ "512\n", "# of embeddings for the first vector: 1\n", "# of embeddings for the second vector: 4\n", - "Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n" + "Results: {\n", + " \"result_list\": [\n", + " \"Black cat\",\n", + " \"white Cat\",\n", + " \"Red Cat\",\n", + " \"apple\"\n", + " ],\n", + " \"scores_llist\": [\n", + " 0.9999999999999999,\n", + " 0.7667772992427558,\n", + " 0.7071684587870847,\n", + " 0.2702694570913964\n", + " ]\n", + "}\n" ] } ], @@ -546,7 +572,7 @@ "\n", " sim = Similarity(vector=res1.result, matrix=res2.result)\n", " result_list = cosine_similarity(sim, prompt_list)\n", - " print(\"Results:\", result_list)" + " print(\"Results:\", json.dumps(result_list, indent=4))" ] }, {