added new embedding models

jrcastropy · Feb 21, 2024 · cd7cad8 · cd7cad8
1 parent bfffb9b
commit cd7cad8
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 25 deletions.
diff --git a/serverless_openai/apis.py b/serverless_openai/apis.py
@@ -290,16 +290,22 @@ def vision_longimage(
     def embeddings(
             self,
             prompt: EmbeddingPrompts,
-            model: EmbeddingModels = EmbeddingModels.ada2,
+            model: EmbeddingModels = EmbeddingModels.te_ada2,
             dimensions: int = 1536,
             tries: int = 5,
             timeout: int = 500,
         ) -> OpenAIResults:
-        data = {
-            "model": model,
-            "input": prompt,
-            "dimensions": dimensions
-        }
+        if model == "text-embedding-ada-002":
+            data = {
+                "model": model,
+                "input": prompt,
+            }
+        else:
+            data = {
+                "model": model,
+                "input": prompt,
+                "dimensions": dimensions
+            }
 
         res = {}
         for _ in range(tries):

diff --git a/serverless_openai/helpers.py b/serverless_openai/helpers.py
@@ -64,7 +64,9 @@ class Config:
         arbitrary_types_allowed = True
 
 class EmbeddingModels(str, ExtendedEnum):
-    ada2 : str = "text-embedding-ada-002"
+    te_ada2 : str = "text-embedding-ada-002"
+    te3_small: str = "text-embedding-3-small"
+    te3_large: str = "text-embedding-3-large"
 
 class EmbeddingPrompts(BaseModel):
     prompt: Union[str, List[str]]
@@ -154,29 +156,31 @@ def crop_image(
 
 def cosine_similarity(
         sim: Similarity,
-        data_list: List[str],
-        get_scores: bool = False
-    ) -> list:
+        data_list: List[str]
+    ) -> dict:
     vector = np.array(sim.vector)
     matrix = np.array(sim.matrix)
     scores = (np.sum(vector*matrix,axis=1) / ( np.sqrt(np.sum(matrix**2,axis=1)) * np.sqrt(np.sum(vector**2)) ) )
-    if get_scores:
-        return scores
-    result_list = get_similarity_result(scores, data_list)
-    return result_list
+    res_dict = get_similarity_result(scores, data_list)
+    return res_dict
 
 def get_similarity_result(
         scores: list, 
         all_data: list, 
         topn: int = 5
-    ) -> list:
+    ) -> dict:
     idx = (-scores).argsort()
     result_list = []
+    scores_llist = []
     for ix in idx[:topn]:
         p = all_data[ix]
-        scr = scores[ix]
+        score = scores[ix]
+        scores_llist.append(score)
         result_list.append(p)
-    return result_list
+    return {
+        "result_list": result_list,
+        "scores_llist": scores_llist
+    }
 
 def get_token_count(
         text: str

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-VERSION = '1.3.1'
+VERSION = '1.3.2'
 DESCRIPTION = "A package for using Openai in serverless environment"
 LONG_DESCRIPTION = 'A package for using Openai with scraping and etc. in serverless application such as AWS Lambda and GCP Cloud Function'
 

diff --git a/test.ipynb b/test.ipynb
@@ -457,7 +457,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -466,14 +466,27 @@
      "text": [
       "# of embeddings for the first vector: 1\n",
       "# of embeddings for the second vector: 4\n",
-      "Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n"
+      "Results: {\n",
+      "    \"result_list\": [\n",
+      "        \"Black cat\",\n",
+      "        \"white Cat\",\n",
+      "        \"Red Cat\",\n",
+      "        \"apple\"\n",
+      "    ],\n",
+      "    \"scores_llist\": [\n",
+      "        1.0,\n",
+      "        0.9070401864448968,\n",
+      "        0.9044700016527735,\n",
+      "        0.79191801054189\n",
+      "    ]\n",
+      "}\n"
      ]
     }
    ],
    "source": [
     "from serverless_openai import OpenAIAPI, Similarity\n",
     "from serverless_openai.helpers import cosine_similarity\n",
-    "import os\n",
+    "import os, json\n",
     "\n",
     "# Initialize OpenAIAPI with API Key\n",
     "openai_api_key = os.getenv(\"openai_api_key\")\n",
@@ -493,7 +506,7 @@
     "\n",
     "    sim = Similarity(vector=res1.result, matrix=res2.result)\n",
     "    result_list = cosine_similarity(sim, prompt_list)\n",
-    "    print(\"Results:\", result_list)"
+    "    print(\"Results:\", json.dumps(result_list, indent=4))"
    ]
   },
   {
@@ -505,7 +518,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -515,7 +528,20 @@
       "512\n",
       "# of embeddings for the first vector: 1\n",
       "# of embeddings for the second vector: 4\n",
-      "Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n"
+      "Results: {\n",
+      "    \"result_list\": [\n",
+      "        \"Black cat\",\n",
+      "        \"white Cat\",\n",
+      "        \"Red Cat\",\n",
+      "        \"apple\"\n",
+      "    ],\n",
+      "    \"scores_llist\": [\n",
+      "        0.9999999999999999,\n",
+      "        0.7667772992427558,\n",
+      "        0.7071684587870847,\n",
+      "        0.2702694570913964\n",
+      "    ]\n",
+      "}\n"
      ]
     }
    ],
@@ -546,7 +572,7 @@
     "\n",
     "    sim = Similarity(vector=res1.result, matrix=res2.result)\n",
     "    result_list = cosine_similarity(sim, prompt_list)\n",
-    "    print(\"Results:\", result_list)"
+    "    print(\"Results:\", json.dumps(result_list, indent=4))"
    ]
   },
   {