Skip to content

Commit

Permalink
added new embedding models
Browse files Browse the repository at this point in the history
  • Loading branch information
jrcastropy committed Feb 21, 2024
1 parent bfffb9b commit cd7cad8
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 25 deletions.
18 changes: 12 additions & 6 deletions serverless_openai/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,16 +290,22 @@ def vision_longimage(
def embeddings(
self,
prompt: EmbeddingPrompts,
model: EmbeddingModels = EmbeddingModels.ada2,
model: EmbeddingModels = EmbeddingModels.te_ada2,
dimensions: int = 1536,
tries: int = 5,
timeout: int = 500,
) -> OpenAIResults:
data = {
"model": model,
"input": prompt,
"dimensions": dimensions
}
if model == "text-embedding-ada-002":
data = {
"model": model,
"input": prompt,
}
else:
data = {
"model": model,
"input": prompt,
"dimensions": dimensions
}

res = {}
for _ in range(tries):
Expand Down
26 changes: 15 additions & 11 deletions serverless_openai/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ class Config:
arbitrary_types_allowed = True

class EmbeddingModels(str, ExtendedEnum):
ada2 : str = "text-embedding-ada-002"
te_ada2 : str = "text-embedding-ada-002"
te3_small: str = "text-embedding-3-small"
te3_large: str = "text-embedding-3-large"

class EmbeddingPrompts(BaseModel):
prompt: Union[str, List[str]]
Expand Down Expand Up @@ -154,29 +156,31 @@ def crop_image(

def cosine_similarity(
sim: Similarity,
data_list: List[str],
get_scores: bool = False
) -> list:
data_list: List[str]
) -> dict:
vector = np.array(sim.vector)
matrix = np.array(sim.matrix)
scores = (np.sum(vector*matrix,axis=1) / ( np.sqrt(np.sum(matrix**2,axis=1)) * np.sqrt(np.sum(vector**2)) ) )
if get_scores:
return scores
result_list = get_similarity_result(scores, data_list)
return result_list
res_dict = get_similarity_result(scores, data_list)
return res_dict

def get_similarity_result(
scores: list,
all_data: list,
topn: int = 5
) -> list:
) -> dict:
idx = (-scores).argsort()
result_list = []
scores_llist = []
for ix in idx[:topn]:
p = all_data[ix]
scr = scores[ix]
score = scores[ix]
scores_llist.append(score)
result_list.append(p)
return result_list
return {
"result_list": result_list,
"scores_llist": scores_llist
}

def get_token_count(
text: str
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup, find_packages

VERSION = '1.3.1'
VERSION = '1.3.2'
DESCRIPTION = "A package for using Openai in serverless environment"
LONG_DESCRIPTION = 'A package for using Openai with scraping and etc. in serverless application such as AWS Lambda and GCP Cloud Function'

Expand Down
40 changes: 33 additions & 7 deletions test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -466,14 +466,27 @@
"text": [
"# of embeddings for the first vector: 1\n",
"# of embeddings for the second vector: 4\n",
"Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n"
"Results: {\n",
" \"result_list\": [\n",
" \"Black cat\",\n",
" \"white Cat\",\n",
" \"Red Cat\",\n",
" \"apple\"\n",
" ],\n",
" \"scores_llist\": [\n",
" 1.0,\n",
" 0.9070401864448968,\n",
" 0.9044700016527735,\n",
" 0.79191801054189\n",
" ]\n",
"}\n"
]
}
],
"source": [
"from serverless_openai import OpenAIAPI, Similarity\n",
"from serverless_openai.helpers import cosine_similarity\n",
"import os\n",
"import os, json\n",
"\n",
"# Initialize OpenAIAPI with API Key\n",
"openai_api_key = os.getenv(\"openai_api_key\")\n",
Expand All @@ -493,7 +506,7 @@
"\n",
" sim = Similarity(vector=res1.result, matrix=res2.result)\n",
" result_list = cosine_similarity(sim, prompt_list)\n",
" print(\"Results:\", result_list)"
" print(\"Results:\", json.dumps(result_list, indent=4))"
]
},
{
Expand All @@ -505,7 +518,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -515,7 +528,20 @@
"512\n",
"# of embeddings for the first vector: 1\n",
"# of embeddings for the second vector: 4\n",
"Results: ['Black cat', 'white Cat', 'Red Cat', 'apple']\n"
"Results: {\n",
" \"result_list\": [\n",
" \"Black cat\",\n",
" \"white Cat\",\n",
" \"Red Cat\",\n",
" \"apple\"\n",
" ],\n",
" \"scores_llist\": [\n",
" 0.9999999999999999,\n",
" 0.7667772992427558,\n",
" 0.7071684587870847,\n",
" 0.2702694570913964\n",
" ]\n",
"}\n"
]
}
],
Expand Down Expand Up @@ -546,7 +572,7 @@
"\n",
" sim = Similarity(vector=res1.result, matrix=res2.result)\n",
" result_list = cosine_similarity(sim, prompt_list)\n",
" print(\"Results:\", result_list)"
" print(\"Results:\", json.dumps(result_list, indent=4))"
]
},
{
Expand Down

0 comments on commit cd7cad8

Please sign in to comment.