Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Sosek1 committed Sep 29, 2024
1 parent 0252bcf commit 16a7bd2
Show file tree
Hide file tree
Showing 4 changed files with 160,416 additions and 0 deletions.
125 changes: 125 additions & 0 deletions api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@

from flask import Flask, request, jsonify

import pandas as pd
import pickle
import json
from embeddings_utils import (
get_embedding,
distances_from_embeddings,
tsne_components_from_embeddings,
chart_from_components,
indices_of_nearest_neighbors_from_distances,
)

#model
EMBEDDING_MODEL = "text-embedding-3-small"

#dataset
input_json_path = "./ngos_list.json"

with open(input_json_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)
df = pd.DataFrame(json_data)
embedding_cache_path = "recommendations_embeddings_cache.pkl"
try:
embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
pickle.dump(embedding_cache, embedding_cache_file)

def embedding_from_string(
string: str,
model: str = EMBEDDING_MODEL,
embedding_cache=embedding_cache
) -> list:
"""Return embedding of given string, using a cache to avoid recomputing."""
if (string, model) not in embedding_cache.keys():
embedding_cache[(string, model)] = get_embedding(string, model)
with open(embedding_cache_path, "wb") as embedding_cache_file:
pickle.dump(embedding_cache, embedding_cache_file)
return embedding_cache[(string, model)]

def print_recommendations_from_strings(
strings: list[str],
index_of_source_string: int,
k_nearest_neighbors: int = 1,
model=EMBEDDING_MODEL,
prompt: str = ""
) -> list[int]:
article_descriptions.append(prompt)
embeddings = [embedding_from_string(string, model=model) for string in strings]
query_embedding = embeddings[index_of_source_string]
distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
query_string = strings[index_of_source_string]
k_counter = 0
nearest_dict = {}

for i in indices_of_nearest_neighbors:
if query_string == strings[i]:
continue
if k_counter >= k_nearest_neighbors:
break
k_counter += 1
nearest_id = df.iloc[i]['ID'] # Assuming 'ID' is the column name in your DataFrame

# Format the distance
distance = f'{distances[i]:0.3f}'

# Store ID and distance in the dictionary
nearest_dict[nearest_id] = distance
return nearest_dict





# prompt = 'Starsza schorowana pani potrzebująca pomocy socjalnej'
article_descriptions = df["combined"].tolist()
index_of_source_string = len(article_descriptions) - 1


# tony_blair_articles = print_recommendations_from_strings(
# strings=article_descriptions,
# index_of_source_string=index_of_source_string,
# k_nearest_neighbors=5,
# prompt = 'Starsza schorowana pani potrzebująca pomocy socjalnej'
# )


app = Flask(__name__)

@app.route('/search', methods=['GET'])
def knn_search():
try:

k = request.args.get('k', type=int)
prompt = request.args.get('prompt', type=str)

if k is None or prompt is None:
return jsonify({"error": "Missing required parameters: 'k' and 'prompt'."}), 400

# Append the prompt to the article descriptions
article_descriptions.append(prompt)
index_of_source_string = len(article_descriptions) - 1 # The index of the prompt

# Call your recommendation function
nearest_neighbors = print_recommendations_from_strings(article_descriptions, index_of_source_string, int(k), prompt=prompt)

response = {
"k_nearest_neighbours": nearest_neighbors
}
return jsonify(response)

except ValueError as ve:
return jsonify({"error": str(ve)}), 400
except Exception as e:
return jsonify({"error": "An unexpected error occurred."}), 500


if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)


258 changes: 258 additions & 0 deletions embeddings_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
import textwrap as tr
from typing import List, Optional

import matplotlib.pyplot as plt
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score, precision_recall_curve

from openai import OpenAI
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')
OpenAI.api_key = api_key
client = OpenAI(max_retries=5)


def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:

# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")

response = client.embeddings.create(input=[text], model=model, **kwargs)

return response.data[0].embedding


async def aget_embedding(
text: str, model="text-embedding-3-small", **kwargs
) -> List[float]:
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")

return (await client.embeddings.create(input=[text], model=model, **kwargs))[
"data"
][0]["embedding"]


def get_embeddings(
list_of_text: List[str], model="text-embedding-3-small", **kwargs
) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

# replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text]

data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
return [d.embedding for d in data]


async def aget_embeddings(
list_of_text: List[str], model="text-embedding-3-small", **kwargs
) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

# replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text]

data = (
await client.embeddings.create(input=list_of_text, model=model, **kwargs)
).data
return [d.embedding for d in data]


def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def plot_multiclass_precision_recall(
y_score, y_true_untransformed, class_list, classifier_name
):
"""
Precision-Recall plotting for a multiclass problem. It plots average precision-recall, per class precision recall and reference f1 contours.
Code slightly modified, but heavily based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
"""
n_classes = len(class_list)
y_true = pd.concat(
[(y_true_untransformed == class_list[i]) for i in range(n_classes)], axis=1
).values

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_true[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_true[:, i], y_score[:, i])

# A "micro-average": quantifying score on all classes jointly
precision_micro, recall_micro, _ = precision_recall_curve(
y_true.ravel(), y_score.ravel()
)
average_precision_micro = average_precision_score(y_true, y_score, average="micro")
print(
str(classifier_name)
+ " - Average precision score over all classes: {0:0.2f}".format(
average_precision_micro
)
)

# setup plot details
plt.figure(figsize=(9, 10))
f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

lines.append(l)
labels.append("iso-f1 curves")
(l,) = plt.plot(recall_micro, precision_micro, color="gold", lw=2)
lines.append(l)
labels.append(
"average Precision-recall (auprc = {0:0.2f})" "".format(average_precision_micro)
)

for i in range(n_classes):
(l,) = plt.plot(recall[i], precision[i], lw=2)
lines.append(l)
labels.append(
"Precision-recall for class `{0}` (auprc = {1:0.2f})"
"".format(class_list[i], average_precision[i])
)

fig = plt.gcf()
fig.subplots_adjust(bottom=0.25)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"{classifier_name}: Precision-Recall curve for each class")
plt.legend(lines, labels)


def distances_from_embeddings(
query_embedding: List[float],
embeddings: List[List[float]],
distance_metric="cosine",
) -> List[List]:
"""Return the distances between a query embedding and a list of embeddings."""
distance_metrics = {
"cosine": spatial.distance.cosine,
"L1": spatial.distance.cityblock,
"L2": spatial.distance.euclidean,
"Linf": spatial.distance.chebyshev,
}
distances = [
distance_metrics[distance_metric](query_embedding, embedding)
for embedding in embeddings
]
return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
"""Return a list of indices of nearest neighbors from a list of distances."""
return np.argsort(distances)


def pca_components_from_embeddings(
embeddings: List[List[float]], n_components=2
) -> np.ndarray:
"""Return the PCA components of a list of embeddings."""
pca = PCA(n_components=n_components)
array_of_embeddings = np.array(embeddings)
return pca.fit_transform(array_of_embeddings)


def tsne_components_from_embeddings(
embeddings: List[List[float]], n_components=2, **kwargs
) -> np.ndarray:
"""Returns t-SNE components of a list of embeddings."""
# use better defaults if not specified
if "init" not in kwargs.keys():
kwargs["init"] = "pca"
if "learning_rate" not in kwargs.keys():
kwargs["learning_rate"] = "auto"
tsne = TSNE(n_components=n_components, **kwargs)
array_of_embeddings = np.array(embeddings)
return tsne.fit_transform(array_of_embeddings)


def chart_from_components(
components: np.ndarray,
labels: Optional[List[str]] = None,
strings: Optional[List[str]] = None,
x_title="Component 0",
y_title="Component 1",
mark_size=5,
**kwargs,
):
"""Return an interactive 2D chart of embedding components."""
empty_list = ["" for _ in components]
data = pd.DataFrame(
{
x_title: components[:, 0],
y_title: components[:, 1],
"label": labels if labels else empty_list,
"string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
if strings
else empty_list,
}
)
chart = px.scatter(
data,
x=x_title,
y=y_title,
color="label" if labels else None,
symbol="label" if labels else None,
hover_data=["string"] if strings else None,
**kwargs,
).update_traces(marker=dict(size=mark_size))
return chart


def chart_from_components_3D(
components: np.ndarray,
labels: Optional[List[str]] = None,
strings: Optional[List[str]] = None,
x_title: str = "Component 0",
y_title: str = "Component 1",
z_title: str = "Compontent 2",
mark_size: int = 5,
**kwargs,
):
"""Return an interactive 3D chart of embedding components."""
empty_list = ["" for _ in components]
data = pd.DataFrame(
{
x_title: components[:, 0],
y_title: components[:, 1],
z_title: components[:, 2],
"label": labels if labels else empty_list,
"string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
if strings
else empty_list,
}
)
chart = px.scatter_3d(
data,
x=x_title,
y=y_title,
z=z_title,
color="label" if labels else None,
symbol="label" if labels else None,
hover_data=["string"] if strings else None,
**kwargs,
).update_traces(marker=dict(size=mark_size))
return chart
Loading

0 comments on commit 16a7bd2

Please sign in to comment.