initial commit

Truly-Depressed-Developers · Sep 29, 2024 · 16a7bd2 · 16a7bd2
1 parent 0252bcf
commit 16a7bd2
Show file tree

Hide file tree

Showing 4 changed files with 160,416 additions and 0 deletions.
diff --git a/api.py b/api.py
@@ -0,0 +1,125 @@
+
+from flask import Flask, request, jsonify
+
+import pandas as pd
+import pickle
+import json
+from embeddings_utils import (
+    get_embedding,
+    distances_from_embeddings,
+    tsne_components_from_embeddings,
+    chart_from_components,
+    indices_of_nearest_neighbors_from_distances,
+)
+
+#model
+EMBEDDING_MODEL = "text-embedding-3-small"
+
+#dataset
+input_json_path = "./ngos_list.json"
+
+with open(input_json_path, 'r', encoding='utf-8') as file:
+    json_data = json.load(file)
+df = pd.DataFrame(json_data)
+embedding_cache_path = "recommendations_embeddings_cache.pkl"
+try:
+    embedding_cache = pd.read_pickle(embedding_cache_path)
+except FileNotFoundError:
+    embedding_cache = {}
+with open(embedding_cache_path, "wb") as embedding_cache_file:
+    pickle.dump(embedding_cache, embedding_cache_file)
+
+def embedding_from_string(
+    string: str,
+    model: str = EMBEDDING_MODEL,
+    embedding_cache=embedding_cache
+) -> list:
+    """Return embedding of given string, using a cache to avoid recomputing."""
+    if (string, model) not in embedding_cache.keys():
+        embedding_cache[(string, model)] = get_embedding(string, model)
+        with open(embedding_cache_path, "wb") as embedding_cache_file:
+            pickle.dump(embedding_cache, embedding_cache_file)
+    return embedding_cache[(string, model)]
+
+def print_recommendations_from_strings(
+    strings: list[str],
+    index_of_source_string: int,
+    k_nearest_neighbors: int = 1,
+    model=EMBEDDING_MODEL,
+    prompt: str = ""
+) -> list[int]:
+    article_descriptions.append(prompt)
+    embeddings = [embedding_from_string(string, model=model) for string in strings]
+    query_embedding = embeddings[index_of_source_string]
+    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
+    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
+    query_string = strings[index_of_source_string]
+    k_counter = 0
+    nearest_dict = {}
+
+    for i in indices_of_nearest_neighbors:
+        if query_string == strings[i]:
+            continue
+        if k_counter >= k_nearest_neighbors:
+            break
+        k_counter += 1
+        nearest_id = df.iloc[i]['ID']  # Assuming 'ID' is the column name in your DataFrame
+
+        # Format the distance
+        distance = f'{distances[i]:0.3f}'
+
+        # Store ID and distance in the dictionary
+        nearest_dict[nearest_id] = distance
+    return nearest_dict
+
+
+
+
+
+# prompt = 'Starsza schorowana pani potrzebująca pomocy socjalnej'
+article_descriptions = df["combined"].tolist()
+index_of_source_string = len(article_descriptions) - 1
+
+
+# tony_blair_articles = print_recommendations_from_strings(
+#     strings=article_descriptions,
+#     index_of_source_string=index_of_source_string,
+#     k_nearest_neighbors=5,
+#     prompt = 'Starsza schorowana pani potrzebująca pomocy socjalnej'
+# )
+
+
+app = Flask(__name__)
+
+@app.route('/search', methods=['GET'])
+def knn_search():
+    try:
+
+        k = request.args.get('k', type=int)
+        prompt = request.args.get('prompt', type=str)
+
+        if k is None or prompt is None:
+            return jsonify({"error": "Missing required parameters: 'k' and 'prompt'."}), 400
+
+        # Append the prompt to the article descriptions
+        article_descriptions.append(prompt)
+        index_of_source_string = len(article_descriptions) - 1  # The index of the prompt
+
+        # Call your recommendation function
+        nearest_neighbors = print_recommendations_from_strings(article_descriptions, index_of_source_string, int(k), prompt=prompt)
+
+        response = {
+            "k_nearest_neighbours": nearest_neighbors
+        }
+        return jsonify(response)
+
+    except ValueError as ve:
+        return jsonify({"error": str(ve)}), 400
+    except Exception as e:
+        return jsonify({"error": "An unexpected error occurred."}), 500
+
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000, debug=True)
+
+
diff --git a/embeddings_utils.py b/embeddings_utils.py
@@ -0,0 +1,258 @@
+import textwrap as tr
+from typing import List, Optional
+
+import matplotlib.pyplot as plt
+import plotly.express as px
+from scipy import spatial
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+from sklearn.metrics import average_precision_score, precision_recall_curve
+
+from openai import OpenAI
+import numpy as np
+import pandas as pd
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+api_key = os.getenv('OPENAI_API_KEY')
+OpenAI.api_key = api_key
+client = OpenAI(max_retries=5)
+
+
+def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
+
+    # replace newlines, which can negatively affect performance.
+    text = text.replace("\n", " ")
+
+    response = client.embeddings.create(input=[text], model=model, **kwargs)
+
+    return response.data[0].embedding
+
+
+async def aget_embedding(
+    text: str, model="text-embedding-3-small", **kwargs
+) -> List[float]:
+    # replace newlines, which can negatively affect performance.
+    text = text.replace("\n", " ")
+
+    return (await client.embeddings.create(input=[text], model=model, **kwargs))[
+        "data"
+    ][0]["embedding"]
+
+
+def get_embeddings(
+    list_of_text: List[str], model="text-embedding-3-small", **kwargs
+) -> List[List[float]]:
+    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
+
+    # replace newlines, which can negatively affect performance.
+    list_of_text = [text.replace("\n", " ") for text in list_of_text]
+
+    data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
+    return [d.embedding for d in data]
+
+
+async def aget_embeddings(
+    list_of_text: List[str], model="text-embedding-3-small", **kwargs
+) -> List[List[float]]:
+    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
+
+    # replace newlines, which can negatively affect performance.
+    list_of_text = [text.replace("\n", " ") for text in list_of_text]
+
+    data = (
+        await client.embeddings.create(input=list_of_text, model=model, **kwargs)
+    ).data
+    return [d.embedding for d in data]
+
+
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+
+def plot_multiclass_precision_recall(
+    y_score, y_true_untransformed, class_list, classifier_name
+):
+    """
+    Precision-Recall plotting for a multiclass problem. It plots average precision-recall, per class precision recall and reference f1 contours.
+
+    Code slightly modified, but heavily based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
+    """
+    n_classes = len(class_list)
+    y_true = pd.concat(
+        [(y_true_untransformed == class_list[i]) for i in range(n_classes)], axis=1
+    ).values
+
+    # For each class
+    precision = dict()
+    recall = dict()
+    average_precision = dict()
+    for i in range(n_classes):
+        precision[i], recall[i], _ = precision_recall_curve(y_true[:, i], y_score[:, i])
+        average_precision[i] = average_precision_score(y_true[:, i], y_score[:, i])
+
+    # A "micro-average": quantifying score on all classes jointly
+    precision_micro, recall_micro, _ = precision_recall_curve(
+        y_true.ravel(), y_score.ravel()
+    )
+    average_precision_micro = average_precision_score(y_true, y_score, average="micro")
+    print(
+        str(classifier_name)
+        + " - Average precision score over all classes: {0:0.2f}".format(
+            average_precision_micro
+        )
+    )
+
+    # setup plot details
+    plt.figure(figsize=(9, 10))
+    f_scores = np.linspace(0.2, 0.8, num=4)
+    lines = []
+    labels = []
+    for f_score in f_scores:
+        x = np.linspace(0.01, 1)
+        y = f_score * x / (2 * x - f_score)
+        (l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
+        plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
+
+    lines.append(l)
+    labels.append("iso-f1 curves")
+    (l,) = plt.plot(recall_micro, precision_micro, color="gold", lw=2)
+    lines.append(l)
+    labels.append(
+        "average Precision-recall (auprc = {0:0.2f})" "".format(average_precision_micro)
+    )
+
+    for i in range(n_classes):
+        (l,) = plt.plot(recall[i], precision[i], lw=2)
+        lines.append(l)
+        labels.append(
+            "Precision-recall for class `{0}` (auprc = {1:0.2f})"
+            "".format(class_list[i], average_precision[i])
+        )
+
+    fig = plt.gcf()
+    fig.subplots_adjust(bottom=0.25)
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title(f"{classifier_name}: Precision-Recall curve for each class")
+    plt.legend(lines, labels)
+
+
+def distances_from_embeddings(
+    query_embedding: List[float],
+    embeddings: List[List[float]],
+    distance_metric="cosine",
+) -> List[List]:
+    """Return the distances between a query embedding and a list of embeddings."""
+    distance_metrics = {
+        "cosine": spatial.distance.cosine,
+        "L1": spatial.distance.cityblock,
+        "L2": spatial.distance.euclidean,
+        "Linf": spatial.distance.chebyshev,
+    }
+    distances = [
+        distance_metrics[distance_metric](query_embedding, embedding)
+        for embedding in embeddings
+    ]
+    return distances
+
+
+def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
+    """Return a list of indices of nearest neighbors from a list of distances."""
+    return np.argsort(distances)
+
+
+def pca_components_from_embeddings(
+    embeddings: List[List[float]], n_components=2
+) -> np.ndarray:
+    """Return the PCA components of a list of embeddings."""
+    pca = PCA(n_components=n_components)
+    array_of_embeddings = np.array(embeddings)
+    return pca.fit_transform(array_of_embeddings)
+
+
+def tsne_components_from_embeddings(
+    embeddings: List[List[float]], n_components=2, **kwargs
+) -> np.ndarray:
+    """Returns t-SNE components of a list of embeddings."""
+    # use better defaults if not specified
+    if "init" not in kwargs.keys():
+        kwargs["init"] = "pca"
+    if "learning_rate" not in kwargs.keys():
+        kwargs["learning_rate"] = "auto"
+    tsne = TSNE(n_components=n_components, **kwargs)
+    array_of_embeddings = np.array(embeddings)
+    return tsne.fit_transform(array_of_embeddings)
+
+
+def chart_from_components(
+    components: np.ndarray,
+    labels: Optional[List[str]] = None,
+    strings: Optional[List[str]] = None,
+    x_title="Component 0",
+    y_title="Component 1",
+    mark_size=5,
+    **kwargs,
+):
+    """Return an interactive 2D chart of embedding components."""
+    empty_list = ["" for _ in components]
+    data = pd.DataFrame(
+        {
+            x_title: components[:, 0],
+            y_title: components[:, 1],
+            "label": labels if labels else empty_list,
+            "string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
+            if strings
+            else empty_list,
+        }
+    )
+    chart = px.scatter(
+        data,
+        x=x_title,
+        y=y_title,
+        color="label" if labels else None,
+        symbol="label" if labels else None,
+        hover_data=["string"] if strings else None,
+        **kwargs,
+    ).update_traces(marker=dict(size=mark_size))
+    return chart
+
+
+def chart_from_components_3D(
+    components: np.ndarray,
+    labels: Optional[List[str]] = None,
+    strings: Optional[List[str]] = None,
+    x_title: str = "Component 0",
+    y_title: str = "Component 1",
+    z_title: str = "Compontent 2",
+    mark_size: int = 5,
+    **kwargs,
+):
+    """Return an interactive 3D chart of embedding components."""
+    empty_list = ["" for _ in components]
+    data = pd.DataFrame(
+        {
+            x_title: components[:, 0],
+            y_title: components[:, 1],
+            z_title: components[:, 2],
+            "label": labels if labels else empty_list,
+            "string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
+            if strings
+            else empty_list,
+        }
+    )
+    chart = px.scatter_3d(
+        data,
+        x=x_title,
+        y=y_title,
+        z=z_title,
+        color="label" if labels else None,
+        symbol="label" if labels else None,
+        hover_data=["string"] if strings else None,
+        **kwargs,
+    ).update_traces(marker=dict(size=mark_size))
+    return chart