From a511791c9afd432340465719cd1d7c3fc1042e3f Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 5 Dec 2023 16:17:21 +0100 Subject: [PATCH] Update markdown cell in notebook --- tasks/clustering/LaserClusteringExample.ipynb | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tasks/clustering/LaserClusteringExample.ipynb b/tasks/clustering/LaserClusteringExample.ipynb index 6bcd4c43..0bfe5249 100644 --- a/tasks/clustering/LaserClusteringExample.ipynb +++ b/tasks/clustering/LaserClusteringExample.ipynb @@ -11311,7 +11311,7 @@ "source": [ "# Clustering Multilingual Embeddings using LASER\n", "\n", - "In this tutorial, we'll explore the power of Language-Agnostic SEntence Representations ([LASER](https://github.com/facebookresearch/LASER)) for generating multilingual embeddings. We'll then use these embeddings to perform clustering on the [MASSIVE](https://github.com/alexa/massive) dataset. Our goal is to show that LASER embeddings can effectively group texts not only by their thematic content but also across different languages. LASER can encode sentences from multiple languages into a shared embedding space, allowing for cross-lingual understanding and comparison. We'll see how this capability is useful for tasks like multilingual clustering.\n" + "In this tutorial, we'll explore the power of Language-Agnostic SEntence Representations ([LASER](https://github.com/facebookresearch/LASER)) for generating multilingual embeddings. We'll then use these embeddings to perform clustering on the [MASSIVE](https://github.com/alexa/massive) dataset. Our goal is to show that LASER embeddings can effectively group texts not only by their semantic content or meaning but also across different languages. LASER can encode sentences from multiple languages into a shared embedding space, allowing for cross-lingual understanding and comparison. We'll see how this capability is useful for tasks like multilingual clustering.\n" ], "metadata": { "id": "EqqG01vB6E9H" @@ -11344,7 +11344,7 @@ }, "outputId": "41b40ab7-a601-407e-cbca-18ce1dc4137e" }, - "execution_count": 1, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -11385,7 +11385,7 @@ "base_uri": "https://localhost:8080/" } }, - "execution_count": 2, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -11406,7 +11406,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "vWPUBx6u6A2u" }, @@ -11830,7 +11830,7 @@ }, "outputId": "5848fa25-fe82-4bbf-acdd-e12d4c069d3c" }, - "execution_count": 4, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -12315,7 +12315,7 @@ "metadata": { "id": "sn-hexAdkkne" }, - "execution_count": 5, + "execution_count": null, "outputs": [] }, { @@ -12340,7 +12340,7 @@ "id": "nwjzEPUqk4-9", "outputId": "0d30eb56-cccd-4525-ae9b-9641a53a44bf" }, - "execution_count": 6, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12388,7 +12388,7 @@ "id": "_skE8eHnkpXV", "outputId": "6676a615-49e7-4428-9f93-d57ad0c0d07d" }, - "execution_count": 7, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12421,7 +12421,7 @@ }, "outputId": "2ab966c1-88a5-4bdf-b542-7d79bf5237a8" }, - "execution_count": 8, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12441,7 +12441,7 @@ "With our LASER embeddings, we can now apply a clustering algorithm. K-Means is a good starting point for its simplicity and effectiveness:\n", "\n", "## Choice of number of clusters.\n", - "In our case, we have `20` parallel sentences in each of 5 languages, meaning these sentences convey the same meanings or topics in different languages. If each sentence represents a unique topic or thematic content, then ideally, We would expect to see around 20 clusters. This is because LASER is designed to map semantically similar sentences to nearby points in the embedding space, regardless of the language." + "In our case, we have `20` parallel sentences in each of 5 languages, meaning these sentences convey the same meanings or topics in different languages. If each sentence represents a unique semantic content, then ideally, We would expect to see around 20 clusters. This is because LASER is designed to map semantically similar sentences to nearby points in the embedding space, regardless of the language." ], "metadata": { "id": "qtXbMQ1_zz9y" @@ -12457,7 +12457,7 @@ "metadata": { "id": "kYvltGE_qpXy" }, - "execution_count": 9, + "execution_count": null, "outputs": [] }, { @@ -12481,7 +12481,7 @@ "metadata": { "id": "iYoDifv0lMNl" }, - "execution_count": 10, + "execution_count": null, "outputs": [] }, { @@ -12497,7 +12497,7 @@ "metadata": { "id": "vMS07RohFWnj" }, - "execution_count": 11, + "execution_count": null, "outputs": [] }, { @@ -12525,7 +12525,7 @@ "id": "-tkDQp4YFSpD", "outputId": "52f76d3e-f95c-4ce2-e16e-3411ef920e7d" }, - "execution_count": 12, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12569,7 +12569,7 @@ "id": "w7zgwToT2N9N", "outputId": "4c874a19-591a-4906-f70a-97c12f458145" }, - "execution_count": 13, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12632,7 +12632,7 @@ "metadata": { "id": "VVft-ZUMEtPP" }, - "execution_count": 14, + "execution_count": null, "outputs": [] }, { @@ -12654,7 +12654,7 @@ "metadata": { "id": "6vu1UTQVmLu9" }, - "execution_count": 15, + "execution_count": null, "outputs": [] }, { @@ -12675,7 +12675,7 @@ "id": "0JG_MA7-lXMW", "outputId": "bc556d12-39d5-4b64-fb1e-d73bba7f5894" }, - "execution_count": 16, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -12697,7 +12697,7 @@ "metadata": { "id": "FbKkpx5Eue0o" }, - "execution_count": 17, + "execution_count": null, "outputs": [] }, { @@ -12739,7 +12739,7 @@ "id": "mkpQlPP4unaU", "outputId": "53c66cee-c70b-492e-9f1d-bf8f582069b8" }, - "execution_count": 18, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -12769,7 +12769,7 @@ "metadata": { "id": "s9JqzeO5HCBK" }, - "execution_count": 20, + "execution_count": null, "outputs": [] }, { @@ -12787,7 +12787,7 @@ "id": "lMzYKIMfFl4s", "outputId": "ab3fe9b9-c226-4cee-f796-5898835a4b0f" }, - "execution_count": 21, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -12839,7 +12839,7 @@ "id": "3_gTVXAJFsG9", "outputId": "26bb54c7-9486-4635-8599-0235ed551281" }, - "execution_count": 22, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -12886,7 +12886,7 @@ "id": "knwAXi4EFwx_", "outputId": "1e23469d-1757-4835-9bef-42e49dcb22d8" }, - "execution_count": 25, + "execution_count": null, "outputs": [ { "output_type": "display_data",