added CSV creation for token labelling

delphi-suite · Feb 22, 2024 · d970d8f · d970d8f
1 parent 50604fe
commit d970d8f
Show file tree

Hide file tree

Showing 3 changed files with 4,170 additions and 43 deletions.
diff --git a/notebooks/token_labelling.ipynb b/notebooks/token_labelling.ipynb
@@ -425,13 +425,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas\n",
-    "import matplotlib.pyplot as plt  # install matplotlib, if necessary"
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt  # install matplotlib, if necessary\n",
+    "from tqdm.autonotebook import tqdm"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -643,7 +644,7 @@
        "[5 rows x 21 columns]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -657,18 +658,54 @@
     "# label1, label2 ... labelN are the keys of the label dictionary\n",
     "# the values of the label dictionary are the probabilities of the label\n",
     "# here we go:\n",
-    "df = pandas.DataFrame(labelled_token_ids_dict.items(), columns=[\"token_id\", \"label\"])\n",
+    "df = pd.DataFrame(labelled_token_ids_dict.items(), columns=[\"token_id\", \"label\"])\n",
     "# split the label column into multiple columns\n",
-    "df = df.join(pandas.DataFrame(df.pop('label').tolist()))\n",
+    "df = df.join(pd.DataFrame(df.pop('label').tolist()))\n",
     "# Change datatype of columns to float\n",
     "df = df.astype(int)\n",
     "\n",
     "df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We perform a **sanity check** to assure that the code above was correct."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aac7894b3f61477bb96b9818757be9f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Perform sanity check, that the table was created correctly\n",
+    "for (row_index, row_values) in tqdm(df.iterrows()):\n",
+    "    token_id = row_values.iloc[0]\n",
+    "    label_pandas = list(row_values.iloc[1:]) # we exclude the token_id from the colum\n",
+    "    label_dict = list(labelled_token_ids_dict[token_id].values())[:]\n",
+    "    assert label_pandas == label_dict, f\"The dataframes are not equal for row {token_id}\\n{label_pandas}\\n{label_dict}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
    "metadata": {},
    "outputs": [
     {
@@ -697,42 +734,6 @@
     "# rotate x labels\n",
     "_ = plt.xticks(rotation=90)\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'Capitalized': False,\n",
-      " 'Is Adjective': False,\n",
-      " 'Is Adposition': False,\n",
-      " 'Is Adverb': False,\n",
-      " 'Is Auxiliary': False,\n",
-      " 'Is Coordinating conjuction': False,\n",
-      " 'Is Determiner': False,\n",
-      " 'Is Interjunction': False,\n",
-      " 'Is Named Entity': False,\n",
-      " 'Is Noun': True,\n",
-      " 'Is Numeral': False,\n",
-      " 'Is Other': False,\n",
-      " 'Is Particle': False,\n",
-      " 'Is Pronoun': False,\n",
-      " 'Is Proper Noun': False,\n",
-      " 'Is Punctuation': False,\n",
-      " 'Is Subordinating conjuction': False,\n",
-      " 'Is Symbol': False,\n",
-      " 'Is Verb': False,\n",
-      " 'Starts with space': False}\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(labelled_token_ids_dict[1000])"
-   ]
   }
  ],
  "metadata": {

diff --git a/scripts/label_all_tokens.py b/scripts/label_all_tokens.py
@@ -2,6 +2,7 @@
 import pickle
 from pathlib import Path
 
+import pandas as pd
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -103,6 +104,34 @@ def main():
     assert labelled_token_ids_dict == pickled
     print(" completed.")
 
+    # ----------- CSV ------------------------
+    print("\nCreating the CSV ...")
+    # Create a pandas dataframe / CSV from the label dict
+    df = pd.DataFrame(labelled_token_ids_dict.items(), columns=["token_id", "label"])
+    # split the label column into multiple columns
+    df = df.join(pd.DataFrame(df.pop("label").tolist()))
+    # Change datatype of columns to float
+    df = df.astype(int)
+
+    print("Sanity check pandas csv ...", end="")
+    # Perform sanity check, that the table was created correctly
+    for row_index, row_values in df.iterrows():
+        token_id = row_values.iloc[0]
+        label_pandas = list(
+            row_values.iloc[1:]
+        )  # we exclude the token_id from the colum
+        label_dict = list(labelled_token_ids_dict[token_id].values())[:]
+        assert (
+            label_pandas == label_dict
+        ), f"The dataframes are not equal for row {token_id}\n{label_pandas}\n{label_dict}"
+    print(" completed.")
+
+    # save the dataframe to a csv
+    filename = "labelled_token_ids_df.csv"
+    filepath = SAVE_DIR / filename
+    df.to_csv(filepath, index=False)
+    print(f"Saved the labelled tokens as CSV to:\n\t{filepath}\n")
+
     print(" END ".center(50, "="))