diff --git a/notebooks/eval_notebook.ipynb b/notebooks/eval_notebook.ipynb
index 864a1f1e..daed1f52 100644
--- a/notebooks/eval_notebook.ipynb
+++ b/notebooks/eval_notebook.ipynb
@@ -68,12 +68,12 @@
      "data": {
       "application/vnd.holoviews_exec.v0+json": "",
       "text/html": [
-       "<div id='cc773275-e3eb-4a12-93e1-3b0732805f97'>\n",
-       "  <div id=\"eb946726-cfc7-4f63-964e-ca5295a01899\" data-root-id=\"cc773275-e3eb-4a12-93e1-3b0732805f97\" style=\"display: contents;\"></div>\n",
+       "<div id='2ac0543f-58a1-4b93-9570-a2c0e6f09501'>\n",
+       "  <div id=\"e770ef02-8e33-49f9-b929-8ce2243ecc30\" data-root-id=\"2ac0543f-58a1-4b93-9570-a2c0e6f09501\" style=\"display: contents;\"></div>\n",
        "</div>\n",
        "<script type=\"application/javascript\">(function(root) {\n",
-       "  var docs_json = {\"167ed557-d911-465c-8d17-9f2a82bd5736\":{\"version\":\"3.4.1\",\"title\":\"Bokeh Application\",\"roots\":[{\"type\":\"object\",\"name\":\"panel.models.browser.BrowserInfo\",\"id\":\"cc773275-e3eb-4a12-93e1-3b0732805f97\"},{\"type\":\"object\",\"name\":\"panel.models.comm_manager.CommManager\",\"id\":\"bf83b14b-e758-4062-a1cc-c79adea2c904\",\"attributes\":{\"plot_id\":\"cc773275-e3eb-4a12-93e1-3b0732805f97\",\"comm_id\":\"574b9954e55b4b29b4d3aba9d5c58dbe\",\"client_comm_id\":\"f0794d6c687c455da54c1269a3a844b7\"}}],\"defs\":[{\"type\":\"model\",\"name\":\"ReactiveHTML1\"},{\"type\":\"model\",\"name\":\"FlexBox1\",\"properties\":[{\"name\":\"align_content\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"align_items\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"flex_direction\",\"kind\":\"Any\",\"default\":\"row\"},{\"name\":\"flex_wrap\",\"kind\":\"Any\",\"default\":\"wrap\"},{\"name\":\"justify_content\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"gap\",\"kind\":\"Any\",\"default\":\"\"}]},{\"type\":\"model\",\"name\":\"FloatPanel1\",\"properties\":[{\"name\":\"config\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"contained\",\"kind\":\"Any\",\"default\":true},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"right-top\"},{\"name\":\"offsetx\",\"kind\":\"Any\",\"default\":null},{\"name\":\"offsety\",\"kind\":\"Any\",\"default\":null},{\"name\":\"theme\",\"kind\":\"Any\",\"default\":\"primary\"},{\"name\":\"status\",\"kind\":\"Any\",\"default\":\"normalized\"}]},{\"type\":\"model\",\"name\":\"GridStack1\",\"properties\":[{\"name\":\"mode\",\"kind\":\"Any\",\"default\":\"warn\"},{\"name\":\"ncols\",\"kind\":\"Any\",\"default\":null},{\"name\":\"nrows\",\"kind\":\"Any\",\"default\":null},{\"name\":\"allow_resize\",\"kind\":\"Any\",\"default\":true},{\"name\":\"allow_drag\",\"kind\":\"Any\",\"default\":true},{\"name\":\"state\",\"kind\":\"Any\",\"default\":[]}]},{\"type\":\"model\",\"name\":\"drag1\",\"properties\":[{\"name\":\"slider_width\",\"kind\":\"Any\",\"default\":5},{\"name\":\"slider_color\",\"kind\":\"Any\",\"default\":\"black\"},{\"name\":\"value\",\"kind\":\"Any\",\"default\":50}]},{\"type\":\"model\",\"name\":\"click1\",\"properties\":[{\"name\":\"terminal_output\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"debug_name\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"clears\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"FastWrapper1\",\"properties\":[{\"name\":\"object\",\"kind\":\"Any\",\"default\":null},{\"name\":\"style\",\"kind\":\"Any\",\"default\":null}]},{\"type\":\"model\",\"name\":\"NotificationAreaBase1\",\"properties\":[{\"name\":\"js_events\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"bottom-right\"},{\"name\":\"_clear\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"NotificationArea1\",\"properties\":[{\"name\":\"js_events\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"notifications\",\"kind\":\"Any\",\"default\":[]},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"bottom-right\"},{\"name\":\"_clear\",\"kind\":\"Any\",\"default\":0},{\"name\":\"types\",\"kind\":\"Any\",\"default\":[{\"type\":\"map\",\"entries\":[[\"type\",\"warning\"],[\"background\",\"#ffc107\"],[\"icon\",{\"type\":\"map\",\"entries\":[[\"className\",\"fas fa-exclamation-triangle\"],[\"tagName\",\"i\"],[\"color\",\"white\"]]}]]},{\"type\":\"map\",\"entries\":[[\"type\",\"info\"],[\"background\",\"#007bff\"],[\"icon\",{\"type\":\"map\",\"entries\":[[\"className\",\"fas fa-info-circle\"],[\"tagName\",\"i\"],[\"color\",\"white\"]]}]]}]}]},{\"type\":\"model\",\"name\":\"Notification\",\"properties\":[{\"name\":\"background\",\"kind\":\"Any\",\"default\":null},{\"name\":\"duration\",\"kind\":\"Any\",\"default\":3000},{\"name\":\"icon\",\"kind\":\"Any\",\"default\":null},{\"name\":\"message\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"notification_type\",\"kind\":\"Any\",\"default\":null},{\"name\":\"_destroyed\",\"kind\":\"Any\",\"default\":false}]},{\"type\":\"model\",\"name\":\"TemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"BootstrapTemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"TemplateEditor1\",\"properties\":[{\"name\":\"layout\",\"kind\":\"Any\",\"default\":[]}]},{\"type\":\"model\",\"name\":\"MaterialTemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"copy_to_clipboard1\",\"properties\":[{\"name\":\"fill\",\"kind\":\"Any\",\"default\":\"none\"},{\"name\":\"value\",\"kind\":\"Any\",\"default\":null}]}]}};\n",
-       "  var render_items = [{\"docid\":\"167ed557-d911-465c-8d17-9f2a82bd5736\",\"roots\":{\"cc773275-e3eb-4a12-93e1-3b0732805f97\":\"eb946726-cfc7-4f63-964e-ca5295a01899\"},\"root_ids\":[\"cc773275-e3eb-4a12-93e1-3b0732805f97\"]}];\n",
+       "  var docs_json = {\"c08ddc18-0dc2-4d6c-b37e-ecfdf93b8a15\":{\"version\":\"3.4.1\",\"title\":\"Bokeh Application\",\"roots\":[{\"type\":\"object\",\"name\":\"panel.models.browser.BrowserInfo\",\"id\":\"2ac0543f-58a1-4b93-9570-a2c0e6f09501\"},{\"type\":\"object\",\"name\":\"panel.models.comm_manager.CommManager\",\"id\":\"80c41d1f-608b-4452-8e7b-067a288b2be5\",\"attributes\":{\"plot_id\":\"2ac0543f-58a1-4b93-9570-a2c0e6f09501\",\"comm_id\":\"145ac8bee94d45318405fa7dad85fe86\",\"client_comm_id\":\"f1421a654f6e4108afe5e1a723fc89e6\"}}],\"defs\":[{\"type\":\"model\",\"name\":\"ReactiveHTML1\"},{\"type\":\"model\",\"name\":\"FlexBox1\",\"properties\":[{\"name\":\"align_content\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"align_items\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"flex_direction\",\"kind\":\"Any\",\"default\":\"row\"},{\"name\":\"flex_wrap\",\"kind\":\"Any\",\"default\":\"wrap\"},{\"name\":\"justify_content\",\"kind\":\"Any\",\"default\":\"flex-start\"},{\"name\":\"gap\",\"kind\":\"Any\",\"default\":\"\"}]},{\"type\":\"model\",\"name\":\"FloatPanel1\",\"properties\":[{\"name\":\"config\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"contained\",\"kind\":\"Any\",\"default\":true},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"right-top\"},{\"name\":\"offsetx\",\"kind\":\"Any\",\"default\":null},{\"name\":\"offsety\",\"kind\":\"Any\",\"default\":null},{\"name\":\"theme\",\"kind\":\"Any\",\"default\":\"primary\"},{\"name\":\"status\",\"kind\":\"Any\",\"default\":\"normalized\"}]},{\"type\":\"model\",\"name\":\"GridStack1\",\"properties\":[{\"name\":\"mode\",\"kind\":\"Any\",\"default\":\"warn\"},{\"name\":\"ncols\",\"kind\":\"Any\",\"default\":null},{\"name\":\"nrows\",\"kind\":\"Any\",\"default\":null},{\"name\":\"allow_resize\",\"kind\":\"Any\",\"default\":true},{\"name\":\"allow_drag\",\"kind\":\"Any\",\"default\":true},{\"name\":\"state\",\"kind\":\"Any\",\"default\":[]}]},{\"type\":\"model\",\"name\":\"drag1\",\"properties\":[{\"name\":\"slider_width\",\"kind\":\"Any\",\"default\":5},{\"name\":\"slider_color\",\"kind\":\"Any\",\"default\":\"black\"},{\"name\":\"value\",\"kind\":\"Any\",\"default\":50}]},{\"type\":\"model\",\"name\":\"click1\",\"properties\":[{\"name\":\"terminal_output\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"debug_name\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"clears\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"FastWrapper1\",\"properties\":[{\"name\":\"object\",\"kind\":\"Any\",\"default\":null},{\"name\":\"style\",\"kind\":\"Any\",\"default\":null}]},{\"type\":\"model\",\"name\":\"NotificationAreaBase1\",\"properties\":[{\"name\":\"js_events\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"bottom-right\"},{\"name\":\"_clear\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"NotificationArea1\",\"properties\":[{\"name\":\"js_events\",\"kind\":\"Any\",\"default\":{\"type\":\"map\"}},{\"name\":\"notifications\",\"kind\":\"Any\",\"default\":[]},{\"name\":\"position\",\"kind\":\"Any\",\"default\":\"bottom-right\"},{\"name\":\"_clear\",\"kind\":\"Any\",\"default\":0},{\"name\":\"types\",\"kind\":\"Any\",\"default\":[{\"type\":\"map\",\"entries\":[[\"type\",\"warning\"],[\"background\",\"#ffc107\"],[\"icon\",{\"type\":\"map\",\"entries\":[[\"className\",\"fas fa-exclamation-triangle\"],[\"tagName\",\"i\"],[\"color\",\"white\"]]}]]},{\"type\":\"map\",\"entries\":[[\"type\",\"info\"],[\"background\",\"#007bff\"],[\"icon\",{\"type\":\"map\",\"entries\":[[\"className\",\"fas fa-info-circle\"],[\"tagName\",\"i\"],[\"color\",\"white\"]]}]]}]}]},{\"type\":\"model\",\"name\":\"Notification\",\"properties\":[{\"name\":\"background\",\"kind\":\"Any\",\"default\":null},{\"name\":\"duration\",\"kind\":\"Any\",\"default\":3000},{\"name\":\"icon\",\"kind\":\"Any\",\"default\":null},{\"name\":\"message\",\"kind\":\"Any\",\"default\":\"\"},{\"name\":\"notification_type\",\"kind\":\"Any\",\"default\":null},{\"name\":\"_destroyed\",\"kind\":\"Any\",\"default\":false}]},{\"type\":\"model\",\"name\":\"TemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"BootstrapTemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"TemplateEditor1\",\"properties\":[{\"name\":\"layout\",\"kind\":\"Any\",\"default\":[]}]},{\"type\":\"model\",\"name\":\"MaterialTemplateActions1\",\"properties\":[{\"name\":\"open_modal\",\"kind\":\"Any\",\"default\":0},{\"name\":\"close_modal\",\"kind\":\"Any\",\"default\":0}]},{\"type\":\"model\",\"name\":\"copy_to_clipboard1\",\"properties\":[{\"name\":\"fill\",\"kind\":\"Any\",\"default\":\"none\"},{\"name\":\"value\",\"kind\":\"Any\",\"default\":null}]}]}};\n",
+       "  var render_items = [{\"docid\":\"c08ddc18-0dc2-4d6c-b37e-ecfdf93b8a15\",\"roots\":{\"2ac0543f-58a1-4b93-9570-a2c0e6f09501\":\"e770ef02-8e33-49f9-b929-8ce2243ecc30\"},\"root_ids\":[\"2ac0543f-58a1-4b93-9570-a2c0e6f09501\"]}];\n",
        "  var docs = Object.values(docs_json)\n",
        "  if (!docs) {\n",
        "    return\n",
@@ -137,7 +137,7 @@
      },
      "metadata": {
       "application/vnd.holoviews_exec.v0+json": {
-       "id": "cc773275-e3eb-4a12-93e1-3b0732805f97"
+       "id": "2ac0543f-58a1-4b93-9570-a2c0e6f09501"
       }
      },
      "output_type": "display_data"
@@ -147,15 +147,10 @@
     "# imports\n",
     "import torch\n",
     "import panel as pn\n",
-    "from delphi.eval.vis import token_selector\n",
+    "from delphi.eval import token_selector, vis_pos_map, calc_model_group_stats, visualize_selected_tokens, get_all_tok_metrics_in_label\n",
     "from datasets import load_dataset, Dataset\n",
     "from transformers import AutoTokenizer\n",
     "from typing import cast\n",
-    "from delphi.eval.calc_model_group_stats import calc_model_group_stats\n",
-    "from delphi.eval.vis_per_token_model import visualize_selected_tokens\n",
-    "from ipywidgets import interact\n",
-    "from delphi.eval.token_positions import get_all_tok_metrics_in_label\n",
-    "from delphi.eval.vis import vis_pos_map\n",
     "import ipywidgets as widgets\n",
     "\n",
     "# refer to https://panel.holoviz.org/reference/panes/IPyWidget.html to integrate ipywidgets with panel\n",
@@ -225,21 +220,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/jett/Documents/jett/delphi/.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7a90e8b37a674fe4a82fa837c5258335",
+       "model_id": "29c01ed2f022418ebc6a7c4f2d8210b4",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "BokehModel(combine_events=True, render_bundle={'docs_json': {'fbcd4d70-83e4-4f3b-a073-702f920d0738': {'version…"
+       "BokehModel(combine_events=True, render_bundle={'docs_json': {'f8a47e67-7cc8-4e1f-bc8e-c10325c540a2': {'version…"
       ]
      },
-     "execution_count": 68,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -269,7 +272,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -288,16 +291,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "torch.Size([100, 513])"
+       "torch.Size([100, 512])"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -308,14 +311,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing model 100k\n",
+      "Processing model 100k\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Processing model 200k\n",
       "Processing model 400k\n"
      ]
@@ -323,7 +332,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "133ee80dfa814961944d5ddf76f2fc6e",
+       "model_id": "2084f5f7ca5e4aeca85b0f5c821eaced",
        "version_major": 2,
        "version_minor": 0
       },
@@ -335,9 +344,9 @@
        "              'name': 'Upper Bound',\n",
        "              'showlegend': False,\n",
        "              'type': 'scatter',\n",
-       "              'uid': '7b5d42bd-2dc2-4c7b-b9ef-500259ecc5e6',\n",
+       "              'uid': '3348f11d-9719-4274-9954-43a9dc8f2ce1',\n",
        "              'x': [100k, 200k, 400k],\n",
-       "              'y': array([3.71775752, 3.02412134, 3.20290118])},\n",
+       "              'y': array([4.6017912 , 4.03893679, 3.46496367])},\n",
        "             {'fill': 'tonexty',\n",
        "              'fillcolor': 'rgba(68, 68, 68, 0.3)',\n",
        "              'line': {'width': 0},\n",
@@ -346,21 +355,21 @@
        "              'name': 'Lower Bound',\n",
        "              'showlegend': False,\n",
        "              'type': 'scatter',\n",
-       "              'uid': '716e9e70-2be1-4d42-ad85-8c815b6b66b2',\n",
+       "              'uid': '2a90892b-69a9-49d6-bad0-086ee4837fcc',\n",
        "              'x': [100k, 200k, 400k],\n",
-       "              'y': array([0.68709016, 0.66691843, 0.41506469])},\n",
+       "              'y': array([1.00667199, 0.88813308, 0.735852  ])},\n",
        "             {'marker': {'color': 'rgb(31, 119, 180)', 'line': {'color': 'rgb(31, 119, 180)', 'width': 1}, 'size': 0},\n",
        "              'mode': 'lines',\n",
        "              'name': 'Means',\n",
        "              'type': 'scatter',\n",
-       "              'uid': 'f388f676-370a-45d7-8f75-4d51c27d1728',\n",
+       "              'uid': '24ce8dc7-90cf-48f7-9722-de6cc02ba5a1',\n",
        "              'x': [100k, 200k, 400k],\n",
-       "              'y': array([0.94124633, 0.75591046, 0.50620776])}],\n",
+       "              'y': array([1.39094847, 1.15670866, 0.93012363])}],\n",
        "    'layout': {'template': '...'}\n",
        "})"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -383,22 +392,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ipywidgets import interact_manual"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ff334e54715402eb32b3a77aef0a830",
+       "model_id": "3dee1d95570945f3a119c830cdd6d9b6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -415,7 +415,7 @@
        "<function __main__.show_pos_map(quantile: tuple[float, float], model_name_1: str, model_name_2: str)>"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -436,7 +436,7 @@
     "    return\n",
     "\n",
     "\n",
-    "interact_manual(\n",
+    "widgets.interact_manual(\n",
     "    show_pos_map,\n",
     "    quantile=widgets.FloatRangeSlider(\n",
     "        min=0.0, max=1.0, step=0.05, description=\"Quantiles\"\n",
@@ -472,7 +472,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/scripts/get_next_logprobs.py b/scripts/get_next_logprobs.py
index 5cf0d26e..d44b7015 100755
--- a/scripts/get_next_logprobs.py
+++ b/scripts/get_next_logprobs.py
@@ -9,7 +9,6 @@
 from transformers import AutoModelForCausalLM
 
 from delphi import utils
-from delphi.eval.utils import get_all_and_next_logprobs
 
 torch.set_grad_enabled(False)
 
@@ -61,7 +60,7 @@ def get_logprobs_single_model(
     for i in trange(0, n_seq, batch_size):
         batch_tokens = dataset[i : i + batch_size][feature]
         logprobs[i : i + batch_size, 1:] = (
-            get_all_and_next_logprobs(model, batch_tokens)[1].cpu().numpy()  # type: ignore
+            utils.get_all_and_next_logprobs(model, batch_tokens)[1].cpu().numpy()  # type: ignore
         )
     return Dataset.from_dict({"logprobs": [row for row in logprobs]})
 
diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py
index 8c326278..01e0c119 100755
--- a/scripts/tokenize_dataset.py
+++ b/scripts/tokenize_dataset.py
@@ -9,7 +9,7 @@
 from transformers import AutoTokenizer
 
 from delphi import utils
-from delphi.dataset.tokenization import get_tokenized_chunks
+from delphi.tokenization import get_tokenized_chunks
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="", allow_abbrev=False)
diff --git a/src/delphi/dataset/__init__.py b/src/delphi/dataset/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/delphi/eval.py b/src/delphi/eval.py
new file mode 100644
index 00000000..c7ffaf2c
--- /dev/null
+++ b/src/delphi/eval.py
@@ -0,0 +1,379 @@
+import math
+import random
+import uuid
+from typing import Any, Optional, cast
+
+import numpy as np
+import panel as pn
+import plotly.graph_objects as go
+import torch
+from datasets import Dataset
+from IPython.core.display import HTML
+from IPython.core.display_functions import display
+from jaxtyping import Float, Int
+from transformers import PreTrainedTokenizerBase
+
+
+def single_loss_diff_to_color(loss_diff: float) -> str:
+    # if loss_diff is negative, we want the color to be red
+    # if loss_diff is positive, we want the color to be green
+    # if loss_diff is 0, we want the color to be white
+    # the color should be more intense the larger the absolute value of loss_diff
+
+    def sigmoid(x: float) -> float:
+        return 1 / (1 + math.exp(-x))
+
+    scaled_loss_diff = sigmoid(loss_diff)  # scale to 0-1
+
+    if scaled_loss_diff < 0.5:  # red
+        red_val = 255
+        green_blue_val = min(int(255 * 2 * scaled_loss_diff), 255)
+        return f"rgb({red_val}, {green_blue_val}, {green_blue_val})"
+    else:  # green
+        green_val = 255
+        red_blue_val = min(int(255 * 2 * (1 - scaled_loss_diff)), 255)
+        return f"rgb({red_blue_val}, {green_val}, {red_blue_val})"
+
+
+def token_to_html(
+    token: int,
+    tokenizer: PreTrainedTokenizerBase,
+    bg_color: str,
+    data: dict,
+    class_name: str = "token",
+) -> str:
+    data = data or {}  # equivalent to if not data: data = {}
+    # non-breakable space, w/o it leading spaces wouldn't be displayed
+    str_token = tokenizer.decode(token).replace(" ", "&nbsp;")
+
+    # background or user-select (for \n) goes here
+    specific_styles = {}
+    # for now just adds line break or doesn't
+    br = ""
+
+    if bg_color:
+        specific_styles["background-color"] = bg_color
+    if str_token == "\n":
+        # replace new line character with two characters: \ and n
+        str_token = r"\n"
+        # add line break in html
+        br += "<br>"
+        # this is so we can copy the prompt without "\n"s
+        specific_styles["user-select"] = "none"
+    str_token = str_token.replace("<", "&lt;").replace(">", "&gt;")
+
+    style_str = data_str = ""
+    # converting style dict into the style attribute
+    if specific_styles:
+        inside_style_str = "; ".join(f"{k}: {v}" for k, v in specific_styles.items())
+        style_str = f" style='{inside_style_str}'"
+    if data:
+        data_str = "".join(
+            f" data-{k}='{v.replace(' ', '&nbsp;')}'" for k, v in data.items()
+        )
+    return f"<div class='{class_name}'{style_str}{data_str}>{str_token}</div>{br}"
+
+
+_token_style = {
+    "border": "1px solid #888",
+    "display": "inline-block",
+    # each character of the same width, so we can easily spot a space
+    "font-family": "monospace",
+    "font-size": "14px",
+    "color": "black",
+    "background-color": "white",
+    "margin": "1px 0px 1px 1px",
+    "padding": "0px 1px 1px 1px",
+}
+_token_emphasized_style = {
+    "border": "3px solid #888",
+    "display": "inline-block",
+    "font-family": "monospace",
+    "font-size": "14px",
+    "color": "black",
+    "background-color": "white",
+    "margin": "1px 0px 1px 1px",
+    "padding": "0px 1px 1px 1px",
+}
+_token_style_str = " ".join([f"{k}: {v};" for k, v in _token_style.items()])
+_token_emphasized_style_str = " ".join(
+    [f"{k}: {v};" for k, v in _token_emphasized_style.items()]
+)
+
+
+def vis_pos_map(
+    pos_list: list[tuple[int, int]],
+    selected_tokens: list[int],
+    metrics: Float[torch.Tensor, "prompt pos"],
+    token_ids: Int[torch.Tensor, "prompt pos"],
+    tokenizer: PreTrainedTokenizerBase,
+):
+    """
+    Randomly sample from pos_map and visualize the loss diff at the corresponding position.
+    """
+
+    token_htmls = []
+    unique_id = str(uuid.uuid4())
+    token_class = f"pretoken_{unique_id}"
+    selected_token_class = f"token_{unique_id}"
+    hover_div_id = f"hover_info_{unique_id}"
+
+    # choose a random keys from pos_map
+    key = random.choice(pos_list)
+
+    prompt, pos = key
+    all_toks = token_ids[prompt][: pos + 1]
+
+    for i in range(all_toks.shape[0]):
+        token_id = cast(int, all_toks[i].item())
+        value = metrics[prompt][i].item()
+        token_htmls.append(
+            token_to_html(
+                token_id,
+                tokenizer,
+                bg_color="white"
+                if np.isnan(value)
+                else single_loss_diff_to_color(value),
+                data={"loss-diff": f"{value:.2f}"},
+                class_name=token_class
+                if token_id not in selected_tokens
+                else selected_token_class,
+            )
+        )
+
+    # add break line
+    token_htmls.append("<br><br>")
+
+    html_str = f"""
+    <style>.{token_class} {{ {_token_style_str}}} .{selected_token_class} {{ {_token_emphasized_style_str} }} #{hover_div_id} {{ height: 100px; font-family: monospace; }}</style>
+    {"".join(token_htmls)} <div id='{hover_div_id}'></div>
+    <script>
+        (function() {{
+            var token_divs = document.querySelectorAll('.{token_class}');
+            token_divs = Array.from(token_divs).concat(Array.from(document.querySelectorAll('.{selected_token_class}')));
+            var hover_info = document.getElementById('{hover_div_id}');
+
+
+            token_divs.forEach(function(token_div) {{
+                token_div.addEventListener('mousemove', function(e) {{
+                    hover_info.innerHTML = ""
+                    for( var d in this.dataset) {{
+                        hover_info.innerHTML += "<b>" + d + "</b> ";
+                        hover_info.innerHTML += this.dataset[d] + "<br>";
+                    }}
+                }});
+
+                token_div.addEventListener('mouseout', function(e) {{
+                    hover_info.innerHTML = ""
+                }});
+            }});
+        }})();
+    </script>
+    """
+    display(HTML(html_str))
+
+
+def token_selector(
+    vocab_map: dict[str, int]
+) -> tuple[pn.widgets.MultiChoice, list[int]]:
+    tokens = list(vocab_map.keys())
+    token_selector_ = pn.widgets.MultiChoice(name="Tokens", options=tokens)
+    token_ids = [vocab_map[token] for token in cast(list[str], token_selector_.value)]
+
+    def update_tokens(event):
+        token_ids.clear()
+        token_ids.extend([vocab_map[token] for token in event.new])
+
+    token_selector_.param.watch(update_tokens, "value")
+    return token_selector_, token_ids
+
+
+def calc_model_group_stats(
+    tokenized_corpus_dataset: Dataset,
+    logprobs_by_dataset: dict[str, torch.Tensor],
+    selected_tokens: list[int],
+) -> dict[str, dict[str, float]]:
+    """
+    For each (model, token group) pair, calculate useful stats (for visualization)
+
+    args:
+    - tokenized_corpus_dataset: a list of the tokenized corpus datasets, e.g. load_dataset(constants.tokenized_corpus_dataset))["validation"]
+    - logprob_datasets: a dict of lists of logprobs, e.g. {"llama2": load_dataset("transcendingvictor/llama2-validation-logprobs")["validation"]["logprobs"]}
+    - selected_tokens: a list of selected token IDs, e.g. [46, 402, ...]
+
+    returns: a dict of model names as keys and stats dict as values
+        e.g. {"100k": {"mean": -0.5, "median": -0.4, "min": -0.1, "max": -0.9, "25th": -0.3, "75th": -0.7}, ...}
+
+    Stats calculated: mean, median, min, max, 25th percentile, 75th percentile
+    """
+    model_group_stats = {}
+    for model in logprobs_by_dataset:
+        model_logprobs = []
+        print(f"Processing model {model}")
+        dataset = logprobs_by_dataset[model]
+        for ix_doc_lp, document_lps in enumerate(dataset):
+            tokens = tokenized_corpus_dataset[ix_doc_lp]["tokens"]
+            for ix_token, token in enumerate(tokens):
+                if ix_token == 0:  # skip the first token, which isn't predicted
+                    continue
+                logprob = document_lps[ix_token].item()
+                if token in selected_tokens:
+                    model_logprobs.append(logprob)
+
+        if model_logprobs:
+            model_group_stats[model] = {
+                "mean": np.mean(model_logprobs),
+                "median": np.median(model_logprobs),
+                "min": np.min(model_logprobs),
+                "max": np.max(model_logprobs),
+                "25th": np.percentile(model_logprobs, 25),
+                "75th": np.percentile(model_logprobs, 75),
+            }
+    return model_group_stats
+
+
+def dict_filter_quantile(
+    d: dict[Any, float], q_start: float, q_end: float
+) -> dict[Any, float]:
+    if not (0 <= q_start < q_end <= 1):
+        raise ValueError("Invalid quantile range")
+    q_start_val = np.nanquantile(list(d.values()), q_start)
+    q_end_val = np.nanquantile(list(d.values()), q_end)
+    return {
+        k: v for k, v in d.items() if q_start_val <= v <= q_end_val and not np.isnan(v)
+    }
+
+
+def get_all_tok_metrics_in_label(
+    token_ids: Int[torch.Tensor, "prompt pos"],
+    selected_tokens: list[int],
+    metrics: torch.Tensor,
+    q_start: Optional[float] = None,
+    q_end: Optional[float] = None,
+) -> dict[tuple[int, int], float]:
+    """
+    From the token_map, get all the positions of the tokens that have a certain label.
+    We don't use the token_map because for sampling purposes, iterating through token_ids is more efficient.
+    Optionally, filter the tokens based on the quantile range of the metrics.
+
+    Args:
+    - token_ids (Dataset): token_ids dataset e.g. token_ids[0] = {"tokens": [[1, 2, ...], [2, 5, ...], ...]}
+    - selected_tokens (list[int]): list of token IDs to search for e.g. [46, 402, ...]
+    - metrics (torch.Tensor): tensor of metrics to search through e.g. torch.tensor([[0.1, 0.2, ...], [0.3, 0.4, ...], ...])
+    - q_start (float): the start of the quantile range to filter the metrics e.g. 0.1
+    - q_end (float): the end of the quantile range to filter the metrics e.g. 0.9
+
+    Returns:
+    - tok_positions (dict[tuple[int, int], Number]): dictionary of token positions and their corresponding metrics
+    """
+
+    # check if metrics have the same dimensions as token_ids
+    if metrics.shape != token_ids.shape:
+        raise ValueError(
+            f"Expected metrics to have the same shape as token_ids, but got {metrics.shape} and {token_ids.shape} instead."
+        )
+
+    tok_positions = {}
+    for prompt_pos, prompt in enumerate(token_ids.numpy()):
+        for tok_pos, tok in enumerate(prompt):
+            if tok in selected_tokens:
+                tok_positions[(prompt_pos, tok_pos)] = metrics[
+                    prompt_pos, tok_pos
+                ].item()
+
+    if q_start is not None and q_end is not None:
+        tok_positions = dict_filter_quantile(tok_positions, q_start, q_end)
+
+    return tok_positions
+
+
+def visualize_selected_tokens(
+    input: dict[str | int, tuple[float, float, float]],
+    log_scale=False,
+    line_metric="Means",
+    checkpoint_mode=True,
+    shade_color="rgba(68, 68, 68, 0.3)",
+    line_color="rgb(31, 119, 180)",
+    bar_color="purple",
+    marker_color="SkyBlue",
+    background_color="AliceBlue",
+) -> go.FigureWidget:
+    input_x = list(input.keys())
+
+    def get_hovertexts(mid: np.ndarray, lo: np.ndarray, hi: np.ndarray) -> list[str]:
+        return [f"Loss: {m:.3f} ({l:.3f}, {h:.3f})" for m, l, h in zip(mid, lo, hi)]
+
+    def get_plot_values() -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        x = np.array([input[x] for x in input_x]).T
+        means, err_lo, err_hi = x[0], x[1], x[2]
+        return means, err_lo, err_hi
+
+    means, err_lo, err_hi = get_plot_values()
+
+    if checkpoint_mode:
+        scatter_plot = go.Figure(
+            [
+                go.Scatter(
+                    name="Upper Bound",
+                    x=input_x,
+                    y=means + err_hi,
+                    mode="lines",
+                    marker=dict(color=shade_color),
+                    line=dict(width=0),
+                    showlegend=False,
+                ),
+                go.Scatter(
+                    name="Lower Bound",
+                    x=input_x,
+                    y=means - err_lo,
+                    marker=dict(color=shade_color),
+                    line=dict(width=0),
+                    mode="lines",
+                    fillcolor=shade_color,
+                    fill="tonexty",
+                    showlegend=False,
+                ),
+                go.Scatter(
+                    name=line_metric,
+                    x=input_x,
+                    y=means,
+                    mode="lines",
+                    marker=dict(
+                        color=line_color,
+                        size=0,
+                        line=dict(color=line_color, width=1),
+                    ),
+                ),
+            ]
+        )
+    else:
+        scatter_plot = go.Scatter(
+            x=input_x,
+            y=means,
+            error_y=dict(
+                type="data",
+                symmetric=False,
+                array=err_hi,
+                arrayminus=err_lo,
+                color=bar_color,
+            ),
+            marker=dict(
+                color=marker_color,
+                size=15,
+                line=dict(color=line_color, width=2),
+            ),
+            hovertext=get_hovertexts(means, err_lo, err_hi),
+            hoverinfo="text+x",
+        )
+    g = go.FigureWidget(
+        data=scatter_plot,
+        layout=go.Layout(
+            yaxis=dict(
+                title="Loss",
+                type="log" if log_scale else "linear",
+            ),
+            plot_bgcolor=background_color,
+        ),
+    )
+
+    return g
diff --git a/src/delphi/eval/__init__.py b/src/delphi/eval/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/delphi/eval/calc_model_group_stats.py b/src/delphi/eval/calc_model_group_stats.py
deleted file mode 100644
index faab8a02..00000000
--- a/src/delphi/eval/calc_model_group_stats.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import numpy as np
-import torch
-from datasets import Dataset
-from jaxtyping import Float
-
-
-def calc_model_group_stats(
-    tokenized_corpus_dataset: Dataset,
-    logprobs_by_dataset: dict[str, torch.Tensor],
-    selected_tokens: list[int],
-) -> dict[str, dict[str, float]]:
-    """
-    For each (model, token group) pair, calculate useful stats (for visualization)
-
-    args:
-    - tokenized_corpus_dataset: a list of the tokenized corpus datasets, e.g. load_dataset(constants.tokenized_corpus_dataset))["validation"]
-    - logprob_datasets: a dict of lists of logprobs, e.g. {"llama2": load_dataset("transcendingvictor/llama2-validation-logprobs")["validation"]["logprobs"]}
-    - selected_tokens: a list of selected token IDs, e.g. [46, 402, ...]
-
-    returns: a dict of model names as keys and stats dict as values
-        e.g. {"100k": {"mean": -0.5, "median": -0.4, "min": -0.1, "max": -0.9, "25th": -0.3, "75th": -0.7}, ...}
-
-    Stats calculated: mean, median, min, max, 25th percentile, 75th percentile
-    """
-    model_group_stats = {}
-    for model in logprobs_by_dataset:
-        model_logprobs = []
-        print(f"Processing model {model}")
-        dataset = logprobs_by_dataset[model]
-        for ix_doc_lp, document_lps in enumerate(dataset):
-            tokens = tokenized_corpus_dataset[ix_doc_lp]["tokens"]
-            for ix_token, token in enumerate(tokens):
-                if ix_token == 0:  # skip the first token, which isn't predicted
-                    continue
-                logprob = document_lps[ix_token].item()
-                if token in selected_tokens:
-                    model_logprobs.append(logprob)
-
-        if model_logprobs:
-            model_group_stats[model] = {
-                "mean": np.mean(model_logprobs),
-                "median": np.median(model_logprobs),
-                "min": np.min(model_logprobs),
-                "max": np.max(model_logprobs),
-                "25th": np.percentile(model_logprobs, 25),
-                "75th": np.percentile(model_logprobs, 75),
-            }
-    return model_group_stats
diff --git a/src/delphi/eval/token_positions.py b/src/delphi/eval/token_positions.py
deleted file mode 100644
index a98af761..00000000
--- a/src/delphi/eval/token_positions.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import Optional
-
-import torch
-from jaxtyping import Int
-
-from delphi.eval.utils import dict_filter_quantile
-
-
-def get_all_tok_metrics_in_label(
-    token_ids: Int[torch.Tensor, "prompt pos"],
-    selected_tokens: list[int],
-    metrics: torch.Tensor,
-    q_start: Optional[float] = None,
-    q_end: Optional[float] = None,
-) -> dict[tuple[int, int], float]:
-    """
-    From the token_map, get all the positions of the tokens that have a certain label.
-    We don't use the token_map because for sampling purposes, iterating through token_ids is more efficient.
-    Optionally, filter the tokens based on the quantile range of the metrics.
-
-    Args:
-    - token_ids (Dataset): token_ids dataset e.g. token_ids[0] = {"tokens": [[1, 2, ...], [2, 5, ...], ...]}
-    - selected_tokens (list[int]): list of token IDs to search for e.g. [46, 402, ...]
-    - metrics (torch.Tensor): tensor of metrics to search through e.g. torch.tensor([[0.1, 0.2, ...], [0.3, 0.4, ...], ...])
-    - q_start (float): the start of the quantile range to filter the metrics e.g. 0.1
-    - q_end (float): the end of the quantile range to filter the metrics e.g. 0.9
-
-    Returns:
-    - tok_positions (dict[tuple[int, int], Number]): dictionary of token positions and their corresponding metrics
-    """
-
-    # check if metrics have the same dimensions as token_ids
-    if metrics.shape != token_ids.shape:
-        raise ValueError(
-            f"Expected metrics to have the same shape as token_ids, but got {metrics.shape} and {token_ids.shape} instead."
-        )
-
-    tok_positions = {}
-    for prompt_pos, prompt in enumerate(token_ids.numpy()):
-        for tok_pos, tok in enumerate(prompt):
-            if tok in selected_tokens:
-                tok_positions[(prompt_pos, tok_pos)] = metrics[
-                    prompt_pos, tok_pos
-                ].item()
-
-    if q_start is not None and q_end is not None:
-        tok_positions = dict_filter_quantile(tok_positions, q_start, q_end)
-
-    return tok_positions
diff --git a/src/delphi/eval/utils.py b/src/delphi/eval/utils.py
deleted file mode 100644
index 0026e7a7..00000000
--- a/src/delphi/eval/utils.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from collections.abc import Callable
-from typing import Any
-
-import numpy as np
-import torch
-from jaxtyping import Float, Int
-from transformers import PreTrainedModel
-
-
-def get_all_logprobs(
-    model: Callable, input_ids: Int[torch.Tensor, "batch seq"]
-) -> Float[torch.Tensor, "batch seq vocab"]:
-    # batch, seq, vocab
-    logits = model(input_ids).logits
-    return torch.log_softmax(logits, dim=-1)
-
-
-# convenience wrapper for calling on a single sample
-def get_single_logprobs(
-    model: Callable, input_ids: Int[torch.Tensor, "seq"]
-) -> Float[torch.Tensor, "seq vocab"]:
-    return get_all_logprobs(model, input_ids.unsqueeze(0))[0]
-
-
-def gather_logprobs(
-    logprobs: Float[torch.Tensor, "batch seq vocab"],
-    tokens: Int[torch.Tensor, "batch seq"],
-) -> Float[torch.Tensor, "batch seq"]:
-    return torch.gather(logprobs, -1, tokens.unsqueeze(-1)).squeeze(-1)
-
-
-def get_all_and_next_logprobs(
-    model: Callable,
-    input_ids: Int[torch.Tensor, "batch seq"],
-) -> tuple[
-    Float[torch.Tensor, "batch shorter_seq vocab"],
-    Float[torch.Tensor, "batch shorter_seq"],
-]:
-    logprobs = get_all_logprobs(model, input_ids[:, :-1])
-    next_tokens = input_ids[:, 1:]
-    return logprobs, gather_logprobs(logprobs, next_tokens)
-
-
-def get_all_and_next_logprobs_single(
-    model: Callable,
-    input_ids: Int[torch.Tensor, "seq"],
-) -> tuple[
-    Float[torch.Tensor, "shorter_seq vocab"],
-    Float[torch.Tensor, "shorter_seq"],
-]:
-    all_logprobs, next_logprobs = get_all_and_next_logprobs(
-        model, input_ids.unsqueeze(0)
-    )
-    return all_logprobs[0], next_logprobs[0]
-
-
-def get_next_and_top_k_probs(
-    model: PreTrainedModel, input_ids: Int[torch.Tensor, "seq"], k: int = 3
-) -> tuple[Float[torch.Tensor, "shorter_seq"], torch.return_types.topk,]:
-    all_logprobs, next_logprobs = get_all_and_next_logprobs_single(model, input_ids)
-    all_probs = torch.exp(all_logprobs)
-    next_probs = torch.exp(next_logprobs)
-    top_k = torch.topk(all_probs, k, dim=-1)
-    return next_probs, top_k
-
-
-def dict_filter_quantile(
-    d: dict[Any, float], q_start: float, q_end: float
-) -> dict[Any, float]:
-    if not (0 <= q_start < q_end <= 1):
-        raise ValueError("Invalid quantile range")
-    q_start_val = np.nanquantile(list(d.values()), q_start)
-    q_end_val = np.nanquantile(list(d.values()), q_end)
-    return {
-        k: v for k, v in d.items() if q_start_val <= v <= q_end_val and not np.isnan(v)
-    }
diff --git a/src/delphi/eval/vis.py b/src/delphi/eval/vis.py
deleted file mode 100644
index cea76e88..00000000
--- a/src/delphi/eval/vis.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-import random
-import uuid
-from typing import cast
-
-import numpy as np
-import panel as pn
-import torch
-from IPython.core.display import HTML
-from IPython.core.display_functions import display
-from jaxtyping import Float, Int
-from transformers import PreTrainedTokenizerBase
-
-
-def single_loss_diff_to_color(loss_diff: float) -> str:
-    # if loss_diff is negative, we want the color to be red
-    # if loss_diff is positive, we want the color to be green
-    # if loss_diff is 0, we want the color to be white
-    # the color should be more intense the larger the absolute value of loss_diff
-
-    def sigmoid(x: float) -> float:
-        return 1 / (1 + math.exp(-x))
-
-    scaled_loss_diff = sigmoid(loss_diff)  # scale to 0-1
-
-    if scaled_loss_diff < 0.5:  # red
-        red_val = 255
-        green_blue_val = min(int(255 * 2 * scaled_loss_diff), 255)
-        return f"rgb({red_val}, {green_blue_val}, {green_blue_val})"
-    else:  # green
-        green_val = 255
-        red_blue_val = min(int(255 * 2 * (1 - scaled_loss_diff)), 255)
-        return f"rgb({red_blue_val}, {green_val}, {red_blue_val})"
-
-
-def to_tok_prob_str(tok: int, prob: float, tokenizer: PreTrainedTokenizerBase) -> str:
-    tok_str = tokenizer.decode(tok).replace(" ", "&nbsp;").replace("\n", r"\n")
-    prob_str = f"{prob:.2%}"
-    return f"{prob_str:>6} |{tok_str}|"
-
-
-def token_to_html(
-    token: int,
-    tokenizer: PreTrainedTokenizerBase,
-    bg_color: str,
-    data: dict,
-    class_name: str = "token",
-) -> str:
-    data = data or {}  # equivalent to if not data: data = {}
-    # non-breakable space, w/o it leading spaces wouldn't be displayed
-    str_token = tokenizer.decode(token).replace(" ", "&nbsp;")
-
-    # background or user-select (for \n) goes here
-    specific_styles = {}
-    # for now just adds line break or doesn't
-    br = ""
-
-    if bg_color:
-        specific_styles["background-color"] = bg_color
-    if str_token == "\n":
-        # replace new line character with two characters: \ and n
-        str_token = r"\n"
-        # add line break in html
-        br += "<br>"
-        # this is so we can copy the prompt without "\n"s
-        specific_styles["user-select"] = "none"
-    str_token = str_token.replace("<", "&lt;").replace(">", "&gt;")
-
-    style_str = data_str = ""
-    # converting style dict into the style attribute
-    if specific_styles:
-        inside_style_str = "; ".join(f"{k}: {v}" for k, v in specific_styles.items())
-        style_str = f" style='{inside_style_str}'"
-    if data:
-        data_str = "".join(
-            f" data-{k}='{v.replace(' ', '&nbsp;')}'" for k, v in data.items()
-        )
-    return f"<div class='{class_name}'{style_str}{data_str}>{str_token}</div>{br}"
-
-
-_token_style = {
-    "border": "1px solid #888",
-    "display": "inline-block",
-    # each character of the same width, so we can easily spot a space
-    "font-family": "monospace",
-    "font-size": "14px",
-    "color": "black",
-    "background-color": "white",
-    "margin": "1px 0px 1px 1px",
-    "padding": "0px 1px 1px 1px",
-}
-_token_emphasized_style = {
-    "border": "3px solid #888",
-    "display": "inline-block",
-    "font-family": "monospace",
-    "font-size": "14px",
-    "color": "black",
-    "background-color": "white",
-    "margin": "1px 0px 1px 1px",
-    "padding": "0px 1px 1px 1px",
-}
-_token_style_str = " ".join([f"{k}: {v};" for k, v in _token_style.items()])
-_token_emphasized_style_str = " ".join(
-    [f"{k}: {v};" for k, v in _token_emphasized_style.items()]
-)
-
-
-def vis_pos_map(
-    pos_list: list[tuple[int, int]],
-    selected_tokens: list[int],
-    metrics: Float[torch.Tensor, "prompt pos"],
-    token_ids: Int[torch.Tensor, "prompt pos"],
-    tokenizer: PreTrainedTokenizerBase,
-):
-    """
-    Randomly sample from pos_map and visualize the loss diff at the corresponding position.
-    """
-
-    token_htmls = []
-    unique_id = str(uuid.uuid4())
-    token_class = f"pretoken_{unique_id}"
-    selected_token_class = f"token_{unique_id}"
-    hover_div_id = f"hover_info_{unique_id}"
-
-    # choose a random keys from pos_map
-    key = random.choice(pos_list)
-
-    prompt, pos = key
-    all_toks = token_ids[prompt][: pos + 1]
-
-    for i in range(all_toks.shape[0]):
-        token_id = cast(int, all_toks[i].item())
-        value = metrics[prompt][i].item()
-        token_htmls.append(
-            token_to_html(
-                token_id,
-                tokenizer,
-                bg_color="white"
-                if np.isnan(value)
-                else single_loss_diff_to_color(value),
-                data={"loss-diff": f"{value:.2f}"},
-                class_name=token_class
-                if token_id not in selected_tokens
-                else selected_token_class,
-            )
-        )
-
-    # add break line
-    token_htmls.append("<br><br>")
-
-    html_str = f"""
-    <style>.{token_class} {{ {_token_style_str}}} .{selected_token_class} {{ {_token_emphasized_style_str} }} #{hover_div_id} {{ height: 100px; font-family: monospace; }}</style>
-    {"".join(token_htmls)} <div id='{hover_div_id}'></div>
-    <script>
-        (function() {{
-            var token_divs = document.querySelectorAll('.{token_class}');
-            token_divs = Array.from(token_divs).concat(Array.from(document.querySelectorAll('.{selected_token_class}')));
-            var hover_info = document.getElementById('{hover_div_id}');
-
-
-            token_divs.forEach(function(token_div) {{
-                token_div.addEventListener('mousemove', function(e) {{
-                    hover_info.innerHTML = ""
-                    for( var d in this.dataset) {{
-                        hover_info.innerHTML += "<b>" + d + "</b> ";
-                        hover_info.innerHTML += this.dataset[d] + "<br>";
-                    }}
-                }});
-
-                token_div.addEventListener('mouseout', function(e) {{
-                    hover_info.innerHTML = ""
-                }});
-            }});
-        }})();
-    </script>
-    """
-    display(HTML(html_str))
-
-
-def token_selector(
-    vocab_map: dict[str, int]
-) -> tuple[pn.widgets.MultiChoice, list[int]]:
-    tokens = list(vocab_map.keys())
-    token_selector_ = pn.widgets.MultiChoice(name="Tokens", options=tokens)
-    token_ids = [vocab_map[token] for token in cast(list[str], token_selector_.value)]
-
-    def update_tokens(event):
-        token_ids.clear()
-        token_ids.extend([vocab_map[token] for token in event.new])
-
-    token_selector_.param.watch(update_tokens, "value")
-    return token_selector_, token_ids
diff --git a/src/delphi/eval/vis_per_token_model.py b/src/delphi/eval/vis_per_token_model.py
deleted file mode 100644
index e5d735f4..00000000
--- a/src/delphi/eval/vis_per_token_model.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from typing import Union
-
-import numpy as np
-import plotly.graph_objects as go
-
-
-def visualize_selected_tokens(
-    input: dict[Union[str, int], tuple[float, float, float]],
-    log_scale=False,
-    line_metric="Means",
-    checkpoint_mode=True,
-    shade_color="rgba(68, 68, 68, 0.3)",
-    line_color="rgb(31, 119, 180)",
-    bar_color="purple",
-    marker_color="SkyBlue",
-    background_color="AliceBlue",
-) -> go.FigureWidget:
-    input_x = list(input.keys())
-
-    def get_hovertexts(mid: np.ndarray, lo: np.ndarray, hi: np.ndarray) -> list[str]:
-        return [f"Loss: {m:.3f} ({l:.3f}, {h:.3f})" for m, l, h in zip(mid, lo, hi)]
-
-    def get_plot_values() -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-        x = np.array([input[x] for x in input_x]).T
-        means, err_lo, err_hi = x[0], x[1], x[2]
-        return means, err_lo, err_hi
-
-    means, err_lo, err_hi = get_plot_values()
-
-    if checkpoint_mode:
-        scatter_plot = go.Figure(
-            [
-                go.Scatter(
-                    name="Upper Bound",
-                    x=input_x,
-                    y=means + err_hi,
-                    mode="lines",
-                    marker=dict(color=shade_color),
-                    line=dict(width=0),
-                    showlegend=False,
-                ),
-                go.Scatter(
-                    name="Lower Bound",
-                    x=input_x,
-                    y=means - err_lo,
-                    marker=dict(color=shade_color),
-                    line=dict(width=0),
-                    mode="lines",
-                    fillcolor=shade_color,
-                    fill="tonexty",
-                    showlegend=False,
-                ),
-                go.Scatter(
-                    name=line_metric,
-                    x=input_x,
-                    y=means,
-                    mode="lines",
-                    marker=dict(
-                        color=line_color,
-                        size=0,
-                        line=dict(color=line_color, width=1),
-                    ),
-                ),
-            ]
-        )
-    else:
-        scatter_plot = go.Scatter(
-            x=input_x,
-            y=means,
-            error_y=dict(
-                type="data",
-                symmetric=False,
-                array=err_hi,
-                arrayminus=err_lo,
-                color=bar_color,
-            ),
-            marker=dict(
-                color=marker_color,
-                size=15,
-                line=dict(color=line_color, width=2),
-            ),
-            hovertext=get_hovertexts(means, err_lo, err_hi),
-            hoverinfo="text+x",
-        )
-    g = go.FigureWidget(
-        data=scatter_plot,
-        layout=go.Layout(
-            yaxis=dict(
-                title="Loss",
-                type="log" if log_scale else "linear",
-            ),
-            plot_bgcolor=background_color,
-        ),
-    )
-
-    return g
diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/tokenization.py
similarity index 100%
rename from src/delphi/dataset/tokenization.py
rename to src/delphi/tokenization.py
diff --git a/src/delphi/utils.py b/src/delphi/utils.py
index 30325cb2..7ffbfc88 100644
--- a/src/delphi/utils.py
+++ b/src/delphi/utils.py
@@ -1,6 +1,9 @@
+from collections.abc import Callable
 from typing import cast
 
+import torch
 from datasets import Dataset, Features, Sequence, Value, load_dataset
+from jaxtyping import Float, Int
 
 
 def hf_split_to_split_name(split: str) -> str:
@@ -55,3 +58,30 @@ def get_all_hf_branch_names(repo_id: str) -> list[str]:
     api = HfApi()
     refs = api.list_repo_refs(repo_id)
     return [branch.name for branch in refs.branches]
+
+
+def gather_logprobs(
+    logprobs: Float[torch.Tensor, "batch seq vocab"],
+    tokens: Int[torch.Tensor, "batch seq"],
+) -> Float[torch.Tensor, "batch seq"]:
+    return torch.gather(logprobs, -1, tokens.unsqueeze(-1)).squeeze(-1)
+
+
+def get_all_logprobs(
+    model: Callable, input_ids: Int[torch.Tensor, "batch seq"]
+) -> Float[torch.Tensor, "batch seq vocab"]:
+    # batch, seq, vocab
+    logits = model(input_ids).logits
+    return torch.log_softmax(logits, dim=-1)
+
+
+def get_all_and_next_logprobs(
+    model: Callable,
+    input_ids: Int[torch.Tensor, "batch seq"],
+) -> tuple[
+    Float[torch.Tensor, "batch shorter_seq vocab"],
+    Float[torch.Tensor, "batch shorter_seq"],
+]:
+    logprobs = get_all_logprobs(model, input_ids[:, :-1])
+    next_tokens = input_ids[:, 1:]
+    return logprobs, gather_logprobs(logprobs, next_tokens)
diff --git a/tests/eval/test_token_positions.py b/tests/eval/test_token_positions.py
deleted file mode 100644
index c584b931..00000000
--- a/tests/eval/test_token_positions.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from math import isclose
-from typing import cast
-
-import pytest
-from datasets import Dataset
-
-from delphi.eval.token_positions import *
-
-
-@pytest.fixture
-def mock_data():
-    token_ids = Dataset.from_dict(
-        {"tokens": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
-    ).with_format("torch")
-    selected_tokens = [2, 4, 6, 8]
-    metrics = torch.tensor([[-1, 0.45, -0.33], [-1.31, 2.3, 0.6], [0.2, 0.8, 0.1]])
-    return token_ids, selected_tokens, metrics
-
-
-def test_get_all_tok_metrics_in_label(mock_data):
-    token_ids, selected_tokens, metrics = mock_data
-    result = get_all_tok_metrics_in_label(
-        token_ids["tokens"],
-        selected_tokens,
-        metrics,
-    )
-    # key: (prompt_pos, tok_pos), value: logprob
-    expected = {
-        (0, 1): 0.45,
-        (1, 0): -1.31,
-        (1, 2): 0.6,
-        (2, 1): 0.8,
-    }
-
-    # compare keys
-    assert result.keys() == expected.keys()
-    # compare values
-    for k in result:
-        assert isclose(cast(float, result[k]), expected[k], rel_tol=1e-6)  # type: ignore
-
-    # test with quantile filtering
-    result_q = get_all_tok_metrics_in_label(
-        token_ids["tokens"], selected_tokens, metrics, q_start=0.6, q_end=1.0
-    )
-    expected_q = {
-        (1, 2): 0.6,
-        (2, 1): 0.8,
-    }
-
-    # compare keys
-    assert result_q.keys() == expected_q.keys()
-    # compare values
-    for k in result_q:
-        assert isclose(cast(float, result_q[k]), expected_q[k], rel_tol=1e-6)  # type: ignore
diff --git a/tests/eval/test_utils_eval.py b/tests/eval/test_utils_eval.py
deleted file mode 100644
index 54e0034a..00000000
--- a/tests/eval/test_utils_eval.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from math import isclose
-
-import pytest
-import torch
-
-from delphi.eval.utils import dict_filter_quantile, gather_logprobs
-
-
-def test_gather_logprobs():
-    # vocab size = 3
-    logprobs = torch.tensor(
-        [
-            # batch 0
-            [
-                # seq 0
-                [0.00, 0.01, 0.02],
-                # seq 1
-                [0.10, 0.11, 0.12],
-            ],
-            # batch 1
-            [
-                # seq 0
-                [1.00, 1.01, 1.02],
-                # seq 1
-                [1.10, 1.11, 1.12],
-            ],
-        ]
-    )
-    tokens = torch.tensor(
-        [
-            # batch 0
-            [0, 2],
-            # batch 1
-            [1, 2],
-        ]
-    )
-    expected_output = torch.tensor(
-        [
-            # batch 0
-            [0.00, 0.12],
-            # batch 1
-            [1.01, 1.12],
-        ]
-    )
-    result = gather_logprobs(logprobs, tokens)
-    assert torch.allclose(result, expected_output)
-
-
-@pytest.mark.filterwarnings(
-    "ignore::RuntimeWarning"
-)  # ignore warnings from numpy empty slice
-def test_dict_filter_quantile():
-    d = {1: 0.1, 2: 0.2, 3: 0.3, 4: 0.4, 5: 0.5}
-    result = dict_filter_quantile(d, 0.2, 0.6)
-    expected = {2: 0.2, 3: 0.3}
-
-    # compare keys
-    assert result.keys() == expected.keys()
-    # compare values
-    for k in result:
-        assert isclose(result[k], expected[k], rel_tol=1e-6)
-
-    # test with negative values
-    d = {1: -0.1, 2: -0.2, 3: -0.3, 4: -0.4, 5: -0.5}
-    result = dict_filter_quantile(d, 0.2, 0.6)
-    expected = {3: -0.3, 4: -0.4}
-
-    # compare keys
-    assert result.keys() == expected.keys()
-    # compare values
-    for k in result:
-        assert isclose(result[k], expected[k], rel_tol=1e-6)
-
-    # test invalid quantile range
-    with pytest.raises(ValueError):
-        dict_filter_quantile(d, 0.6, 0.2)
-    with pytest.raises(ValueError):
-        dict_filter_quantile(d, 0.1, 1.1)
-    with pytest.raises(ValueError):
-        dict_filter_quantile(d, -0.1, 0.6)
-
-    # test empty dict, will raise a warning
-    result = dict_filter_quantile({}, 0.2, 0.6)
-    assert result == {}
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 00000000..cdf88413
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,91 @@
+from math import isclose
+from typing import cast
+
+import pytest
+import torch
+from datasets import Dataset
+
+from delphi.eval import dict_filter_quantile, get_all_tok_metrics_in_label
+
+
+@pytest.mark.filterwarnings(
+    "ignore::RuntimeWarning"
+)  # ignore warnings from numpy empty slice
+def test_dict_filter_quantile():
+    d = {1: 0.1, 2: 0.2, 3: 0.3, 4: 0.4, 5: 0.5}
+    result = dict_filter_quantile(d, 0.2, 0.6)
+    expected = {2: 0.2, 3: 0.3}
+
+    # compare keys
+    assert result.keys() == expected.keys()
+    # compare values
+    for k in result:
+        assert isclose(result[k], expected[k], rel_tol=1e-6)
+
+    # test with negative values
+    d = {1: -0.1, 2: -0.2, 3: -0.3, 4: -0.4, 5: -0.5}
+    result = dict_filter_quantile(d, 0.2, 0.6)
+    expected = {3: -0.3, 4: -0.4}
+
+    # compare keys
+    assert result.keys() == expected.keys()
+    # compare values
+    for k in result:
+        assert isclose(result[k], expected[k], rel_tol=1e-6)
+
+    # test invalid quantile range
+    with pytest.raises(ValueError):
+        dict_filter_quantile(d, 0.6, 0.2)
+    with pytest.raises(ValueError):
+        dict_filter_quantile(d, 0.1, 1.1)
+    with pytest.raises(ValueError):
+        dict_filter_quantile(d, -0.1, 0.6)
+
+    # test empty dict, will raise a warning
+    result = dict_filter_quantile({}, 0.2, 0.6)
+    assert result == {}
+
+
+def test_get_all_tok_metrics_in_label():
+    token_ids = Dataset.from_dict(
+        {"tokens": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
+    ).with_format("torch")
+    selected_tokens = [2, 4, 6, 8]
+    metrics = torch.tensor([[-1, 0.45, -0.33], [-1.31, 2.3, 0.6], [0.2, 0.8, 0.1]])
+    result = get_all_tok_metrics_in_label(
+        token_ids["tokens"],  # type: ignore
+        selected_tokens,
+        metrics,
+    )
+    # key: (prompt_pos, tok_pos), value: logprob
+    expected = {
+        (0, 1): 0.45,
+        (1, 0): -1.31,
+        (1, 2): 0.6,
+        (2, 1): 0.8,
+    }
+
+    # compare keys
+    assert result.keys() == expected.keys()
+    # compare values
+    for k in result:
+        assert isclose(cast(float, result[k]), expected[k], rel_tol=1e-6)  # type: ignore
+
+    # test with quantile filtering
+    result_q = get_all_tok_metrics_in_label(
+        token_ids["tokens"],  # type: ignore
+        selected_tokens,
+        metrics,
+        q_start=0.6,
+        q_end=1.0,
+    )
+    expected_q = {
+        (1, 2): 0.6,
+        (2, 1): 0.8,
+    }
+
+    # compare keys
+    assert result_q.keys() == expected_q.keys()
+    # compare values
+    for k in result_q:
+        assert isclose(cast(float, result_q[k]), expected_q[k], rel_tol=1e-6)  # type: ignore
diff --git a/tests/dataset/test_tokeniation.py b/tests/test_tokeniation.py
similarity index 97%
rename from tests/dataset/test_tokeniation.py
rename to tests/test_tokeniation.py
index bb4180ba..cc9494b2 100644
--- a/tests/dataset/test_tokeniation.py
+++ b/tests/test_tokeniation.py
@@ -5,7 +5,7 @@
 from datasets import Dataset
 from transformers import AutoTokenizer
 
-from delphi.dataset.tokenization import extend_deque, make_new_sample, tokenize_dataset
+from delphi.tokenization import extend_deque, make_new_sample, tokenize_dataset
 
 
 @pytest.fixture
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 597438ca..79b639ad 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,13 @@
-from delphi.utils import hf_split_to_split_name
+import random
+import string
 
-from .utils import random_string
+import torch
+
+from delphi.utils import gather_logprobs, hf_split_to_split_name
+
+
+def random_string(length: int) -> str:
+    return "".join(random.choices(string.ascii_lowercase, k=length))
 
 
 def test_hf_split_to_split_name():
@@ -12,3 +19,43 @@ def test_hf_split_to_split_name():
     assert hf_split_to_split_name(f"{random_split_name}[:200]") == random_split_name
     assert hf_split_to_split_name(f"{random_split_name}[200:]") == random_split_name
     assert hf_split_to_split_name(f"{random_split_name}[200:400]") == random_split_name
+
+
+def test_gather_logprobs():
+    # vocab size = 3
+    logprobs = torch.tensor(
+        [
+            # batch 0
+            [
+                # seq 0
+                [0.00, 0.01, 0.02],
+                # seq 1
+                [0.10, 0.11, 0.12],
+            ],
+            # batch 1
+            [
+                # seq 0
+                [1.00, 1.01, 1.02],
+                # seq 1
+                [1.10, 1.11, 1.12],
+            ],
+        ]
+    )
+    tokens = torch.tensor(
+        [
+            # batch 0
+            [0, 2],
+            # batch 1
+            [1, 2],
+        ]
+    )
+    expected_output = torch.tensor(
+        [
+            # batch 0
+            [0.00, 0.12],
+            # batch 1
+            [1.01, 1.12],
+        ]
+    )
+    result = gather_logprobs(logprobs, tokens)
+    assert torch.allclose(result, expected_output)
diff --git a/tests/train/test_train_step.py b/tests/train/test_train_step.py
index 9a7f3456..e06fa1af 100644
--- a/tests/train/test_train_step.py
+++ b/tests/train/test_train_step.py
@@ -8,7 +8,6 @@
 from transformers import PreTrainedModel
 
 from delphi import TEST_CONFIGS_DIR
-from delphi.eval.utils import get_all_and_next_logprobs
 from delphi.train.config import TrainingConfig
 from delphi.train.config.utils import build_config_from_files_and_overrides
 from delphi.train.train_step import accumulate_gradients, train_step
@@ -18,6 +17,7 @@
     init_model,
     setup_determinism,
 )
+from delphi.utils import get_all_and_next_logprobs
 
 
 def load_test_config(preset_name: str) -> TrainingConfig:
diff --git a/tests/utils.py b/tests/utils.py
deleted file mode 100644
index ed81b58a..00000000
--- a/tests/utils.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import random
-import string
-
-
-def random_string(length: int) -> str:
-    return "".join(random.choices(string.ascii_lowercase, k=length))