Merge branch 'training-script' of https://github.com/delphi-suite/delphi

into training-script
delphi-suite · Mar 8, 2024 · cb653cb · cb653cb
2 parents e61f483 + e542237
commit cb653cb
Show file tree

Hide file tree

Showing 50 changed files with 2,976 additions and 332 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -6,7 +6,7 @@ on:
       - main
   pull_request:
     branches:
-      - '*'
+      - "*"
 
 permissions:
   actions: write
@@ -17,10 +17,19 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
       - name: setup python
         uses: actions/setup-python@v5
         with:
           python-version: "3.10"
+          cache: "pip"
+      - name: cache models and datasets
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cache/huggingface
+          key: ${{ runner.os }}-huggingface-cache-v1  # increment this key to invalidate the cache when new models/datasets are added
       - name: dependencies
         run: |
           python -m pip install --upgrade pip
@@ -31,4 +40,4 @@ jobs:
       - name: isort
         run: isort --check .
       - name: pytest
-        run: pytest
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,11 @@ __pycache__/
 # C extensions
 *.so
 
+bin
+include
+lib64
+pyvenv.cfg
+
 # Distribution / packaging
 .Python
 build/
@@ -158,3 +163,15 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# ignore wandb files
+**/wandb/*
+**/*.wandb
+**/wandb-summary.json
+**/wandb-metadata.json
+
+# scratch notebook
+notebooks/scratch.ipynb
+
+# dsstore
+.DS_Store
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "src/delphi/train/llama2c"]
-	path = src/delphi/train/llama2c
+[submodule "src/llama2c"]
+	path = src/llama2c
 	url = https://github.com/delphi-suite/llama2.c.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,4 +8,3 @@ repos:
     rev: 5.13.2
     hooks:
       - id: isort
-        name: isort (python)
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -7,4 +7,5 @@
         "source.organizeImports": "explicit"
     },
     "python.analysis.typeCheckingMode": "basic",
+    "black-formatter.importStrategy": "fromEnvironment",
 }
diff --git a/README.md b/README.md
@@ -1,13 +1,24 @@
 # Delphi
+
 Interpreting Small Language Models Across Time and Scale
 
-# setup
-1. make python 3.10 virtual env in `.venv`
-2. install dependencies `pip install -r requirements.txt`
-3. install the project in editable state `pip install -e .`
-4. run tests `pytest`
+# Setup
+
+1. Clone this repo and submodules: `git clone https://github.com/delphi-suite/delphi.git --recurse-submodules`
+2. make python 3.10 virtual env in `.venv`
+3. install dependencies `pip install -r requirements.txt`
+4. install the project in editable state `pip install -e .`
+5. run tests `pytest`
+
+## Submodule Setup
+If you cloned without `--recurse-submodules`, you can still install the submodules later with:
+```bash
+git submodule init
+git submodule update
+```
+
+# Formatting
 
-# formatting
 We're using black & isort to format the code. To make sure your changes adhere to the rules:
 
 1. follow setup instructions above
@@ -16,24 +27,25 @@ We're using black & isort to format the code. To make sure your changes adhere t
 
 When you save a file vscode should automatically format it. Otherwise, pre-commit will do that, but you will need to add the changes and commit again.
 
-# pull requests
+# Pull Requests
 
 1. make a branch
-    - if it relates to an existing issue
-        - go to the issue page and click *Create a branch* under *Development*
-        - if the default name is not very long, keep it; otherwise, make it shorter, but keep the issue number in the front
-    - otherwise pick a short but descriptive name, a few hyphen-separated-words
+   - if it relates to an existing issue
+     - go to the issue page and click _Create a branch_ under _Development_
+     - if the default name is not very long, keep it; otherwise, make it shorter, but keep the issue number in the front
+   - otherwise pick a short but descriptive name, a few hyphen-separated-words
 2. make your changes
-    - include unit tests
-    - update README if needed
+   - include unit tests
+   - update README if needed
+   - if new huggingface datasets/models are added to testing, increment the cache number in `.github/workflows/checks.yml`
 3. make a pull request
-    - if it isn't ready for review yet, mark it as draft
-    - check if CI is passing
-    - if the change is big, try to keep the commit history clean using interactive rebase
-    - don't push more often than it's needed, we're running github actions on a free tier
-    - if there were any changes to the main branch, rebase on top of it
-    - explain the change
-        - provide short description; focus on things that were not mentioned in the relevant issue
-        - comment important sections of the code in *Files changed* tab
-    - when it's ready, add the relevant stakeholders as reviewers
-4. after the comments are resolved and PR is approved, merge it using *Squash and merge*
+   - if it isn't ready for review yet, mark it as draft
+   - check if CI is passing
+   - if the change is big, try to keep the commit history clean using interactive rebase
+   - don't push more often than it's needed, we're running github actions on a free tier
+   - if there were any changes to the main branch, rebase on top of it
+   - explain the change
+     - provide short description; focus on things that were not mentioned in the relevant issue
+     - comment important sections of the code in _Files changed_ tab
+   - when it's ready, add the relevant stakeholders as reviewers
+4. after the comments are resolved and PR is approved, merge it using _Squash and merge_
diff --git a/notebooks/end2end_demo.ipynb b/notebooks/end2end_demo.ipynb
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from typing import cast\n",
+    "import pickle\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "from datasets import load_dataset, Dataset\n",
+    "\n",
+    "from delphi.constants import STATIC_ASSETS_DIR\n",
+    "from delphi.eval import utils\n",
+    "from delphi.eval import constants\n",
+    "from delphi.eval.vis_per_token_model import visualize_per_token_category\n",
+    "\n",
+    "# from delphi.eval.calc_model_group_stats import calc_model_group_stats\n",
+    "from delphi.eval.token_labelling import TOKEN_LABELS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load data\n",
+    "tokenized_corpus_dataset = cast(Dataset, load_dataset(constants.tokenized_corpus_dataset))[\"validation\"]\n",
+    "\n",
+    "# TODO: convert to use static paths\n",
+    "# with open(\"../src/delphi/eval/labelled_token_ids_dict.pkl\", \"rb\") as f:\n",
+    "#     token_groups = pickle.load(f)\n",
+    "# model_group_stats = calc_model_group_stats(\n",
+    "#     tokenized_corpus_dataset, logprob_datasets, token_groups, token_groups[0].keys()\n",
+    "# )\n",
+    "with open(f\"{STATIC_ASSETS_DIR}/model_group_stats.pkl\", \"rb\") as f:\n",
+    "    model_group_stats = pickle.load(f)\n",
+    "\n",
+    "logprob_datasets = utils.load_logprob_datasets(\"validation\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f8846898fbb4a1b9e872ff6511acd3d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Dropdown(description='Token Category:', options=('Capitalized', 'Is Determiner', 'Is Interjunct…"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "performance_data = defaultdict(dict)\n",
+    "for model in constants.LLAMA2_MODELS:\n",
+    "    for token_group_desc in TOKEN_LABELS:\n",
+    "        if (model, token_group_desc) not in model_group_stats:\n",
+    "            continue\n",
+    "        stats = model_group_stats[(model, token_group_desc)]\n",
+    "        performance_data[model][token_group_desc] = (\n",
+    "            -stats[\"median\"],\n",
+    "            -stats[\"75th\"],\n",
+    "            -stats[\"25th\"],\n",
+    "        )\n",
+    "\n",
+    "visualize_per_token_category(\n",
+    "    performance_data,\n",
+    "    log_scale=True,\n",
+    "    bg_color=\"LightGrey\",\n",
+    "    line_color=\"Red\",\n",
+    "    marker_color=\"Orange\",\n",
+    "    bar_color=\"Green\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tinyevals",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/per_token_plot.ipynb b/notebooks/per_token_plot.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "696575431f65420e9dc22c3b3476bfbb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Dropdown(description='Token Category:', options=('nouns', 'verbs', 'prepositions', 'adjectives'…"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from collections import defaultdict\n",
+    "import math\n",
+    "import random\n",
+    "import numpy as np\n",
+    "\n",
+    "from delphi.eval.vis_per_token_model import visualize_per_token_category\n",
+    "\n",
+    "\n",
+    "random.seed(0)\n",
+    "\n",
+    "# generate mock data\n",
+    "model_names = ['llama2-100k', 'llama2-200k', 'llama2-1m', 'llama2-10m']\n",
+    "categories = ['nouns', 'verbs', 'prepositions', 'adjectives']\n",
+    "entries = [200, 100, 150, 300]\n",
+    "performance_data = defaultdict()\n",
+    "for i, model in enumerate(model_names):\n",
+    "    performance_data[model] = defaultdict()\n",
+    "    for cat in categories:\n",
+    "        x = [math.log2(random.random()) for _ in range(entries[i])]\n",
+    "        means = np.mean(x)\n",
+    "        err_low = means - np.percentile(x, 25)\n",
+    "        err_hi = np.percentile(x, 75) - means\n",
+    "        performance_data[model][cat] = (-means, err_low, err_hi)\n",
+    "\n",
+    "\n",
+    "visualize_per_token_category(performance_data, log_scale=True, bg_color='LightGrey', line_color=\"Red\", marker_color='Orange', bar_color='Green')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb3af5248a4a40118c36a527c927289d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Dropdown(description='Token Category:', options=('nouns', 'verbs', 'prepositions', 'adjectives'…"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "visualize_per_token_category(performance_data, log_scale=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}