Merge pull request #78 from graphcore-research/docker-dev

Add docker dev setup and update requirements
graphcore-research · Nov 5, 2024 · bcc3d5e · bcc3d5e
2 parents 445ff24 + 7a1f767
commit bcc3d5e
Show file tree

Hide file tree

Showing 19 changed files with 127 additions and 210 deletions.
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceFolder": "/home/developer/unit-scaling",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-toolsai.jupyter"
+            ],
+	    "settings": {
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": { "zsh": { "path": "/bin/zsh" } }
+            }
+        }
+    },
+    "mounts": [
+        "source=${localEnv:HOME}/.ssh,target=/home/developer/.ssh,type=bind,readonly=true",
+        "source=${localEnv:HOME}/.gitconfig,target=/home/developer/.gitconfig,type=bind,readonly=true",
+        "source=${localWorkspaceFolder},target=/home/developer/unit-scaling,type=bind"
+    ],
+    "remoteUser": "developer"
+}
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,2 @@
+*
+!requirements*.txt
diff --git a/.github/workflows/ci-ipu.yaml b/.github/workflows/ci-ipu.yaml
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -15,14 +15,16 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v3
-      - name: Install dependencies
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build Docker Image
         run: |
-          sudo apt-get update
-          sudo apt-get install -y git
-          pip install -r requirements-dev.txt
+          docker build -t unit-scaling-dev:latest . 
+
       - name: Run CI
-        run: ./dev ci
+        run: docker run --rm -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest ./dev ci
+
       - name: Publish documentation
         if: ${{github.ref == 'refs/heads/main'}}
         uses: Cecilapp/GitHub-Pages-deploy@v3

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,39 @@
+# Use PyTorch base image
+FROM pytorch/pytorch:latest
+
+# Install additional dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    vim \
+    sudo \
+    make \
+    g++ \
+    zsh \
+    && chsh -s /bin/zsh \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*   # cleanup (smaller image)
+
+# Configure a non-root user with sudo privileges
+ARG USERNAME=developer  # Change this to preferred username
+ARG USER_UID=1001
+ARG USER_GID=$USER_UID
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
+    && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+USER $USERNAME
+
+# Set working directory
+WORKDIR /home/$USERNAME/unit-scaling
+
+# Puts pip install libs on $PATH & sets correct locale
+ENV PATH="$PATH:/home/$USERNAME/.local/bin" \
+    LC_ALL=C.UTF-8
+
+# Install Python dependencies
+COPY requirements-dev.txt .
+RUN pip install -r requirements-dev.txt
+
+# Creates basic .zshrc
+RUN sudo cp /etc/zsh/newuser.zshrc.recommended /home/$USERNAME/.zshrc
+
+CMD ["/bin/zsh"]
diff --git a/README.md b/README.md
@@ -17,7 +17,22 @@ To install the `unit-scaling` library, run:
 pip install git+https://github.com/graphcore-research/unit-scaling.git
 ```
 
-For development on this repository, see [docs/development.md](docs/development.md).
+## Development
+
+For development in this repository, we recommend using the provided docker container.
+This image can be built and entered interactively using:
+
+```sh
+docker build -t unit-scaling-dev:latest .
+docker run -it --rm  --user developer:developer -v $(pwd):/home/developer/unit-scaling unit-scaling-dev:latest
+# To use git within the container, add `-v ~/.ssh:/home/developer/.ssh:ro -v ~/.gitconfig:/home/developer/.gitconfig:ro`.
+```
+
+For vscode users, this repo also contains a `.devcontainer.json` file, which enables the container to be used as a full-featured IDE (see the [Dev Container docs](https://code.visualstudio.com/docs/devcontainers/containers) for details on how to use this feature).
+
+Key development functionality is contained within the `./dev` script. This includes running unit tests, linting, formatting, documentation generation and more. Run `./dev --help` for the available options. Running `./dev` without arguments is equivalent to using the `--ci` option, which runs all of the available dev checks. This is the test used for GitHub CI.
+
+We encourage pull requests from the community. Please reach out to us with any questions about contributing.
 
 ## What is u-μP?
 

diff --git a/analysis/almost_scaled_dot_product_attention/demo_transformer.py b/analysis/almost_scaled_dot_product_attention/demo_transformer.py
@@ -10,13 +10,6 @@
 from torch import nn, Tensor
 import tqdm
 
-try:
-    import poptorch
-
-    poptorch_available = True
-except ModuleNotFoundError:
-    poptorch_available = False
-
 
 class Config(dict):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -132,7 +125,7 @@ def forward(self, indices: Tensor) -> Tensor:
         )
 
 
-def train_cpu() -> Tensor:
+def train() -> Tensor:
     model = Model()
     opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
     losses = []
@@ -143,26 +136,3 @@ def train_cpu() -> Tensor:
         opt.step()
         losses.append(float(loss))
     return torch.tensor(losses)
-
-
-def train_ipu() -> Tensor:
-    model = Model()
-    options = poptorch.Options()
-    options.showCompilationProgressBar(False)
-    opt = torch.optim.Adam(model.parameters(), lr=CONFIG.lr)
-    session = poptorch.trainingModel(model, options, opt)
-    try:
-        return torch.tensor(
-            [
-                float(session(batch.int()))
-                for batch in tqdm.tqdm(
-                    islice(batches(), CONFIG.steps), total=CONFIG.steps
-                )
-            ]
-        )
-    finally:
-        session.destroy()
-
-
-def train() -> Tensor:
-    return train_ipu() if poptorch_available else train_cpu()
diff --git a/docs/development.md b/docs/development.md
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
@@ -55,13 +55,6 @@ The advantage of using a unit-scaled model is as follows:
    scales have stayed within range for all unit-scaled models tested thus far.
 3. This can enable the use of smaller, more efficient number formats out-of-the-box,
    such as FP16 and even FP8.
-4. As the behaviour of some ops depends on scale, unit-scaling a model can change its
-   training dynamics slightly. In some experiments this has been shown to lead to
-   loss decreasing faster, though further work is needed to validate this.
-
-For a more in-depth treatment of unit scaling, see our paper
-`Unit Scaling: Out-of-the-Box Low-Precision Training (ICML, 2023)
-<https://arxiv.org/abs/2303.11257>`_.
 
 
 How to unit-scale a model

diff --git a/requirements-dev-ipu.txt b/requirements-dev-ipu.txt
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,14 +1,28 @@
--r requirements.txt
-black==24.3.0
-flake8==6.0.0
-isort==5.12.0
-mypy==1.2.0
-myst-parser==1.0.0
-pandas-stubs==2.0.2.230605
-pytest==7.3.1
-pytest-cov==4.0.0
-sphinx==6.2.1
-sphinx-rtd-theme==1.2.0
-transformers==4.38.0
-types-Pygments==2.15.0.0
-types-tabulate==0.9.0.2
+# Look in pytorch-cpu first, then pypi second
+--index-url https://download.pytorch.org/whl/cpu
+--extra-index-url=https://pypi.org/simple
+
+# Same as requirements.txt, but with versions locked-in
+datasets==3.1.0
+docstring-parser==0.16
+einops==0.8.0
+numpy==2.1.3
+seaborn==0.13.2
+tabulate==0.9.0
+torch==2.5.1+cpu
+
+# Additional dev requirements
+black==24.10.0
+flake8==7.1.1
+isort==5.13.2
+mypy==1.13.0
+myst-parser==4.0.0
+pandas-stubs==2.2.3.241009
+pytest==8.3.3
+pytest-cov==6.0.0
+sphinx==8.1.3
+sphinx-rtd-theme==3.0.1
+transformers==4.46.1
+triton==3.1.0
+types-Pygments==2.18.0.20240506
+types-tabulate==0.9.0.20240106
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 datasets
 docstring-parser
 einops
-numpy<2.0
+numpy
 seaborn
 tabulate
 torch>=2.2
diff --git a/setup.cfg b/setup.cfg
@@ -8,12 +8,6 @@ show_error_codes = true
 strict = true
 check_untyped_defs = true
 
-[mypy-poptorch.*]
-ignore_missing_imports = True
-
-[mypy-poptorch_experimental_addons.*]
-ignore_missing_imports = True
-
 # As torch.fx doesn't explicitly export many of its useful modules.
 [mypy-torch.fx]
 implicit_reexport = True

diff --git a/setup.py b/setup.py
@@ -5,15 +5,6 @@
 import setuptools
 
 requirements = Path("requirements.txt").read_text().rstrip("\n").split("\n")
-try:
-    import poptorch
-
-    # This should match requirements-dev-ipu.txt
-    requirements.append(
-        "poptorch-experimental-addons @ git+https://github.com/graphcore-research/poptorch-experimental-addons@beb12678d1e7ea2c033bd061d32167be262dfa58"
-    )
-except ImportError:
-    pass
 
 version = re.search("__version__ = \"(.+)\"", Path("unit_scaling/_version.py").read_text()).group(1)
 

diff --git a/unit_scaling/analysis.py b/unit_scaling/analysis.py
@@ -12,8 +12,8 @@
 import matplotlib.colors
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns  # type: ignore[import]
-from datasets import load_dataset  # type: ignore[import]
+import seaborn as sns  # type: ignore[import-untyped]
+from datasets import load_dataset  # type: ignore[import-untyped]
 from torch import Tensor, nn
 from torch.fx.graph import Graph
 from torch.fx.node import Node