Merge branch 'stable' into new_pytorch_stable

mila-iqia · Feb 2, 2024 · 85d500e · 85d500e
2 parents 8772404 + 2dd2b14
commit 85d500e
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 5 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,8 +27,8 @@ jobs:
         include:
           - arch: cuda
             exclude : "no-cuda"
-          - arch: rocm
-            exclude : "no-rocm"
+          # - arch: rocm
+          #   exclude : "no-rocm"
 
     runs-on: [self-hosted, "${{ matrix.arch }}"]
 

diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,13 @@ dist/
 .envrc
 docs/_build
 .pin-constraints-*
+
 scripts/inventory.yaml
 output/
 sqlite.db
 .ruff_cache/
+
+test.out
+output/
+workspace/
+.pin/tmp-*
diff --git a/.no_report b/.no_report
diff --git a/benchmarks/accelerate_opt/main.py b/benchmarks/accelerate_opt/main.py
@@ -145,17 +145,19 @@ def mblog(data):
     validation_split_percentage = config["validation_split_percentage"]
     dataset_name = config["dataset_name"]
     dataset_config_name = config["dataset_config_name"]
-    raw_datasets = load_dataset(dataset_name, dataset_config_name)
+    raw_datasets = load_dataset(dataset_name, dataset_config_name, revision="f5562967961a45407fa15044c5535a607200983f")
     if "validation" not in raw_datasets.keys():
         raw_datasets["validation"] = load_dataset(
             dataset_name,
             dataset_config_name,
             split=f"train[:{validation_split_percentage}%]",
+            revision="f5562967961a45407fa15044c5535a607200983f",
         )
         raw_datasets["train"] = load_dataset(
             dataset_name,
             dataset_config_name,
             split=f"train[{validation_split_percentage}%:]",
+            revision="f5562967961a45407fa15044c5535a607200983f",
         )
 
     model_name = config["model_name"]

diff --git a/benchmarks/stable_baselines3/requirements.cuda.txt b/benchmarks/stable_baselines3/requirements.cuda.txt
@@ -304,7 +304,7 @@ urllib3==1.26.15
     #   sentry-sdk
 varname==0.10.0
     # via giving
-voir==0.2.9
+voir==0.2.12
     # via -r benchmarks/stable_baselines3/requirements.in
 wandb==0.14.0
     # via -r benchmarks/stable_baselines3/requirements.in

diff --git a/benchmarks/stable_baselines3/requirements.rocm.txt b/benchmarks/stable_baselines3/requirements.rocm.txt
@@ -304,7 +304,7 @@ urllib3==1.26.15
     #   sentry-sdk
 varname==0.10.0
     # via giving
-voir==0.2.9
+voir==0.2.12
     # via -r benchmarks/stable_baselines3/requirements.in
 wandb==0.14.0
     # via -r benchmarks/stable_baselines3/requirements.in

diff --git a/run.sh b/run.sh
@@ -0,0 +1,16 @@
+
+OUTPUT="test.out"
+rm -rf $OUTPUT
+touch $OUTPUT
+sbatch  --ntasks=1\
+        --gpus-per-task=rtx8000:1\
+        --cpus-per-task=4\
+        --time=01:30:00\
+        --ntasks-per-node=1\
+        --mem=64G\
+        -o $OUTPUT\
+        slurm.sh\
+        -a cuda\
+        -b stable_update
+
+tail -f $OUTPUT
diff --git a/slurm.sh b/slurm.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+function usage() {
+  echo "Usage: $0 [-m] [-p]"
+  echo "  -h              Display this help message."
+  echo "  -a arch         GPU arch           (default: cuda)"
+  echo "  -b BRANCH       Branch to checkout (default: master)"
+  echo "  -o ORIGIN       Origin to use      (default: github/mila/milabench)"
+  echo "  -c CONFIG       Configuration      (default: milabench/config/standard.yaml)"
+  echo "  -e ENV          Environment        (default: ./env)"
+  echo "  -p PYTHON       Python version     (default: 3.9)"
+  echo "  ARGUMENT        Any additional argument you want to process."
+  exit 1
+}
+
+ARCH="cuda"
+PYTHON="3.9"
+BRANCH="master"
+ORIGIN="https://github.com/mila-iqia/milabench.git"
+LOC="$SLURM_TMPDIR"
+CONFIG="$LOC/milabench/config/standard.yaml"
+BASE="$LOC/base"
+ENV="./env"
+REMAINING_ARGS=""
+
+
+while getopts ":hm:p:e:a:b:o:c:" opt; do
+  case $opt in
+    h)
+      usage
+      ;;
+    p)
+        PYTHON="$OPTARG"
+        ;;
+    b)
+        BRANCH="$OPTARG"
+        ;;
+    o)
+        ORIGIN="$OPTARG"
+        ;;
+    c)
+        CONFIG="$OPTARG"
+        ;;
+    e)
+        ENV="$OPTARG"
+        ;;
+    a)
+        ARCH="$OPTARG"
+        ;;
+    l)
+        # FIX ME
+        LOC="$OPTARG"
+        CONFIG="$LOC/milabench/config/standard.yaml"
+        BASE="$LOC/base"
+        ;;
+    :)
+        echo "Option -$OPTARG requires an argument." >&2
+        usage
+        ;;
+  esac
+done
+
+shift "$((OPTIND-1))"
+REMAINING_ARGS="$@"
+
+echo "  PYTHON: $PYTHON"
+echo "  branch: $BRANCH"
+echo "  origin: $ORIGIN"
+echo "  config: $CONFIG"
+echo "     env: $ENV"
+echo "    args: $REMAINING_ARGS"
+#
+#   Fix problem with conda saying it is not "init properly"
+#
+CONDA_EXEC="$(which conda)"
+CONDA_BASE=$(dirname $CONDA_EXEC)
+source $CONDA_BASE/../etc/profile.d/conda.sh
+
+if [ -e $HOME/.credentials.env ]; then
+  source $HOME/.credentials.env
+fi
+
+cd $LOC
+#
+#   Create a new environment
+#
+if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then
+     conda create --prefix $ENV python=$PYTHON -y
+fi
+conda activate $ENV
+
+export HF_HOME=$BASE/cache
+export HF_DATASETS_CACHE=$BASE/cache
+export TORCH_HOME=$BASE/cache
+export XDG_CACHE_HOME=$BASE/cache
+export MILABENCH_GPU_ARCH=$ARCH
+
+export MILABENCH_DASH=no 
+export PYTHONUNBUFFERED=1
+export MILABENCH_BASE=$BASE
+export MILABENCH_CONFIG=$CONFIG
+#
+# Fetch the repo
+#
+git clone --single-branch --depth 1 -b $BRANCH $ORIGIN
+python -m pip install -e ./milabench
+
+module load gcc/9.3.0 
+module load cuda/11.8
+
+echo ""
+echo "Install"
+echo "-------"
+milabench install --config $CONFIG --base $BASE $REMAINING_ARGS
+
+
+echo ""
+echo "Prepare"
+echo "-------"
+milabench prepare --config $CONFIG --base $BASE $REMAINING_ARGS
+
+echo ""
+echo "Run"
+echo "---"
+milabench run     --config $CONFIG --base $BASE $REMAINING_ARGS
+
+echo "----"
+echo "Done after $SECONDS"
+echo ""