diff --git a/milabench/executors.py b/milabench/executors.py index eae0107dc..41d308919 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -660,7 +660,7 @@ def _argv(self, **_) -> List: ngpu = len(get_gpu_info()["gpus"].values()) nproc = ngpu * num_machines - assert nproc > 0 + assert nproc > 0, f"nproc: {nproc} num_machines: {num_machines} ngpu: {ngpu}" deepspeed_argv = ( [ diff --git a/milabench/schedule.py b/milabench/schedule.py index 3c887ef0f..7a494469c 100644 --- a/milabench/schedule.py +++ b/milabench/schedule.py @@ -131,7 +131,7 @@ def launch_milabench(sbatch_args=None, dry: bool = False, sync: bool = False): if sbatch_args is None: sbatch_args = [ "--ntasks=1", - "--gpus-per-task=1", + "--gpus-per-task=rtx8000:1", "--cpus-per-task=4", "--time=01:00:00", "--ntasks-per-node=1", diff --git a/milabench/scripts/milabench.bash b/milabench/scripts/milabench.bash index 9032ff832..54abc46c7 100755 --- a/milabench/scripts/milabench.bash +++ b/milabench/scripts/milabench.bash @@ -71,6 +71,11 @@ if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then fi conda activate $ENV +export HF_HOME=$BASE/cache +export HF_DATASETS_CACHE=$BASE/cache +export TORCH_HOME=$BASE/cache +export XDG_CACHE_HOME=$BASE/cache + # # Fetch the repo # @@ -80,11 +85,35 @@ python -m pip install ./milabench SYSTEM="$SLURM_TMPDIR/system.yaml" +echo "" +echo "System" +echo "------" + milabench slurm_system milabench slurm_system > $SYSTEM +module load cuda/11.8 + +echo "" +echo "Install" +echo "-------" milabench install --config $CONFIG --system $SYSTEM --base $BASE $REMAINING_ARGS + +echo "" +echo "Prepare" +echo "-------" milabench prepare --config $CONFIG --system $SYSTEM --base $BASE $REMAINING_ARGS + +echo "" +echo "Run" +echo "---" milabench run --config $CONFIG --system $SYSTEM --base $BASE $REMAINING_ARGS +echo "" +echo "Report" +echo "------" milabench summary $SLURM_TMPDIR/base/runs/ + +echo "----" +echo "Done" +echo ""