Skip to content

Commit

Permalink
Add TorchVision example
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Mar 15, 2023
1 parent 47e7860 commit f861529
Show file tree
Hide file tree
Showing 16 changed files with 584 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
_build
.idea
**/__pycache__
/docs/examples/**/*.diff
6 changes: 4 additions & 2 deletions docs/Minimal_examples.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
.. ***************************
.. ****************
.. Minimal Examples
.. ***************************
.. ****************
.. include:: examples/frameworks/README.rst
.. include:: examples/distributed/README.rst
.. include:: examples/data/README.rst
16 changes: 15 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from __future__ import division, print_function, unicode_literals

from datetime import datetime

import subprocess
from pathlib import Path
import sphinx_theme

extensions = [
Expand Down Expand Up @@ -90,5 +91,18 @@
# Include CNAME file so GitHub Pages can set Custom Domain name
html_extra_path = ['CNAME']


# Generate the diffs that are shown in the examples.
file_dir = Path(__file__).parent / "examples/generate_diffs.sh"
try:
proc = subprocess.run(str(file_dir), shell=True, capture_output=True, check=True)
except subprocess.CalledProcessError as err:
raise RuntimeError(
"Could not build the diff files for the examples:\n"
+ str(err.output, encoding="utf-8")
+ str(err.stderr, encoding="utf-8")
)


def setup(app):
app.add_css_file('custom.css')
6 changes: 6 additions & 0 deletions docs/examples/data/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*****************************
Data Handling during Training
*****************************


.. include:: examples/data/torchvision/README.rst
31 changes: 31 additions & 0 deletions docs/examples/data/torchvision/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
TorchVision
===========


**Prerequisites**

Make sure to read the following sections of the documentation before using this example:

* :ref:`pytorch_setup`
* :ref:`001 - Single GPU Job`

The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/imagenet>`_


**job.sh**

.. literalinclude:: examples/data/torchvision/job.sh.diff
:language: diff


**main.py**

.. literalinclude:: examples/data/torchvision/main.py.diff
:language: diff


**Running this example**

.. code-block:: bash
$ sbatch job.sh
12 changes: 12 additions & 0 deletions docs/examples/data/torchvision/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Make sure the data is available"""
import sys
import time

from torchvision.datasets import ImageNet


t = -time.time()
ImageNet(root=sys.argv[1], split="train")
ImageNet(root=sys.argv[1], split="val")
t += time.time()
print(f"Prepared data in {t/60:.2f}m")
15 changes: 15 additions & 0 deletions docs/examples/data/torchvision/data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -o errexit

# Stage dataset into $SLURM_TMPDIR for extraction
# 'ln' will avoid a useless copy of the archives before they are to be extracted
mkdir -p "$SLURM_TMPDIR/data"
ln -sft "$SLURM_TMPDIR/data" "/network/datasets/imagenet"/*

# The following 3 lines will save you ~7min in the case of ImageNet
mkdir -p "$SLURM_TMPDIR/data/train"
pushd "$SLURM_TMPDIR/data/train"
tar -xf ../ILSVRC2012_img_train.tar --to-command='mkdir ${TAR_REALNAME%.tar}; tar -xC ${TAR_REALNAME%.tar}'
popd

python3 data.py "$SLURM_TMPDIR/data"
37 changes: 37 additions & 0 deletions docs/examples/data/torchvision/job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=1
#SBATCH --mem=16G
#SBATCH --time=01:30:00
set -o errexit


# Echo time and hostname into log
echo "Date: $(date)"
echo "Hostname: $(hostname)"


# Ensure only anaconda/3 module loaded.
module purge
# This example uses Conda to manage package dependencies.
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3


# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.6 scipy -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
conda activate pytorch


# Prepare data
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 time bash data.sh


# Execute Python script
python main.py
187 changes: 187 additions & 0 deletions docs/examples/data/torchvision/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
"""Torchvision training example."""
import logging
import os

import rich.logging
import torch
from torch import Tensor, nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
from torchvision.datasets import ImageNet
from torchvision.models import resnet18
from tqdm import tqdm


def main():
training_epochs = 1
learning_rate = 5e-4
weight_decay = 1e-4
batch_size = 256

# Check that the GPU is available
assert torch.cuda.is_available() and torch.cuda.device_count() > 0
device = torch.device("cuda", 0)

# Setup logging (optional, but much better than using print statements)
logging.basicConfig(
level=logging.INFO,
handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package.
)

logger = logging.getLogger(__name__)

# Create a model and move it to the GPU.
model = resnet18()
model.to(device=device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Setup ImageNet
num_workers = get_num_workers()
try:
dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
except KeyError:
dataset_path = "../dataset"
train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
train_dataloader = DataLoader(
train_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=True,
)
valid_dataloader = DataLoader(
valid_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
)
test_dataloader = DataLoader( # NOTE: Not used in this example.
test_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
)

# Checkout the "checkpointing and preemption" example for more info!
logger.debug("Starting training from scratch.")

for epoch in range(training_epochs):
logger.debug(f"Starting epoch {epoch}/{training_epochs}")

# Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
model.train()

# NOTE: using a progress bar from tqdm because it's nicer than using `print`.
progress_bar = tqdm(
total=len(train_dataloader),
desc=f"Train epoch {epoch}",
)

# Training loop
for batch in train_dataloader:
# Move the batch to the GPU before we pass it to the model
batch = tuple(item.to(device) for item in batch)
x, y = batch

# Forward pass
logits: Tensor = model(x)

loss = F.cross_entropy(logits, y)

optimizer.zero_grad()
loss.backward()
optimizer.step()

# Calculate some metrics:
n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
n_samples = y.shape[0]
accuracy = n_correct_predictions / n_samples

logger.debug(f"Accuracy: {accuracy.item():.2%}")
logger.debug(f"Average Loss: {loss.item()}")

# Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
progress_bar.update(1)
progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
progress_bar.close()

val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")

print("Done!")


@torch.no_grad()
def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
model.eval()

total_loss = 0.0
n_samples = 0
correct_predictions = 0

for batch in dataloader:
batch = tuple(item.to(device) for item in batch)
x, y = batch

logits: Tensor = model(x)
loss = F.cross_entropy(logits, y)

batch_n_samples = x.shape[0]
batch_correct_predictions = logits.argmax(-1).eq(y).sum()

total_loss += loss.item()
n_samples += batch_n_samples
correct_predictions += batch_correct_predictions

accuracy = correct_predictions / n_samples
return total_loss, accuracy


def make_datasets(
dataset_path: str,
val_split: float = 0.1,
val_split_seed: int = 42,
):
"""Returns the training, validation, and test splits for ImageNet.
NOTE: We don't use image transforms here for simplicity.
Having different transformations for train and validation would complicate things a bit.
Later examples will show how to do the train/val/test split properly when using transforms.
"""
train_dataset = ImageNet(
root=dataset_path,
transform=transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
]),
split="train"
)
test_dataset = ImageNet(
root=dataset_path,
transform=transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
]),
split="val"
)
# Split the training dataset into a training and validation set.
train_dataset, valid_dataset = random_split(
train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
)
return train_dataset, valid_dataset, test_dataset


def get_num_workers() -> int:
"""Gets the optimal number of DatLoader workers to use in the current job."""
if "SLURM_CPUS_PER_TASK" in os.environ:
return int(os.environ["SLURM_CPUS_PER_TASK"])
if hasattr(os, "sched_getaffinity"):
return len(os.sched_getaffinity(0))
return torch.multiprocessing.cpu_count()


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions docs/examples/distributed/001_single_gpu/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
001 - Single GPU Job
====================


**Prerequisites**
Make sure to read the following sections of the documentation before using this example:

* :ref:`pytorch_setup`

The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/lebrice/mila-docs/tree/pytorch_distributed_training_examples/docs/examples/distributed/001_single_gpu>`_

**job.sh**

.. literalinclude:: examples/distributed/001_single_gpu/job.sh
:language: bash


**main.py**

.. literalinclude:: examples/distributed/001_single_gpu/main.py
:language: python


**Running this example**


.. code-block:: bash
$ sbatch job.sh
Loading

0 comments on commit f861529

Please sign in to comment.