Skip to content

Commit

Permalink
-
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Oct 9, 2024
1 parent ef7d912 commit a04de74
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 2 deletions.
114 changes: 114 additions & 0 deletions benchmarks/lightning/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python

import argparse
import os

import torch
import torch.nn.functional as F
import lightning as L
import torchvision.models as torchvision_models

# HPU fix
os.environ["RANK"] = os.getenv("RANK","-1")
os.environ["LOCAL_RANK"] = os.getenv("LOCAL_RANK","-1")
os.environ["WORLD_SIZE"] = os.getenv("WORLD_SIZE", "1")
os.environ["LOCAL_WORLD_SIZE"] = os.getenv("LOCAL_WORLD_SIZE", "1")


from benchmate.dataloader import imagenet_dataloader, dataloader_arguments


def criterion():
return F.cross_entropy


class TorchvisionLightning(L.LightningModule):
def __init__(self, model):
super().__init__()
self.model = model
self.criterion = criterion()

def training_step(self, batch, batch_idx):
x, y = batch
p = self.model(x)
loss = self.criterion(p, y)
return loss

def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
return optimizer



def prepare_voir():
from benchmate.observer import BenchObserver
from benchmate.monitor import bench_monitor
import torchcompat.core as accelerator
observer = BenchObserver(
accelerator.Event,
earlystop=100,
batch_size_fn=lambda x: len(x[0]),
raise_stop_program=False,
stdout=True,
)

return observer, bench_monitor

def main():
rank = int(os.getenv("RANK", 0))
world_size = int(os.getenv("WORLD_SIZE", 1))
local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1))

parser = argparse.ArgumentParser(description='simple distributed training job')
parser.add_argument(
"--epochs",
type=int,
default=10,
metavar="N",
help="number of epochs to train (default: 10)",
)
parser.add_argument(
"--model", type=str, help="torchvision model name", required=True
)
dataloader_arguments(parser)
args = parser.parse_args()
model = getattr(torchvision_models, args.model)()

import torchcompat.core as accelerator

# n = accelerator.device_count()
n = local_world_size
nnodes = world_size // local_world_size

model = TorchvisionLightning(model)

accelerator.set_enable_tf32(True)

observer, monitor = prepare_voir()
loader = observer.loader(imagenet_dataloader(args, model, rank, world_size))

# alueError: The `HPUAccelerator` can only be used with a `SingleHPUStrategy` or `HPUParallelStrategy`, found DDPStrategy.

from pytorch_lightning.strategies import HPUParallelStrategy

# train model
trainer = L.Trainer(
accelerator="hpu",
devices=n,
num_nodes=nnodes,
strategy=HPUParallelStrategy(),
max_epochs=args.epochs,
precision="bf16-mixed",
enable_checkpointing=False,
enable_progress_bar=False,
reload_dataloaders_every_n_epochs=1,
max_steps=120
)

with monitor(poll_interval=0.1):
trainer.fit(model=model, train_dataloaders=loader)
print("finished: ", rank)


if __name__ == "__main__":
main()
5 changes: 3 additions & 2 deletions docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ hpu:
git add --all
git commit -m "-"
git push origin hpu
docker system prune -a -f
docker image prune -a -f
docker rmi $(docker images --filter "dangling=true" -q --no-trunc)
# docker system prune -a -f
# docker image prune -a -f
docker build --build-arg CACHEBUST=`git rev-parse hpu` -f Dockerfile-hpu -t dockerfile-hpu .
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --shm-size 50G --cap-add=sys_nice --net=host dockerfile-hpu:latest bash

0 comments on commit a04de74

Please sign in to comment.