-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Your Name
committed
Oct 9, 2024
1 parent
ef7d912
commit a04de74
Showing
2 changed files
with
117 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import os | ||
|
||
import torch | ||
import torch.nn.functional as F | ||
import lightning as L | ||
import torchvision.models as torchvision_models | ||
|
||
# HPU fix | ||
os.environ["RANK"] = os.getenv("RANK","-1") | ||
os.environ["LOCAL_RANK"] = os.getenv("LOCAL_RANK","-1") | ||
os.environ["WORLD_SIZE"] = os.getenv("WORLD_SIZE", "1") | ||
os.environ["LOCAL_WORLD_SIZE"] = os.getenv("LOCAL_WORLD_SIZE", "1") | ||
|
||
|
||
from benchmate.dataloader import imagenet_dataloader, dataloader_arguments | ||
|
||
|
||
def criterion(): | ||
return F.cross_entropy | ||
|
||
|
||
class TorchvisionLightning(L.LightningModule): | ||
def __init__(self, model): | ||
super().__init__() | ||
self.model = model | ||
self.criterion = criterion() | ||
|
||
def training_step(self, batch, batch_idx): | ||
x, y = batch | ||
p = self.model(x) | ||
loss = self.criterion(p, y) | ||
return loss | ||
|
||
def configure_optimizers(self): | ||
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) | ||
return optimizer | ||
|
||
|
||
|
||
def prepare_voir(): | ||
from benchmate.observer import BenchObserver | ||
from benchmate.monitor import bench_monitor | ||
import torchcompat.core as accelerator | ||
observer = BenchObserver( | ||
accelerator.Event, | ||
earlystop=100, | ||
batch_size_fn=lambda x: len(x[0]), | ||
raise_stop_program=False, | ||
stdout=True, | ||
) | ||
|
||
return observer, bench_monitor | ||
|
||
def main(): | ||
rank = int(os.getenv("RANK", 0)) | ||
world_size = int(os.getenv("WORLD_SIZE", 1)) | ||
local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1)) | ||
|
||
parser = argparse.ArgumentParser(description='simple distributed training job') | ||
parser.add_argument( | ||
"--epochs", | ||
type=int, | ||
default=10, | ||
metavar="N", | ||
help="number of epochs to train (default: 10)", | ||
) | ||
parser.add_argument( | ||
"--model", type=str, help="torchvision model name", required=True | ||
) | ||
dataloader_arguments(parser) | ||
args = parser.parse_args() | ||
model = getattr(torchvision_models, args.model)() | ||
|
||
import torchcompat.core as accelerator | ||
|
||
# n = accelerator.device_count() | ||
n = local_world_size | ||
nnodes = world_size // local_world_size | ||
|
||
model = TorchvisionLightning(model) | ||
|
||
accelerator.set_enable_tf32(True) | ||
|
||
observer, monitor = prepare_voir() | ||
loader = observer.loader(imagenet_dataloader(args, model, rank, world_size)) | ||
|
||
# alueError: The `HPUAccelerator` can only be used with a `SingleHPUStrategy` or `HPUParallelStrategy`, found DDPStrategy. | ||
|
||
from pytorch_lightning.strategies import HPUParallelStrategy | ||
|
||
# train model | ||
trainer = L.Trainer( | ||
accelerator="hpu", | ||
devices=n, | ||
num_nodes=nnodes, | ||
strategy=HPUParallelStrategy(), | ||
max_epochs=args.epochs, | ||
precision="bf16-mixed", | ||
enable_checkpointing=False, | ||
enable_progress_bar=False, | ||
reload_dataloaders_every_n_epochs=1, | ||
max_steps=120 | ||
) | ||
|
||
with monitor(poll_interval=0.1): | ||
trainer.fit(model=model, train_dataloaders=loader) | ||
print("finished: ", rank) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters