diff --git a/benchmarks/llama/main.py b/benchmarks/llama/main.py index e53d6c760..fb347ada9 100755 --- a/benchmarks/llama/main.py +++ b/benchmarks/llama/main.py @@ -231,4 +231,8 @@ def main(): if __name__ == "__main__": - main() + try: + main() + except Exception as err: + # Habana likes to eat exceptions + print(err) \ No newline at end of file diff --git a/benchmarks/torchvision/main.py b/benchmarks/torchvision/main.py index 7446e5276..5bbbea806 100644 --- a/benchmarks/torchvision/main.py +++ b/benchmarks/torchvision/main.py @@ -71,8 +71,10 @@ def train_epoch(model, criterion, optimizer, loader, device, dtype, scaler=None) def toiterator(loader): with timeit("loader"): return iter(loader) - - for inp, target in timeiterator(voir.iterate("train", toiterator(loader), True)): + + iterator = timeiterator(voir.iterate("train", toiterator(loader), True)) + + for inp, target in iterator: with timeit("batch"): inp = inp.to(device, dtype=dtype) @@ -150,7 +152,10 @@ def toiterator(loader): with given() as gv: for epoch in voir.iterate("main", range(args.epochs)): with timeit("epoch"): - for inp, target in timeiterator(voir.iterate("train", toiterator(loader), True)): + + iterator = timeiterator(voir.iterate("train", toiterator(loader), True)) + + for inp, target in iterator: with timeit("batch"): inp = inp.to(device, dtype=dtype) target = target.to(device) @@ -251,7 +256,7 @@ def _main(): if args.iobench: iobench(args) else: - trainbench() + trainbench(args) def trainbench(args): if args.fixed_batch: @@ -277,7 +282,7 @@ def trainbench(args): optimizer = torch.optim.SGD(model.parameters(), args.lr) - model, optimizer = accelerator.optimizer(model, optimizer=optimizer, dtype=float_dtype(args.precision)) + model, optimizer = accelerator.optimize(model, optimizer=optimizer, dtype=float_dtype(args.precision)) if args.data: train_loader = dataloader(args) diff --git a/benchmarks/torchvision_ddp/main.py b/benchmarks/torchvision_ddp/main.py index 101d6c929..117cfe083 100755 --- a/benchmarks/torchvision_ddp/main.py +++ b/benchmarks/torchvision_ddp/main.py @@ -167,13 +167,16 @@ def image_transforms(): return data_transforms def prepare_dataloader(dataset: Dataset, args): + dsampler = DistributedSampler(dataset) + # next(iter(dsampler)) + return DataLoader( dataset, batch_size=args.batch_size, num_workers=args.num_workers if not args.noio else 0, pin_memory=not args.noio, shuffle=False, - sampler=DistributedSampler(dataset) + sampler=dsampler ) class FakeDataset: diff --git a/config/base.yaml b/config/base.yaml index cda4b0f94..5982a1cd7 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -97,6 +97,7 @@ _timm: --amp-dtype: bfloat16 ## FIXME --device: hpu + --dist-backend: hccl _sb3: inherits: _defaults