From 2c27dfafaab94dacf92cc7bbcf610d8b4886692c Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Tue, 2 Jul 2024 20:04:26 +0000 Subject: [PATCH] Try to bypass HPU hangs --- benchmarks/accelerate_opt/main.py | 10 ---------- milabench/commands/executors.py | 24 +++++++++++++----------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/benchmarks/accelerate_opt/main.py b/benchmarks/accelerate_opt/main.py index 9c003eda1..244050b4c 100644 --- a/benchmarks/accelerate_opt/main.py +++ b/benchmarks/accelerate_opt/main.py @@ -126,16 +126,6 @@ class CustomInitProcessGroupKwargs(InitProcessGroupKwargs): world_size=int(os.environ["WORLD_SIZE"]), ) - # Accelerator SUCK, it is impossible to make it use hccl - # We can bypass Accelerator logic by initializing the group ourselves - if acc.device_type == "hpu": - acc.init_process_group( - init_method=f"tcp://{MASTER_ADDR}:{MASTER_PORT}", - timeout=timedelta(seconds=60), - rank=int(os.environ["RANK"]), - world_size=int(os.environ["WORLD_SIZE"]), - ) - accelerator = Accelerator(kwargs_handlers=[init_process_group_kwargs]) else: accelerator = Accelerator() diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index 11d5ddb72..c359e19d0 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -69,16 +69,18 @@ async def execute_command( fut = execute(pack, *argv, **{**_kwargs, **kwargs}) coro.append(fut) - warden.extend(pack.processes) - - if timeout: - delay = pack.config.get("max_duration", timeout_delay) - timeout_task = asyncio.create_task(force_terminate(pack, delay)) - timeout_tasks.append(timeout_task) - - results = await asyncio.gather(*coro) + warden.add_process(*pack.processes) if timeout: - for task in timeout_tasks: - task.cancel() - return results + delay = pack.config.get("max_duration", timeout_delay) + + try: + async with asyncio.timeout(delay): + return await asyncio.gather(*coro) + + except TimeoutError: + await force_terminate(pack, delay) + return [-1 for _ in coro] + + return await asyncio.gather(*coro) +