Skip to content

Commit

Permalink
Try to bypass HPU hangs
Browse files Browse the repository at this point in the history
  • Loading branch information
Pierre Delaunay committed Jul 2, 2024
1 parent d669feb commit 2c27dfa
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 21 deletions.
10 changes: 0 additions & 10 deletions benchmarks/accelerate_opt/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,6 @@ class CustomInitProcessGroupKwargs(InitProcessGroupKwargs):
world_size=int(os.environ["WORLD_SIZE"]),
)

# Accelerator SUCK, it is impossible to make it use hccl
# We can bypass Accelerator logic by initializing the group ourselves
if acc.device_type == "hpu":
acc.init_process_group(
init_method=f"tcp://{MASTER_ADDR}:{MASTER_PORT}",
timeout=timedelta(seconds=60),
rank=int(os.environ["RANK"]),
world_size=int(os.environ["WORLD_SIZE"]),
)

accelerator = Accelerator(kwargs_handlers=[init_process_group_kwargs])
else:
accelerator = Accelerator()
Expand Down
24 changes: 13 additions & 11 deletions milabench/commands/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,18 @@ async def execute_command(

fut = execute(pack, *argv, **{**_kwargs, **kwargs})
coro.append(fut)
warden.extend(pack.processes)

if timeout:
delay = pack.config.get("max_duration", timeout_delay)
timeout_task = asyncio.create_task(force_terminate(pack, delay))
timeout_tasks.append(timeout_task)

results = await asyncio.gather(*coro)
warden.add_process(*pack.processes)

if timeout:
for task in timeout_tasks:
task.cancel()
return results
delay = pack.config.get("max_duration", timeout_delay)

try:
async with asyncio.timeout(delay):
return await asyncio.gather(*coro)

except TimeoutError:
await force_terminate(pack, delay)
return [-1 for _ in coro]

return await asyncio.gather(*coro)

0 comments on commit 2c27dfa

Please sign in to comment.