Skip to content

Commit

Permalink
Allow to start environments in single without mpirun if required. Par…
Browse files Browse the repository at this point in the history
…ameter added to configuration file.
  • Loading branch information
m-kurz committed Aug 4, 2022
1 parent 3380227 commit e2c50b5
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 21 deletions.
3 changes: 2 additions & 1 deletion examples/HIT_24_DOF/prm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ environment:
mesh_file: ./simulation_files/CART_HEX_PERIODIC_004_mesh.h5
#local_dir: /var/tmp
mpi_launch_mpmd: False
env_launcher: local

# All parameters for setting up the reward
reward:
Expand All @@ -28,7 +29,7 @@ reward:
training:
train_num_epochs: 5
train_learning_rate: 1.e-4
train_num_iterations: 10
train_num_iterations: 1
train_buffer_capacity: 2000
train_files:
- ./simulation_files/run_f200_N5_4Elems_State_0000003.000000000.h5
Expand Down
11 changes: 6 additions & 5 deletions examples/HIT_32_DOF/prm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ environment:
mesh_file: ./simulation_files/CART_HEX_PERIODIC_004_mesh.h5
#local_dir: /var/tmp
mpi_launch_mpmd: False
env_launcher: local

# All parameters for setting up the reward
reward:
Expand All @@ -31,10 +32,10 @@ training:
train_num_iterations: 10
train_buffer_capacity: 2000
train_files:
- ./simulation_files/N7_4Elems/run_f200_N7_4Elems_State_0000003.000000000.h5
- ./simulation_files/N7_4Elems/run_f200_N7_4Elems_State_0000004.000000000.h5
- ./simulation_files/N7_4Elems/run_f200_N7_4Elems_State_0000005.000000000.h5
- ./simulation_files/N7_4Elems/run_f200_N7_4Elems_State_0000006.000000000.h5
- ./simulation_files/run_f200_N7_4Elems_State_0000003.000000000.h5
- ./simulation_files/run_f200_N7_4Elems_State_0000004.000000000.h5
- ./simulation_files/run_f200_N7_4Elems_State_0000005.000000000.h5
- ./simulation_files/run_f200_N7_4Elems_State_0000006.000000000.h5

# All parameters for the agent
agent:
Expand All @@ -53,7 +54,7 @@ evaluation:
eval_num_episodes: 1
eval_interval: 10
eval_files:
- ./simulation_files/N7_4Elems/run_f200_N7_4Elems_State_0000008.000000000.h5
- ./simulation_files/run_f200_N7_4Elems_State_0000008.000000000.h5

# Performance
performance:
Expand Down
44 changes: 30 additions & 14 deletions src/flexiEnvSmartSim.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from smartsim import Experiment
from smartsim.database import PBSOrchestrator,Orchestrator
from smartsim.settings import MpirunSettings
from smartsim.settings import MpirunSettings,RunSettings

from smartredis import Client

Expand Down Expand Up @@ -67,6 +67,7 @@ def __init__( self
,hosts = None
,rankfiles = None
,mpi_launch_mpmd = False
,env_launcher = 'mpirun'
):
"""Initialize TF and FLEXI specific properties"""

Expand All @@ -83,7 +84,18 @@ def __init__( self
self.flexi_path = flexi_path
self.hosts = hosts
self.rankfiles = rankfiles
self.mpi_launch_mpmd = mpi_launch_mpmd

# Sanity Check Launcher
self.env_launcher = env_launcher
if ((self.env_launcher == 'local') and (n_procs != 1)):
printWarning("For env_launcher 'local', only single execution is allowed! Setting 'n_procs=1'!")
printWarning("To run evironments in parallel with MPI, use env_launcher='mpi'!")
n_procs = 1

if (self.env_launcher == 'mpirun'):
self.mpi_launch_mpmd = mpi_launch_mpmd
else:
self.mpi_launch_mpmd = False

# Save list of restart files
self.random_restart_file = random_restart_file
Expand Down Expand Up @@ -216,21 +228,25 @@ def _start_flexi(self,exp,n_procs,n_envs):
args.append('--tag')
args.append(self.tag[i])

run_args = {"rankfile" : self.rankfiles[i] ,"report-bindings" : ""}
mpi = MpirunSettings(exe=self.flexi_path, exe_args=args, run_args=run_args)
mpi.set_tasks(n_procs)
mpi.set_hostlist(self.hosts[hosts_per_flexi[i,0]:hosts_per_flexi[i,1]+1])
if (self.env_launcher == 'mpirun'):
run_args = {"rankfile" : self.rankfiles[i] ,"report-bindings" : ""}
run = MpirunSettings(exe=self.flexi_path, exe_args=args, run_args=run_args)
run.set_tasks(n_procs)
run.set_hostlist(self.hosts[hosts_per_flexi[i,0]:hosts_per_flexi[i,1]+1])

# Create MPMD Settings and start later in single command
if self.mpi_launch_mpmd:
if i==0:
f_mpmd = mpi
else:
f_mpmd.make_mpmd(mpi)
# Create MPMD Settings and start later in single command
if self.mpi_launch_mpmd:
if i==0:
f_mpmd = run
else:
f_mpmd.make_mpmd(run)

else: # Otherwise do not use launcher
run = RunSettings(exe=self.flexi_path, exe_args=args)

# Create and directly start FLEXI instances
else:
flexi_instance = exp.create_model(self.tag[i]+"flexi", mpi)
if not self.mpi_launch_mpmd:
flexi_instance = exp.create_model(self.tag[i]+"flexi", run)
exp.start(flexi_instance, block=False,summary=False)
flexi.append(flexi_instance)

Expand Down
5 changes: 4 additions & 1 deletion src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def train( config_file
,smartsim_num_dbs = 1
,smartsim_launcher = "local"
,smartsim_orchestrator = "local"
,env_launcher = "mpirun"
,mpi_launch_mpmd = False
,local_dir = None
,n_procs_per_node=128 # Hawk
Expand Down Expand Up @@ -483,6 +484,7 @@ def train( config_file
,reward_scale = reward_scale
,restart_files = train_files
,rankfiles = rank_files
,env_launcher = env_launcher
,mpi_launch_mpmd = mpi_launch_mpmd
,debug = debug
))
Expand All @@ -509,8 +511,9 @@ def train( config_file
,reward_scale = reward_scale
,restart_files = eval_files
,random_restart_file = False
,debug = debug
,rankfiles = rank_files
,env_launcher = env_launcher
,debug = debug
))


Expand Down

0 comments on commit e2c50b5

Please sign in to comment.