Skip to content

Commit

Permalink
Improve Slurm support
Browse files Browse the repository at this point in the history
Signed-off-by: Geoffroy Vallee <[email protected]>
  • Loading branch information
gvallee committed Jun 16, 2021
1 parent a7e337d commit dd1fdbf
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
33 changes: 28 additions & 5 deletions pkg/jm/jobmgr_slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"path/filepath"
"strconv"
"strings"
"time"

"github.com/gvallee/go_exec/pkg/advexec"
"github.com/gvallee/go_hpc_jobmgr/internal/pkg/network"
Expand Down Expand Up @@ -183,10 +184,13 @@ func slurmLoad(jobmgr *JM, sysCfg *sys.Config) error {
}

func getJobOutFilenamePrefix(j *job.Job) string {
if j.ExecutionTimestamp == "" {
return ""
}
if j.MPICfg != nil && j.MPICfg.Implem.ID != "" {
return j.Name + "-" + j.MPICfg.Implem.ID + "-" + j.MPICfg.Implem.Version
return j.Name + "-" + j.ExecutionTimestamp + "-" + j.MPICfg.Implem.ID + j.MPICfg.Implem.Version
}
return j.Name
return j.Name + "-" + j.ExecutionTimestamp
}

func getJobOutputFilePath(j *job.Job, sysCfg *sys.Config) string {
Expand All @@ -203,7 +207,7 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error)
return "", fmt.Errorf("batch script path is undefined")
}

scriptText := "#!/bin/bash\n#\n"
scriptText := "#!/bin/bash -l\n#\n"
if j.Partition != "" {
scriptText += slurm.ScriptCmdPrefix + " -p " + j.Partition + "\n"
}
Expand All @@ -220,6 +224,8 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error)
}
*/

now := time.Now()
j.ExecutionTimestamp = string(now.Format("060102150405"))
scriptText += slurm.ScriptCmdPrefix + " --error=" + getJobErrorFilePath(j, sysCfg) + "\n"
scriptText += slurm.ScriptCmdPrefix + " --output=" + getJobOutputFilePath(j, sysCfg) + "\n"
scriptText += "\n"
Expand All @@ -228,6 +234,12 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error)
scriptText += "\nmodule purge\nmodule load " + strings.Join(j.RequiredModules, " ") + "\n"
}

if j.CustomEnv != nil {
for envvar, val := range j.CustomEnv {
scriptText += fmt.Sprintf("export %s=%s\n", envvar, val)
}
}

return scriptText, nil
}

Expand All @@ -240,6 +252,12 @@ func setupMpiJob(j *job.Job, sysCfg *sys.Config) error {
netCfg := new(network.Config)
netCfg.Device = j.Device

if j.CustomEnv != nil {
for envvar, val := range j.CustomEnv {
scriptText += fmt.Sprintf("export %s=%s\n", envvar, val)
}
}

// Add the mpirun command
if j.MPICfg != nil && len(j.RequiredModules) == 0 {
scriptText += "\nMPI_DIR=" + j.MPICfg.Implem.InstallDir + "\n"
Expand All @@ -251,6 +269,8 @@ func setupMpiJob(j *job.Job, sysCfg *sys.Config) error {
return fmt.Errorf("unable to get mpirun arguments: %s", err)
}

scriptText += "\nwhich mpirun\n"

scriptText += "\nmpirun "
if j.NP > 0 {
scriptText += fmt.Sprintf("-np %d ", j.NP)
Expand Down Expand Up @@ -383,8 +403,11 @@ func slurmSubmit(j *job.Job, jobmgr *JM, sysCfg *sys.Config) advexec.Result {
jobmgr.CmdArgs = append(jobmgr.CmdArgs, "-W")
}

cmd.CmdArgs = append(cmd.CmdArgs, jobmgr.CmdArgs...)
cmd.CmdArgs = append(cmd.CmdArgs, j.BatchScript)
if len(jobmgr.CmdArgs) > 0 {
cmd.CmdArgs = append(cmd.CmdArgs, jobmgr.CmdArgs...)
}
//cmd.CmdArgs = append(cmd.CmdArgs, j.BatchScript)
cmd.CmdArgs = []string{j.BatchScript}

j.SetOutputFn(slurmGetOutput)
j.SetErrorFn(slurmGetError)
Expand Down
4 changes: 4 additions & 0 deletions pkg/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ type Job struct {
RequiredModules []string

NonBlocking bool

CustomEnv map[string]string

ExecutionTimestamp string
}

// GetOutput is the function to call to gather the output (stdout) of the application after execution of the job
Expand Down

0 comments on commit dd1fdbf

Please sign in to comment.