From 552f2605fe1575ff31185a9c135e927cf0ecc712 Mon Sep 17 00:00:00 2001 From: Geoffroy Vallee Date: Mon, 28 Jun 2021 13:09:31 -0700 Subject: [PATCH] Create predectable and unique job script names Signed-off-by: Geoffroy Vallee --- go.mod | 4 ++-- go.sum | 7 ++++--- internal/pkg/openmpi/openmpi.go | 9 +++++++-- pkg/jm/jm.go | 6 ++---- pkg/jm/jobmgr_slurm.go | 16 ++++++++++------ pkg/job/job.go | 9 +++++++++ 6 files changed, 34 insertions(+), 17 deletions(-) diff --git a/go.mod b/go.mod index cddaf08..13d6b0d 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,6 @@ module github.com/gvallee/go_hpc_jobmgr go 1.13 require ( - github.com/gvallee/go_exec v1.0.0 - github.com/gvallee/go_util v1.1.0 + github.com/gvallee/go_exec v1.1.0 + github.com/gvallee/go_util v1.5.0 ) diff --git a/go.sum b/go.sum index 9cb9a8b..f7aef86 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,5 @@ -github.com/gvallee/go_exec v1.0.0 h1:43nuopP/x1+34yeZX0Z/tv7eI68SgLDwHygQMwVtAkE= -github.com/gvallee/go_exec v1.0.0/go.mod h1:s9fSkVniLYlQUZb99zuioVH45nkvnYxw6vmGEaZCUPk= -github.com/gvallee/go_util v1.1.0 h1:qSwjCPTejJ8zbhFmpJ3WQtzpNOHWQwc4ijFsTbCJK7s= +github.com/gvallee/go_exec v1.1.0 h1:3xhwLIcxY4VNGGPL5YZYxnWeTgLYU0PSOEMcO0gfi+Q= +github.com/gvallee/go_exec v1.1.0/go.mod h1:s9fSkVniLYlQUZb99zuioVH45nkvnYxw6vmGEaZCUPk= github.com/gvallee/go_util v1.1.0/go.mod h1:fTexpwdH/n05Ziu0TXJIQsr7E+46QpBxNdeOOsyC0/s= +github.com/gvallee/go_util v1.5.0 h1:xxAQR2v6csFQdMX18dt9J0DATIUvkBV1zu2VW3yU3wo= +github.com/gvallee/go_util v1.5.0/go.mod h1:rhmrHriih4is1E3KbQUyn+o8J6wrT6j2pLfAsugaJMY= diff --git a/internal/pkg/openmpi/openmpi.go b/internal/pkg/openmpi/openmpi.go index 0a324ac..bfec797 100644 --- a/internal/pkg/openmpi/openmpi.go +++ b/internal/pkg/openmpi/openmpi.go @@ -72,8 +72,13 @@ func DetectFromDir(dir string) (string, string, error) { versionCmd.Env = []string{"LD_LIBRARY_PATH=" + newLDPath, "PATH=" + newPath} res := versionCmd.Run() if res.Err != nil { - log.Printf("unable to run ompi_info: %s; stdout: %s; stderr: %s", res.Err, res.Stdout, res.Stderr) - return "", "", res.Err + // If it fails we try with OPAL_PREFIX set + versionCmd.Env = append(versionCmd.Env, "OPAL_PREFIX="+dir) + res = versionCmd.Run() + if res.Err != nil { + log.Printf("unable to run ompi_info: %s; stdout: %s; stderr: %s", res.Err, res.Stdout, res.Stderr) + return "", "", res.Err + } } version, err := parseOmpiInfoOutputForVersion(res.Stdout) if err != nil { diff --git a/pkg/jm/jm.go b/pkg/jm/jm.go index 82e9eec..219c544 100644 --- a/pkg/jm/jm.go +++ b/pkg/jm/jm.go @@ -168,10 +168,8 @@ func getBatchScriptPath(j *job.Job, sysCfg *sys.Config, batchScriptFilenamePrefi // TempFile creates a temporary file that is used to store a batch script func TempFile(j *job.Job, sysCfg *sys.Config) error { - filePrefix := "sbatch-" - if j.Partition != "" { - filePrefix += j.Partition + "-" - } + j.SetTimestamp() + filePrefix := "sbatch-" + j.ExecutionTimestamp + "-" filePrefix += j.Name var err error j.BatchScript, err = getBatchScriptPath(j, sysCfg, filePrefix) diff --git a/pkg/jm/jobmgr_slurm.go b/pkg/jm/jobmgr_slurm.go index 98073e6..ec8c375 100644 --- a/pkg/jm/jobmgr_slurm.go +++ b/pkg/jm/jobmgr_slurm.go @@ -14,7 +14,6 @@ import ( "path/filepath" "strconv" "strings" - "time" "github.com/gvallee/go_exec/pkg/advexec" "github.com/gvallee/go_hpc_jobmgr/internal/pkg/network" @@ -216,7 +215,11 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error) scriptText += slurm.ScriptCmdPrefix + " -N " + strconv.Itoa(j.NNodes) + "\n" } - scriptText += slurm.ScriptCmdPrefix + " -t 0:30:0\n" + if j.MaxExecTime == "" { + scriptText += slurm.ScriptCmdPrefix + " -t 0:30:0\n" + } else { + scriptText += slurm.ScriptCmdPrefix + " -t " + j.MaxExecTime + "\n" + } /* if j.NP > 0 { @@ -224,8 +227,7 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error) } */ - now := time.Now() - j.ExecutionTimestamp = string(now.Format("060102150405")) + j.SetTimestamp() scriptText += slurm.ScriptCmdPrefix + " --error=" + getJobErrorFilePath(j, sysCfg) + "\n" scriptText += slurm.ScriptCmdPrefix + " --output=" + getJobOutputFilePath(j, sysCfg) + "\n" scriptText += "\n" @@ -322,8 +324,10 @@ func generateJobScript(j *job.Job, sysCfg *sys.Config) error { return fmt.Errorf("undefined scratch directory") } - if j.App.BinPath == "" { - return fmt.Errorf("application binary is undefined") + // If we know nothing about the app and there is no batch script to use, we do + // not know how to launch the application + if j.App.BinPath == "" && j.BatchScript == "" { + return fmt.Errorf("application binary and batch script are undefined") } // Create the batch script if the user did not specify a batch script to use. diff --git a/pkg/job/job.go b/pkg/job/job.go index e628eaa..7f68b1d 100644 --- a/pkg/job/job.go +++ b/pkg/job/job.go @@ -12,6 +12,7 @@ import ( "github.com/gvallee/go_hpc_jobmgr/pkg/app" "github.com/gvallee/go_hpc_jobmgr/pkg/mpi" "github.com/gvallee/go_hpc_jobmgr/pkg/sys" + "github.com/gvallee/go_util/pkg/timestamp" ) // CleanUpFn is a "function pointer" to call to clean up the system after the completion of a job @@ -81,6 +82,8 @@ type Job struct { CustomEnv map[string]string ExecutionTimestamp string + + MaxExecTime string } // GetOutput is the function to call to gather the output (stdout) of the application after execution of the job @@ -102,3 +105,9 @@ func (j *Job) SetOutputFn(fn GetOutputFn) { func (j *Job) SetErrorFn(fn GetErrorFn) { j.internalGetError = fn } + +func (j *Job) SetTimestamp() { + if j.ExecutionTimestamp == "" { + j.ExecutionTimestamp = timestamp.Now() + } +}