Skip to content

Commit

Permalink
Create predectable and unique job script names
Browse files Browse the repository at this point in the history
Signed-off-by: Geoffroy Vallee <[email protected]>
  • Loading branch information
gvallee committed Jun 28, 2021
1 parent dd1fdbf commit 552f260
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 17 deletions.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ module github.com/gvallee/go_hpc_jobmgr
go 1.13

require (
github.com/gvallee/go_exec v1.0.0
github.com/gvallee/go_util v1.1.0
github.com/gvallee/go_exec v1.1.0
github.com/gvallee/go_util v1.5.0
)
7 changes: 4 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
github.com/gvallee/go_exec v1.0.0 h1:43nuopP/x1+34yeZX0Z/tv7eI68SgLDwHygQMwVtAkE=
github.com/gvallee/go_exec v1.0.0/go.mod h1:s9fSkVniLYlQUZb99zuioVH45nkvnYxw6vmGEaZCUPk=
github.com/gvallee/go_util v1.1.0 h1:qSwjCPTejJ8zbhFmpJ3WQtzpNOHWQwc4ijFsTbCJK7s=
github.com/gvallee/go_exec v1.1.0 h1:3xhwLIcxY4VNGGPL5YZYxnWeTgLYU0PSOEMcO0gfi+Q=
github.com/gvallee/go_exec v1.1.0/go.mod h1:s9fSkVniLYlQUZb99zuioVH45nkvnYxw6vmGEaZCUPk=
github.com/gvallee/go_util v1.1.0/go.mod h1:fTexpwdH/n05Ziu0TXJIQsr7E+46QpBxNdeOOsyC0/s=
github.com/gvallee/go_util v1.5.0 h1:xxAQR2v6csFQdMX18dt9J0DATIUvkBV1zu2VW3yU3wo=
github.com/gvallee/go_util v1.5.0/go.mod h1:rhmrHriih4is1E3KbQUyn+o8J6wrT6j2pLfAsugaJMY=
9 changes: 7 additions & 2 deletions internal/pkg/openmpi/openmpi.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,13 @@ func DetectFromDir(dir string) (string, string, error) {
versionCmd.Env = []string{"LD_LIBRARY_PATH=" + newLDPath, "PATH=" + newPath}
res := versionCmd.Run()
if res.Err != nil {
log.Printf("unable to run ompi_info: %s; stdout: %s; stderr: %s", res.Err, res.Stdout, res.Stderr)
return "", "", res.Err
// If it fails we try with OPAL_PREFIX set
versionCmd.Env = append(versionCmd.Env, "OPAL_PREFIX="+dir)
res = versionCmd.Run()
if res.Err != nil {
log.Printf("unable to run ompi_info: %s; stdout: %s; stderr: %s", res.Err, res.Stdout, res.Stderr)
return "", "", res.Err
}
}
version, err := parseOmpiInfoOutputForVersion(res.Stdout)
if err != nil {
Expand Down
6 changes: 2 additions & 4 deletions pkg/jm/jm.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,8 @@ func getBatchScriptPath(j *job.Job, sysCfg *sys.Config, batchScriptFilenamePrefi

// TempFile creates a temporary file that is used to store a batch script
func TempFile(j *job.Job, sysCfg *sys.Config) error {
filePrefix := "sbatch-"
if j.Partition != "" {
filePrefix += j.Partition + "-"
}
j.SetTimestamp()
filePrefix := "sbatch-" + j.ExecutionTimestamp + "-"
filePrefix += j.Name
var err error
j.BatchScript, err = getBatchScriptPath(j, sysCfg, filePrefix)
Expand Down
16 changes: 10 additions & 6 deletions pkg/jm/jobmgr_slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"path/filepath"
"strconv"
"strings"
"time"

"github.com/gvallee/go_exec/pkg/advexec"
"github.com/gvallee/go_hpc_jobmgr/internal/pkg/network"
Expand Down Expand Up @@ -216,16 +215,19 @@ func generateBatchScriptContent(j *job.Job, sysCfg *sys.Config) (string, error)
scriptText += slurm.ScriptCmdPrefix + " -N " + strconv.Itoa(j.NNodes) + "\n"
}

scriptText += slurm.ScriptCmdPrefix + " -t 0:30:0\n"
if j.MaxExecTime == "" {
scriptText += slurm.ScriptCmdPrefix + " -t 0:30:0\n"
} else {
scriptText += slurm.ScriptCmdPrefix + " -t " + j.MaxExecTime + "\n"
}

/*
if j.NP > 0 {
scriptText += slurm.ScriptCmdPrefix + " --ntasks=" + strconv.Itoa(j.NP) + "\n"
}
*/

now := time.Now()
j.ExecutionTimestamp = string(now.Format("060102150405"))
j.SetTimestamp()
scriptText += slurm.ScriptCmdPrefix + " --error=" + getJobErrorFilePath(j, sysCfg) + "\n"
scriptText += slurm.ScriptCmdPrefix + " --output=" + getJobOutputFilePath(j, sysCfg) + "\n"
scriptText += "\n"
Expand Down Expand Up @@ -322,8 +324,10 @@ func generateJobScript(j *job.Job, sysCfg *sys.Config) error {
return fmt.Errorf("undefined scratch directory")
}

if j.App.BinPath == "" {
return fmt.Errorf("application binary is undefined")
// If we know nothing about the app and there is no batch script to use, we do
// not know how to launch the application
if j.App.BinPath == "" && j.BatchScript == "" {
return fmt.Errorf("application binary and batch script are undefined")
}

// Create the batch script if the user did not specify a batch script to use.
Expand Down
9 changes: 9 additions & 0 deletions pkg/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/gvallee/go_hpc_jobmgr/pkg/app"
"github.com/gvallee/go_hpc_jobmgr/pkg/mpi"
"github.com/gvallee/go_hpc_jobmgr/pkg/sys"
"github.com/gvallee/go_util/pkg/timestamp"
)

// CleanUpFn is a "function pointer" to call to clean up the system after the completion of a job
Expand Down Expand Up @@ -81,6 +82,8 @@ type Job struct {
CustomEnv map[string]string

ExecutionTimestamp string

MaxExecTime string
}

// GetOutput is the function to call to gather the output (stdout) of the application after execution of the job
Expand All @@ -102,3 +105,9 @@ func (j *Job) SetOutputFn(fn GetOutputFn) {
func (j *Job) SetErrorFn(fn GetErrorFn) {
j.internalGetError = fn
}

func (j *Job) SetTimestamp() {
if j.ExecutionTimestamp == "" {
j.ExecutionTimestamp = timestamp.Now()
}
}

0 comments on commit 552f260

Please sign in to comment.