Skip to content

Commit

Permalink
Add basic support for Intel-Slurm (not fully supported) + fix unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Geoffroy Vallee <[email protected]>
  • Loading branch information
gvallee committed Aug 16, 2021
1 parent b91e791 commit bf3341a
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pkg/jm/jm.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ const (
// SlurmID is the value set to JM.ID when Slurm shall be used to submit a job
SlurmID = "slurm"

// IntelSlurmID is the value set to JM.ID when Intel-Slurm shall be used to submit a job
IntelSlurmID = "intel-slurm"

// PrunID is the value set to JM.ID when prun shall be used to submit a job
PrunID = "prun"
)
Expand Down
117 changes: 117 additions & 0 deletions pkg/jm/jm_intel_slurm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Copyright (c) 2019, Sylabs Inc. All rights reserved.
// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.

package jm

import (
"fmt"
"log"
"os/exec"
"strconv"
"strings"

"github.com/gvallee/go_exec/pkg/advexec"
"github.com/gvallee/go_hpc_jobmgr/pkg/job"
"github.com/gvallee/go_hpc_jobmgr/pkg/sys"
"github.com/gvallee/go_util/pkg/util"
)

// IntelSlurmDetect is the function used by our job management framework to figure out if Intel-Slurm can be used and
// if so return a JM structure with all the "function pointers" to interact with Slurm through our generic
// API.
func IntelSlurmDetect() (bool, JM) {
var jm JM
var err error

jm.BinPath, err = exec.LookPath("bsub")
if err != nil {
log.Println("* Intel-Slurm not detected")
return false, jm
}

_, err = exec.LookPath("squeue")
if err != nil {
log.Println("* Intel-Slurm not detected (no squeue command available)")
return false, jm
}

jm.ID = IntelSlurmID
jm.submitJM = intelSlurmSubmit
jm.loadJM = intelSlurmLoad
jm.jobStatusJM = slurmJobStatus
jm.numJobsJM = slurmGetNumJobs
jm.postRunJM = slurmPostJob

return true, jm
}

// intelSlurmLoad is the function called when trying to load a JM module
func intelSlurmLoad(jobmgr *JM, sysCfg *sys.Config) error {
// jobmgr.BinPath has been set during Detect()
return nil
}

// intelSlurmSubmit prepares the batch script necessary to start a given job.
//
// Note that a script does not need any specific environment to be submitted
func intelSlurmSubmit(j *job.Job, jobmgr *JM, sysCfg *sys.Config) advexec.Result {
var cmd advexec.Advcmd
var resExec advexec.Result

// Sanity checks
if j == nil || !util.FileExists(jobmgr.BinPath) {
resExec.Err = fmt.Errorf("job is undefined")
return resExec
}

err := generateJobScript(j, sysCfg)
if err != nil {
resExec.Err = fmt.Errorf("unable to generate Slurm script: %s", err)
return resExec
}
if j.BatchScript == "" {
resExec.Err = fmt.Errorf("undefined batch script path")
return resExec
}

cmd.BinPath = jobmgr.BinPath
cmd.ExecDir = j.RunDir
// We want the default to be blocking sbatch but users can request non-blocking
if !j.NonBlocking {
jobmgr.CmdArgs = append(jobmgr.CmdArgs, "-W")
}

if len(jobmgr.CmdArgs) > 0 {
cmd.CmdArgs = append(cmd.CmdArgs, jobmgr.CmdArgs...)
}
//cmd.CmdArgs = append(cmd.CmdArgs, j.BatchScript)
cmd.CmdArgs = []string{j.BatchScript}

j.SetOutputFn(slurmGetOutput)
j.SetErrorFn(slurmGetError)

if !util.PathExists(sysCfg.ScratchDir) {
resExec.Err = fmt.Errorf("scratch directory does not exist")
return resExec
}

cmdRes := cmd.Run()
if strings.HasPrefix(cmdRes.Stdout, slurmJobIDPrefix) {
jobIDStr := strings.TrimPrefix(cmdRes.Stdout, slurmJobIDPrefix)
jobIDStr = strings.TrimRight(jobIDStr, "\n")
j.ID, err = strconv.Atoi(jobIDStr)
if err != nil {
resExec.Err = fmt.Errorf("unable to get job ID: %s", err)
return resExec
}
}

if !j.NonBlocking {
return slurmPostJob(&cmdRes, j, sysCfg)
}

return cmdRes
}
5 changes: 4 additions & 1 deletion pkg/jm/jobmgr_slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ func isDateCmdOutput(output string) bool {
}

func setupSlurm(t *testing.T) (JM, job.Job, sys.Config, string) {
if *partition == "" {
t.Skip("no partition specified, skipping...")
}
loaded, jobmgr := SlurmDetect()
if !loaded {
t.Skip("slurm cannot be used on this platform")
Expand Down Expand Up @@ -134,7 +137,7 @@ func TestSlurmSubmitMPI(t *testing.T) {
}
var mpiImplem implem.Info
mpiImplem.InstallDir = *mpiDir
err := mpiImplem.Load()
err := mpiImplem.Load(nil)
if err != nil {
t.Fatalf("unable to detect the MPI implementation in %s: %s", *mpiDir, err)
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/launcher/launcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ var partition = flag.String("partition", "", "Name of Slurm partition to use to
var scratchDir = flag.String("scratch", "", "Scratch directory to use to execute the test")

func TestSlurmLaunch(t *testing.T) {
if *partition == "" {
t.Skip("partition not defined, skipping...")
}
var j job.Job
var err error
j.App.BinPath, err = exec.LookPath("date")
Expand Down

0 comments on commit bf3341a

Please sign in to comment.