diff --git a/README.md b/README.md index e3e1214..ff3327e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ Fluence enables HPC-grade pod scheduling in Kubernetes via the [Kubernetes Sched ## TODO -- On init, need to load in resource graph that accounts for running stuff - Need to allow for restart / crashes and looking up existing jobid, updating maps in PodGroup - Since AskFlux is done on level of pod group, refactor function to account for specific resources of all pods (not just one pod) - Figure out if EventsToRegister replaces old informer diff --git a/hack/quick-build-gke.sh b/hack/quick-build-gke.sh new file mode 100755 index 0000000..875360a --- /dev/null +++ b/hack/quick-build-gke.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Before running this, you should: +# 1. create the kind cluster (needs more than one node, fluence does not scheduler to the control plane) +# 2. Install cert-manager +# 3. Customize the script to point to your registry if you intend to push + +REGISTRY="${1:-ghcr.io/vsoch}" +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT=$(dirname ${HERE}) + +# Go to the script directory +cd ${ROOT} + +# These build each of the images. The sidecar is separate from the other two in src/ +make REGISTRY=${REGISTRY} SCHEDULER_IMAGE=fluence SIDECAR_IMAGE=fluence-sidecar CONTROLLER_IMAGE=fluence-controller + +# This is what it might look like to push +# docker push ghcr.io/vsoch/fluence-sidecar && docker push ghcr.io/vsoch/fluence-controller && docker push ghcr.io/vsoch/fluence:latest + +# We load into kind so we don't need to push/pull and use up internet data ;) +docker push ${REGISTRY}/fluence-sidecar:latest +docker push ${REGISTRY}/fluence-controller:latest +docker push ${REGISTRY}/fluence:latest + +# And then install using the charts. The pull policy ensures we use the loaded ones +cd ${ROOT}/upstream/manifests/install/charts +helm uninstall fluence || true +helm install \ + --set scheduler.image=${REGISTRY}/fluence:latest \ + --set controller.image=${REGISTRY}/fluence-controller:latest \ + --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ + fluence as-a-second-scheduler/ diff --git a/hack/quick-build.sh b/hack/quick-build.sh index b3ccefe..23a5c87 100755 --- a/hack/quick-build.sh +++ b/hack/quick-build.sh @@ -33,4 +33,4 @@ helm install \ --set controller.pullPolicy=Never \ --set controller.image=${REGISTRY}/fluence-controller:latest \ --set scheduler.sidecarimage=${REGISTRY}/fluence-sidecar:latest \ - fluence as-a-second-scheduler/ \ No newline at end of file + fluence as-a-second-scheduler/ diff --git a/sig-scheduler-plugins/pkg/fluence/core/core.go b/sig-scheduler-plugins/pkg/fluence/core/core.go index 8b08468..1e75814 100644 --- a/sig-scheduler-plugins/pkg/fluence/core/core.go +++ b/sig-scheduler-plugins/pkg/fluence/core/core.go @@ -71,7 +71,7 @@ type PodGroupManager struct { scheduleTimeout *time.Duration // permittedPG stores the podgroup name which has passed the pre resource check. permittedPG *gochache.Cache - // backedOffPG stores the podgorup name which failed scheudling recently. + // backedOffPG stores the podgorup name which failed scheduling recently. backedOffPG *gochache.Cache // podLister is pod lister podLister listerv1.PodLister @@ -111,12 +111,25 @@ func NewPodGroupManager( } // GetStatuses string (of all pods) to show for debugging purposes -func (pgMgr *PodGroupManager) GetStatuses(pods []*corev1.Pod) string { +// Since we loop here, we also determine if the first pod is the one +// we are considering +func (pgMgr *PodGroupManager) GetStatusesAndIndex( + pods []*corev1.Pod, + pod *corev1.Pod, +) (string, bool, int) { statuses := "" - for _, pod := range pods { - statuses += " " + fmt.Sprintf("%s", pod.Status.Phase) + + // We need to distinguish 0 from the default and not finding anything + foundIndex := false + index := 0 + for i, p := range pods { + if p.Name == pod.Name { + foundIndex = true + index = i + } + statuses += " " + fmt.Sprintf("%s", p.Status.Phase) } - return statuses + return statuses, foundIndex, index } // GetPodNode is a quick lookup to see if we have a node @@ -153,8 +166,10 @@ func (pgMgr *PodGroupManager) PreFilter( return fmt.Errorf("podLister list pods failed: %w", err) } + // Only allow scheduling the first in the group so the others come after + // Get statuses to show for debugging - statuses := pgMgr.GetStatuses(pods) + statuses, found, idx := pgMgr.GetStatusesAndIndex(pods, pod) // This shows us the number of pods we have in the set and their states pgMgr.log.Info("[PodGroup PreFilter] group: %s pods: %s MinMember: %d Size: %d", pgFullName, statuses, pg.Spec.MinMember, len(pods)) @@ -163,6 +178,18 @@ func (pgMgr *PodGroupManager) PreFilter( "current pods number: %v, minMember of group: %v", pod.Name, len(pods), pg.Spec.MinMember) } + if !found { + return fmt.Errorf("pod %s was not found in group - this should not happen", pod.Name) + } + + // We only will AskFlux for the first pod + // This makes an assumption that the order listed is the order in the queue, I'm not + // sure that is true in practice. This is the one case with retry. This design + // probably needs thinking and work. + if idx != 0 { + return fmt.Errorf("pod %s is not first in the list, will wait to schedule", pod.Name) + } + // TODO we likely can take advantage of these resources or other custom // attributes we add. For now ignore and calculate based on pod needs (above) // if pg.Spec.MinResources == nil { diff --git a/sig-scheduler-plugins/pkg/logger/logger.go b/sig-scheduler-plugins/pkg/logger/logger.go index 522be61..053021a 100644 --- a/sig-scheduler-plugins/pkg/logger/logger.go +++ b/sig-scheduler-plugins/pkg/logger/logger.go @@ -19,7 +19,6 @@ const ( LevelDebug ) -// TODO try saving state here when we can close type DebugLogger struct { level int Filename string @@ -28,7 +27,7 @@ type DebugLogger struct { func NewDebugLogger(level int, filename string) *DebugLogger { return &DebugLogger{ - level: LevelNone, + level: level, Filename: filename, } }