diff --git a/tpu-provisioner/cmd/main.go b/tpu-provisioner/cmd/main.go index 896c7bdcc..2a841beb7 100644 --- a/tpu-provisioner/cmd/main.go +++ b/tpu-provisioner/cmd/main.go @@ -84,6 +84,8 @@ func main() { // the node to become Ready and for a pending Pod to be scheduled on it. NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"3m"` + NodepoolDeletionDelay time.Duration `envconfig:"NODEPOOL_DELETION_DELAY" default:"30s"` + PodResourceType string `envconfig:"POD_RESOURCE_TYPE" default:"google.com/tpu"` Concurrency int `envconfig:"CONCURRENCY" default:"3"` @@ -229,7 +231,8 @@ func main() { Recorder: mgr.GetEventRecorderFor("tpu-provisioner"), Provider: provider, NodeCriteria: controller.NodeCriteria{ - MinLifetime: cfg.NodeMinLifespan, + MinLifetime: cfg.NodeMinLifespan, + PoolDeletionDelay: cfg.NodepoolDeletionDelay, }, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DeletionReconciler") diff --git a/tpu-provisioner/internal/cloud/constants.go b/tpu-provisioner/internal/cloud/common.go similarity index 61% rename from tpu-provisioner/internal/cloud/constants.go rename to tpu-provisioner/internal/cloud/common.go index 6f1d819e4..f4ac2b6c1 100644 --- a/tpu-provisioner/internal/cloud/constants.go +++ b/tpu-provisioner/internal/cloud/common.go @@ -1,5 +1,14 @@ package cloud +import ( + "errors" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + const ( keyPrefix = "google.com/" @@ -23,3 +32,24 @@ const ( EventNodePoolNotFound = "NodePoolNotFound" ) + +type Provider interface { + NodePoolLabelKey() string + EnsureNodePoolForPod(*corev1.Pod, string) error + DeleteNodePoolForNode(*corev1.Node, string) error + DeleteNodePool(string, client.Object, string) error + ListNodePools() ([]NodePoolRef, error) +} + +var ErrDuplicateRequest = errors.New("duplicate request") + +type NodePoolRef struct { + Name string + + CreationTime time.Time + + CreatedForPod types.NamespacedName + + Error bool + Message string +} diff --git a/tpu-provisioner/internal/cloud/gke.go b/tpu-provisioner/internal/cloud/gke.go index 0ab873d06..f1f7fab79 100644 --- a/tpu-provisioner/internal/cloud/gke.go +++ b/tpu-provisioner/internal/cloud/gke.go @@ -114,9 +114,9 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) { for _, np := range resp.NodePools { refs = append(refs, NodePoolRef{ - Name: np.Name, - Error: np.Status == "ERROR", - ErrorMsg: np.StatusMessage, + Name: np.Name, + Error: np.Status == "ERROR", + Message: np.StatusMessage, CreatedForPod: types.NamespacedName{ Name: np.Config.Labels[LabelPodName], Namespace: np.Config.Labels[LabelPodNamespace], diff --git a/tpu-provisioner/internal/cloud/interface.go b/tpu-provisioner/internal/cloud/interface.go deleted file mode 100644 index b4359215a..000000000 --- a/tpu-provisioner/internal/cloud/interface.go +++ /dev/null @@ -1,31 +0,0 @@ -package cloud - -import ( - "errors" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type Provider interface { - NodePoolLabelKey() string - EnsureNodePoolForPod(*corev1.Pod, string) error - DeleteNodePoolForNode(*corev1.Node, string) error - DeleteNodePool(string, client.Object, string) error - ListNodePools() ([]NodePoolRef, error) -} - -var ErrDuplicateRequest = errors.New("duplicate request") - -type NodePoolRef struct { - Name string - - CreationTime time.Time - - CreatedForPod types.NamespacedName - - Error bool - ErrorMsg string -} diff --git a/tpu-provisioner/internal/controller/deletion_controller.go b/tpu-provisioner/internal/controller/deletion_controller.go index 3f6a2e4bb..8b2ed9e4b 100644 --- a/tpu-provisioner/internal/controller/deletion_controller.go +++ b/tpu-provisioner/internal/controller/deletion_controller.go @@ -23,13 +23,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" ) -// nodePoolDeletionCheckInterval is the interval between the first and -// second node pool deletion checks. Once the node pool deletion check -// has passed twice, the node pool can be safely deleted. This second -// check is ensure the node pool is not prematurely deleted, in the case -// where a JobSet is restarted, but no pods have been created yet. -var nodePoolDeletionCheckInterval = 30 * time.Second - // DeletionReconciler watches Pods and Nodes and deletes Node Pools. type DeletionReconciler struct { client.Client @@ -43,6 +36,13 @@ type DeletionReconciler struct { type NodeCriteria struct { MinLifetime time.Duration + + // PoolDeletionDelay is the interval between the first and + // second node pool deletion checks. Once the node pool deletion check + // has passed twice, the node pool can be safely deleted. This second + // check is ensure the node pool is not prematurely deleted, in the case + // where a JobSet is restarted, but no pods have been created yet. + PoolDeletionDelay time.Duration } //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete @@ -128,14 +128,14 @@ func (r *DeletionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if !exists { lg.Info(fmt.Sprintf("Node pool %q passed deletion check once", nodePoolName)) r.NodePoolsMarkedForDeletion.Store(nodePoolName, time.Now()) - return ctrl.Result{RequeueAfter: nodePoolDeletionCheckInterval}, nil + return ctrl.Result{RequeueAfter: r.NodeCriteria.PoolDeletionDelay}, nil } // If we haven't reached the node pool deletion check interval, this reconcile was // caused by something else, we can return early, and wait for the manually requeued // reconcile we did after the first deletion check passed. firstDeletionCheckTime := value.(time.Time) - if time.Now().Sub(firstDeletionCheckTime) < nodePoolDeletionCheckInterval { + if time.Now().Sub(firstDeletionCheckTime) < r.NodeCriteria.PoolDeletionDelay { return ctrl.Result{}, nil } diff --git a/tpu-provisioner/internal/controller/deletion_controller_test.go b/tpu-provisioner/internal/controller/deletion_controller_test.go index 13bfc7bb7..232ea7fac 100644 --- a/tpu-provisioner/internal/controller/deletion_controller_test.go +++ b/tpu-provisioner/internal/controller/deletion_controller_test.go @@ -67,7 +67,7 @@ var _ = Describe("Deletion controller", func() { By("Checking the first deletion attempt only occurred after the node had existed for >= nodeDeletionInterval") actualDuration := deletionTimestamp.Sub(createdNode.CreationTimestamp.Time) - requiredDuration := nodePoolDeletionCheckInterval + minNodeLifetime + requiredDuration := nodepoolDeletionDelay + minNodeLifetime Expect(actualDuration).Should(BeNumerically(">=", requiredDuration)) By("Checking that other Nodes were ignored") diff --git a/tpu-provisioner/internal/controller/nodepool_garbage_collector.go b/tpu-provisioner/internal/controller/nodepool_garbage_collector.go index 873c3c73e..0fb4337c1 100644 --- a/tpu-provisioner/internal/controller/nodepool_garbage_collector.go +++ b/tpu-provisioner/internal/controller/nodepool_garbage_collector.go @@ -12,6 +12,10 @@ import ( ctrllog "sigs.k8s.io/controller-runtime/pkg/log" ) +// NodePoolGarbageCollector deletes node pools that have no Nodes, +// are in an errored state, and where the Pod that created the node pool +// no longer exists (the deletion reconciler would not see these b/c there +// are no Node objects). type NodePoolGarbageCollector struct { Interval time.Duration client.Client @@ -84,7 +88,7 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) { log.Info("garbage collecting node pool in error state") // TODO: Lookup namespace from env with downward API. whyDelete := fmt.Sprintf("the node pool has no corresponding Nodes, the Pod (%s/%s) that triggered its creation no longer exists, and node pool is in an error state: %s", - np.CreatedForPod.Namespace, np.CreatedForPod.Name, np.ErrorMsg) + np.CreatedForPod.Namespace, np.CreatedForPod.Name, np.Message) if err := g.Provider.DeleteNodePool(np.Name, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tpu-provisioner-system"}}, whyDelete); err != nil { log.Error(err, "failed to garbage collect node pool") continue diff --git a/tpu-provisioner/internal/controller/suite_test.go b/tpu-provisioner/internal/controller/suite_test.go index c81307c5a..39b46c9a3 100644 --- a/tpu-provisioner/internal/controller/suite_test.go +++ b/tpu-provisioner/internal/controller/suite_test.go @@ -51,8 +51,9 @@ var ( ) const ( - resourceName = "test.com/tpu" - minNodeLifetime = time.Second + resourceName = "test.com/tpu" + minNodeLifetime = time.Second + nodepoolDeletionDelay = 5 * time.Second ) func TestAPIs(t *testing.T) { @@ -103,7 +104,8 @@ var _ = BeforeSuite(func() { Recorder: mgr.GetEventRecorderFor("tpu-provisioner-deleter"), Provider: provider, NodeCriteria: NodeCriteria{ - MinLifetime: minNodeLifetime, + MinLifetime: minNodeLifetime, + PoolDeletionDelay: nodepoolDeletionDelay, }, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred())