skypilot-org · romilbhardwaj · Dec 29, 2024 · Dec 29, 2024 · Michaelvll · Dec 30, 2024
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -4195,34 +4195,40 @@ def post_teardown_cleanup(self,
         # that successfully call this function but do not first call
         # teardown_cluster or terminate_instances. See
         # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
+        # If purge is set, we do not care about instance status and should skip
+        # the check because it may fail if the cluster is not reachable.
         attempts = 0
-        while True:
-            logger.debug(f'instance statuses attempt {attempts + 1}')
-            node_status_dict = provision_lib.query_instances(
-                repr(cloud),
-                cluster_name_on_cloud,
-                config['provider'],
-                non_terminated_only=False)
-
-            unexpected_node_state: Optional[Tuple[str, str]] = None
-            for node_id, node_status in node_status_dict.items():
-                logger.debug(f'{node_id} status: {node_status}')
-                # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
-                # between "stopping/stopped" and "terminating/terminated", so we
-                # allow for either status instead of casing on `terminate`.
-                if node_status not in [None, status_lib.ClusterStatus.STOPPED]:
-                    unexpected_node_state = (node_id, node_status)
-
-            if unexpected_node_state is None:
-                break
+        if not purge:
+            while True:
+                logger.debug(f'instance statuses attempt {attempts + 1}')
+                node_status_dict = provision_lib.query_instances(
+                    repr(cloud),
+                    cluster_name_on_cloud,
+                    config['provider'],
+                    non_terminated_only=False)
+
+                unexpected_node_state: Optional[Tuple[str, str]] = None
+                for node_id, node_status in node_status_dict.items():
+                    logger.debug(f'{node_id} status: {node_status}')
+                    # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
+                    # between "stopping/stopped" and "terminating/terminated",
+                    # so we allow for either status instead of casing
+                    # on `terminate`.
+                    if node_status not in [
+                            None, status_lib.ClusterStatus.STOPPED
+                    ]:
+                        unexpected_node_state = (node_id, node_status)
+
+                if unexpected_node_state is None:
+                    break
 
-            attempts += 1
-            if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
-                time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
-            else:
-                (node_id, node_status) = unexpected_node_state
-                raise RuntimeError(f'Instance {node_id} in unexpected state '
-                                   f'{node_status}.')
+                attempts += 1
+                if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
+                    time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
+                else:
+                    (node_id, node_status) = unexpected_node_state
+                    raise RuntimeError(f'Instance {node_id} in unexpected '
+                                       'state {node_status}.')
 
         global_user_state.remove_cluster(handle.cluster_name,
                                          terminate=terminate)