From 91214a05ebb5993e06a362bc1cfd745b221d0701 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 9 Jan 2025 08:55:45 +0000 Subject: [PATCH 1/7] feat: basic recovery support --- sky/serve/serve_state.py | 22 +++++++++++++ sky/serve/service.py | 70 +++++++++++++++++++++++----------------- 2 files changed, 63 insertions(+), 29 deletions(-) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index f3e8fbf1e53..005a89135df 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -548,3 +548,25 @@ def delete_all_versions(service_name: str) -> None: """\ DELETE FROM version_specs WHERE service_name=(?)""", (service_name,)) + + +def get_service_controller_port(service_name: str) -> int: + """Gets the controller port of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute('SELECT controller_port FROM services WHERE name = ?', + (service_name,)) + row = cursor.fetchone() + if row is None: + raise ValueError(f'Service {service_name} does not exist.') + return row[0] + + +def get_service_load_balancer_port(service_name: str) -> int: + """Gets the load balancer port of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute('SELECT load_balancer_port FROM services WHERE name = ?', + (service_name,)) + row = cursor.fetchone() + if row is None: + raise ValueError(f'Service {service_name} does not exist.') + return row[0] diff --git a/sky/serve/service.py b/sky/serve/service.py index dbfc57b22bf..5d58a35b9bc 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -130,7 +130,7 @@ def cleanup_version_storage(version: int) -> bool: return failed -def _start(service_name: str, tmp_task_yaml: str, job_id: int): +def _start(service_name: str, tmp_task_yaml: str, job_id: int, is_recovery: bool = False): """Starts the service.""" # Generate ssh key pair to avoid race condition when multiple sky.launch # are executed at the same time. @@ -141,27 +141,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): # Already checked before submit to controller. assert task.service is not None, task service_spec = task.service - if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: - cleanup_storage(tmp_task_yaml) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError('Max number of services reached.') - success = serve_state.add_service( - service_name, - controller_job_id=job_id, - policy=service_spec.autoscaling_policy_str(), - requested_resources_str=backend_utils.get_task_resources_str(task), - load_balancing_policy=service_spec.load_balancing_policy, - status=serve_state.ServiceStatus.CONTROLLER_INIT) - # Directly throw an error here. See sky/serve/api.py::up - # for more details. - if not success: - cleanup_storage(tmp_task_yaml) - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name} already exists.') - - # Add initial version information to the service state. - serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION, - service_spec) + if not is_recovery: + if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: + cleanup_storage(tmp_task_yaml) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Max number of services reached.') + success = serve_state.add_service( + service_name, + controller_job_id=job_id, + policy=service_spec.autoscaling_policy_str(), + requested_resources_str=backend_utils.get_task_resources_str(task), + load_balancing_policy=service_spec.load_balancing_policy, + status=serve_state.ServiceStatus.CONTROLLER_INIT) + # Directly throw an error here. See sky/serve/api.py::up + # for more details. + if not success: + cleanup_storage(tmp_task_yaml) + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name} already exists.') + + # Add initial version information to the service state. + serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION, + service_spec) # Create the service working directory. service_dir = os.path.expanduser( @@ -187,8 +188,13 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): try: with filelock.FileLock( os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)): - controller_port = common_utils.find_free_port( - constants.CONTROLLER_PORT_START) + if is_recovery: + # In recovery mode, use the ports from the database + controller_port = serve_state.get_service_controller_port(service_name) + load_balancer_port = serve_state.get_service_load_balancer_port(service_name) + else: + controller_port = common_utils.find_free_port( + constants.CONTROLLER_PORT_START) # We expose the controller to the public network when running # inside a kubernetes cluster to allow external load balancers @@ -211,14 +217,16 @@ def _get_host(): args=(service_name, service_spec, task_yaml, controller_host, controller_port)) controller_process.start() - serve_state.set_service_controller_port(service_name, + if not is_recovery: + serve_state.set_service_controller_port(service_name, controller_port) # TODO(tian): Support HTTPS. controller_addr = f'http://{controller_host}:{controller_port}' - load_balancer_port = common_utils.find_free_port( - constants.LOAD_BALANCER_PORT_START) + if not is_recovery: + load_balancer_port = common_utils.find_free_port( + constants.LOAD_BALANCER_PORT_START) # Extract the load balancing policy from the service spec policy_name = service_spec.load_balancing_policy @@ -233,7 +241,8 @@ def _get_host(): load_balancer_log_file).run, args=(controller_addr, load_balancer_port, policy_name)) load_balancer_process.start() - serve_state.set_service_load_balancer_port(service_name, + if not is_recovery: + serve_state.set_service_load_balancer_port(service_name, load_balancer_port) while True: @@ -279,8 +288,11 @@ def _get_host(): required=True, type=int, help='Job id for the service job.') + parser.add_argument('--is-recovery', + action='store_true', + help='Whether this is a recovery mode start.') args = parser.parse_args() # We start process with 'spawn', because 'fork' could result in weird # behaviors; 'spawn' is also cross-platform. multiprocessing.set_start_method('spawn', force=True) - _start(args.service_name, args.task_yaml, args.job_id) + _start(args.service_name, args.task_yaml, args.job_id, args.is_recovery) From d61600f60a168cf16f2ec540f6144888d9ed2df5 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 15 Jan 2025 13:54:39 +0800 Subject: [PATCH 2/7] refactor: get_head_pod_name --- sky/provision/kubernetes/instance.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index c431b023ab9..187400d79ff 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -35,12 +35,9 @@ def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]: - head_pod_name = None - for pod_name, pod in pods.items(): - if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head': - head_pod_name = pod_name - break - return head_pod_name + return next((pod_name for pod_name, pod in pods.items() + if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'), + None) def head_service_selector(cluster_name: str) -> Dict[str, str]: From 7550e892eddf1881ce58f001d11c4d09875b859d Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 15 Jan 2025 14:00:36 +0800 Subject: [PATCH 3/7] refactor: extract `is_head` --- sky/provision/kubernetes/instance.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 187400d79ff..d4e1a7d5a0f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -34,9 +34,13 @@ TAG_POD_INITIALIZED = 'skypilot-initialized' +def _is_head(pod) -> bool: + return pod.metadata.labels.get(constants.TAG_RAY_NODE_KIND) == 'head' + + def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]: return next((pod_name for pod_name, pod in pods.items() - if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'), + if _is_head(pod)), None) @@ -806,8 +810,7 @@ def _create_pod_thread(i: int): # Process created pods for pod in pods: created_pods[pod.metadata.name] = pod - if head_pod_name is None and pod.metadata.labels.get( - constants.TAG_RAY_NODE_KIND) == 'head': + if head_pod_name is None and _is_head(pod): head_pod_name = pod.metadata.name networking_mode = network_utils.get_networking_mode( @@ -961,9 +964,6 @@ def terminate_instances( logger.warning('terminate_instances: Error occurred when analyzing ' f'SSH Jump pod: {e}') - def _is_head(pod) -> bool: - return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head' - def _terminate_pod_thread(pod_info): pod_name, pod = pod_info if _is_head(pod) and worker_only: @@ -1019,7 +1019,7 @@ def get_cluster_info( tags=pod.metadata.labels, ) ] - if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head': + if _is_head(pod): head_pod_name = pod_name head_spec = pod.spec assert head_spec is not None, pod From 0f223d10219f5c3816fc6a7813ff46fbced5e8c1 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 15 Jan 2025 22:59:58 +0800 Subject: [PATCH 4/7] feat: k8s deployment for skyserve controller --- sky/provision/kubernetes/instance.py | 143 ++++++++++++++++++++++++++- sky/serve/serve_state.py | 4 +- 2 files changed, 142 insertions(+), 5 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index d4e1a7d5a0f..f012626e54b 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -39,8 +39,7 @@ def _is_head(pod) -> bool: def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]: - return next((pod_name for pod_name, pod in pods.items() - if _is_head(pod)), + return next((pod_name for pod_name, pod in pods.items() if _is_head(pod)), None) @@ -651,6 +650,124 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict, raise e +def _is_serve_controller(cluster_name_on_cloud: str) -> bool: + return cluster_name_on_cloud.startswith('sky-serve-controller-') + + +def _create_persistent_volume_claim(namespace: str, context: Optional[str], + pvc_name: str) -> None: + """Creates a persistent volume claim for SkyServe controller.""" + try: + kubernetes.core_api(context).read_namespaced_persistent_volume_claim( + name=pvc_name, namespace=namespace) + return + except kubernetes.api_exception() as e: + if e.status != 404: # Not found + raise + + pvc_spec = { + 'apiVersion': 'v1', + 'kind': 'PersistentVolumeClaim', + 'metadata': { + 'name': pvc_name, + }, + 'spec': { + 'accessModes': ['ReadWriteOnce'], + 'resources': { + 'requests': { + 'storage': '10Gi' # TODO(andyl): use a constant here + } + } + } + } + + kubernetes.core_api(context).create_namespaced_persistent_volume_claim( + namespace=namespace, body=pvc_spec) + + +def _create_serve_controller_deployment( + pod_spec: Dict[str, Any], cluster_name_on_cloud: str, namespace: str, + context: Optional[str]) -> Dict[str, Any]: + """Creates a deployment for SkyServe controller with persistence.""" + pvc_name = f'{cluster_name_on_cloud}-data' + _create_persistent_volume_claim(namespace, context, pvc_name) + + mount_path = '/root/.sky/serve' # TODO(andyl): use a constant here + volume_mounts = [{'name': 'serve-data', 'mountPath': mount_path}] + + volumes = [{ + 'name': 'serve-data', + 'persistentVolumeClaim': { + 'claimName': pvc_name + } + }] + + if 'volumes' in pod_spec['spec']: + pod_spec['spec']['volumes'].extend(volumes) + else: + pod_spec['spec']['volumes'] = volumes + + for container in pod_spec['spec']['containers']: + if 'volumeMounts' in container: + container['volumeMounts'].extend(volume_mounts) + else: + container['volumeMounts'] = volume_mounts + + template_metadata = pod_spec.pop('metadata') + + deployment_labels = { + 'app': cluster_name_on_cloud, + } + template_metadata['labels'].update(deployment_labels) + + # The pod template part of pod_spec is used in the deployment + # spec.template.spec + + deployment_spec = { + 'apiVersion': 'apps/v1', + 'kind': 'Deployment', + 'metadata': { + 'name': f'{cluster_name_on_cloud}-deployment', + 'namespace': namespace, + }, + 'spec': { + 'replicas': 1, + 'selector': { + 'matchLabels': deployment_labels + }, + 'template': { + 'metadata': template_metadata, + 'spec': { + **pod_spec['spec'], 'restartPolicy': 'Always' + } + } + } + } + + return deployment_spec + + +@timeline.event +def _wait_for_deployment_pod(context, namespace, deployment, timeout=60): + label_selector = ','.join([ + f'{key}={value}' + for key, value in deployment.spec.selector.match_labels.items() + ]) + target_replicas = deployment.spec.replicas + start_time = time.time() + while time.time() - start_time < timeout: + pods = kubernetes.core_api(context).list_namespaced_pod( + namespace, label_selector=label_selector).items + # TODO(andyl): not sure if this necessary + if len(pods) == target_replicas: + return pods + time.sleep(2) + + raise TimeoutError( + f'Timeout: Not all Pods for Deployment {deployment.metadata.name!r}' + ' are created.') + + @timeline.event def _create_pods(region: str, cluster_name_on_cloud: str, config: common.ProvisionConfig) -> common.ProvisionRecord: @@ -662,6 +779,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, tags = { TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud, } + pod_spec['metadata']['namespace'] = namespace if 'labels' in pod_spec['metadata']: pod_spec['metadata']['labels'].update(tags) @@ -749,7 +867,8 @@ def _create_pod_thread(i: int): pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS) head_selector = head_service_selector(cluster_name_on_cloud) pod_spec_copy['metadata']['labels'].update(head_selector) - pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head' + pod_spec_copy['metadata'][ + 'name'] = f'{cluster_name_on_cloud}-head' #! else: # Worker pods pod_spec_copy['metadata']['labels'].update( @@ -800,6 +919,17 @@ def _create_pod_thread(i: int): } pod_spec_copy['spec']['tolerations'] = [tpu_toleration] + if _is_serve_controller(cluster_name_on_cloud): + deployment_spec = _create_serve_controller_deployment( + pod_spec_copy, cluster_name_on_cloud, namespace, context) + try: + return kubernetes.apps_api( + context).create_namespaced_deployment( + namespace, deployment_spec) + except Exception as e: + print('Deployment failed', e) + raise e + return _create_namespaced_pod_with_retries(namespace, pod_spec_copy, context) @@ -807,6 +937,13 @@ def _create_pod_thread(i: int): pods = subprocess_utils.run_in_parallel(_create_pod_thread, range(to_start_count), _NUM_THREADS) + if _is_serve_controller(cluster_name_on_cloud): + deployments = copy.deepcopy(pods) + pods.clear() # Since it's not pods. What created here are true pods. + for deployment in deployments: + pods.extend(_wait_for_deployment_pod(context, namespace, + deployment)) + # Process created pods for pod in pods: created_pods[pod.metadata.name] = pod diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 005a89135df..d6dc5d441db 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -554,7 +554,7 @@ def get_service_controller_port(service_name: str) -> int: """Gets the controller port of a service.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute('SELECT controller_port FROM services WHERE name = ?', - (service_name,)) + (service_name,)) row = cursor.fetchone() if row is None: raise ValueError(f'Service {service_name} does not exist.') @@ -565,7 +565,7 @@ def get_service_load_balancer_port(service_name: str) -> int: """Gets the load balancer port of a service.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute('SELECT load_balancer_port FROM services WHERE name = ?', - (service_name,)) + (service_name,)) row = cursor.fetchone() if row is None: raise ValueError(f'Service {service_name} does not exist.') From 0abc2c80cb15539a340b1e269d7cd5d6bb13c192 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 15 Jan 2025 23:27:35 +0800 Subject: [PATCH 5/7] feat: auto-detect recovering --- sky/serve/service.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/sky/serve/service.py b/sky/serve/service.py index 5d58a35b9bc..8d1cd7d0b86 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -130,7 +130,20 @@ def cleanup_version_storage(version: int) -> bool: return failed -def _start(service_name: str, tmp_task_yaml: str, job_id: int, is_recovery: bool = False): +def is_recovery_mode(service_name: str) -> bool: + """Check if service exists in database to determine recovery mode. + + Args: + service_name: Name of the service to check + + Returns: + True if service exists in database, indicating recovery mode + """ + service = serve_state.get_service_from_name(service_name) + return service is not None + + +def _start(service_name: str, tmp_task_yaml: str, job_id: int): """Starts the service.""" # Generate ssh key pair to avoid race condition when multiple sky.launch # are executed at the same time. @@ -141,6 +154,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, is_recovery: bool # Already checked before submit to controller. assert task.service is not None, task service_spec = task.service + + is_recovery = is_recovery_mode(service_name) + if not is_recovery: if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: cleanup_storage(tmp_task_yaml) @@ -161,8 +177,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, is_recovery: bool raise ValueError(f'Service {service_name} already exists.') # Add initial version information to the service state. - serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION, - service_spec) + serve_state.add_or_update_version(service_name, + constants.INITIAL_VERSION, + service_spec) # Create the service working directory. service_dir = os.path.expanduser( @@ -190,8 +207,10 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, is_recovery: bool os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)): if is_recovery: # In recovery mode, use the ports from the database - controller_port = serve_state.get_service_controller_port(service_name) - load_balancer_port = serve_state.get_service_load_balancer_port(service_name) + controller_port = serve_state.get_service_controller_port( + service_name) + load_balancer_port = serve_state.get_service_load_balancer_port( + service_name) else: controller_port = common_utils.find_free_port( constants.CONTROLLER_PORT_START) @@ -219,7 +238,7 @@ def _get_host(): controller_process.start() if not is_recovery: serve_state.set_service_controller_port(service_name, - controller_port) + controller_port) # TODO(tian): Support HTTPS. controller_addr = f'http://{controller_host}:{controller_port}' @@ -242,8 +261,8 @@ def _get_host(): args=(controller_addr, load_balancer_port, policy_name)) load_balancer_process.start() if not is_recovery: - serve_state.set_service_load_balancer_port(service_name, - load_balancer_port) + serve_state.set_service_load_balancer_port( + service_name, load_balancer_port) while True: _handle_signal(service_name) @@ -288,11 +307,8 @@ def _get_host(): required=True, type=int, help='Job id for the service job.') - parser.add_argument('--is-recovery', - action='store_true', - help='Whether this is a recovery mode start.') args = parser.parse_args() # We start process with 'spawn', because 'fork' could result in weird # behaviors; 'spawn' is also cross-platform. multiprocessing.set_start_method('spawn', force=True) - _start(args.service_name, args.task_yaml, args.job_id, args.is_recovery) + _start(args.service_name, args.task_yaml, args.job_id) From 21e8507b65863112d442de27094d58091c827a46 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 17 Jan 2025 16:13:08 +0800 Subject: [PATCH 6/7] fix: mount path --- sky/provision/kubernetes/instance.py | 37 +++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index f012626e54b..61f93f3e3ac 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -692,7 +692,7 @@ def _create_serve_controller_deployment( pvc_name = f'{cluster_name_on_cloud}-data' _create_persistent_volume_claim(namespace, context, pvc_name) - mount_path = '/root/.sky/serve' # TODO(andyl): use a constant here + mount_path = '/home/sky/.sky/serve' # TODO(andyl): use a constant here volume_mounts = [{'name': 'serve-data', 'mountPath': mount_path}] volumes = [{ @@ -713,6 +713,37 @@ def _create_serve_controller_deployment( else: container['volumeMounts'] = volume_mounts + # pod_spec['spec']['securityContext'] = { + # 'fsGroup': 1000 # Ensure group permissions for mounted volumes + # } + # for container in pod_spec['spec']['containers']: + # container['securityContext'] = { + # 'runAsUser': 1000, # Run as sky user + # 'runAsGroup': 1000 + # } + + # init_container = { + # 'name': 'init-fix-permissions', + # 'image': 'busybox', + # 'command': [ + # 'sh', '-c', + # ( + # 'chown -R 1000:1000 /home/sky/.sky && ' + # 'chmod -R 755 /home/sky/.sky && ' + # 'mkdir -p /home/sky/.sky/.runtime_files && ' + # 'chown -R 1000:1000 /home/sky/.sky/.runtime_files && ' + # 'chmod -R 755 /home/sky/.sky/.runtime_files' + # ) + # ], + # 'volumeMounts': [ + # {'name': 'serve-data', 'mountPath': '/home/sky/.sky/serve'} + # ], + # } + # if 'initContainers' in pod_spec['spec']: + # pod_spec['spec']['initContainers'].append(init_container) + # else: + # pod_spec['spec']['initContainers'] = [init_container] + template_metadata = pod_spec.pop('metadata') deployment_labels = { @@ -922,6 +953,10 @@ def _create_pod_thread(i: int): if _is_serve_controller(cluster_name_on_cloud): deployment_spec = _create_serve_controller_deployment( pod_spec_copy, cluster_name_on_cloud, namespace, context) + print('try to create deployment') + import yaml + with open('deployment_spec.yaml', 'w') as f: + yaml.dump(deployment_spec, f) try: return kubernetes.apps_api( context).create_namespaced_deployment( From 8aee49479de540c8225adb2b6912a46a5f165a88 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 17 Jan 2025 16:55:03 +0800 Subject: [PATCH 7/7] fix: use `~/.sky` for mounting --- sky/provision/kubernetes/instance.py | 42 +++++----------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 61f93f3e3ac..5ddb2f0708a 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -692,7 +692,13 @@ def _create_serve_controller_deployment( pvc_name = f'{cluster_name_on_cloud}-data' _create_persistent_volume_claim(namespace, context, pvc_name) - mount_path = '/home/sky/.sky/serve' # TODO(andyl): use a constant here + # The reason we mount the whole /home/sky/.sky instead of just + # /home/sky/.sky/serve is that k8s changes the ownership of the + # mounted directory to root:root. If we only mount /home/sky/.sky/serve, + # the serve controller will not be able to create the serve directory. + # pylint: disable=line-too-long + # See https://stackoverflow.com/questions/50818029/mounted-folder-created-as-root-instead-of-current-user-in-docker/50820023#50820023. + mount_path = '/home/sky/.sky' # TODO(andyl): use a constant here volume_mounts = [{'name': 'serve-data', 'mountPath': mount_path}] volumes = [{ @@ -713,37 +719,6 @@ def _create_serve_controller_deployment( else: container['volumeMounts'] = volume_mounts - # pod_spec['spec']['securityContext'] = { - # 'fsGroup': 1000 # Ensure group permissions for mounted volumes - # } - # for container in pod_spec['spec']['containers']: - # container['securityContext'] = { - # 'runAsUser': 1000, # Run as sky user - # 'runAsGroup': 1000 - # } - - # init_container = { - # 'name': 'init-fix-permissions', - # 'image': 'busybox', - # 'command': [ - # 'sh', '-c', - # ( - # 'chown -R 1000:1000 /home/sky/.sky && ' - # 'chmod -R 755 /home/sky/.sky && ' - # 'mkdir -p /home/sky/.sky/.runtime_files && ' - # 'chown -R 1000:1000 /home/sky/.sky/.runtime_files && ' - # 'chmod -R 755 /home/sky/.sky/.runtime_files' - # ) - # ], - # 'volumeMounts': [ - # {'name': 'serve-data', 'mountPath': '/home/sky/.sky/serve'} - # ], - # } - # if 'initContainers' in pod_spec['spec']: - # pod_spec['spec']['initContainers'].append(init_container) - # else: - # pod_spec['spec']['initContainers'] = [init_container] - template_metadata = pod_spec.pop('metadata') deployment_labels = { @@ -954,9 +929,6 @@ def _create_pod_thread(i: int): deployment_spec = _create_serve_controller_deployment( pod_spec_copy, cluster_name_on_cloud, namespace, context) print('try to create deployment') - import yaml - with open('deployment_spec.yaml', 'w') as f: - yaml.dump(deployment_spec, f) try: return kubernetes.apps_api( context).create_namespaced_deployment(