forked from AI-Hypercomputer/xpk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xpk-large-scale-guide.sh
561 lines (461 loc) · 23.4 KB
/
xpk-large-scale-guide.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
#!/bin/bash
######### OVERVIEW OF SCRIPT #########
## This script is intended to guide one on the steps needed to create large scale
## (>5k VMs) with v5e-256 with xpk and GKE.
## This script was run by manually copying commands per step and verifying the
## output of each step.
## We recommend you manually copy commands per step and verify the outputs of
## each step.
## Step Summary is:
## Step 1: Cluster Networking setup.
## Step 2: Create your cluster with xpk.
## Step 3: Move from KubeDNS to CoreDNS. This is necessary past 1000 VMs.
## Step 4: Pass Cluster name and Project ID to Google POCs to setup your cluster
## for large scale and high throughput. This is necessary past 5000 VMs.
## Step 5: Scale up your cluster.
## Step 6 (OPTIONAL): Run a simple sample job on all slices in the cluster.
## Shows how to run jobs and view jobs with xpk.
## Step 7 (OPTIONAL): Run maxtext training on slices in the cluster.
## Shows how to create custom docker images and use cacheimage to improve start up time.
## XPK offers cacheimage as a wrapper around daemonset.
######### OVERVIEW OF SCRIPT #########
### USER VARIABLES:
# TODO(USER): ADJUST PROJECT_NAME, CLUSTER NAME at least.
export PREFIX=${USER}
export PROJECT=PROJECT_NAME
export REGION=us-central2
export ZONE=us-central2-b
export CLUSTER=CLUSTER-NAME
gcloud config set project $PROJECT
gcloud config set compute/zone $ZONE
gcloud config set compute/region $REGION
### INTERNAL VARIABLES:
export NETWORK_NAME=${PREFIX}-privatenetwork
export SUBNET_NAME=${PREFIX}-privatesubnet
export FIREWALL_RULE_NAME=${PREFIX}-privatefirewall
export ROUTER_NAME=${PREFIX}-network
export NAT_CONFIG=${PREFIX}-natconfig
export NUMSLICES=4
##### STEP 1 #################
##### Cluster Networking #####
##############################
##### 1A #####################
# Create network for cluster.
##### 1A #####################
gcloud compute networks create "${NETWORK_NAME}" --mtu=8896 --bgp-routing-mode=regional --subnet-mode=custom
# Created [https://www.googleapis.com/compute/v1/projects/PROJECT/global/networks/PREFIX-privatenetwork].
# NAME SUBNET_MODE BGP_ROUTING_MODE IPV4_RANGE GATEWAY_IPV4
# PREFIX-privatenetwork CUSTOM REGIONAL
# Instances on this network will not be reachable until firewall rules
# are created. As an example, you can allow all internal traffic between
# instances as well as SSH, RDP, and ICMP by running:
##### 1B #####################
# Create subnetwork for cluster.
##### 1B #####################
gcloud compute networks subnets create "${SUBNET_NAME}" --network="${NETWORK_NAME}" --range=10.10.0.0/18 --region="${REGION}"
# Created [https://www.googleapis.com/compute/v1/projects/PROJECT/regions/us-central2/subnetworks/PREFIX-privatesubnet].
# NAME REGION NETWORK RANGE STACK_TYPE IPV6_ACCESS_TYPE INTERNAL_IPV6_PREFIX EXTERNAL_IPV6_PREFIX
# PREFIX-privatesubnet us-central2 PREFIX-privatenetwork 10.10.0.0/18 IPV4_ONLY
##### 1C #####################
# Create firewall rules for private network.
##### 1C #####################
gcloud compute firewall-rules create "${FIREWALL_RULE_NAME}" --network "${NETWORK_NAME}" --allow tcp,icmp,udp --project="${PROJECT}"
# Creating firewall...⠹Created [https://www.googleapis.com/compute/v1/projects/PROJECT/global/firewalls/PREFIX-privatefirewall].
# Creating firewall...done.
# NAME NETWORK DIRECTION PRIORITY ALLOW DENY DISABLED
# PREFIX-privatefirewall PREFIX-privatenetwork INGRESS 1000 tcp,icmp,udp False
##### 1D #####################
# Routers for network and region.
##### 1D #####################
gcloud compute routers create "${ROUTER_NAME}" \
--project="${PROJECT}" \
--network="${NETWORK_NAME}" \
--region="${REGION}"
# Creating router [PREFIX-network]...done.
# NAME REGION NETWORK
# PREFIX-network us-central2 PREFIX-privatenetwork
##### 1E #####################
# Router nats for the region
##### 1E #####################
gcloud compute routers nats create "${NAT_CONFIG}" \
--router="${ROUTER_NAME}" \
--region="${REGION}" \
--auto-allocate-nat-external-ips \
--nat-all-subnet-ip-ranges \
--enable-logging
# Creating NAT [PREFIX-natconfig] in router [PREFIX-network]...done.
##### STEP 2 ############################
##### Create your cluster with xpk. #####
#########################################
##### 2A #####################
# Export cluster and node pool arguments
##### 2A #####################
export CLUSTER_ARGUMENTS=" \
--network=${NETWORK_NAME} \
--subnetwork=${SUBNET_NAME} \
--scopes=storage-full,gke-default \
--enable-ip-alias \
--enable-private-nodes \
--master-ipv4-cidr 172.16.0.32/28 \
--cluster-ipv4-cidr=10.224.0.0/12 \
--no-enable-master-authorized-networks \
"
export TPU_NODEPOOL_ARGUMENTS=" \
--scopes=storage-full,gke-default \
--enable-gvnic \
--max-pods-per-node 15 \
--disk-size=50 \
"
##### 2B #####################
# Git clone and go to the correct directory.
##### 2B #####################
git clone https://github.com/google/xpk.git && cd xpk
##### 2C #####################
# Confirm that variables are correctly set:
##### 2C #####################
echo python3 xpk.py cluster create \
--cluster "${CLUSTER}" --tpu-type=v5litepod-256 \
--num-slices="${NUMSLICES}" \
--host-maintenance-interval=PERIODIC \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \
--custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}"
# python3 xpk.py cluster create --cluster NAME \
# --tpu-type=v5litepod-256 --num-slices=4 \
# --host-maintenance-interval=PERIODIC \
# --custom-cluster-arguments= --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks
# --custom-tpu-nodepool-arguments= --scopes=storage-full,gke-default --enable-gvnic --max-pods-per-node 15 --disk-size=50
##### 2D #####################
# Run Cluster Create.
##### 2D #####################
# Rerun create command to update the cluster (with a new slice size) or if the create command fails.
python3 xpk.py cluster create \
--cluster "${CLUSTER}" --tpu-type=v5litepod-256 \
--num-slices="${NUMSLICES}" \
--host-maintenance-interval=PERIODIC \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \
--custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}"
# This process takes around 4 minutes with 4 slices of v5e-256.
###############################
##### 2D - TIPS ###############
###############################
# 1) View other examples of xpk.py here: https://github.com/google/maxtext/blob/main/xpk/README.md
# 2) xpk create command will update the cluster. If you adjust the num-slices and call create again,
# xpk will intelligently adjust the # number of node pools and execute the number of create / delete commands.
# 3) If xpk create command fails, the first step is to try running create again.
##### STEP 3 ##############################
##### MOVE From KubeDNS to CoreDNS ########
###########################################
##### 3A #####################
# Install jq command-line JSON processor
##### 3A #####################
sudo apt install jq -y
##### 3B #####################
# git clone coredns deployment repo.
##### 3B #####################
git clone https://github.com/coredns/deployment.git
##### 3C #####################
# Go to repo and deploy coredns.
##### 3C #####################
cd deployment/kubernetes
./deploy.sh | kubectl apply -f -
# serviceaccount/coredns created
# clusterrole.rbac.authorization.k8s.io/system:coredns created
# clusterrolebinding.rbac.authorization.k8s.io/system:coredns created
# configmap/coredns created
# deployment.apps/coredns created
# Warning: resource services/kube-dns is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
# service/kube-dns configured
##### 3D #####################
# Scale down kube-dns-autoscaler
##### 3D #####################
kubectl scale deployment --replicas=0 kube-dns-autoscaler --namespace=kube-system
# deployment.apps/kube-dns-autoscaler scaled
##### 3E #####################
# Scale down kube-dns
##### 3E #####################
kubectl scale deployment --replicas=0 kube-dns --namespace=kube-system
# Warning: spec.template.metadata.annotations[scheduler.alpha.kubernetes.io/critical-pod]: non-functional in v1.16+; use the "priorityClassName" field instead
# Warning: spec.template.metadata.annotations[seccomp.security.alpha.kubernetes.io/pod]: non-functional in v1.27+; use the "seccompProfile" field instead
# deployment.apps/kube-dns scaled
##### 3F #####################
# Scale up core-dns
##### 3F #####################
# We recommend 15+ replicas
kubectl scale deployment coredns --replicas=15 -n kube-system
# deployment.apps/coredns scaled
##### 3G #####################
# Verify that kubedns pods have stopped.
# Verify that coredns pods have started.
##### 3G #####################
watch 'kubectl get pods -n kube-system -o=wide | grep dns | grep -i kube'
# These should be terminated / disappear soon.
watch 'kubectl get pods -n kube-system -o=wide | grep dns | grep -i core'
# These should create at least one coredns pod.
##### 3H #####################
# Rerun xpk cluster create to plumb coredns changes to the cluster.
##### 3H #####################
# Go to the correct directory.
cd ../..
# Cluster create is the same command as run previously in step 2D. It will
# not recreate the cluster but just update it.
python3 xpk.py cluster create \
--cluster "${CLUSTER}" --tpu-type=v5litepod-256 \
--num-slices="${NUMSLICES}" \
--host-maintenance-interval=PERIODIC \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \
--custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}"
##### STEP 4 ###################################################
##### PASS Cluster name and Project ID to Google POCs ##########
################################################################
# 4A. Tell Google POCs to setup GKE Cluster for large scale.
echo -e "\nTell Google POCS: We want $CLUSTER in $PROJECT to be set up for high throughput scheduling and large scale.\n"
##### STEP 5 ###############################
##### Begin Scale Up of GKE Cluster ########
############################################
##### 5A #####################
# TODO(USER): Set NUMSLICES to what you wish to scale to
##### 5A #####################
# Remember it is ok to incrementally scale if you wish. You can run cluster create
# repeatedly and adjust `--num-slices`.
export NUMSLICES=64
### USER VARIABLES:
# TODO(USER): ADJUST PROJECT_NAME AND CLUSTER NAME at least.
export PREFIX=${USER}
export PROJECT=PROJECT_NAME
export REGION=us-central2
export ZONE=us-central2-b
export CLUSTER=CLUSTER-NAME
gcloud config set project $PROJECT
gcloud config set compute/zone $ZONE
gcloud config set compute/region $REGION
### INTERNAL VARIABLES:
export NETWORK_NAME=${PREFIX}-privatenetwork
export SUBNET_NAME=${PREFIX}-privatesubnet
export FIREWALL_RULE_NAME=${PREFIX}-privatefirewall
export ROUTER_NAME=${PREFIX}-network
export NAT_CONFIG=${PREFIX}-natconfig
export CLUSTER_ARGUMENTS=" \
--network=${NETWORK_NAME} \
--subnetwork=${SUBNET_NAME} \
--scopes=storage-full,gke-default \
--enable-ip-alias \
--enable-private-nodes \
--master-ipv4-cidr 172.16.0.32/28 \
--cluster-ipv4-cidr=10.224.0.0/12 \
--no-enable-master-authorized-networks \
"
export TPU_NODEPOOL_ARGUMENTS=" \
--scopes=storage-full,gke-default \
--enable-gvnic \
--max-pods-per-node 15 \
--disk-size=50 \
"
##### 5B #####################
# Confirm that variables are correctly set:
##### 5B #####################
echo python3 xpk.py cluster create \
--cluster "${CLUSTER}" --tpu-type=v5litepod-256 \
--num-slices="${NUMSLICES}" \
--host-maintenance-interval=PERIODIC \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \
--custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}"
# python3 xpk.py cluster create --cluster NAME \
# --tpu-type=v5litepod-256 --num-slices=64 \
# --host-maintenance-interval=PERIODIC \
# --custom-cluster-arguments= --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks
# --custom-tpu-nodepool-arguments= --scopes=storage-full,gke-default --enable-gvnic --max-pods-per-node 15 --disk-size=50
##### 5C #####################
# Scale up to NUMSLICES (64 in the provided case) V5e-256s.
##### 5C #####################
python3 xpk.py cluster create \
--cluster "${CLUSTER}" --tpu-type=v5litepod-256 \
--num-slices="${NUMSLICES}" \
--host-maintenance-interval=PERIODIC \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \
--custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}"
###############################
##### 5C - POTENTIAL ERRORS ###
###############################
# If you see failures, the first step is to retry the cluster create command.
# [XPK] Terminating all Create and Delete Nodepools processes since at least one failed.
# [XPK] Failure is NodepoolCreate-PREFIX-CLUSTER-np-53 and logfile /tmp/NodepoolCreate-PREFIX-CLUSTER-np-53-pqfbm5nl
# [XPK] Create and Delete Nodepools returned ERROR 1
# [XPK] XPK failed, error code 1
# Auto-repairing nodepools on creation.
# Node pools can go into auto repair if there is an issue with their creation. For example,
# in the DEADLINE error, the node pool will automatically repair itself. This took ~20 minutes for the
# node pool to repair in the above example. You can continue with rerun cluster create
# commands while it is auto-repairing so that the rest of the cluster continues to
# be created while the repair occurs.
# It took 20 minutes for the above internal example to go from 4 to 64 NPs with
# a series of increment cluster create steps.
##### (OPTIONAL) STEP 6 (OPTIONAL) #########################################
##### Run a simple multislice sample job ###################################
############################################################################
##### 6A #####################
# Verify in Cloud Console that you have NUMSLICES node pools.
##### 6A #####################
# Verify in Cloud Console that you have NUMSLICES node pools in the NODES tab of Cloud Console.
echo "https://console.cloud.google.com/kubernetes/clusters/details/us-central2/${CLUSTER}/details?project=${PROJECT}"
##### 6B #####################
# Run a multislice workload on all slices.
##### 6B #####################
# Set --scheduler=gke.io/high-throughput-scheduler to use the high throughput scheduler.
python3 xpk.py workload create \
--scheduler=gke.io/high-throughput-scheduler \
--workload xpk-test-workload --command "echo hello world" --cluster ${CLUSTER} \
--tpu-type=v5litepod-256 --num-slices=${NUMSLICES}
# [XPK] Starting xpk
# [XPK] Working on args.project='PROJECT' and us-central2-b
# [XPK] Task: `Set Cluster` is implemented by `gcloud container clusters get-credentials CLUSTER --region=us-central2 --project=PROJECT && kubectl config view`, hiding output unless there is an error.
# [XPK] Task: `Set Cluster` succeeded.
# [XPK] Task: `Check if Workload Already Exists` is implemented by `kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name'`, hiding output unless there is an error.
# [XPK] Starting workload create
# [XPK] Task: `Creating Workload` is implemented by `kubectl apply -f /tmp/tmpk7599zd9`, streaming output live.
# [XPK] Waiting for `Creating Workload`, for 0 seconds
# [XPK] Waiting for `Creating Workload`, for 1 seconds
# jobset.jobset.x-k8s.io/xpk-test-workload created
# [XPK] Task: `Creating Workload` terminated with code `0`
# [XPK] Follow your workload here: WORKLOAD_LOGS_LINK
# [XPK] Exiting XPK cleanly
# ###########################################
# #### Logs expected from the above link ####
# ###########################################
# 2023-10-03 11:34:54.621 PDT
# XPK Start: Tue Oct 3 18:34:54 UTC 2023
# 2023-10-03 11:34:54.622 PDT
# hello world
# 2023-10-03 11:34:54.622 PDT
# XPK End: Tue Oct 3 18:34:54 UTC 2023
# 2023-10-03 11:34:54.622 PDT
# EXIT_CODE=0
# 2023-10-03 11:34:54.779 PDT
# XPK Start: Tue Oct 3 18:34:54 UTC 2023
# 2023-10-03 11:34:54.779 PDT
# hello world
# ...
##### 6C #####################
# Verify workload.
# Use the link in the above "WORKLOAD_LOGS_LINK" view logs
# Run xpk workload list to view all workloads on the cluster.
##### 6C #####################
# Use the link in the above "WORKLOAD_LOGS_LINK" view logs. You should see
# the echo command in cloud logs.
python3 xpk.py workload list \
--cluster ${CLUSTER}
###############################
##### 6C - TIPS ###############
###############################
# If you see `Not all pods are ready or succeeded` then the workload is still running.
# If you see `JobSet finished successfully` then the workload is finished successfully.
# [XPK] Starting xpk
# Namespace(xpk_subcommands='workload', func=<function workload_list at 0x7fa23a059da0>, xpk_workload_subcommands='list', cluster='CLUSTER', project=None, zone=None, dry_run=False)
# [XPK] Starting workload list
# [XPK] Working on args.project='PROJECT' and us-central2-b
# [XPK] Task: `Set Cluster` is implemented by `gcloud container clusters get-credentials CLUSTER --region=us-central2 --project=PROJECT && kubectl config view`, hiding output unless there is an error.
# [XPK] Task: `Set Cluster` succeeded.
# [XPK] Task: `List Jobs` is implemented by `kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name,Created Time:.metadata.creationTimestamp,Priority:.spec.priorityClassName,TPU VMs Needed:.spec.podSets[0].count,Last Status Verbose:.status.conditions[-1].message,Last Status:.status.conditions[-1].status,Last Transition:.status.conditions[-1].lastTransitionTime,Current Queue:.status.admission.clusterQueue,All Done:.status.reclaimablePods[0].count'`, streaming output live.
# [XPK] Waiting for `List Jobs`, for 0 seconds
# [XPK] Waiting for `List Jobs`, for 1 seconds
# Jobset Created Time Priority TPU VMs Needed Last Status Verbose Last Status Last Transition Current Queue All Done
# xpk-test-workload 2023-09-27T18:30:47Z medium 5120 Not all pods are ready or succeeded False 2023-09-27T18:30:47Z cluster-queue 192
# [XPK] Task: `List Jobs` terminated with code `0`
# [XPK] Exiting XPK cleanly
##### (OPTIONAL) STEP 7 (OPTIONAL) ###############################
##### XPK with Maxtext ###########################################
##################################################################
# These instructions show you how to build a docker image related to Maxtext
# and cache the docker image for faster job start times.
##### 7A #####################
# Build local Maxtext docker image.
##### 7A #####################
git clone https://github.com/google/maxtext.git
cd maxtext
bash docker_build_dependency_image.sh
##### 7B #####################
# Upload image to gcp project.
##### 7B #####################
bash docker_upload_runner.sh CLOUD_IMAGE_NAME="${USER}"_runner
###############################
##### 7B - TIPS ###############
###############################
# Potential permissions you need for your account or service account:
# Storage Admin
# Kubernetes Engine Admin
##### 7C #####################
# Cluster cacheimage to enable faster start times.
# XPK offers cacheimage as a wrapper around daemonset.
##### 7C #####################
cd ../xpk
python3 xpk.py cluster cacheimage \
--cluster ${CLUSTER} --docker-image gcr.io/"${PROJECT}"/"${USER}"_runner \
--tpu-type=v5litepod-256
# [XPK] Starting xpk
# [XPK] Starting cluster cacheimage for cluster: xpk-test
# [XPK] Working on args.project='PROJECT' and us-central2-b
# [XPK] Task: `Deleting Cached Image` is implemented by `kubectl delete -f /tmp/tmpypvl4dn_ --ignore-not-found=true`, streaming output live.
# [XPK] Waiting for `Deleting Cached Image`, for 0 seconds
# [XPK] Waiting for `Deleting Cached Image`, for 1 seconds
# daemonset.apps "containerimage" deleted
# [XPK] Task: `Deleting Cached Image` terminated with code `0`
# [XPK] Task: `Creating Cached Image` is implemented by `kubectl apply -f /tmp/tmpypvl4dn_`, streaming output live.
# [XPK] Waiting for `Creating Cached Image`, for 0 seconds
# daemonset.apps/containerimage created
# [XPK] Task: `Creating Cached Image` terminated with code `0`
# [XPK] Exiting XPK cleanly
##### 7D #####################
# Run workload in the cluster.
##### 7D #####################
export NUMSLICES=64
# Make sure you are in the xpk github root directory when running this command
python3 xpk.py workload create \
--cluster ${CLUSTER} \
--docker-image gcr.io/${PROJECT}/"${USER}"_runner \
--workload "${USER}"-first-job \
--tpu-type=v5litepod-256 \
--num-slices=${NUMSLICES} \
--scheduler=gke.io/high-throughput-scheduler \
--command "python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=gs://maxtext-experiments-tpem/ dataset_path=gs://max-datasets-rogue/ steps=100 per_device_batch_size=1"
# [XPK] Starting xpk
# [XPK] Working on args.project='PROJECT' and us-central2-b
# [XPK] Task: `Set Cluster` is implemented by `gcloud container clusters get-credentials CLUSTER --region=us-central2 --project=PROJECT && kubectl config view`, hiding output unless there is an error.
# [XPK] Task: `Set Cluster` succeeded.
# [XPK] Task: `Check if Workload Already Exists` is implemented by `kubectl get workloads -o=custom-columns='Jobset:.metadata.ownerReferences[0].name'`, hiding output unless there is an error.
# [XPK] Starting workload create
# [XPK] Task: `Validate Docker Image` is implemented by `gcloud container images describe gcr.io/PROJECT/docker_build_dependency_image --project PROJECT`, hiding output unless there is an error.
# [XPK] Task: `Validate Docker Image` succeeded.
# [XPK] Task: `Creating Workload` is implemented by `kubectl apply -f /tmp/tmpiwxlv270`, streaming output live.
# [XPK] Waiting for `Creating Workload`, for 0 seconds
# jobset.jobset.x-k8s.io/JOB_NAME created
# [XPK] Task: `Creating Workload` terminated with code `0`
# [XPK] Follow your workload here: https://console.cloud.google.com/kubernetes/service/us-central2/CLUSTER/default/JOB_NAME/details?project=PROJECT
# [XPK] Exiting XPK cleanly
#
# ####################################################
# # Logs expected from the above link showcasing the #
# ### model is training and losses are decreasing ###
# ####################################################
# ...
# 2023-10-03 11:48:40.025 PDT
# completed step: 1, seconds: 8.693, TFLOP/s: 1.731, loss: 9.206
# 2023-10-03 11:48:40.308 PDT
# completed step: 2, seconds: 0.206, TFLOP/s: 72.958, loss: 9.352
# 2023-10-03 11:48:40.589 PDT
# completed step: 3, seconds: 0.282, TFLOP/s: 53.464, loss: 9.768
# 2023-10-03 11:48:40.871 PDT
# completed step: 4, seconds: 0.285, TFLOP/s: 52.876, loss: 8.903
# 2023-10-03 11:48:41.152 PDT
# completed step: 5, seconds: 0.278, TFLOP/s: 54.058, loss: 9.663
# ...
# ...
# completed step: 95, seconds: 0.281, TFLOP/s: 53.562, loss: 6.490
# 2023-10-03 11:49:06.768 PDT
# completed step: 96, seconds: 0.282, TFLOP/s: 53.430, loss: 6.532
# 2023-10-03 11:49:07.049 PDT
# completed step: 97, seconds: 0.281, TFLOP/s: 53.572, loss: 6.578
# 2023-10-03 11:49:07.330 PDT
# completed step: 98, seconds: 0.281, TFLOP/s: 53.596, loss: 6.731
# 2023-10-03 11:49:07.612 PDT
# completed step: 99, seconds: 0.281, TFLOP/s: 53.532, loss: 6.880
# ...
# ...