forked from allenai/OLMo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
125 lines (109 loc) · 4.08 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# If you update this, also update BEAKER_IMAGE in .github/workflows/main.yml
IMAGE_NAME_BASE = olmo-torch2
# If you update this, also update BEAKER_WORKSPACE in .github/workflows/main.yml
BEAKER_WORKSPACE = ai2/llm-testing
BEAKER_USER = $(shell beaker account whoami --format=json | jq -r '.[0].name')
GANTRY_IMAGE = $(shell beaker workspace images $(BEAKER_WORKSPACE) --format=json | jq -r -c '.[] | select( .name == "$(IMAGE_NAME_BASE)-gantry" ) | .fullName')
TEST_IMAGE = $(shell beaker workspace images $(BEAKER_WORKSPACE) --format=json | jq -r -c '.[] | select( .name == "$(IMAGE_NAME_BASE)-test" ) | .fullName')
.PHONY : run-checks
run-checks :
isort --check .
black --check .
ruff check .
mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes tests/
.PHONY : beaker-info
beaker-info :
@echo "Beaker user: $(BEAKER_USER)"
@echo "Gantry image: $(GANTRY_IMAGE)"
@echo "Testing image: $(TEST_IMAGE)"
.PHONY : images
images : gantry-image test-image
.PHONY : base-image
base-image :
docker build -f docker/Dockerfile.base -t $(IMAGE_NAME_BASE)-base .
.PHONY : gantry-image
gantry-image :
docker build -f docker/Dockerfile.gantry -t $(IMAGE_NAME_BASE)-gantry .
beaker image create $(IMAGE_NAME_BASE)-gantry --name $(IMAGE_NAME_BASE)-gantry-tmp --workspace $(BEAKER_WORKSPACE)
beaker image delete $(GANTRY_IMAGE) || true
beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-gantry-tmp $(IMAGE_NAME_BASE)-gantry
.PHONY : test-image
test-image : base-image
docker build -f docker/Dockerfile.test -t $(IMAGE_NAME_BASE)-test .
beaker image create $(IMAGE_NAME_BASE)-test --name $(IMAGE_NAME_BASE)-test-tmp --workspace $(BEAKER_WORKSPACE)
beaker image delete $(TEST_IMAGE) || true
beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-test-tmp $(IMAGE_NAME_BASE)-test
.PHONY : lumi-image
lumi-image :
docker build -f docker/Dockerfile.lumi -t ghcr.io/allenai/llm-lumi:$(shell git log -1 --pretty=format:%h) .
docker push ghcr.io/allenai/llm-lumi:$(shell git log -1 --pretty=format:%h)
.PHONY : singularity-pull
singularity-pull :
singularity pull $PROJECT_DIR/containers/llm-lumi_$(TAG).sif docker://ghcr.io/allenai/llm-lumi:$(TAG)
.PHONY : show-test-image
show-test-image :
@echo $(TEST_IMAGE)
.PHONY : show-gantry-image
show-gantry-image :
@echo $(GANTRY_IMAGE)
.PHONY : show-beaker-workspace
show-beaker-workspace :
@echo $(BEAKER_WORKSPACE)
.PHONY : gantry-test
gantry-test :
gantry run \
--workspace "$(BEAKER_WORKSPACE)" \
--priority "normal" \
--preemptible \
--beaker-image "$(GANTRY_IMAGE)" \
--gpus 1 \
--description "Test run" \
--cluster ai2/allennlp-cirrascale \
--cluster ai2/aristo-cirrascale \
--cluster ai2/mosaic-cirrascale \
--cluster ai2/mosaic-cirrascale-a100 \
--cluster ai2/prior-cirrascale \
--cluster ai2/s2-cirrascale \
--cluster ai2/general-cirrascale \
--cluster ai2/general-cirrascale-a100-80g-ib \
--cluster ai2/jupiter-cirrascale \
--cluster ai2/pluto-cirrascale \
--allow-dirty \
--venv base \
--timeout -1 \
--yes \
-- make check-cuda-install
.PHONY : gantry-run-ib
gantry-run-ib :
gantry run \
--workspace "$(BEAKER_WORKSPACE)" \
--priority "normal" \
--preemptible \
--beaker-image "$(GANTRY_IMAGE)" \
--gpus 8 \
--description "LLM Beaker IB Cluster Run" \
--cluster ai2/general-cirrascale-a100-80g-ib \
--cluster ai2/jupiter-cirrascale \
--cluster ai2/pluto-cirrascale \
--nfs \
--env WORLD_SIZE=32 \
--env GPUS=8 \
--env NCCL_DEBUG=INFO \
--env SCRATCH_DIR=/tmp/scratch \
--env FLASH_DIR=/tmp/flash \
--env WANDB_PROJECT=olmo-beaker-ib \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--replicas 4 \
--leader-selection \
--host-networking \
--allow-dirty \
--venv base \
--yes \
-- /bin/bash -c 'torchrun --master-addr $$BEAKER_LEADER_REPLICA_HOSTNAME --master_port 1234 --nnodes 4 --node-rank $$BEAKER_REPLICA_RANK --nproc-per-node 8 scripts/train.py configs/c4-large.yaml'
.PHONY : check-cpu-install
check-cpu-install :
@python -c 'from olmo import check_install; check_install(cuda=False)'
.PHONY : check-cuda-install
check-cuda-install :
@python -c 'from olmo import check_install; check_install(cuda=True)'