Skip to content

Commit

Permalink
dino-giant working
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jul 26, 2024
1 parent bebcf9e commit f96c947
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 77 deletions.
8 changes: 4 additions & 4 deletions benchmarks/dinov2/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ endif

export MILABENCH_BASE

BENCH_NAME=dinov2-large
BENCH_NAME=dinov2-giant
MILABENCH_CONFIG=dev.yaml
MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE)

Expand All @@ -22,10 +22,10 @@ tests: install prepare
milabench run $(MILABENCH_ARGS)

single:
CUDA_VISIBLE_DEVICES=0,1 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single
CUDA_VISIBLE_DEVICES=0 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)

gpus:
milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus
milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)

nodes:
milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes
milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)
14 changes: 9 additions & 5 deletions benchmarks/dinov2/dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,23 @@ _dinov2:
--output-dir: "{milabench_extra}/output"
--no-resume: true

dinov2-large-single:
dinov2-large:
inherits: _dinov2
argv:
--config-file: src/dinov2/configs/train/vitl14.yaml
# THIS NEEDS TO BE LAST
# THOSE NEED TO BE LAST
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true

train.batch_size_per_gpu=32: true
train.saveckp_freq=100: true
train.num_workers=10: true


dinov2-giant:
inherits: _dinov2
argv:
--config-file: src/dinov2/configs/train/vitg14.yaml
# THIS NEEDS TO BE LAST
# THOSE NEED TO BE LAST
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true

train.batch_size_per_gpu=32: true
train.saveckp_freq=100: true
train.num_workers=10: true
6 changes: 1 addition & 5 deletions benchmarks/dinov2/main.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
#!/usr/bin/env python

import os
from benchmate.datagen import generate_fakeimagenet





if __name__ == "__main__":
import sys
sys.path.append(os.path.dirname(__file__) + "/src/")
from dinov2.train.train import main, get_args_parser

from dinov2.train.train import main, get_args_parser
args = get_args_parser(add_help=True).parse_args()
main(args)
36 changes: 18 additions & 18 deletions benchmarks/dinov2/prepare.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
#!/usr/bin/env python

import os
from benchmate.datagen import generate_fakeimagenet



from benchmate.datagen import generate_fakeimagenet, device_count


if __name__ == "__main__":
import os
import sys
sys.path.append(os.path.dirname(__file__) + "/src/")
from dinov2.data.datasets import ImageNet

data_directory = os.environ["MILABENCH_DIR_DATA"]
dest = os.path.join(data_directory, f"FakeImageNet")
if job_id := os.getenv("SLURM_JOB_ID"):
del os.environ["SLURM_JOB_ID"]

from argparse import Namespace
from dinov2.train.train import setup, get_args_parser

# class_id, class_name
with open(dest + "/labels.txt", "w") as fp:
for i in range(1000):
fp.write(f"{i}, {i}\n")
args = get_args_parser(add_help=True).parse_args()
cfg = setup(args)

args = Namespace(
batch_size=cfg["train"]["batch_size_per_gpu"],
batch_count=60,
device_count=device_count(),
device=None,
image_size=[3, 384, 384],
val=0.1,
test=0.1
)
#
# generate_fakeimagenet()



for split in ImageNet.Split:
dataset = ImageNet(split=split, root=dest, extra=dest)
dataset.dump_extra()
generate_fakeimagenet(args)
79 changes: 34 additions & 45 deletions benchmarks/dinov2/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from voir.phase import StopProgram
from voir import configurable
from voir.instruments import dash, early_stop, log
from benchmate.observer import BenchObserver
from benchmate.monitor import monitor_monogpu, multigpu_monitor
from benchmate.monitor import voirfile_monitor


@dataclass
Expand Down Expand Up @@ -37,26 +36,45 @@ def instrument_main(ov, options: Config):

yield ov.phases.load_script

if options.dash:
ov.require(dash)
# GPU monitor, rate, loss etc...
voirfile_monitor(ov, options)

monitor = monitor_monogpu
if os.getenv("RANK", -1) != -1:
monitor = multigpu_monitor
code_patch(ov)

instruments = [
log(
"value", "progress", "rate", "units", "loss", "gpudata", context="task"
),
monitor(poll_interval=options.gpu_poll)
]
#
# Insert milabench tools
#
def batch_size(x):
return x["collated_global_crops"].shape[0]

observer = BenchObserver(
earlystop=options.stop + options.skip,
batch_size_fn=batch_size,
)

probe = ov.probe("/dinov2.data.loaders/make_data_loader() as loader", overridable=True)
probe['loader'].override(observer.loader)

probe = ov.probe("/dinov2.train.train/do_train > losses_reduced", overridable=True)
probe["losses_reduced"].override(observer.record_loss)

probe = ov.probe("/dinov2.train.train/build_optimizer() as optimizer", overridable=True)
probe['optimizer'].override(observer.optimizer)

#
# Run the benchmark
#
try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")

if int(os.getenv("RANK", 0)) == 0:
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))

ov.require(*instruments)

def code_patch(ov):
# FIX dinov2 code using ptera
import os

from torchvision.datasets import ImageFolder
import torch
import dinov2.train.train
Expand All @@ -80,32 +98,3 @@ def override_parsed_dataset(results):

probe = ov.probe("/dinov2.data.loaders/_parse_dataset_str() as dataset_kwargs", overridable=True)
probe['dataset_kwargs'].override(override_parsed_dataset)


#
# Insert milabench tools
#
def batch_size(x):
return x["collated_global_crops"].shape[0]

observer = BenchObserver(
earlystop=options.stop + options.skip,
batch_size_fn=batch_size,
)

probe = ov.probe("/dinov2.data.loaders/make_data_loader() as loader", overridable=True)
probe['loader'].override(observer.loader)

probe = ov.probe("/dinov2.train.train/do_train > losses_reduced", overridable=True)
probe["losses_reduced"].override(observer.record_loss)

probe = ov.probe("/dinov2.train.train/build_optimizer() as optimizer", overridable=True)
probe['optimizer'].override(observer.optimizer)

#
# Run the benchmark
#
try:
yield ov.phases.run_script
except StopProgram:
print("early stopped")
21 changes: 21 additions & 0 deletions benchmate/benchmate/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,24 @@ def milabench_sys_monitor(monogpu=False):
return setupvoir(monogpu)



def voirfile_monitor(ov, options):
from voir.instruments import early_stop, log, dash

if options.dash:
ov.require(dash)

instruments = [
log(
"value", "progress", "rate", "units", "loss", "gpudata", context="task"
)
]

if int(os.getenv("RANK", 0)) == 0:
instruments.append(early_stop(n=options.stop, key="rate", task="train", signal="stop"))
instruments.append(monitor_node(poll_interval=options.gpu_poll))

if os.getenv("RANK", -1) == -1:
instruments.append(monitor_monogpu(poll_interval=options.gpu_poll))

ov.require(*instruments)

0 comments on commit f96c947

Please sign in to comment.