Skip to content

Commit

Permalink
Attempt fix on dinov2-giant-nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Sep 4, 2024
1 parent ef4f437 commit 5a83c2c
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 8 deletions.
3 changes: 3 additions & 0 deletions benchmarks/dinov2/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@


SOURCE_DIR = "src"
# Fix https://github.com/facebookresearch/dinov2/pull/281
# REPO_URL = "https://github.com/pathologywatch/dinov2"
# BRANCH = "733d285dccf407ebd2c0e97f899d67206ede44a5"
REPO_URL = "https://github.com/Delaunay/dinov2"
BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c"

Expand Down
40 changes: 40 additions & 0 deletions benchmarks/dinov2/prepare.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
#!/usr/bin/env python

from pathlib import Path
import os
from benchmate.datagen import generate_fakeimagenet, device_count
from tqdm import tqdm


def loop_on(iterable:list):
while 1:
yield from iterable


if __name__ == "__main__":
Expand All @@ -13,6 +20,7 @@
del os.environ["SLURM_JOB_ID"]

from argparse import Namespace
from dinov2.data.loaders import ImageNet, _parse_dataset_str
from dinov2.train.train import get_args_parser
from dinov2.utils.config import get_cfg_from_args, apply_scaling_rules_to_cfg

Expand All @@ -33,3 +41,35 @@
)
#
generate_fakeimagenet(args)

# Generate metadata
class_, kwargs = _parse_dataset_str(cfg.train.dataset_path)
dataset = class_(**kwargs)
root = Path(dataset.root)
for split in class_.Split:
dirs = sorted(entry for entry in root.glob(f"{split.value}/*/") if entry.is_dir())
first_files = [
sorted(entry for entry in _dir.glob(f"*") if not entry.is_dir())[0]
for _dir in dirs
]
files_cnt = len([entry for entry in root.glob(f"{split.value}/*/*") if not entry.is_dir()])
missings_cnt = split.length - files_cnt

for linkname, first_file in zip(
(
root / split.get_image_relpath(i, _dir.name)
for i, _dir in zip(
tqdm(range(split.length), total=split.length),
loop_on(dirs),
)
),
loop_on(first_files)
):
if missings_cnt <= 0:
break
if linkname.exists():
continue
linkname.hardlink_to(first_file)
missings_cnt -= 1

dataset.dump_extra()
30 changes: 25 additions & 5 deletions benchmate/benchmate/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,35 @@
def write(args):
import torchvision.transforms as transforms

offset, outdir, size = args
offset, outdir, prefix, size = args

img = torch.randn(*size)
target = offset % 1000 # torch.randint(0, 1000, size=(1,), dtype=torch.long)[0]
img = transforms.ToPILImage()(img)

class_val = int(target)
image_name = f"{offset}.jpeg"

# Some benches need filenames to match those of imagenet:
# https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43
if not prefix: # train
image_name = f"{class_val}_{offset}"
else: # val, test
image_name = f"{prefix}{int(offset):08d}"

path = os.path.join(outdir, str(class_val))
os.makedirs(path, exist_ok=True)

image_path = os.path.join(path, image_name)
image_path = os.path.join(path, f"{image_name}.JPEG")
img.save(image_path)


def generate(image_size, n, outdir, start=0):
def generate(image_size, n, outdir, prefix="", start=0):
work_items = []
for i in range(n):
work_items.append(
[
start + i,
outdir,
prefix,
image_size,
]
)
Expand Down Expand Up @@ -67,12 +73,20 @@ def generate_sets(root, sets, shape):
for split, count in sets.items():
current_count = total_images.get(split, 0)

# Some benches need filenames to match those of imagenet:
# https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43
if split == "train":
prefix = ""
else: # split in (val, test):
prefix = f"ILSVRC2012_{split}_"

if current_count < count:
print(f"Generating {split} (current {current_count}) (target: {count})")
generate(
shape,
count - current_count,
os.path.join(root, split),
prefix=prefix,
start=current_count,
)

Expand Down Expand Up @@ -123,6 +137,12 @@ def generate_fakeimagenet(args=None):
}

generate_sets(dest, size_spec, args.image_size)

labels = set([int(entry.name) for entry in Path(dest).glob("*/*/")])
with open(os.path.join(dest, "labels.txt"), "wt") as _f:
# class_id,class_name
_f.writelines([f"{l},{l}\n" for l in sorted(labels)])

print("Done!")


Expand Down
2 changes: 1 addition & 1 deletion config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ dinov2-giant-nodes:
argv:
--config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
# THOSE NEED TO BE LAST
train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true
train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
train.batch_size_per_gpu=12: true
train.saveckp_freq=100: true
train.num_workers=10: true
Expand Down
5 changes: 3 additions & 2 deletions milabench/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,12 @@ def rsync(node, src=None, remote_src=None, dest=None, force=False) -> list:
return [
"rsync",
*(["--force"] if force else []),
"-av",
"-aHv",
"--del",
"-e",
f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no",
"--include=*/.git/*",
*[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp") ],
*[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp")],
*src, *remote_src,
dest,
]
Expand Down

0 comments on commit 5a83c2c

Please sign in to comment.