diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py index 214a013f8..0b4f10e6c 100644 --- a/benchmarks/dinov2/benchfile.py +++ b/benchmarks/dinov2/benchfile.py @@ -3,6 +3,9 @@ SOURCE_DIR = "src" +# Fix https://github.com/facebookresearch/dinov2/pull/281 +# REPO_URL = "https://github.com/pathologywatch/dinov2" +# BRANCH = "733d285dccf407ebd2c0e97f899d67206ede44a5" REPO_URL = "https://github.com/Delaunay/dinov2" BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c" diff --git a/benchmarks/dinov2/prepare.py b/benchmarks/dinov2/prepare.py index 8eaf83f52..60bfa6813 100755 --- a/benchmarks/dinov2/prepare.py +++ b/benchmarks/dinov2/prepare.py @@ -1,7 +1,14 @@ #!/usr/bin/env python +from pathlib import Path import os from benchmate.datagen import generate_fakeimagenet, device_count +from tqdm import tqdm + + +def loop_on(iterable:list): + while 1: + yield from iterable if __name__ == "__main__": @@ -13,6 +20,7 @@ del os.environ["SLURM_JOB_ID"] from argparse import Namespace + from dinov2.data.loaders import ImageNet, _parse_dataset_str from dinov2.train.train import get_args_parser from dinov2.utils.config import get_cfg_from_args, apply_scaling_rules_to_cfg @@ -33,3 +41,35 @@ ) # generate_fakeimagenet(args) + + # Generate metadata + class_, kwargs = _parse_dataset_str(cfg.train.dataset_path) + dataset = class_(**kwargs) + root = Path(dataset.root) + for split in class_.Split: + dirs = sorted(entry for entry in root.glob(f"{split.value}/*/") if entry.is_dir()) + first_files = [ + sorted(entry for entry in _dir.glob(f"*") if not entry.is_dir())[0] + for _dir in dirs + ] + files_cnt = len([entry for entry in root.glob(f"{split.value}/*/*") if not entry.is_dir()]) + missings_cnt = split.length - files_cnt + + for linkname, first_file in zip( + ( + root / split.get_image_relpath(i, _dir.name) + for i, _dir in zip( + tqdm(range(split.length), total=split.length), + loop_on(dirs), + ) + ), + loop_on(first_files) + ): + if missings_cnt <= 0: + break + if linkname.exists(): + continue + linkname.hardlink_to(first_file) + missings_cnt -= 1 + + dataset.dump_extra() diff --git a/benchmate/benchmate/datagen.py b/benchmate/benchmate/datagen.py index a7a753099..1840667a0 100644 --- a/benchmate/benchmate/datagen.py +++ b/benchmate/benchmate/datagen.py @@ -15,29 +15,35 @@ def write(args): import torchvision.transforms as transforms - offset, outdir, size = args + offset, outdir, prefix, size = args img = torch.randn(*size) target = offset % 1000 # torch.randint(0, 1000, size=(1,), dtype=torch.long)[0] img = transforms.ToPILImage()(img) - class_val = int(target) - image_name = f"{offset}.jpeg" + + # Some benches need filenames to match those of imagenet: + # https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43 + if not prefix: # train + image_name = f"{class_val}_{offset}" + else: # val, test + image_name = f"{prefix}{int(offset):08d}" path = os.path.join(outdir, str(class_val)) os.makedirs(path, exist_ok=True) - image_path = os.path.join(path, image_name) + image_path = os.path.join(path, f"{image_name}.JPEG") img.save(image_path) -def generate(image_size, n, outdir, start=0): +def generate(image_size, n, outdir, prefix="", start=0): work_items = [] for i in range(n): work_items.append( [ start + i, outdir, + prefix, image_size, ] ) @@ -67,12 +73,20 @@ def generate_sets(root, sets, shape): for split, count in sets.items(): current_count = total_images.get(split, 0) + # Some benches need filenames to match those of imagenet: + # https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43 + if split == "train": + prefix = "" + else: # split in (val, test): + prefix = f"ILSVRC2012_{split}_" + if current_count < count: print(f"Generating {split} (current {current_count}) (target: {count})") generate( shape, count - current_count, os.path.join(root, split), + prefix=prefix, start=current_count, ) @@ -123,6 +137,12 @@ def generate_fakeimagenet(args=None): } generate_sets(dest, size_spec, args.image_size) + + labels = set([int(entry.name) for entry in Path(dest).glob("*/*/")]) + with open(os.path.join(dest, "labels.txt"), "wt") as _f: + # class_id,class_name + _f.writelines([f"{l},{l}\n" for l in sorted(labels)]) + print("Done!") diff --git a/config/base.yaml b/config/base.yaml index 6801c288f..b49a9a125 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -488,7 +488,7 @@ dinov2-giant-nodes: argv: --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST - train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true + train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true train.batch_size_per_gpu=12: true train.saveckp_freq=100: true train.num_workers=10: true diff --git a/milabench/remote.py b/milabench/remote.py index 6e54521a1..3a4b348d4 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -67,11 +67,12 @@ def rsync(node, src=None, remote_src=None, dest=None, force=False) -> list: return [ "rsync", *(["--force"] if force else []), - "-av", + "-aHv", + "--del", "-e", f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no", "--include=*/.git/*", - *[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp") ], + *[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp")], *src, *remote_src, dest, ]