Attempt fix on dinov2-giant-nodes

mila-iqia · Sep 4, 2024 · 5a83c2c · 5a83c2c
1 parent ef4f437
commit 5a83c2c
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 8 deletions.
diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py
@@ -3,6 +3,9 @@
 
 
 SOURCE_DIR = "src"
+# Fix https://github.com/facebookresearch/dinov2/pull/281
+# REPO_URL = "https://github.com/pathologywatch/dinov2"
+# BRANCH = "733d285dccf407ebd2c0e97f899d67206ede44a5"
 REPO_URL = "https://github.com/Delaunay/dinov2"
 BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c"
 

diff --git a/benchmarks/dinov2/prepare.py b/benchmarks/dinov2/prepare.py
@@ -1,7 +1,14 @@
 #!/usr/bin/env python
 
+from pathlib import Path
 import os
 from benchmate.datagen import generate_fakeimagenet, device_count
+from tqdm import tqdm
+
+
+def loop_on(iterable:list):
+    while 1:
+        yield from iterable
 
 
 if __name__ == "__main__":
@@ -13,6 +20,7 @@
         del os.environ["SLURM_JOB_ID"]
 
     from argparse import Namespace
+    from dinov2.data.loaders import ImageNet, _parse_dataset_str
     from dinov2.train.train import get_args_parser
     from dinov2.utils.config import get_cfg_from_args, apply_scaling_rules_to_cfg
 
@@ -33,3 +41,35 @@
     )
     # 
     generate_fakeimagenet(args)
+
+    # Generate metadata
+    class_, kwargs = _parse_dataset_str(cfg.train.dataset_path)
+    dataset = class_(**kwargs)
+    root = Path(dataset.root)
+    for split in class_.Split:
+        dirs = sorted(entry for entry in root.glob(f"{split.value}/*/") if entry.is_dir())
+        first_files = [
+            sorted(entry for entry in _dir.glob(f"*") if not entry.is_dir())[0]
+            for _dir in dirs
+        ]
+        files_cnt = len([entry for entry in root.glob(f"{split.value}/*/*") if not entry.is_dir()])
+        missings_cnt = split.length - files_cnt
+
+        for linkname, first_file in zip(
+            (
+                root / split.get_image_relpath(i, _dir.name)
+                for i, _dir in zip(
+                    tqdm(range(split.length), total=split.length),
+                    loop_on(dirs),
+                )
+            ),
+            loop_on(first_files)
+        ):
+            if missings_cnt <= 0:
+                break
+            if linkname.exists():
+                continue
+            linkname.hardlink_to(first_file)
+            missings_cnt -= 1
+
+    dataset.dump_extra()
diff --git a/benchmate/benchmate/datagen.py b/benchmate/benchmate/datagen.py
@@ -15,29 +15,35 @@
 def write(args):
     import torchvision.transforms as transforms
 
-    offset, outdir, size = args
+    offset, outdir, prefix, size = args
 
     img = torch.randn(*size)
     target = offset % 1000  # torch.randint(0, 1000, size=(1,), dtype=torch.long)[0]
     img = transforms.ToPILImage()(img)
-
     class_val = int(target)
-    image_name = f"{offset}.jpeg"
+
+    # Some benches need filenames to match those of imagenet:
+    # https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43
+    if not prefix:  # train
+        image_name = f"{class_val}_{offset}"
+    else:           # val, test
+        image_name = f"{prefix}{int(offset):08d}"
 
     path = os.path.join(outdir, str(class_val))
     os.makedirs(path, exist_ok=True)
 
-    image_path = os.path.join(path, image_name)
+    image_path = os.path.join(path, f"{image_name}.JPEG")
     img.save(image_path)
 
 
-def generate(image_size, n, outdir, start=0):
+def generate(image_size, n, outdir, prefix="", start=0):
     work_items = []
     for i in range(n):
         work_items.append(
             [
                 start + i,
                 outdir,
+                prefix,
                 image_size,
             ]
         )
@@ -67,12 +73,20 @@ def generate_sets(root, sets, shape):
     for split, count in sets.items():
         current_count = total_images.get(split, 0)
 
+        # Some benches need filenames to match those of imagenet:
+        # https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/data/datasets/image_net.py#L40-L43
+        if split == "train":
+            prefix = ""
+        else:  # split in (val, test):
+            prefix = f"ILSVRC2012_{split}_"
+
         if current_count < count:
             print(f"Generating {split} (current {current_count}) (target: {count})")
             generate(
                 shape,
                 count - current_count,
                 os.path.join(root, split),
+                prefix=prefix,
                 start=current_count,
             )
 
@@ -123,6 +137,12 @@ def generate_fakeimagenet(args=None):
     }
 
     generate_sets(dest, size_spec, args.image_size)
+
+    labels = set([int(entry.name) for entry in Path(dest).glob("*/*/")])
+    with open(os.path.join(dest, "labels.txt"), "wt") as _f:
+        # class_id,class_name
+        _f.writelines([f"{l},{l}\n" for l in sorted(labels)])
+
     print("Done!")
 
 

diff --git a/config/base.yaml b/config/base.yaml
@@ -488,7 +488,7 @@ dinov2-giant-nodes:
   argv:
     --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
     # THOSE NEED TO BE LAST
-    train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true
+    train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true
     train.batch_size_per_gpu=12: true
     train.saveckp_freq=100: true
     train.num_workers=10: true

diff --git a/milabench/remote.py b/milabench/remote.py
@@ -67,11 +67,12 @@ def rsync(node, src=None, remote_src=None, dest=None, force=False) -> list:
     return [
         "rsync",
         *(["--force"] if force else []),
-        "-av",
+        "-aHv",
+        "--del",
         "-e",
         f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no",
         "--include=*/.git/*",
-        *[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp") ],
+        *[f"--exclude=*/{_dir}/*" for _dir in (".*", "tmp")],
         *src, *remote_src,
         dest,
     ]