From 88ab14871c14888d1495007573f0f270cb04f333 Mon Sep 17 00:00:00 2001
From: wildsnowman <saito.snowman@gmail.com>
Date: Wed, 7 Sep 2022 11:02:52 +0900
Subject: [PATCH 1/2] use multiple split settings on creating the data

---
 benchmarks/set_matching_pytorch/train_sm.py | 25 ++++----------
 benchmarks/set_matching_pytorch/train_we.py | 19 +++--------
 shift15m/datasets/outfitfeature.py          | 38 ++++++---------------
 3 files changed, 21 insertions(+), 61 deletions(-)

diff --git a/benchmarks/set_matching_pytorch/train_sm.py b/benchmarks/set_matching_pytorch/train_sm.py
index 8257c20..7575bd3 100644
--- a/benchmarks/set_matching_pytorch/train_sm.py
+++ b/benchmarks/set_matching_pytorch/train_sm.py
@@ -29,7 +29,7 @@ def get_train_val_loader(
 ) -> Tuple[Any, Any]:
     label_dir_name = f"{train_year}-{valid_year}-split{split}"
 
-    iqon_outfits = IQONOutfits(root=root, split=split)
+    iqon_outfits = IQONOutfits(root=root)
 
     train, valid = iqon_outfits.get_trainval_data(label_dir_name)
     feature_dir = iqon_outfits.feature_dir
@@ -118,18 +118,14 @@ def eval_process(engine, batch):
 
     # early stopping
     handler = EarlyStopping(
-        patience=5,
-        score_function=exfn.stopping_score_function,
-        trainer=trainer,
+        patience=5, score_function=exfn.stopping_score_function, trainer=trainer,
     )
     valid_evaluator.add_event_handler(Events.COMPLETED, handler)
 
     # lr scheduler
     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.7)
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED,
-        exfn.lr_step,
-        lr_scheduler,
+        Events.EPOCH_COMPLETED, exfn.lr_step, lr_scheduler,
     )
 
     # logging
@@ -170,16 +166,11 @@ def eval_process(engine, batch):
         save_handler=DiskSaver(args.log_dir, require_empty=False),
     )
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
-        trainer_checkpointer,
+        Events.EPOCH_COMPLETED(every=args.checkpoint_interval), trainer_checkpointer,
     )
 
     model_checkpointer = ModelCheckpoint(
-        args.log_dir,
-        "modelckpt",
-        n_saved=1,
-        create_dir=True,
-        require_empty=False,
+        args.log_dir, "modelckpt", n_saved=1, create_dir=True, require_empty=False,
     )
     trainer.add_event_handler(
         Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
@@ -206,11 +197,7 @@ def eval_process(engine, batch):
     parser.add_argument(
         "--model",
         "-m",
-        choices=[
-            "set_matching_sim",
-            "cov_mean",
-            "cov_max",
-        ],
+        choices=["set_matching_sim", "cov_mean", "cov_max",],
         default="cov_max",
     )
     parser.add_argument("--batchsize", "-b", type=int, default=32)
diff --git a/benchmarks/set_matching_pytorch/train_we.py b/benchmarks/set_matching_pytorch/train_we.py
index 1af9ac8..177b770 100644
--- a/benchmarks/set_matching_pytorch/train_we.py
+++ b/benchmarks/set_matching_pytorch/train_we.py
@@ -33,7 +33,7 @@ def get_train_val_loader(
 ) -> Tuple[Any, Any]:
     label_dir_name = f"{train_year}-{valid_year}-split{split}"
 
-    iqon_outfits = IQONOutfits(root=root, split=split)
+    iqon_outfits = IQONOutfits(root=root)
 
     train, valid = iqon_outfits.get_trainval_data(label_dir_name)
     feature_dir = iqon_outfits.feature_dir
@@ -125,18 +125,14 @@ def eval_process(engine, batch):
 
     # early stopping
     handler = EarlyStopping(
-        patience=5,
-        score_function=exfn.stopping_score_function,
-        trainer=trainer,
+        patience=5, score_function=exfn.stopping_score_function, trainer=trainer,
     )
     valid_evaluator.add_event_handler(Events.COMPLETED, handler)
 
     # lr scheduler
     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.7)
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED,
-        exfn.lr_step,
-        lr_scheduler,
+        Events.EPOCH_COMPLETED, exfn.lr_step, lr_scheduler,
     )
 
     # logging
@@ -177,16 +173,11 @@ def eval_process(engine, batch):
         save_handler=DiskSaver(args.log_dir, require_empty=False),
     )
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
-        trainer_checkpointer,
+        Events.EPOCH_COMPLETED(every=args.checkpoint_interval), trainer_checkpointer,
     )
 
     model_checkpointer = ModelCheckpoint(
-        args.log_dir,
-        "modelckpt",
-        n_saved=1,
-        create_dir=True,
-        require_empty=False,
+        args.log_dir, "modelckpt", n_saved=1, create_dir=True, require_empty=False,
     )
     trainer.add_event_handler(
         Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
diff --git a/shift15m/datasets/outfitfeature.py b/shift15m/datasets/outfitfeature.py
index 7bccc21..4a82063 100644
--- a/shift15m/datasets/outfitfeature.py
+++ b/shift15m/datasets/outfitfeature.py
@@ -114,14 +114,10 @@ def __init__(
         self.root = root
         self.n_cand_sets = n_cand_sets
         self.transform_q = FeatureListTransform(
-            max_set_size=max_set_size_query,
-            apply_shuffle=False,
-            apply_padding=True,
+            max_set_size=max_set_size_query, apply_shuffle=False, apply_padding=True,
         )
         self.transform_a = FeatureListTransform(
-            max_set_size=max_set_size_answer,
-            apply_shuffle=False,
-            apply_padding=True,
+            max_set_size=max_set_size_answer, apply_shuffle=False, apply_padding=True,
         )
 
     def __len__(self):
@@ -178,11 +174,7 @@ def _read_feature(self, path):
 
 
 class IQONOutfits:
-    def __init__(
-        self,
-        root: str = C.ROOT,
-        split: int = 0,
-    ) -> None:
+    def __init__(self, root: str = C.ROOT, split: int = 0,) -> None:  # not used
         self.root = pathlib.Path(root)
         self.root.mkdir(parents=True, exist_ok=True)
         if not (self.root / "iqon_outfits.json").exists():
@@ -191,8 +183,11 @@ def __init__(
 
         self._label_dir = self.root / "set_matching/labels"
         if not self._label_dir.exists():
+            print("Making train/val dataset.")
             self._label_dir.mkdir(parents=True, exist_ok=True)
-            self._make_trainval_dataset(seed=split)
+            splits = [0, 1, 2]
+            for _s in splits:
+                self._make_trainval_dataset(seed=_s)
 
         self._feature_dir = self.root / "features"
         if not self._feature_dir.exists():
@@ -226,13 +221,8 @@ def _download_outfit_label(self):
         res.check_returncode()
 
     def _make_trainval_dataset(
-        self,
-        min_num_categories: int = 4,
-        min_like_num: int = 50,
-        seed: int = 0,
+        self, min_num_categories: int = 4, min_like_num: int = 50, seed: int = 0,
     ):
-        print("Make train/val dataset.")
-
         np.random.seed(seed)
         num_train, num_val, num_test = 30816, 3851, 3851  # max size
 
@@ -285,11 +275,7 @@ def get_test_data(self, label_dir_name: str) -> List[Dict]:
         return test
 
     def get_fitb_data(
-        self,
-        label_dir_name: str,
-        n_comb: int = 1,
-        n_cands: int = 8,
-        seed: int = 0,
+        self, label_dir_name: str, n_comb: int = 1, n_cands: int = 8, seed: int = 0,
     ) -> List:
         dir_name = self._label_dir / label_dir_name
         path = dir_name / f"test_examples_ncomb_{n_comb}_ncands_{n_cands}.json"
@@ -299,11 +285,7 @@ def get_fitb_data(
         return test_examples
 
     def _make_test_examples(
-        self,
-        path: pathlib.Path,
-        n_comb: int = 1,
-        n_cands: int = 8,
-        seed: int = 0,
+        self, path: pathlib.Path, n_comb: int = 1, n_cands: int = 8, seed: int = 0,
     ):
         print("Make test dataset.")
         np.random.seed(seed)

From d632dfd6c31cde124e8a8eec8b332a9b9ead220b Mon Sep 17 00:00:00 2001
From: wildsnowman <saito.snowman@gmail.com>
Date: Wed, 7 Sep 2022 11:10:18 +0900
Subject: [PATCH 2/2] black

---
 benchmarks/set_matching_pytorch/train_sm.py | 23 +++++++++++----
 benchmarks/set_matching_pytorch/train_we.py | 17 ++++++++---
 shift15m/datasets/outfitfeature.py          | 31 +++++++++++++++++----
 3 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/benchmarks/set_matching_pytorch/train_sm.py b/benchmarks/set_matching_pytorch/train_sm.py
index 7575bd3..8613534 100644
--- a/benchmarks/set_matching_pytorch/train_sm.py
+++ b/benchmarks/set_matching_pytorch/train_sm.py
@@ -118,14 +118,18 @@ def eval_process(engine, batch):
 
     # early stopping
     handler = EarlyStopping(
-        patience=5, score_function=exfn.stopping_score_function, trainer=trainer,
+        patience=5,
+        score_function=exfn.stopping_score_function,
+        trainer=trainer,
     )
     valid_evaluator.add_event_handler(Events.COMPLETED, handler)
 
     # lr scheduler
     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.7)
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED, exfn.lr_step, lr_scheduler,
+        Events.EPOCH_COMPLETED,
+        exfn.lr_step,
+        lr_scheduler,
     )
 
     # logging
@@ -166,11 +170,16 @@ def eval_process(engine, batch):
         save_handler=DiskSaver(args.log_dir, require_empty=False),
     )
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED(every=args.checkpoint_interval), trainer_checkpointer,
+        Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
+        trainer_checkpointer,
     )
 
     model_checkpointer = ModelCheckpoint(
-        args.log_dir, "modelckpt", n_saved=1, create_dir=True, require_empty=False,
+        args.log_dir,
+        "modelckpt",
+        n_saved=1,
+        create_dir=True,
+        require_empty=False,
     )
     trainer.add_event_handler(
         Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
@@ -197,7 +206,11 @@ def eval_process(engine, batch):
     parser.add_argument(
         "--model",
         "-m",
-        choices=["set_matching_sim", "cov_mean", "cov_max",],
+        choices=[
+            "set_matching_sim",
+            "cov_mean",
+            "cov_max",
+        ],
         default="cov_max",
     )
     parser.add_argument("--batchsize", "-b", type=int, default=32)
diff --git a/benchmarks/set_matching_pytorch/train_we.py b/benchmarks/set_matching_pytorch/train_we.py
index 177b770..7cbf06f 100644
--- a/benchmarks/set_matching_pytorch/train_we.py
+++ b/benchmarks/set_matching_pytorch/train_we.py
@@ -125,14 +125,18 @@ def eval_process(engine, batch):
 
     # early stopping
     handler = EarlyStopping(
-        patience=5, score_function=exfn.stopping_score_function, trainer=trainer,
+        patience=5,
+        score_function=exfn.stopping_score_function,
+        trainer=trainer,
     )
     valid_evaluator.add_event_handler(Events.COMPLETED, handler)
 
     # lr scheduler
     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.7)
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED, exfn.lr_step, lr_scheduler,
+        Events.EPOCH_COMPLETED,
+        exfn.lr_step,
+        lr_scheduler,
     )
 
     # logging
@@ -173,11 +177,16 @@ def eval_process(engine, batch):
         save_handler=DiskSaver(args.log_dir, require_empty=False),
     )
     trainer.add_event_handler(
-        Events.EPOCH_COMPLETED(every=args.checkpoint_interval), trainer_checkpointer,
+        Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
+        trainer_checkpointer,
     )
 
     model_checkpointer = ModelCheckpoint(
-        args.log_dir, "modelckpt", n_saved=1, create_dir=True, require_empty=False,
+        args.log_dir,
+        "modelckpt",
+        n_saved=1,
+        create_dir=True,
+        require_empty=False,
     )
     trainer.add_event_handler(
         Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
diff --git a/shift15m/datasets/outfitfeature.py b/shift15m/datasets/outfitfeature.py
index 4a82063..af3ab5d 100644
--- a/shift15m/datasets/outfitfeature.py
+++ b/shift15m/datasets/outfitfeature.py
@@ -114,10 +114,14 @@ def __init__(
         self.root = root
         self.n_cand_sets = n_cand_sets
         self.transform_q = FeatureListTransform(
-            max_set_size=max_set_size_query, apply_shuffle=False, apply_padding=True,
+            max_set_size=max_set_size_query,
+            apply_shuffle=False,
+            apply_padding=True,
         )
         self.transform_a = FeatureListTransform(
-            max_set_size=max_set_size_answer, apply_shuffle=False, apply_padding=True,
+            max_set_size=max_set_size_answer,
+            apply_shuffle=False,
+            apply_padding=True,
         )
 
     def __len__(self):
@@ -174,7 +178,11 @@ def _read_feature(self, path):
 
 
 class IQONOutfits:
-    def __init__(self, root: str = C.ROOT, split: int = 0,) -> None:  # not used
+    def __init__(
+        self,
+        root: str = C.ROOT,
+        split: int = 0,
+    ) -> None:  # not used
         self.root = pathlib.Path(root)
         self.root.mkdir(parents=True, exist_ok=True)
         if not (self.root / "iqon_outfits.json").exists():
@@ -221,7 +229,10 @@ def _download_outfit_label(self):
         res.check_returncode()
 
     def _make_trainval_dataset(
-        self, min_num_categories: int = 4, min_like_num: int = 50, seed: int = 0,
+        self,
+        min_num_categories: int = 4,
+        min_like_num: int = 50,
+        seed: int = 0,
     ):
         np.random.seed(seed)
         num_train, num_val, num_test = 30816, 3851, 3851  # max size
@@ -275,7 +286,11 @@ def get_test_data(self, label_dir_name: str) -> List[Dict]:
         return test
 
     def get_fitb_data(
-        self, label_dir_name: str, n_comb: int = 1, n_cands: int = 8, seed: int = 0,
+        self,
+        label_dir_name: str,
+        n_comb: int = 1,
+        n_cands: int = 8,
+        seed: int = 0,
     ) -> List:
         dir_name = self._label_dir / label_dir_name
         path = dir_name / f"test_examples_ncomb_{n_comb}_ncands_{n_cands}.json"
@@ -285,7 +300,11 @@ def get_fitb_data(
         return test_examples
 
     def _make_test_examples(
-        self, path: pathlib.Path, n_comb: int = 1, n_cands: int = 8, seed: int = 0,
+        self,
+        path: pathlib.Path,
+        n_comb: int = 1,
+        n_cands: int = 8,
+        seed: int = 0,
     ):
         print("Make test dataset.")
         np.random.seed(seed)