From 779f4ea2732a0ef911bf60de6dcff4666f5c95c3 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 28 May 2024 22:29:34 +0800
Subject: [PATCH 1/9] cuda -> device

---
 .../classification/image/resnet50/check/check.py  | 14 ++++++++------
 Vision/classification/image/resnet50/config.py    |  1 +
 Vision/classification/image/resnet50/graph.py     | 10 ++++++----
 Vision/classification/image/resnet50/infer.py     |  4 ++--
 .../classification/image/resnet50/models/data.py  | 15 ++++++++++-----
 Vision/classification/image/resnet50/train.py     | 13 +++++++------
 6 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py
index 2708f0c0a..a822f6e9b 100644
--- a/Vision/classification/image/resnet50/check/check.py
+++ b/Vision/classification/image/resnet50/check/check.py
@@ -14,6 +14,7 @@
 
 def _parse_args():
     parser = argparse.ArgumentParser("flags for train resnet50")
+    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save_checkpoint_path",
         type=str,
@@ -68,8 +69,8 @@ def setup(args):
     graph_model = resnet50()
     graph_model.load_state_dict(eager_model.state_dict())
 
-    eager_model.to("cuda")
-    graph_model.to("cuda")
+    eager_model.to(args.device)
+    graph_model.to(args.device)
     # optimizer setup
     eager_optimizer = flow.optim.SGD(
         eager_model.parameters(), lr=args.learning_rate, momentum=args.mom
@@ -80,7 +81,7 @@ def setup(args):
 
     # criterion setup
     criterion = flow.nn.CrossEntropyLoss()
-    criterion = criterion.to("cuda")
+    criterion = criterion.to(args.device)
 
     class ModelTrainGraph(flow.nn.Graph):
         def __init__(self):
@@ -145,6 +146,7 @@ def __init__(self, args):
         self.graph_eval_total_time = 0.0
         self.eager_val_total_time = 0.0
 
+        self.device = args.device
         self.args = args
 
     def compare_eager_graph(self, compare_dic):
@@ -167,8 +169,8 @@ def compare_eager_graph(self, compare_dic):
 
             for b in range(len(train_data_loader)):
                 image, label = train_data_loader()
-                image = image.to("cuda")
-                label = label.to("cuda")
+                image = image.to(self.device)
+                label = label.to(self.device)
 
                 # oneflow graph train
                 graph_iter_start_time = time.time()
@@ -224,7 +226,7 @@ def compare_eager_graph(self, compare_dic):
             total_graph_infer_time, total_eager_infer_time = 0, 0
             for b in tqdm(range(len(val_data_loader))):
                 image, label = val_data_loader()
-                image = image.to("cuda")
+                image = image.to(self.device)
 
                 # graph val
                 graph_infer_time = time.time()
diff --git a/Vision/classification/image/resnet50/config.py b/Vision/classification/image/resnet50/config.py
index 63f3e25e2..129c8968c 100644
--- a/Vision/classification/image/resnet50/config.py
+++ b/Vision/classification/image/resnet50/config.py
@@ -26,6 +26,7 @@ def parse_args(ignore_unknown_args=False):
     parser = argparse.ArgumentParser(
         description="OneFlow ResNet50 Arguments", allow_abbrev=False
     )
+    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save",
         type=str,
diff --git a/Vision/classification/image/resnet50/graph.py b/Vision/classification/image/resnet50/graph.py
index dcad741ba..58ab63689 100644
--- a/Vision/classification/image/resnet50/graph.py
+++ b/Vision/classification/image/resnet50/graph.py
@@ -51,11 +51,12 @@ def __init__(
         self.cross_entropy = cross_entropy
         self.data_loader = data_loader
         self.add_optimizer(optimizer, lr_sch=lr_scheduler)
+        self.device = args.device
 
     def build(self):
         image, label = self.data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         loss = self.cross_entropy(logits, label)
         if self.return_pred_and_label:
@@ -79,11 +80,12 @@ def __init__(self, model, data_loader):
 
         self.data_loader = data_loader
         self.model = model
+        self.device = args.device
 
     def build(self):
         image, label = self.data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         pred = logits.softmax()
         return pred, label
diff --git a/Vision/classification/image/resnet50/infer.py b/Vision/classification/image/resnet50/infer.py
index 85f19ed6a..8837ec39a 100644
--- a/Vision/classification/image/resnet50/infer.py
+++ b/Vision/classification/image/resnet50/infer.py
@@ -55,7 +55,7 @@ def main(args):
     print("***** Model Init *****")
     model = resnet50()
     model.load_state_dict(flow.load(args.model_path))
-    model = model.to("cuda")
+    model = model.to(args.device)
     model.eval()
     end_t = time.perf_counter()
     print(f"***** Model Init Finish, time escapled {end_t - start_t:.6f} s *****")
@@ -65,7 +65,7 @@ def main(args):
 
     start_t = end_t
     image = load_image(args.image_path)
-    image = flow.Tensor(image, device=flow.device("cuda"))
+    image = flow.Tensor(image, device=flow.device(args.device))
     if args.graph:
         pred = model_graph(image)
     else:
diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index ee8da362f..ca7e40c23 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -31,8 +31,9 @@ def make_data_loader(args, mode, is_global=False, synthetic=False):
             placement=placement,
             sbp=sbp,
             channel_last=args.channel_last,
+            device=args.device,
         )
-        return data_loader.to("cuda")
+        return data_loader.to(args.device)
 
     ofrecord_data_loader = OFRecordDataLoader(
         ofrecord_dir=args.ofrecord_path,
@@ -45,6 +46,7 @@ def make_data_loader(args, mode, is_global=False, synthetic=False):
         placement=placement,
         sbp=sbp,
         use_gpu_decode=args.use_gpu_decode,
+        device=args.device,
     )
     return ofrecord_data_loader
 
@@ -62,6 +64,7 @@ def __init__(
         placement=None,
         sbp=None,
         use_gpu_decode=False,
+        device="cuda",
     ):
         super().__init__()
 
@@ -71,6 +74,7 @@ def __init__(
         self.total_batch_size = total_batch_size
         self.dataset_size = dataset_size
         self.mode = mode
+        self.device = device
 
         random_shuffle = True if mode == "train" else False
         shuffle_after_epoch = True if mode == "train" else False
@@ -159,11 +163,11 @@ def forward(self):
             else:
                 image_raw_bytes = self.image_decoder(record)
                 image = self.resize(image_raw_bytes)[0]
-                image = image.to("cuda")
+                image = image.to(self.device)
 
             label = self.label_decoder(record)
             flip_code = self.flip()
-            flip_code = flip_code.to("cuda")
+            flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
             record = self.ofrecord_reader()
@@ -184,6 +188,7 @@ def __init__(
         placement=None,
         sbp=None,
         channel_last=False,
+        device="cuda",
     ):
         super().__init__()
 
@@ -220,10 +225,10 @@ def __init__(
             )
         else:
             self.image = flow.randint(
-                0, high=256, size=self.image_shape, dtype=flow.float32, device="cuda"
+                0, high=256, size=self.image_shape, dtype=flow.float32, device=device,
             )
             self.label = flow.randint(
-                0, high=self.num_classes, size=self.label_shape, device="cuda",
+                0, high=self.num_classes, size=self.label_shape, device=device,
             ).to(dtype=flow.int32)
 
     def forward(self):
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index c1ba49ba4..3ae575040 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -26,6 +26,7 @@
 class Trainer(object):
     def __init__(self):
         args = get_args()
+        self.device = args.device
         for k, v in args.__dict__.items():
             setattr(self, k, v)
 
@@ -89,12 +90,12 @@ def init_model(self):
         start_t = time.perf_counter()
 
         if self.is_global:
-            placement = flow.env.all_device_placement("cuda")
+            placement = flow.env.all_device_placement(self.device)
             self.model = self.model.to_global(
                 placement=placement, sbp=flow.sbp.broadcast
             )
         else:
-            self.model = self.model.to("cuda")
+            self.model = self.model.to(self.device)
 
         if self.load_path is None:
             self.legacy_init_parameters()
@@ -311,8 +312,8 @@ def eval(self):
 
     def forward(self):
         image, label = self.train_data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         loss = self.cross_entropy(logits, label)
         if self.metric_train_acc:
@@ -323,8 +324,8 @@ def forward(self):
 
     def inference(self):
         image, label = self.val_data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         with flow.no_grad():
             logits = self.model(image)
             pred = logits.softmax()

From 8571548b69c22b83551a392def8855f7d6e82505 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 28 May 2024 22:34:49 +0800
Subject: [PATCH 2/9] recover

---
 .../classification/image/resnet50/check/check.py   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py
index a822f6e9b..2708f0c0a 100644
--- a/Vision/classification/image/resnet50/check/check.py
+++ b/Vision/classification/image/resnet50/check/check.py
@@ -14,7 +14,6 @@
 
 def _parse_args():
     parser = argparse.ArgumentParser("flags for train resnet50")
-    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save_checkpoint_path",
         type=str,
@@ -69,8 +68,8 @@ def setup(args):
     graph_model = resnet50()
     graph_model.load_state_dict(eager_model.state_dict())
 
-    eager_model.to(args.device)
-    graph_model.to(args.device)
+    eager_model.to("cuda")
+    graph_model.to("cuda")
     # optimizer setup
     eager_optimizer = flow.optim.SGD(
         eager_model.parameters(), lr=args.learning_rate, momentum=args.mom
@@ -81,7 +80,7 @@ def setup(args):
 
     # criterion setup
     criterion = flow.nn.CrossEntropyLoss()
-    criterion = criterion.to(args.device)
+    criterion = criterion.to("cuda")
 
     class ModelTrainGraph(flow.nn.Graph):
         def __init__(self):
@@ -146,7 +145,6 @@ def __init__(self, args):
         self.graph_eval_total_time = 0.0
         self.eager_val_total_time = 0.0
 
-        self.device = args.device
         self.args = args
 
     def compare_eager_graph(self, compare_dic):
@@ -169,8 +167,8 @@ def compare_eager_graph(self, compare_dic):
 
             for b in range(len(train_data_loader)):
                 image, label = train_data_loader()
-                image = image.to(self.device)
-                label = label.to(self.device)
+                image = image.to("cuda")
+                label = label.to("cuda")
 
                 # oneflow graph train
                 graph_iter_start_time = time.time()
@@ -226,7 +224,7 @@ def compare_eager_graph(self, compare_dic):
             total_graph_infer_time, total_eager_infer_time = 0, 0
             for b in tqdm(range(len(val_data_loader))):
                 image, label = val_data_loader()
-                image = image.to(self.device)
+                image = image.to("cuda")
 
                 # graph val
                 graph_infer_time = time.time()

From caa6d4ba09b6d3a4758d022bc5bc8837af4ad210 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 29 May 2024 04:17:54 +0000
Subject: [PATCH 3/9] update

---
 Vision/classification/image/resnet50/models/data.py | 4 ++--
 Vision/classification/image/resnet50/train.py       | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index ca7e40c23..c5b3e6958 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -163,11 +163,11 @@ def forward(self):
             else:
                 image_raw_bytes = self.image_decoder(record)
                 image = self.resize(image_raw_bytes)[0]
-                image = image.to(self.device)
 
             label = self.label_decoder(record)
             flip_code = self.flip()
-            flip_code = flip_code.to(self.device)
+            if self.use_gpu_decode:
+                flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
             record = self.ofrecord_reader()
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index 3ae575040..2e2fded8f 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -9,6 +9,7 @@
 import time
 
 import oneflow as flow
+import oneflow_npu
 from oneflow.nn.parallel import DistributedDataParallel as ddp
 
 from config import get_args

From ac2ecb7cd37659b28e90546fe29bb8d13054f2b9 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 29 May 2024 04:19:38 +0000
Subject: [PATCH 4/9] update

---
 Vision/classification/image/resnet50/examples/train_eager.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh
index 46461d208..e75b485a0 100644
--- a/Vision/classification/image/resnet50/examples/train_eager.sh
+++ b/Vision/classification/image/resnet50/examples/train_eager.sh
@@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50
 SRC_DIR=$(realpath $(dirname $0)/..)
 
 python3 $SRC_DIR/train.py \
+    --device npu \
+    --label-smoothing 0 \
     --ofrecord-path $OFRECORD_PATH \
     --ofrecord-part-num $OFRECORD_PART_NUM \
     --num-devices-per-node 1 \

From b1908dda7b9669b216e007e739d8f75944d10bc0 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Sun, 2 Jun 2024 12:41:47 +0000
Subject: [PATCH 5/9] eager fp32

---
 .../examples/train_eager_distributed_fp32.sh  | 54 +++++++++++++++++++
 .../image/resnet50/models/data.py             |  1 +
 Vision/classification/image/resnet50/train.py |  2 +-
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh

diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
new file mode 100644
index 000000000..cfbd8c09c
--- /dev/null
+++ b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
@@ -0,0 +1,54 @@
+# set -aux
+
+DEVICE_NUM_PER_NODE=8
+MASTER_ADDR=127.0.0.1
+NUM_NODES=1
+NODE_RANK=0
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+# export NCCL_DEBUG=INFO
+# export ONEFLOW_DEBUG_MODE=True
+
+CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints"
+if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
+    mkdir $CHECKPOINT_SAVE_PATH
+fi
+
+#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord"
+
+OFRECORD_PART_NUM=256
+LEARNING_RATE=0.768
+MOM=0.875
+EPOCH=50
+TRAIN_BATCH_SIZE=96
+VAL_BATCH_SIZE=50
+
+# SRC_DIR=/path/to/models/resnet50
+SRC_DIR=$(realpath $(dirname $0)/..)
+
+python3 -m oneflow.distributed.launch \
+    --nproc_per_node $DEVICE_NUM_PER_NODE \
+    --nnodes $NUM_NODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    $SRC_DIR/train.py \
+        --device npu \
+        --label-smoothing 0 \
+	--print-interval 100 \
+        --save $CHECKPOINT_SAVE_PATH \
+        --ofrecord-path $OFRECORD_PATH \
+        --ofrecord-part-num $OFRECORD_PART_NUM \
+        --num-devices-per-node $DEVICE_NUM_PER_NODE \
+        --lr $LEARNING_RATE \
+        --momentum $MOM \
+        --num-epochs $EPOCH \
+        --train-batch-size $TRAIN_BATCH_SIZE \
+        --val-batch-size $VAL_BATCH_SIZE \
+        --scale-grad \
+        #--graph \
+        #--fuse-bn-relu \
+        #--fuse-bn-add-relu \
diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index c5b3e6958..2f3cbefa9 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -167,6 +167,7 @@ def forward(self):
             label = self.label_decoder(record)
             flip_code = self.flip()
             if self.use_gpu_decode:
+                # todo NPU: image will down grade to cpu
                 flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index 2e2fded8f..43d54617d 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -278,7 +278,7 @@ def train_eager(self):
                     param.grad /= self.world_size
         else:
             loss.backward()
-            loss = loss / self.world_size
+            #loss = loss / self.world_size
 
         self.optimizer.step()
         self.optimizer.zero_grad()

From de581e1d2d78fe68f04e1043796007d1aae76426 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Mon, 23 Sep 2024 10:48:06 +0000
Subject: [PATCH 6/9] npu OK

---
 .../image/resnet50/examples/train_graph.sh            | 11 +++++++----
 Vision/classification/image/resnet50/models/data.py   |  3 ++-
 .../classification/image/resnet50/models/optimizer.py |  3 ++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh
index 3e267e0bf..450e37189 100644
--- a/Vision/classification/image/resnet50/examples/train_graph.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph.sh
@@ -8,7 +8,8 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
     mkdir $CHECKPOINT_SAVE_PATH
 fi
 
-OFRECORD_PATH="./mini-imagenet/ofrecord"
+#OFRECORD_PATH="./mini-imagenet/ofrecord"
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/"
 
 if [ ! -d "$OFRECORD_PATH" ]; then
     wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
@@ -36,8 +37,10 @@ python3 $SRC_DIR/train.py \
     --train-batch-size $TRAIN_BATCH_SIZE \
     --val-batch-size $VAL_BATCH_SIZE \
     --save $CHECKPOINT_SAVE_PATH \
-    --samples-per-epoch 50 \
-    --val-samples-per-epoch 50 \
-    --use-gpu-decode \
     --scale-grad \
     --graph \
+    --device npu
+    #--print-interval 1 \
+    #--use-gpu-decode \
+    #--samples-per-epoch 50 \
+    #--val-samples-per-epoch 50 \
diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index 2f3cbefa9..5fce27f7b 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -46,7 +46,8 @@ def make_data_loader(args, mode, is_global=False, synthetic=False):
         placement=placement,
         sbp=sbp,
         use_gpu_decode=args.use_gpu_decode,
-        device=args.device,
+        device="cpu",
+        #device=args.device,
     )
     return ofrecord_data_loader
 
diff --git a/Vision/classification/image/resnet50/models/optimizer.py b/Vision/classification/image/resnet50/models/optimizer.py
index 13b172992..877d9d32a 100644
--- a/Vision/classification/image/resnet50/models/optimizer.py
+++ b/Vision/classification/image/resnet50/models/optimizer.py
@@ -83,5 +83,6 @@ def forward(self, input, label):
         # log_prob = input.softmax(dim=-1).log()
         # onehot_label = flow.F.cast(onehot_label, log_prob.dtype)
         # loss = flow.mul(log_prob * -1, onehot_label).sum(dim=-1).mean()
-        loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype))
+        #loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype))
+        loss = flow._C.cross_entropy(input, onehot_label.to(dtype=input.dtype), reduction='none')
         return loss.mean()

From affb3df0b80b4a21dd54f33c33057a3ecc2abcbe Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 24 Sep 2024 08:30:44 +0000
Subject: [PATCH 7/9] align with eager and cuda

---
 .../image/resnet50/examples/npu_eager.sh      | 46 ++++++++++++++++++
 .../image/resnet50/examples/npu_graph.sh      | 47 +++++++++++++++++++
 .../image/resnet50/models/optimizer.py        | 24 ++++++++++
 Vision/classification/image/resnet50/train.py |  9 +++-
 4 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100755 Vision/classification/image/resnet50/examples/npu_eager.sh
 create mode 100755 Vision/classification/image/resnet50/examples/npu_graph.sh

diff --git a/Vision/classification/image/resnet50/examples/npu_eager.sh b/Vision/classification/image/resnet50/examples/npu_eager.sh
new file mode 100755
index 000000000..f7e078ee6
--- /dev/null
+++ b/Vision/classification/image/resnet50/examples/npu_eager.sh
@@ -0,0 +1,46 @@
+# set -aux
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+
+CHECKPOINT_SAVE_PATH="./graph_checkpoints"
+if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
+    mkdir $CHECKPOINT_SAVE_PATH
+fi
+
+#OFRECORD_PATH="./mini-imagenet/ofrecord"
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/"
+
+if [ ! -d "$OFRECORD_PATH" ]; then
+    wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
+    unzip mini-imagenet.zip
+fi
+
+OFRECORD_PART_NUM=1
+LEARNING_RATE=0.256
+MOM=0.875
+EPOCH=90
+TRAIN_BATCH_SIZE=50
+VAL_BATCH_SIZE=50
+
+# SRC_DIR=/path/to/models/resnet50
+SRC_DIR=$(realpath $(dirname $0)/..)
+
+python3 $SRC_DIR/train.py \
+    --ofrecord-path $OFRECORD_PATH \
+    --ofrecord-part-num $OFRECORD_PART_NUM \
+    --num-devices-per-node 1 \
+    --lr $LEARNING_RATE \
+    --momentum $MOM \
+    --num-epochs $EPOCH \
+    --warmup-epochs 5 \
+    --train-batch-size $TRAIN_BATCH_SIZE \
+    --val-batch-size $VAL_BATCH_SIZE \
+    --save $CHECKPOINT_SAVE_PATH \
+    --scale-grad \
+    --print-interval 1 \
+    --load checkpoints/init \
+    --device npu
+    #--use-gpu-decode \
+    #--samples-per-epoch 50 \
+    #--val-samples-per-epoch 50 \
diff --git a/Vision/classification/image/resnet50/examples/npu_graph.sh b/Vision/classification/image/resnet50/examples/npu_graph.sh
new file mode 100755
index 000000000..58ffdfe2c
--- /dev/null
+++ b/Vision/classification/image/resnet50/examples/npu_graph.sh
@@ -0,0 +1,47 @@
+# set -aux
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+
+CHECKPOINT_SAVE_PATH="./graph_checkpoints"
+if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
+    mkdir $CHECKPOINT_SAVE_PATH
+fi
+
+#OFRECORD_PATH="./mini-imagenet/ofrecord"
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/"
+
+if [ ! -d "$OFRECORD_PATH" ]; then
+    wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
+    unzip mini-imagenet.zip
+fi
+
+OFRECORD_PART_NUM=1
+LEARNING_RATE=0.256
+MOM=0.875
+EPOCH=90
+TRAIN_BATCH_SIZE=50
+VAL_BATCH_SIZE=50
+
+# SRC_DIR=/path/to/models/resnet50
+SRC_DIR=$(realpath $(dirname $0)/..)
+
+python3 $SRC_DIR/train.py \
+    --ofrecord-path $OFRECORD_PATH \
+    --ofrecord-part-num $OFRECORD_PART_NUM \
+    --num-devices-per-node 1 \
+    --lr $LEARNING_RATE \
+    --momentum $MOM \
+    --num-epochs $EPOCH \
+    --warmup-epochs 5 \
+    --train-batch-size $TRAIN_BATCH_SIZE \
+    --val-batch-size $VAL_BATCH_SIZE \
+    --save $CHECKPOINT_SAVE_PATH \
+    --scale-grad \
+    --print-interval 1 \
+    --load checkpoints/init \
+    --graph \
+    --device npu
+    #--use-gpu-decode \
+    #--samples-per-epoch 50 \
+    #--val-samples-per-epoch 50 \
diff --git a/Vision/classification/image/resnet50/models/optimizer.py b/Vision/classification/image/resnet50/models/optimizer.py
index 877d9d32a..df2a40706 100644
--- a/Vision/classification/image/resnet50/models/optimizer.py
+++ b/Vision/classification/image/resnet50/models/optimizer.py
@@ -86,3 +86,27 @@ def forward(self, input, label):
         #loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype))
         loss = flow._C.cross_entropy(input, onehot_label.to(dtype=input.dtype), reduction='none')
         return loss.mean()
+
+class oldLabelSmoothLoss(flow.nn.Module):
+    """NLL Loss with label smoothing
+    """
+
+    #def __init__(self, smoothing=0.1):
+        #super(LabelSmoothingCrossEntropy, self).__init__()
+    def __init__(self, num_classes=-1, smooth_rate=0.0):
+        super().__init__()
+        assert smooth_rate < 1.0
+        self.smoothing = smooth_rate
+        self.confidence = 1.0 - smooth_rate
+
+    def forward(self, x: flow.Tensor, target: flow.Tensor) -> flow.Tensor:
+        # TODO: register F.log_softmax() function and switch flow.log(flow.softmax()) to F.log_softmax()
+        logprobs = flow.log_softmax(x, dim=-1)
+        # TODO: fix gather bug when dim < 0
+        # FIXME: only support cls task now
+        nll_loss = -logprobs.gather(dim=1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index 43d54617d..5d70ca424 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -58,7 +58,7 @@ def __init__(self):
         self.cross_entropy = make_cross_entropy(args)
 
         self.train_data_loader = make_data_loader(
-            args, "train", self.is_global, self.synthetic_data
+            args, "validation", self.is_global, self.synthetic_data
         )
         self.val_data_loader = make_data_loader(
             args, "validation", self.is_global, self.synthetic_data
@@ -249,6 +249,13 @@ def train_one_epoch(self):
             else:
                 loss, pred, label = self.train_eager()
 
+            print("loss")
+            print(loss)
+            print("pred")
+            print(pred)
+            print("label")
+            print(label)
+            exit()
             self.cur_iter += 1
 
             loss = tol(loss, self.metric_local)

From 8ba3cd856ac69708cbee7a0baeee967e6e495fde Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 24 Sep 2024 08:38:28 +0000
Subject: [PATCH 8/9] update

---
 .../image/resnet50/examples/train_eager.sh            |  2 --
 .../image/resnet50/examples/train_graph.sh            | 11 ++++-------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh
index e75b485a0..46461d208 100644
--- a/Vision/classification/image/resnet50/examples/train_eager.sh
+++ b/Vision/classification/image/resnet50/examples/train_eager.sh
@@ -26,8 +26,6 @@ VAL_BATCH_SIZE=50
 SRC_DIR=$(realpath $(dirname $0)/..)
 
 python3 $SRC_DIR/train.py \
-    --device npu \
-    --label-smoothing 0 \
     --ofrecord-path $OFRECORD_PATH \
     --ofrecord-part-num $OFRECORD_PART_NUM \
     --num-devices-per-node 1 \
diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh
index 450e37189..3e267e0bf 100644
--- a/Vision/classification/image/resnet50/examples/train_graph.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph.sh
@@ -8,8 +8,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
     mkdir $CHECKPOINT_SAVE_PATH
 fi
 
-#OFRECORD_PATH="./mini-imagenet/ofrecord"
-OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/"
+OFRECORD_PATH="./mini-imagenet/ofrecord"
 
 if [ ! -d "$OFRECORD_PATH" ]; then
     wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
@@ -37,10 +36,8 @@ python3 $SRC_DIR/train.py \
     --train-batch-size $TRAIN_BATCH_SIZE \
     --val-batch-size $VAL_BATCH_SIZE \
     --save $CHECKPOINT_SAVE_PATH \
+    --samples-per-epoch 50 \
+    --val-samples-per-epoch 50 \
+    --use-gpu-decode \
     --scale-grad \
     --graph \
-    --device npu
-    #--print-interval 1 \
-    #--use-gpu-decode \
-    #--samples-per-epoch 50 \
-    #--val-samples-per-epoch 50 \

From 82620cc0c15a2dad6c88f7cde55b7f4c8fbcd40c Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 24 Sep 2024 08:39:30 +0000
Subject: [PATCH 9/9] update

---
 .../examples/train_eager_distributed_fp32.sh  | 54 -------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh

diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
deleted file mode 100644
index cfbd8c09c..000000000
--- a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-# set -aux
-
-DEVICE_NUM_PER_NODE=8
-MASTER_ADDR=127.0.0.1
-NUM_NODES=1
-NODE_RANK=0
-
-export PYTHONUNBUFFERED=1
-echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
-export NCCL_LAUNCH_MODE=PARALLEL
-echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
-# export NCCL_DEBUG=INFO
-# export ONEFLOW_DEBUG_MODE=True
-
-CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints"
-if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
-    mkdir $CHECKPOINT_SAVE_PATH
-fi
-
-#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD
-OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord"
-
-OFRECORD_PART_NUM=256
-LEARNING_RATE=0.768
-MOM=0.875
-EPOCH=50
-TRAIN_BATCH_SIZE=96
-VAL_BATCH_SIZE=50
-
-# SRC_DIR=/path/to/models/resnet50
-SRC_DIR=$(realpath $(dirname $0)/..)
-
-python3 -m oneflow.distributed.launch \
-    --nproc_per_node $DEVICE_NUM_PER_NODE \
-    --nnodes $NUM_NODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    $SRC_DIR/train.py \
-        --device npu \
-        --label-smoothing 0 \
-	--print-interval 100 \
-        --save $CHECKPOINT_SAVE_PATH \
-        --ofrecord-path $OFRECORD_PATH \
-        --ofrecord-part-num $OFRECORD_PART_NUM \
-        --num-devices-per-node $DEVICE_NUM_PER_NODE \
-        --lr $LEARNING_RATE \
-        --momentum $MOM \
-        --num-epochs $EPOCH \
-        --train-batch-size $TRAIN_BATCH_SIZE \
-        --val-batch-size $VAL_BATCH_SIZE \
-        --scale-grad \
-        #--graph \
-        #--fuse-bn-relu \
-        #--fuse-bn-add-relu \