From 779f4ea2732a0ef911bf60de6dcff4666f5c95c3 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 28 May 2024 22:29:34 +0800 Subject: [PATCH 1/9] cuda -> device --- .../classification/image/resnet50/check/check.py | 14 ++++++++------ Vision/classification/image/resnet50/config.py | 1 + Vision/classification/image/resnet50/graph.py | 10 ++++++---- Vision/classification/image/resnet50/infer.py | 4 ++-- .../classification/image/resnet50/models/data.py | 15 ++++++++++----- Vision/classification/image/resnet50/train.py | 13 +++++++------ 6 files changed, 34 insertions(+), 23 deletions(-) diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py index 2708f0c0a..a822f6e9b 100644 --- a/Vision/classification/image/resnet50/check/check.py +++ b/Vision/classification/image/resnet50/check/check.py @@ -14,6 +14,7 @@ def _parse_args(): parser = argparse.ArgumentParser("flags for train resnet50") + parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save_checkpoint_path", type=str, @@ -68,8 +69,8 @@ def setup(args): graph_model = resnet50() graph_model.load_state_dict(eager_model.state_dict()) - eager_model.to("cuda") - graph_model.to("cuda") + eager_model.to(args.device) + graph_model.to(args.device) # optimizer setup eager_optimizer = flow.optim.SGD( eager_model.parameters(), lr=args.learning_rate, momentum=args.mom @@ -80,7 +81,7 @@ def setup(args): # criterion setup criterion = flow.nn.CrossEntropyLoss() - criterion = criterion.to("cuda") + criterion = criterion.to(args.device) class ModelTrainGraph(flow.nn.Graph): def __init__(self): @@ -145,6 +146,7 @@ def __init__(self, args): self.graph_eval_total_time = 0.0 self.eager_val_total_time = 0.0 + self.device = args.device self.args = args def compare_eager_graph(self, compare_dic): @@ -167,8 +169,8 @@ def compare_eager_graph(self, compare_dic): for b in range(len(train_data_loader)): image, label = train_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) # oneflow graph train graph_iter_start_time = time.time() @@ -224,7 +226,7 @@ def compare_eager_graph(self, compare_dic): total_graph_infer_time, total_eager_infer_time = 0, 0 for b in tqdm(range(len(val_data_loader))): image, label = val_data_loader() - image = image.to("cuda") + image = image.to(self.device) # graph val graph_infer_time = time.time() diff --git a/Vision/classification/image/resnet50/config.py b/Vision/classification/image/resnet50/config.py index 63f3e25e2..129c8968c 100644 --- a/Vision/classification/image/resnet50/config.py +++ b/Vision/classification/image/resnet50/config.py @@ -26,6 +26,7 @@ def parse_args(ignore_unknown_args=False): parser = argparse.ArgumentParser( description="OneFlow ResNet50 Arguments", allow_abbrev=False ) + parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save", type=str, diff --git a/Vision/classification/image/resnet50/graph.py b/Vision/classification/image/resnet50/graph.py index dcad741ba..58ab63689 100644 --- a/Vision/classification/image/resnet50/graph.py +++ b/Vision/classification/image/resnet50/graph.py @@ -51,11 +51,12 @@ def __init__( self.cross_entropy = cross_entropy self.data_loader = data_loader self.add_optimizer(optimizer, lr_sch=lr_scheduler) + self.device = args.device def build(self): image, label = self.data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) loss = self.cross_entropy(logits, label) if self.return_pred_and_label: @@ -79,11 +80,12 @@ def __init__(self, model, data_loader): self.data_loader = data_loader self.model = model + self.device = args.device def build(self): image, label = self.data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) pred = logits.softmax() return pred, label diff --git a/Vision/classification/image/resnet50/infer.py b/Vision/classification/image/resnet50/infer.py index 85f19ed6a..8837ec39a 100644 --- a/Vision/classification/image/resnet50/infer.py +++ b/Vision/classification/image/resnet50/infer.py @@ -55,7 +55,7 @@ def main(args): print("***** Model Init *****") model = resnet50() model.load_state_dict(flow.load(args.model_path)) - model = model.to("cuda") + model = model.to(args.device) model.eval() end_t = time.perf_counter() print(f"***** Model Init Finish, time escapled {end_t - start_t:.6f} s *****") @@ -65,7 +65,7 @@ def main(args): start_t = end_t image = load_image(args.image_path) - image = flow.Tensor(image, device=flow.device("cuda")) + image = flow.Tensor(image, device=flow.device(args.device)) if args.graph: pred = model_graph(image) else: diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index ee8da362f..ca7e40c23 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -31,8 +31,9 @@ def make_data_loader(args, mode, is_global=False, synthetic=False): placement=placement, sbp=sbp, channel_last=args.channel_last, + device=args.device, ) - return data_loader.to("cuda") + return data_loader.to(args.device) ofrecord_data_loader = OFRecordDataLoader( ofrecord_dir=args.ofrecord_path, @@ -45,6 +46,7 @@ def make_data_loader(args, mode, is_global=False, synthetic=False): placement=placement, sbp=sbp, use_gpu_decode=args.use_gpu_decode, + device=args.device, ) return ofrecord_data_loader @@ -62,6 +64,7 @@ def __init__( placement=None, sbp=None, use_gpu_decode=False, + device="cuda", ): super().__init__() @@ -71,6 +74,7 @@ def __init__( self.total_batch_size = total_batch_size self.dataset_size = dataset_size self.mode = mode + self.device = device random_shuffle = True if mode == "train" else False shuffle_after_epoch = True if mode == "train" else False @@ -159,11 +163,11 @@ def forward(self): else: image_raw_bytes = self.image_decoder(record) image = self.resize(image_raw_bytes)[0] - image = image.to("cuda") + image = image.to(self.device) label = self.label_decoder(record) flip_code = self.flip() - flip_code = flip_code.to("cuda") + flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: record = self.ofrecord_reader() @@ -184,6 +188,7 @@ def __init__( placement=None, sbp=None, channel_last=False, + device="cuda", ): super().__init__() @@ -220,10 +225,10 @@ def __init__( ) else: self.image = flow.randint( - 0, high=256, size=self.image_shape, dtype=flow.float32, device="cuda" + 0, high=256, size=self.image_shape, dtype=flow.float32, device=device, ) self.label = flow.randint( - 0, high=self.num_classes, size=self.label_shape, device="cuda", + 0, high=self.num_classes, size=self.label_shape, device=device, ).to(dtype=flow.int32) def forward(self): diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index c1ba49ba4..3ae575040 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -26,6 +26,7 @@ class Trainer(object): def __init__(self): args = get_args() + self.device = args.device for k, v in args.__dict__.items(): setattr(self, k, v) @@ -89,12 +90,12 @@ def init_model(self): start_t = time.perf_counter() if self.is_global: - placement = flow.env.all_device_placement("cuda") + placement = flow.env.all_device_placement(self.device) self.model = self.model.to_global( placement=placement, sbp=flow.sbp.broadcast ) else: - self.model = self.model.to("cuda") + self.model = self.model.to(self.device) if self.load_path is None: self.legacy_init_parameters() @@ -311,8 +312,8 @@ def eval(self): def forward(self): image, label = self.train_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) loss = self.cross_entropy(logits, label) if self.metric_train_acc: @@ -323,8 +324,8 @@ def forward(self): def inference(self): image, label = self.val_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) with flow.no_grad(): logits = self.model(image) pred = logits.softmax() From 8571548b69c22b83551a392def8855f7d6e82505 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 28 May 2024 22:34:49 +0800 Subject: [PATCH 2/9] recover --- .../classification/image/resnet50/check/check.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py index a822f6e9b..2708f0c0a 100644 --- a/Vision/classification/image/resnet50/check/check.py +++ b/Vision/classification/image/resnet50/check/check.py @@ -14,7 +14,6 @@ def _parse_args(): parser = argparse.ArgumentParser("flags for train resnet50") - parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save_checkpoint_path", type=str, @@ -69,8 +68,8 @@ def setup(args): graph_model = resnet50() graph_model.load_state_dict(eager_model.state_dict()) - eager_model.to(args.device) - graph_model.to(args.device) + eager_model.to("cuda") + graph_model.to("cuda") # optimizer setup eager_optimizer = flow.optim.SGD( eager_model.parameters(), lr=args.learning_rate, momentum=args.mom @@ -81,7 +80,7 @@ def setup(args): # criterion setup criterion = flow.nn.CrossEntropyLoss() - criterion = criterion.to(args.device) + criterion = criterion.to("cuda") class ModelTrainGraph(flow.nn.Graph): def __init__(self): @@ -146,7 +145,6 @@ def __init__(self, args): self.graph_eval_total_time = 0.0 self.eager_val_total_time = 0.0 - self.device = args.device self.args = args def compare_eager_graph(self, compare_dic): @@ -169,8 +167,8 @@ def compare_eager_graph(self, compare_dic): for b in range(len(train_data_loader)): image, label = train_data_loader() - image = image.to(self.device) - label = label.to(self.device) + image = image.to("cuda") + label = label.to("cuda") # oneflow graph train graph_iter_start_time = time.time() @@ -226,7 +224,7 @@ def compare_eager_graph(self, compare_dic): total_graph_infer_time, total_eager_infer_time = 0, 0 for b in tqdm(range(len(val_data_loader))): image, label = val_data_loader() - image = image.to(self.device) + image = image.to("cuda") # graph val graph_infer_time = time.time() From caa6d4ba09b6d3a4758d022bc5bc8837af4ad210 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 29 May 2024 04:17:54 +0000 Subject: [PATCH 3/9] update --- Vision/classification/image/resnet50/models/data.py | 4 ++-- Vision/classification/image/resnet50/train.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index ca7e40c23..c5b3e6958 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -163,11 +163,11 @@ def forward(self): else: image_raw_bytes = self.image_decoder(record) image = self.resize(image_raw_bytes)[0] - image = image.to(self.device) label = self.label_decoder(record) flip_code = self.flip() - flip_code = flip_code.to(self.device) + if self.use_gpu_decode: + flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: record = self.ofrecord_reader() diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index 3ae575040..2e2fded8f 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -9,6 +9,7 @@ import time import oneflow as flow +import oneflow_npu from oneflow.nn.parallel import DistributedDataParallel as ddp from config import get_args From ac2ecb7cd37659b28e90546fe29bb8d13054f2b9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 29 May 2024 04:19:38 +0000 Subject: [PATCH 4/9] update --- Vision/classification/image/resnet50/examples/train_eager.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh index 46461d208..e75b485a0 100644 --- a/Vision/classification/image/resnet50/examples/train_eager.sh +++ b/Vision/classification/image/resnet50/examples/train_eager.sh @@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50 SRC_DIR=$(realpath $(dirname $0)/..) python3 $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ --num-devices-per-node 1 \ From b1908dda7b9669b216e007e739d8f75944d10bc0 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Sun, 2 Jun 2024 12:41:47 +0000 Subject: [PATCH 5/9] eager fp32 --- .../examples/train_eager_distributed_fp32.sh | 54 +++++++++++++++++++ .../image/resnet50/models/data.py | 1 + Vision/classification/image/resnet50/train.py | 2 +- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh new file mode 100644 index 000000000..cfbd8c09c --- /dev/null +++ b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh @@ -0,0 +1,54 @@ +# set -aux + +DEVICE_NUM_PER_NODE=8 +MASTER_ADDR=127.0.0.1 +NUM_NODES=1 +NODE_RANK=0 + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO +# export ONEFLOW_DEBUG_MODE=True + +CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints" +if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then + mkdir $CHECKPOINT_SAVE_PATH +fi + +#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord" + +OFRECORD_PART_NUM=256 +LEARNING_RATE=0.768 +MOM=0.875 +EPOCH=50 +TRAIN_BATCH_SIZE=96 +VAL_BATCH_SIZE=50 + +# SRC_DIR=/path/to/models/resnet50 +SRC_DIR=$(realpath $(dirname $0)/..) + +python3 -m oneflow.distributed.launch \ + --nproc_per_node $DEVICE_NUM_PER_NODE \ + --nnodes $NUM_NODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ + --print-interval 100 \ + --save $CHECKPOINT_SAVE_PATH \ + --ofrecord-path $OFRECORD_PATH \ + --ofrecord-part-num $OFRECORD_PART_NUM \ + --num-devices-per-node $DEVICE_NUM_PER_NODE \ + --lr $LEARNING_RATE \ + --momentum $MOM \ + --num-epochs $EPOCH \ + --train-batch-size $TRAIN_BATCH_SIZE \ + --val-batch-size $VAL_BATCH_SIZE \ + --scale-grad \ + #--graph \ + #--fuse-bn-relu \ + #--fuse-bn-add-relu \ diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index c5b3e6958..2f3cbefa9 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -167,6 +167,7 @@ def forward(self): label = self.label_decoder(record) flip_code = self.flip() if self.use_gpu_decode: + # todo NPU: image will down grade to cpu flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index 2e2fded8f..43d54617d 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -278,7 +278,7 @@ def train_eager(self): param.grad /= self.world_size else: loss.backward() - loss = loss / self.world_size + #loss = loss / self.world_size self.optimizer.step() self.optimizer.zero_grad() From de581e1d2d78fe68f04e1043796007d1aae76426 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Mon, 23 Sep 2024 10:48:06 +0000 Subject: [PATCH 6/9] npu OK --- .../image/resnet50/examples/train_graph.sh | 11 +++++++---- Vision/classification/image/resnet50/models/data.py | 3 ++- .../classification/image/resnet50/models/optimizer.py | 3 ++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh index 3e267e0bf..450e37189 100644 --- a/Vision/classification/image/resnet50/examples/train_graph.sh +++ b/Vision/classification/image/resnet50/examples/train_graph.sh @@ -8,7 +8,8 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then mkdir $CHECKPOINT_SAVE_PATH fi -OFRECORD_PATH="./mini-imagenet/ofrecord" +#OFRECORD_PATH="./mini-imagenet/ofrecord" +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/" if [ ! -d "$OFRECORD_PATH" ]; then wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip @@ -36,8 +37,10 @@ python3 $SRC_DIR/train.py \ --train-batch-size $TRAIN_BATCH_SIZE \ --val-batch-size $VAL_BATCH_SIZE \ --save $CHECKPOINT_SAVE_PATH \ - --samples-per-epoch 50 \ - --val-samples-per-epoch 50 \ - --use-gpu-decode \ --scale-grad \ --graph \ + --device npu + #--print-interval 1 \ + #--use-gpu-decode \ + #--samples-per-epoch 50 \ + #--val-samples-per-epoch 50 \ diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index 2f3cbefa9..5fce27f7b 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -46,7 +46,8 @@ def make_data_loader(args, mode, is_global=False, synthetic=False): placement=placement, sbp=sbp, use_gpu_decode=args.use_gpu_decode, - device=args.device, + device="cpu", + #device=args.device, ) return ofrecord_data_loader diff --git a/Vision/classification/image/resnet50/models/optimizer.py b/Vision/classification/image/resnet50/models/optimizer.py index 13b172992..877d9d32a 100644 --- a/Vision/classification/image/resnet50/models/optimizer.py +++ b/Vision/classification/image/resnet50/models/optimizer.py @@ -83,5 +83,6 @@ def forward(self, input, label): # log_prob = input.softmax(dim=-1).log() # onehot_label = flow.F.cast(onehot_label, log_prob.dtype) # loss = flow.mul(log_prob * -1, onehot_label).sum(dim=-1).mean() - loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype)) + #loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype)) + loss = flow._C.cross_entropy(input, onehot_label.to(dtype=input.dtype), reduction='none') return loss.mean() From affb3df0b80b4a21dd54f33c33057a3ecc2abcbe Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Sep 2024 08:30:44 +0000 Subject: [PATCH 7/9] align with eager and cuda --- .../image/resnet50/examples/npu_eager.sh | 46 ++++++++++++++++++ .../image/resnet50/examples/npu_graph.sh | 47 +++++++++++++++++++ .../image/resnet50/models/optimizer.py | 24 ++++++++++ Vision/classification/image/resnet50/train.py | 9 +++- 4 files changed, 125 insertions(+), 1 deletion(-) create mode 100755 Vision/classification/image/resnet50/examples/npu_eager.sh create mode 100755 Vision/classification/image/resnet50/examples/npu_graph.sh diff --git a/Vision/classification/image/resnet50/examples/npu_eager.sh b/Vision/classification/image/resnet50/examples/npu_eager.sh new file mode 100755 index 000000000..f7e078ee6 --- /dev/null +++ b/Vision/classification/image/resnet50/examples/npu_eager.sh @@ -0,0 +1,46 @@ +# set -aux + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED + +CHECKPOINT_SAVE_PATH="./graph_checkpoints" +if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then + mkdir $CHECKPOINT_SAVE_PATH +fi + +#OFRECORD_PATH="./mini-imagenet/ofrecord" +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/" + +if [ ! -d "$OFRECORD_PATH" ]; then + wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip + unzip mini-imagenet.zip +fi + +OFRECORD_PART_NUM=1 +LEARNING_RATE=0.256 +MOM=0.875 +EPOCH=90 +TRAIN_BATCH_SIZE=50 +VAL_BATCH_SIZE=50 + +# SRC_DIR=/path/to/models/resnet50 +SRC_DIR=$(realpath $(dirname $0)/..) + +python3 $SRC_DIR/train.py \ + --ofrecord-path $OFRECORD_PATH \ + --ofrecord-part-num $OFRECORD_PART_NUM \ + --num-devices-per-node 1 \ + --lr $LEARNING_RATE \ + --momentum $MOM \ + --num-epochs $EPOCH \ + --warmup-epochs 5 \ + --train-batch-size $TRAIN_BATCH_SIZE \ + --val-batch-size $VAL_BATCH_SIZE \ + --save $CHECKPOINT_SAVE_PATH \ + --scale-grad \ + --print-interval 1 \ + --load checkpoints/init \ + --device npu + #--use-gpu-decode \ + #--samples-per-epoch 50 \ + #--val-samples-per-epoch 50 \ diff --git a/Vision/classification/image/resnet50/examples/npu_graph.sh b/Vision/classification/image/resnet50/examples/npu_graph.sh new file mode 100755 index 000000000..58ffdfe2c --- /dev/null +++ b/Vision/classification/image/resnet50/examples/npu_graph.sh @@ -0,0 +1,47 @@ +# set -aux + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED + +CHECKPOINT_SAVE_PATH="./graph_checkpoints" +if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then + mkdir $CHECKPOINT_SAVE_PATH +fi + +#OFRECORD_PATH="./mini-imagenet/ofrecord" +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/" + +if [ ! -d "$OFRECORD_PATH" ]; then + wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip + unzip mini-imagenet.zip +fi + +OFRECORD_PART_NUM=1 +LEARNING_RATE=0.256 +MOM=0.875 +EPOCH=90 +TRAIN_BATCH_SIZE=50 +VAL_BATCH_SIZE=50 + +# SRC_DIR=/path/to/models/resnet50 +SRC_DIR=$(realpath $(dirname $0)/..) + +python3 $SRC_DIR/train.py \ + --ofrecord-path $OFRECORD_PATH \ + --ofrecord-part-num $OFRECORD_PART_NUM \ + --num-devices-per-node 1 \ + --lr $LEARNING_RATE \ + --momentum $MOM \ + --num-epochs $EPOCH \ + --warmup-epochs 5 \ + --train-batch-size $TRAIN_BATCH_SIZE \ + --val-batch-size $VAL_BATCH_SIZE \ + --save $CHECKPOINT_SAVE_PATH \ + --scale-grad \ + --print-interval 1 \ + --load checkpoints/init \ + --graph \ + --device npu + #--use-gpu-decode \ + #--samples-per-epoch 50 \ + #--val-samples-per-epoch 50 \ diff --git a/Vision/classification/image/resnet50/models/optimizer.py b/Vision/classification/image/resnet50/models/optimizer.py index 877d9d32a..df2a40706 100644 --- a/Vision/classification/image/resnet50/models/optimizer.py +++ b/Vision/classification/image/resnet50/models/optimizer.py @@ -86,3 +86,27 @@ def forward(self, input, label): #loss = flow._C.softmax_cross_entropy(input, onehot_label.to(dtype=input.dtype)) loss = flow._C.cross_entropy(input, onehot_label.to(dtype=input.dtype), reduction='none') return loss.mean() + +class oldLabelSmoothLoss(flow.nn.Module): + """NLL Loss with label smoothing + """ + + #def __init__(self, smoothing=0.1): + #super(LabelSmoothingCrossEntropy, self).__init__() + def __init__(self, num_classes=-1, smooth_rate=0.0): + super().__init__() + assert smooth_rate < 1.0 + self.smoothing = smooth_rate + self.confidence = 1.0 - smooth_rate + + def forward(self, x: flow.Tensor, target: flow.Tensor) -> flow.Tensor: + # TODO: register F.log_softmax() function and switch flow.log(flow.softmax()) to F.log_softmax() + logprobs = flow.log_softmax(x, dim=-1) + # TODO: fix gather bug when dim < 0 + # FIXME: only support cls task now + nll_loss = -logprobs.gather(dim=1, index=target.unsqueeze(1)) + nll_loss = nll_loss.squeeze(1) + smooth_loss = -logprobs.mean(dim=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index 43d54617d..5d70ca424 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -58,7 +58,7 @@ def __init__(self): self.cross_entropy = make_cross_entropy(args) self.train_data_loader = make_data_loader( - args, "train", self.is_global, self.synthetic_data + args, "validation", self.is_global, self.synthetic_data ) self.val_data_loader = make_data_loader( args, "validation", self.is_global, self.synthetic_data @@ -249,6 +249,13 @@ def train_one_epoch(self): else: loss, pred, label = self.train_eager() + print("loss") + print(loss) + print("pred") + print(pred) + print("label") + print(label) + exit() self.cur_iter += 1 loss = tol(loss, self.metric_local) From 8ba3cd856ac69708cbee7a0baeee967e6e495fde Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Sep 2024 08:38:28 +0000 Subject: [PATCH 8/9] update --- .../image/resnet50/examples/train_eager.sh | 2 -- .../image/resnet50/examples/train_graph.sh | 11 ++++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh index e75b485a0..46461d208 100644 --- a/Vision/classification/image/resnet50/examples/train_eager.sh +++ b/Vision/classification/image/resnet50/examples/train_eager.sh @@ -26,8 +26,6 @@ VAL_BATCH_SIZE=50 SRC_DIR=$(realpath $(dirname $0)/..) python3 $SRC_DIR/train.py \ - --device npu \ - --label-smoothing 0 \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ --num-devices-per-node 1 \ diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh index 450e37189..3e267e0bf 100644 --- a/Vision/classification/image/resnet50/examples/train_graph.sh +++ b/Vision/classification/image/resnet50/examples/train_graph.sh @@ -8,8 +8,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then mkdir $CHECKPOINT_SAVE_PATH fi -#OFRECORD_PATH="./mini-imagenet/ofrecord" -OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord/" +OFRECORD_PATH="./mini-imagenet/ofrecord" if [ ! -d "$OFRECORD_PATH" ]; then wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip @@ -37,10 +36,8 @@ python3 $SRC_DIR/train.py \ --train-batch-size $TRAIN_BATCH_SIZE \ --val-batch-size $VAL_BATCH_SIZE \ --save $CHECKPOINT_SAVE_PATH \ + --samples-per-epoch 50 \ + --val-samples-per-epoch 50 \ + --use-gpu-decode \ --scale-grad \ --graph \ - --device npu - #--print-interval 1 \ - #--use-gpu-decode \ - #--samples-per-epoch 50 \ - #--val-samples-per-epoch 50 \ From 82620cc0c15a2dad6c88f7cde55b7f4c8fbcd40c Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Sep 2024 08:39:30 +0000 Subject: [PATCH 9/9] update --- .../examples/train_eager_distributed_fp32.sh | 54 ------------------- 1 file changed, 54 deletions(-) delete mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh deleted file mode 100644 index cfbd8c09c..000000000 --- a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh +++ /dev/null @@ -1,54 +0,0 @@ -# set -aux - -DEVICE_NUM_PER_NODE=8 -MASTER_ADDR=127.0.0.1 -NUM_NODES=1 -NODE_RANK=0 - -export PYTHONUNBUFFERED=1 -echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED -export NCCL_LAUNCH_MODE=PARALLEL -echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE -# export NCCL_DEBUG=INFO -# export ONEFLOW_DEBUG_MODE=True - -CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints" -if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then - mkdir $CHECKPOINT_SAVE_PATH -fi - -#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD -OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord" - -OFRECORD_PART_NUM=256 -LEARNING_RATE=0.768 -MOM=0.875 -EPOCH=50 -TRAIN_BATCH_SIZE=96 -VAL_BATCH_SIZE=50 - -# SRC_DIR=/path/to/models/resnet50 -SRC_DIR=$(realpath $(dirname $0)/..) - -python3 -m oneflow.distributed.launch \ - --nproc_per_node $DEVICE_NUM_PER_NODE \ - --nnodes $NUM_NODES \ - --node_rank $NODE_RANK \ - --master_addr $MASTER_ADDR \ - $SRC_DIR/train.py \ - --device npu \ - --label-smoothing 0 \ - --print-interval 100 \ - --save $CHECKPOINT_SAVE_PATH \ - --ofrecord-path $OFRECORD_PATH \ - --ofrecord-part-num $OFRECORD_PART_NUM \ - --num-devices-per-node $DEVICE_NUM_PER_NODE \ - --lr $LEARNING_RATE \ - --momentum $MOM \ - --num-epochs $EPOCH \ - --train-batch-size $TRAIN_BATCH_SIZE \ - --val-batch-size $VAL_BATCH_SIZE \ - --scale-grad \ - #--graph \ - #--fuse-bn-relu \ - #--fuse-bn-add-relu \