Skip to content

Commit

Permalink
feat: update imagenet training script
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 15, 2024
1 parent 5200f58 commit 95fdb5e
Show file tree
Hide file tree
Showing 10 changed files with 515 additions and 49,871 deletions.
8 changes: 0 additions & 8 deletions docs/src/ecosystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ const autodiff = [
];
const dataload = [
{
avatar: 'https://github.com/evizero.png',
name: 'Augmentor.jl',
desc: 'Data augmentation for machine learning',
links: [
{ icon: 'github', link: 'https://github.com/evizero/Augmentor.jl' }
]
},
{
avatar: 'https://github.com/JuliaML.png',
name: 'MLUtils.jl',
Expand Down
53 changes: 53 additions & 0 deletions examples/ImageNet/Project copy.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# [deps]
# AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
# Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15"
# Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
# Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
# Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
# FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
# FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
# Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
# Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
# Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
# JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
# JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
# Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
# LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
# MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
# MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
# Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
# NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
# OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
# Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
# ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e"
# Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
# Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
# SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7"
# Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
# Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

# [compat]
# AMDGPU = "1"
# Augmentor = "0.6"
# Boltz = "1"
# Configurations = "0.17"
# FLoops = "0.2"
# FileIO = "1.16"
# Format = "1.3"
# Functors = "0.4"
# Images = "0.26"
# JLD2 = "0.4.46, 0.5"
# JpegTurbo = "0.1"
# Lux = "1"
# LuxCUDA = "0.3"
# MLUtils = "0.4"
# MPI = "0.20.19"
# Metalhead = "0.9"
# NCCL = "0.1.1"
# OneHotArrays = "0.2"
# Optimisers = "0.3"
# ParameterSchedulers = "0.4"
# Setfield = "1"
# SimpleConfig = "0.1"
# Statistics = "1"
# Zygote = "0.6"
42 changes: 7 additions & 35 deletions examples/ImageNet/Project.toml
Original file line number Diff line number Diff line change
@@ -1,53 +1,25 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15"
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542"
DataAugmentation = "88a5189c-e7ff-4f85-ac6b-e6158070f02e"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
ParameterSchedulers = "d7d3b36b-41b8-4d0d-a2bf-768c6151755e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
AMDGPU = "1"
Augmentor = "0.6"
Boltz = "0.1, 0.2, 0.3"
Configurations = "0.17"
FLoops = "0.2"
FileIO = "1.16"
Format = "1.3"
Functors = "0.4"
Images = "0.26"
JLD2 = "0.4.46, 0.5"
JpegTurbo = "0.1"
Lux = "1"
LuxCUDA = "0.3"
MLUtils = "0.4"
MPI = "0.20.19"
Metalhead = "0.9"
NCCL = "0.1.1"
OneHotArrays = "0.2"
Optimisers = "0.3"
ParameterSchedulers = "0.4"
Setfield = "1"
SimpleConfig = "0.1"
Statistics = "1"
Zygote = "0.6"
[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
163 changes: 67 additions & 96 deletions examples/ImageNet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,27 @@ the ImageNet dataset.
## Training

To train a model, run `main.jl` with the necessary parameters. See
[Boltz documentation](https://lux.csail.mit.edu/dev/api/Domain_Specific_Modeling/Boltz) for
the model configuration.
[Boltz documentation](https://luxdl.github.io/Boltz.jl/stable/) for the model configuration.

```bash
julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.5

julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.model.name=alexnet --cfg.model.arch=alexnet\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.01
julia --startup=no --project=examples/ImageNet -t auto examples/ImageNet/main.jl \
--model-name="VGG" \
--depth=19 \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"


julia --startup=no --project=examples/ImageNet -t auto examples/ImageNet/main.jl \
--model-name="ViT" \
--model-kind="tiny" \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"
```

## Distributed Data Parallel Training
Expand All @@ -37,93 +44,57 @@ If your system has functional NCCL we will use it for all CUDA communications. O
will use MPI for all communications.

```bash
mpiexecjl -np 4 julia --project=examples/ImageNet -t auto examples/ImageNet/main.jl\
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
--cfg.optimizer.learning_rate=0.01
mpiexecjl -np 4 julia --startup=no --project=examples/ImageNet -t auto\
examples/ImageNet/main.jl \
--model-name="ViT" \
--model-kind="tiny" \
--train-batchsize=256 \
--val-batchsize=256 \
--optimizer-kind="sgd" \
--learning-rate=0.01 \
--base-path="/home/avik-pal/data/ImageNet/"
```

## Usage

```bash
usage: main.jl [--cfg.seed CFG.SEED] [--cfg.model.name CFG.MODEL.NAME]
[--cfg.model.arch CFG.MODEL.ARCH]
[--cfg.model.pretrained CFG.MODEL.PRETRAINED]
[--cfg.optimizer.name CFG.OPTIMIZER.NAME]
[--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE]
[--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV]
[--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM]
[--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY]
[--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME]
[--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH]
[--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR]
[--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP]
[--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY]
[--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS]
[--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY]
[--cfg.train.resume CFG.TRAIN.RESUME]
[--cfg.train.evaluate CFG.TRAIN.EVALUATE]
[--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR]
[--cfg.train.log_dir CFG.TRAIN.LOG_DIR]
[--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR]
[--cfg.train.expt_id CFG.TRAIN.EXPT_ID]
[--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY]
[--cfg.dataset.data_root CFG.DATASET.DATA_ROOT]
[--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE]
[--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE]
[-h]

optional arguments:
--cfg.seed CFG.SEED (type: Int64, default: 12345)
--cfg.model.name CFG.MODEL.NAME
(default: "resnet")
--cfg.model.arch CFG.MODEL.ARCH
(default: "resnet18")
--cfg.model.pretrained CFG.MODEL.PRETRAINED
(type: Bool, default: false)
--cfg.optimizer.name CFG.OPTIMIZER.NAME
(default: "adam")
--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE
(type: Float32, default: 0.01)
--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV
(type: Bool, default: false)
--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM
(type: Float32, default: 0.0)
--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY
(type: Float32, default: 0.0)
--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME
(default: "step")
--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH
(type: Int64, default: 50000)
--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR
(type: Float32, default: 1.2)
--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP
(type: Vector{Int64}, default: [100000, 250000, 500000])
--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY
(type: Float32, default: 0.1)
--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS
(type: Int64, default: 800000)
--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY
(type: Int64, default: 10000)
--cfg.train.resume CFG.TRAIN.RESUME
(default: "")
--cfg.train.evaluate CFG.TRAIN.EVALUATE
(type: Bool, default: false)
--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR
(default: "checkpoints")
--cfg.train.log_dir CFG.TRAIN.LOG_DIR
(default: "logs")
--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR
(default: "")
--cfg.train.expt_id CFG.TRAIN.EXPT_ID
(default: "")
--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY
(type: Int64, default: 100)
--cfg.dataset.data_root CFG.DATASET.DATA_ROOT
(default: "")
--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE
(type: Int64, default: 64)
--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE
(type: Int64, default: 64)
-h, --help show this help message and exit
main

Usage

main [options] [flags]

Options

--seed <0::Integer>
--model-name <String>
--model-kind <nokind::String>
--depth <-1::Int>
--base-path <::String>
--train-batchsize <64::Int>
--val-batchsize <64::Int>
--image-size <-1::Int>
--optimizer-kind <sgd::String>
--learning-rate <0.01::Float32>
--momentum <0.0::Float32>
--weight-decay <0.0::Float32>
--scheduler-kind <step::String>
--cycle-length <50000::Int>
--damp-factor <1.2::Float32>
--lr-step-decay <0.1::Float32>
--lr-step <[100000...::Vector{Int64}>
--expt-id <::String>
--expt-subdir <#= /home...::String>
--resume <::String>
--total-steps <800000::Int>
--evaluate-every <10000::Integer>
--print-frequency <100::Integer>

Flags

--pretrained
--nesterov
--evaluate
-h, --help Print this help message.
--version Print version.
```
48 changes: 0 additions & 48 deletions examples/ImageNet/config.jl

This file was deleted.

Loading

0 comments on commit 95fdb5e

Please sign in to comment.