Skip to content

Commit

Permalink
tweak running examples without cuda (Taeung#794)
Browse files Browse the repository at this point in the history
* tweak running examples without cuda

* rework dry_run handling in mnist, mnist_hogwild
  • Loading branch information
mattip authored Jul 1, 2020
1 parent 59caa16 commit 49ec0bd
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 57 deletions.
6 changes: 6 additions & 0 deletions dcgan/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
parser.add_argument('--cuda', action='store_true', help='enables cuda')
parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works')
parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
parser.add_argument('--netG', default='', help="path to netG (to continue training)")
parser.add_argument('--netD', default='', help="path to netD (to continue training)")
Expand Down Expand Up @@ -211,6 +212,9 @@ def forward(self, input):
optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))

if opt.dry_run:
opt.niter = 1

for epoch in range(opt.niter):
for i, data in enumerate(dataloader, 0):
############################
Expand Down Expand Up @@ -261,6 +265,8 @@ def forward(self, input):
'%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch),
normalize=True)

if opt.dry_run:
break
# do checkpointing
torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))
10 changes: 7 additions & 3 deletions imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def main_worker(gpu, ngpus_per_node, args):
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()

if args.distributed:
if not torch.cuda.is_available():
print('using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
Expand Down Expand Up @@ -281,7 +283,8 @@ def train(train_loader, model, criterion, optimizer, epoch, args):

if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)

# compute output
output = model(images)
Expand Down Expand Up @@ -324,7 +327,8 @@ def validate(val_loader, model, criterion, args):
for i, (images, target) in enumerate(val_loader):
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)

# compute output
output = model(images)
Expand Down
35 changes: 21 additions & 14 deletions mnist/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def train(args, model, device, train_loader, optimizer, epoch):
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break


def test(model, device, test_loader):
Expand Down Expand Up @@ -83,6 +85,8 @@ def main():
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
Expand All @@ -96,20 +100,23 @@ def main():

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
kwargs = {'batch_size': args.batch_size}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
'shuffle': True},
)

transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)

model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
Expand Down
25 changes: 22 additions & 3 deletions mnist_hogwild/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data.sampler import Sampler
from torchvision import datasets, transforms

from train import train, test

Expand All @@ -27,6 +29,8 @@
help='how many training processes to use (default: 2)')
parser.add_argument('--cuda', action='store_true', default=False,
help='enables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')

class Net(nn.Module):
def __init__(self):
Expand All @@ -46,12 +50,26 @@ def forward(self, x):
x = self.fc2(x)
return F.log_softmax(x, dim=1)


if __name__ == '__main__':
args = parser.parse_args()

use_cuda = args.cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
dataloader_kwargs = {'pin_memory': True} if use_cuda else {}
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
kwargs = {'batch_size': args.batch_size,
'shuffle': True}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
})

torch.manual_seed(args.seed)
mp.set_start_method('spawn')
Expand All @@ -61,12 +79,13 @@ def forward(self, x):

processes = []
for rank in range(args.num_processes):
p = mp.Process(target=train, args=(rank, args, model, device, dataloader_kwargs))
p = mp.Process(target=train, args=(rank, args, model, device,
dataset1, kwargs))
# We first train the model across `num_processes` processes
p.start()
processes.append(p)
for p in processes:
p.join()

# Once training is complete, we can test the model
test(args, model, device, dataloader_kwargs)
test(args, model, device, dataset2, kwargs)
24 changes: 6 additions & 18 deletions mnist_hogwild/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,22 @@
import torch
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms


def train(rank, args, model, device, dataloader_kwargs):
def train(rank, args, model, device, dataset, dataloader_kwargs):
torch.manual_seed(args.seed + rank)

train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1,
**dataloader_kwargs)
train_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train_epoch(epoch, args, model, device, train_loader, optimizer)


def test(args, model, device, dataloader_kwargs):
def test(args, model, device, dataset, dataloader_kwargs):
torch.manual_seed(args.seed)

test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1,
**dataloader_kwargs)
test_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)

test_epoch(model, device, test_loader)

Expand All @@ -49,6 +35,8 @@ def train_epoch(epoch, args, model, device, data_loader, optimizer):
print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
pid, epoch, batch_idx * len(data), len(data_loader.dataset),
100. * batch_idx / len(data_loader), loss.item()))
if args.dry_run:
break


def test_epoch(model, device, data_loader):
Expand Down
63 changes: 45 additions & 18 deletions run_python_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,34 @@
# This script runs through the code in each of the python examples.
# The purpose is just as an integrtion test, not to actually train
# models in any meaningful way. For that reason, most of these set
# epochs = 1.
# epochs = 1 and --dry-run.
#
# Optionally specify a comma separated list of examples to run.
# can be run as:
# ./run_python_examples.sh "install_deps,run_all,clean"
# to pip install dependencies (other than pytorch), run all examples,
# and remove temporary/changed data files.
# Expects pytorch to be installed.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

if which nvcc ; then
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
else
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
fi
USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
case $USE_CUDA in
"True")
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
;;
"False")
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
;;
"")
exit 1;
;;
esac

ERRORS=""

Expand Down Expand Up @@ -63,7 +70,7 @@ function dcgan() {
unzip ${DATACLASS}_train_lmdb.zip || { error "couldn't unzip $DATACLASS"; return; }
popd
fi
python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG || error "dcgan failed"
python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG --dry-run || error "dcgan failed"
}

function fast_neural_style() {
Expand Down Expand Up @@ -92,12 +99,12 @@ function imagenet() {

function mnist() {
start
python main.py --epochs 1 || error "mnist example failed"
python main.py --epochs 1 --dry-run || error "mnist example failed"
}

function mnist_hogwild() {
start
python main.py --epochs 1 $CUDA_FLAG || error "mnist hogwild failed"
python main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed"
}

function regression() {
Expand All @@ -115,7 +122,7 @@ function snli() {
echo "installing 'en' model if not installed"
python -m spacy download en || { error "couldn't download 'en' model needed for snli"; return; }
echo "training..."
python train.py --epochs 1 --no-bidirectional || error "couldn't train snli"
python train.py --epochs 1 --dev_every 1 --no-bidirectional --dry-run || error "couldn't train snli"
}

function super_resolution() {
Expand All @@ -126,7 +133,7 @@ function super_resolution() {
function time_sequence_prediction() {
start
python generate_sine_wave.py || { error "generate sine wave failed"; return; }
python train.py || error "time sequence prediction training failed"
python train.py --steps 2 || error "time sequence prediction training failed"
}

function vae() {
Expand All @@ -136,18 +143,38 @@ function vae() {

function word_language_model() {
start
python main.py --epochs 1 $CUDA_FLAG || error "word_language_model failed"
python main.py --epochs 1 --dry-run $CUDA_FLAG || error "word_language_model failed"
}

function clean() {
cd $BASE_DIR
rm -rf dcgan/_cache_lsun_classroom_train_lmdb dcgan/fake_samples_epoch_000.png dcgan/lsun/ dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth dcgan/real_samples.png fast_neural_style/saved_models.zip fast_neural_style/saved_models/ imagenet/checkpoint.pth.tar imagenet/lsun/ imagenet/model_best.pth.tar imagenet/sample/ snli/.data/ snli/.vector_cache/ snli/results/ super_resolution/dataset/ super_resolution/model_epoch_1.pth word_language_model/model.pt || error "couldn't clean up some files"
rm -rf dcgan/_cache_lsun_classroom_train_lmdb \
dcgan/fake_samples_epoch_000.png dcgan/lsun/ \
dcgan/_cache_lsunclassroomtrainlmdb \
dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth \
dcgan/real_samples.png \
fast_neural_style/saved_models.zip \
fast_neural_style/saved_models/ \
imagenet/checkpoint.pth.tar \
imagenet/lsun/ \
imagenet/model_best.pth.tar \
imagenet/sample/ \
snli/.data/ \
snli/.vector_cache/ \
snli/results/ \
super_resolution/dataset/ \
super_resolution/model_epoch_1.pth \
time_sequence_prediction/predict*.pdf \
time_sequence_prediction/traindata.pt \
word_language_model/model.pt || error "couldn't clean up some files"

git checkout fast_neural_style/images/output-images/amber-candy.jpg || error "couldn't clean up fast neural style image"
}

function run_all() {
# cpp
dcgan
# distributed
fast_neural_style
imagenet
mnist
Expand Down
2 changes: 2 additions & 0 deletions snli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,5 @@
print(log_template.format(time.time()-start,
epoch, iterations, 1+batch_idx, len(train_iter),
100. * (1+batch_idx) / len(train_iter), loss.item(), ' '*8, n_correct/n_total*100, ' '*12))
if args.dry_run:
break
2 changes: 2 additions & 0 deletions snli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,7 @@ def get_args():
'glove.6B.50d glove.6B.100d glove.6B.200d glove.6B.300d')
parser.add_argument('--resume_snapshot', type=str, default='',
help='model snapshot to resume.')
parser.add_argument('--dry-run', action='store_true',
help='run only a few iterations')
args = parser.parse_args()
return args
6 changes: 5 additions & 1 deletion time_sequence_prediction/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
Expand Down Expand Up @@ -36,6 +37,9 @@ def forward(self, input, future = 0):


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--steps', type=int, default=15, help='steps to run')
opt = parser.parse_args()
# set random seed to 0
np.random.seed(0)
torch.manual_seed(0)
Expand All @@ -52,7 +56,7 @@ def forward(self, input, future = 0):
# use LBFGS as optimizer since we can load the whole data to train
optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
#begin to train
for i in range(15):
for i in range(opt.steps):
print('STEP: ', i)
def closure():
optimizer.zero_grad()
Expand Down
Loading

0 comments on commit 49ec0bd

Please sign in to comment.