diff --git a/pytorch.ipynb b/pytorch.ipynb index 3242323..9aa26b0 100644 --- a/pytorch.ipynb +++ b/pytorch.ipynb @@ -24,270 +24,198 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", + "\"\"\"\n", + "This example was adapted from the following PyTorch tutorial\n", + "https://pytorch.org/tutorials/beginner/introyt/trainingyt.html\n", + "\"\"\"\n", "\n", - "import optuna\n", - "from optuna.trial import TrialState\n", + "import os\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "import torch.utils.data\n", - "from torchvision import datasets\n", - "from torchvision import transforms\n", - "\n", - "\n", - "BATCHSIZE = 128\n", - "CLASSES = 10\n", - "EPOCHS = 10\n", - "N_TRAIN_EXAMPLES = BATCHSIZE * 30\n", - "N_VALID_EXAMPLES = BATCHSIZE * 10\n", - "\n", - "\n", - "def define_model(trial):\n", - " # We optimize the number of layers, hidden units and dropout ratio in each layer.\n", - " n_layers = trial.suggest_int(\"n_layers\", 1, 3)\n", - " layers = []\n", - "\n", - " in_features = 28 * 28\n", - " for i in range(n_layers):\n", - " out_features = trial.suggest_int(\"n_units_l{}\".format(i), 4, 128)\n", - " layers.append(nn.Linear(in_features, out_features))\n", - " layers.append(nn.ReLU())\n", - " p = trial.suggest_float(\"dropout_l{}\".format(i), 0.2, 0.5)\n", - " layers.append(nn.Dropout(p))\n", - "\n", - " in_features = out_features\n", - " layers.append(nn.Linear(in_features, CLASSES))\n", - " layers.append(nn.LogSoftmax(dim=1))\n", - "\n", - " return nn.Sequential(*layers)\n", - "\n", - "\n", - "def get_mnist():\n", - " # Load FashionMNIST dataset.\n", - " train_loader = torch.utils.data.DataLoader(\n", - " datasets.FashionMNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()),\n", - " batch_size=BATCHSIZE,\n", - " shuffle=True,\n", - " )\n", - " valid_loader = torch.utils.data.DataLoader(\n", - " datasets.FashionMNIST(os.getcwd(), train=False, transform=transforms.ToTensor()),\n", - " batch_size=BATCHSIZE,\n", - " shuffle=True,\n", - " )\n", - "\n", - " return train_loader, valid_loader\n", - "\n", - "\n", - "def objective(trial):\n", - " DEVICE = torch.device(\"cuda\")\n", - "\n", - " # Generate the model.\n", - " model = define_model(trial).to(DEVICE)\n", - "\n", - " # Generate the optimizers.\n", - " optimizer_name = trial.suggest_categorical(\"optimizer\", [\"Adam\", \"RMSprop\", \"SGD\"])\n", - " lr = trial.suggest_float(\"lr\", 1e-5, 1e-1, log=True)\n", - " optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)\n", - "\n", - " # Get the FashionMNIST dataset.\n", - " train_loader, valid_loader = get_mnist()\n", - "\n", - " # Training of the model.\n", - " for epoch in range(EPOCHS):\n", - " model.train()\n", - " for batch_idx, (data, target) in enumerate(train_loader):\n", - " # Limiting training data for faster epochs.\n", - " if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:\n", - " break\n", - "\n", - " data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)\n", - "\n", - " optimizer.zero_grad()\n", - " output = model(data)\n", - " loss = F.nll_loss(output, target)\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # Validation of the model.\n", + "from torch.optim import SGD\n", + "from torchvision import datasets, transforms\n", + "from dask.distributed import print\n", + "\n", + "def load_data():\n", + " transform = transforms.Compose(\n", + " [transforms.ToTensor(),\n", + " transforms.Normalize((0.5,), (0.5,))])\n", + "\n", + " # Create datasets for training & validation, download if necessary\n", + " training_set = datasets.FashionMNIST(os.getcwd(), train=True, transform=transform, download=True)\n", + " validation_set = datasets.FashionMNIST(os.getcwd(), train=False, transform=transform, download=True)\n", + "\n", + " # Create data loaders for our datasets; shuffle for training, not for validation\n", + " training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)\n", + " validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)\n", + "\n", + " # Report split sizes\n", + " print('Training set has {} instances'.format(len(training_set)))\n", + " print('Validation set has {} instances'.format(len(validation_set)))\n", + "\n", + " return training_loader, validation_loader\n", + "\n", + "\n", + "class GarmentClassifier(nn.Module):\n", + " def __init__(self):\n", + " super(GarmentClassifier, self).__init__()\n", + " self.conv1 = nn.Conv2d(1, 6, 5)\n", + " self.pool = nn.MaxPool2d(2, 2)\n", + " self.conv2 = nn.Conv2d(6, 16, 5)\n", + " self.fc1 = nn.Linear(16 * 4 * 4, 120)\n", + " self.fc2 = nn.Linear(120, 84)\n", + " self.fc3 = nn.Linear(84, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.pool(F.relu(self.conv1(x)))\n", + " x = self.pool(F.relu(self.conv2(x)))\n", + " x = x.view(-1, 16 * 4 * 4)\n", + " x = F.relu(self.fc1(x))\n", + " x = F.relu(self.fc2(x))\n", + " x = self.fc3(x)\n", + " return x\n", + "\n", + "\n", + "def train_one_epoch(model, loss_fn, optimizer, training_loader, device):\n", + " running_loss = 0.\n", + " last_loss = 0.\n", + "\n", + " # Here, we use enumerate(training_loader) instead of\n", + " # iter(training_loader) so that we can track the batch\n", + " # index and do some intra-epoch reporting\n", + " for i, data in enumerate(training_loader):\n", + " # Every data instance is an input + label pair\n", + " inputs, labels = data\n", + "\n", + " # Move to GPU\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + "\n", + " # Zero your gradients for every batch!\n", + " optimizer.zero_grad()\n", + "\n", + " # Make predictions for this batch\n", + " outputs = model(inputs)\n", + "\n", + " # Compute the loss and its gradients\n", + " loss = loss_fn(outputs, labels)\n", + " loss.backward()\n", + "\n", + " # Adjust learning weights\n", + " optimizer.step()\n", + "\n", + " # Gather data\n", + " running_loss += loss.item()\n", + " if i % 1000 == 999:\n", + " last_loss = running_loss / 1000 # loss per batch\n", + " print(' batch {} loss: {}'.format(i + 1, last_loss))\n", + " running_loss = 0.\n", + "\n", + " return last_loss\n", + "\n", + "\n", + "def train_all_epochs():\n", + " # Confirm that GPU shows up\n", + " if torch.cuda.is_available():\n", + " device = \"cuda\"\n", + " print(\"Using GPU {torch.cuda.get_device_name(torch.cuda.current_device())} 😎\\n\")\n", + " else:\n", + " device = \"cpu\"\n", + " print(\"Using CPU 😔\\n\")\n", + "\n", + " training_loader, validation_loader = load_data()\n", + " model = GarmentClassifier().to(device)\n", + " loss_fn = nn.CrossEntropyLoss()\n", + " optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)\n", + "\n", + " epochs = 5\n", + " best_vloss = 1_000_000.\n", + "\n", + " for epoch in range(epochs):\n", + " print(f'EPOCH {epoch + 1}:')\n", + "\n", + " # Make sure gradient tracking is on, and do a pass over the data\n", + " model.train(True)\n", + " avg_loss = train_one_epoch(model, loss_fn, optimizer, training_loader, device)\n", + "\n", + " running_vloss = 0.0\n", + " # Set the model to evaluation mode, disabling dropout and using population\n", + " # statistics for batch normalization.\n", " model.eval()\n", - " correct = 0\n", - " with torch.no_grad():\n", - " for batch_idx, (data, target) in enumerate(valid_loader):\n", - " # Limiting validation data.\n", - " if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:\n", - " break\n", - " data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)\n", - " output = model(data)\n", - " # Get the index of the max log-probability.\n", - " pred = output.argmax(dim=1, keepdim=True)\n", - " correct += pred.eq(target.view_as(pred)).sum().item()\n", - "\n", - " accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)\n", "\n", - " trial.report(accuracy, epoch)\n", + " # Disable gradient computation and reduce memory consumption.\n", + " with torch.no_grad():\n", + " for i, vdata in enumerate(validation_loader):\n", + " vinputs, vlabels = vdata\n", "\n", - " # Handle pruning based on the intermediate value.\n", - " if trial.should_prune():\n", - " raise optuna.exceptions.TrialPruned()\n", + " # Move to GPU\n", + " vinputs, vlabels = vinputs.to(device), vlabels.to(device)\n", "\n", - " return accuracy\n" - ] - }, - { - "cell_type": "markdown", - "id": "e566c680-e604-48fb-944b-71309a84aeb2", - "metadata": {}, - "source": [ - "## Run things locally, just to see that everything works\n", + " voutputs = model(vinputs)\n", + " vloss = loss_fn(voutputs, vlabels)\n", + " running_vloss += vloss\n", "\n", + " avg_vloss = running_vloss / (i + 1)\n", + " print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))\n", "\n", - "```python\n", - "study = optuna.create_study(direction=\"maximize\")\n", - "study.optimize(objective, n_trials=1, timeout=600)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "710fe6d4-d6e9-460a-a68f-9bb0e99fa4cb", - "metadata": {}, - "source": [ - "## Create GPU software environment\n", + " # Return the best model\n", + " if avg_vloss < best_vloss:\n", + " best_vloss = avg_vloss\n", + " best_model = model\n", "\n", - "We don't want to run this most of the time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5dfe2ad-4586-4266-8dcc-4791a8064508", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", + " print(f\"Model on CUDA device: {next(best_model.parameters()).is_cuda}\")\n", "\n", - "import coiled\n", + " # Move model to CPU so it can be serialized and returned to local machine\n", "\n", - "coiled.create_software_environment(\n", - " name=\"pytorch\",\n", - " conda={\n", - " \"channels\": [\"pytorch\", \"nvidia\", \"conda-forge\", \"defaults\"],\n", - " \"dependencies\": [\"dask=2023.2\", \"pytorch\", \"optuna\", \"torchvision\", \"cudatoolkit\", \"pynvml\"],\n", - " },\n", - " gpu_enabled=True,\n", - ")" + " return best_model\n", + "\n" ] }, { "cell_type": "markdown", - "id": "88faca73-1a7b-49f8-bfb6-c9063e0fddc0", + "id": "95510b24-123d-4bb4-8249-a0edb003582b", "metadata": {}, "source": [ - "## Create Cluster" + "## Run on CPU" ] }, { "cell_type": "code", "execution_count": null, - "id": "96747c85-9a80-477b-9a63-66afaca68440", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%time\n", - "\n", - "import coiled\n", - "\n", - "cluster = coiled.Cluster(\n", - " n_workers=10,\n", - " worker_gpu=True,\n", - " software=\"pytorch\",\n", - " worker_options={\"nthreads\": 1},\n", - " name=\"pytorch-gpus\",\n", - ")\n", - "\n", - "client = cluster.get_client()" - ] - }, - { - "cell_type": "markdown", - "id": "2887125f-6ee1-460c-96c4-72fbe0e2aba8", - "metadata": {}, - "source": [ - "## Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4a51be7-2ad5-4a1a-90e9-dfb126183670", + "id": "9ade8f94-9008-443c-93e1-6da90d80efbc", "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "\n", - "import optuna\n", - "from optuna.integration.dask import DaskStorage\n", - "\n", - "study = optuna.create_study(\n", - " direction=\"maximize\",\n", - " storage=DaskStorage(),\n", - ")\n", - "\n", - "from dask.distributed import wait\n", - "\n", - "futures = [\n", - " client.submit(study.optimize, objective, n_trials=1, pure=False)\n", - " for _ in range(100)\n", - "]\n", - "\n", - "_ = wait(futures)" + "model = train_all_epochs()" ] }, { "cell_type": "markdown", - "id": "0024b619-4612-4c66-89d1-1972b1a00fa6", + "id": "6ebb91d8-9c2d-408d-9c3c-a0e9535106f1", "metadata": {}, "source": [ - "## Analyze results" + "## Run on GPU\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "c1f7c5c0-05fb-4ab0-beaa-2f20f799f72c", + "id": "2b59f138-f84f-4319-9a54-821dd0525d65", "metadata": {}, "outputs": [], "source": [ - "pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])\n", - "complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])\n", - "\n", - "print(\"Study statistics: \")\n", - "print(\" Number of finished trials: \", len(study.trials))\n", - "print(\" Number of pruned trials: \", len(pruned_trials))\n", - "print(\" Number of complete trials: \", len(complete_trials))\n", - "\n", - "print(\"Best trial:\")\n", - "trial = study.best_trial\n", + "import coiled\n", "\n", - "print(\" Value: \", trial.value)\n", + "@coiled.function(\n", + " vm_type=\"g5.xlarge\",\n", + " region=\"us-east-2\",\n", + " keepalive=\"1 hour\",\n", + ")\n", + "def train_on_gpu():\n", + " model = train_all_epochs()\n", + " return model.to(\"cpu\")\n", "\n", - "print(\" Params: \")\n", - "for key, value in trial.params.items():\n", - " print(\" {}: {}\".format(key, value))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8dd0c44e-0e6f-41f8-964a-a45861b8bd73", - "metadata": {}, - "outputs": [], - "source": [ - "cluster.shutdown()" + "model = train_on_gpu()" ] } ], @@ -307,7 +235,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.10.14" } }, "nbformat": 4,