volkamerlab · PaulaKramer · Jun 7, 2023
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/data.py b/teachopencadd/talktorials/T039_molecular_transformers/code/data.py
@@ -0,0 +1,97 @@
+import numpy as np 
+from torch.nn.utils.rnn import pad_sequence
+
+import random 
+import tokenizer
+import math 
+import torch 
+
+
+def generate_random_data(n):
+    SOS_token = np.array([2])
+    EOS_token = np.array([3])
+    length = 8
+
+    data = []
+
+    # 1,1,1,1,1,1 -> 1,1,1,1,1
+    for i in range(n // 3):
+        X = np.concatenate((SOS_token, np.ones(length), EOS_token))
+        y = 1 # np.concatenate((SOS_token, np.ones(length), EOS_token))
+        data.append([X, y])
+
+    # 0,0,0,0 -> 0,0,0,0
+    for i in range(n // 3):
+        X = np.concatenate((SOS_token, np.zeros(length), EOS_token))
+        y = 0 # np.concatenate((SOS_token, np.zeros(length), EOS_token))
+        data.append([X, y])
+
+    # 1,0,1,0 -> 1,0,1,0,1
+    for i in range(n // 3):
+        X = np.zeros(length)
+        start = random.randint(0, 1)
+
+        X[start::2] = 1
+
+        # y = np.zeros(length)
+        #if X[-1] == 0:
+        #    y[::2] = 1
+        #else:
+        #    y[1::2] = 1
+
+        X = np.concatenate((SOS_token, X, EOS_token))
+        y = 1 # np.concatenate((SOS_token, y, EOS_token))
+
+        data.append([X, y])
+
+    np.random.shuffle(data)
+
+    return data
+
+
+def batchify_data(data, batch_size=64, padding=True, padding_token=30):
+    batches = []
+    for idx in range(0, len(data), batch_size):
+        # We make sure we dont get the last bit if its not batch_size size
+        if idx + batch_size < len(data):
+            # Here you would need to get the max length of the batch,
+            # and normalize the length with the PAD token.
+            if padding:
+                max_batch_length = 0
+
+                # Get longest sentence in batch
+                for seq in data[idx : idx + batch_size]:
+                    if len(seq[0]) > max_batch_length:
+                        max_batch_length = len(seq[0])
+                # Append X padding tokens until it reaches the max length
+                for seq_idx in range(batch_size):
+                   remaining_length = max_batch_length - len(data[idx + seq_idx][0])
+                   data[idx + seq_idx][0] = np.concatenate([data[idx + seq_idx][0], np.array([padding_token] * remaining_length, dtype=np.int64)], axis=0)
+            batches.append(np.array(data[idx : idx + batch_size]))
+
+    print(f"{len(batches)} batches of size {batch_size}")
+    # batches = add_padding(batches)
+    return batches
+
+
+def generate_dataset(smiles, y, token, vocab): 
+    # data = np.array(zip(smiles, y))
+    # build a vocab using the training data
+
+    data = []
+    smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles]
+    for smi, tar in zip(smiles, y): 
+        if not math.isnan(tar): 
+            smi = np.array(smi)
+            # smi = np.concatenate((SOS_token, smi, EOS_token))
+            data.append([smi, tar])
+
+    np.random.shuffle(data)
+    return data
+
+def add_padding(data): 
+    data = pad_sequence(sequences=torch.tensor(data),
+        batch_first=True,
+        padding_value=0,
+    )
+    return data
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/encoder.py b/teachopencadd/talktorials/T039_molecular_transformers/code/encoder.py
@@ -0,0 +1,42 @@
+import torch 
+import torch.nn as nn
+import math 
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, dim_model, dropout_p, max_len):
+        super().__init__()
+        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+        # max_len determines how far the position can have an effect on a token (window)
+
+        # Info
+        self.dropout = nn.Dropout(dropout_p)
+
+        # Encoding - From formula
+        pos_encoding = torch.zeros(max_len, 1, dim_model)
+        # positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
+        # positions_list = torch.arange(max_len).unsqueeze(1)
+        # division_term = torch.exp(torch.arange(0, dim_model, 2) * (-math.log(10000.0) / dim_model))
+
+        # division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
+        factor = -math.log(10000.0) / dim_model  # outs loop
+        for pos in range(0, max_len):  # position of word in seq
+            for i in range(0, dim_model, 2):  # pos of embed of word
+                div_term = math.exp(i * factor)
+                pos_encoding[pos, 0, i] = math.sin(pos * div_term)
+                pos_encoding[pos, 0, i+1] = math.cos(pos * div_term)
+        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
+        # pos_encoding[:, 0, 0::2] = torch.sin(positions_list * division_term)
+
+        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
+        # pos_encoding[:, 0, 1::2] = torch.cos(positions_list * division_term)
+
+        # Saving buffer (same as parameter without gradients needed)
+        # pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pos_encoding", pos_encoding)
+
+    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
+        # Residual connection + pos encoding
+        pos_enc = self.pos_encoding[:token_embedding.size(0), :]
+        token_embedding = token_embedding + pos_enc
+        return self.dropout(token_embedding)
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/inference.py b/teachopencadd/talktorials/T039_molecular_transformers/code/inference.py
@@ -0,0 +1,47 @@
+import torch 
+import numpy as np 
+
+
+def predict(model, input_sequence):
+    """
+    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
+    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
+    """
+    model.eval()
+
+    # Get source mask
+    # tgt_mask = model.get_tgt_mask(y_input.size(1)).to(device)
+
+    pred = model(input_sequence)
+    # pred = pred.argmax(axis=1)
+
+    return pred
+
+
+def test_loop(model, loss_fn, dataloader, device):
+
+    total_loss = 0
+    # total_acc = 0
+    # total = 0
+    prediction = np.empty((0))
+    ground_truth = np.empty((0))
+    model.eval()
+
+    for batch in dataloader:
+        with torch.no_grad():
+            X, y = batch[:, 0], batch[:, 1]
+            X = np.array([arr.astype(np.int64) for arr in X])
+            X, y = torch.tensor(X).to(device), torch.tensor(y.astype(np.float32)).to(device)
+
+            pred = model(X)
+            loss = loss_fn(pred, y.float().unsqueeze(1))
+
+            # correct = pred.argmax(axis=1) == y 
+            total_loss += loss.detach().item()
+            # total_acc += correct.sum().item()
+            # total += correct.size(0)
+            prediction = np.concatenate((prediction, pred.cpu().detach().numpy()[:, 0]))
+            ground_truth = np.concatenate((ground_truth, y.cpu().detach().numpy()))
+
+    return total_loss / len(dataloader), prediction, ground_truth# , total_acc / total 
+
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/main.py b/teachopencadd/talktorials/T039_molecular_transformers/code/main.py
@@ -0,0 +1,70 @@
+import torch 
+import torch.nn as nn 
+
+import transformer
+import data
+import inference 
+import training 
+import tokenizer
+import plot 
+
+from pathlib import Path 
+import pandas as pd 
+import os
+
+
+HERE = Path(__file__).parent.resolve()
+DATA = HERE / "data"
+
+# load the dataset
+df = pd.read_csv(os.path.join(DATA, "qm9.csv.gz"), compression="gzip")
+df = df.sample(frac=1).reset_index(drop=True)
+
+smiles = df["smiles"].tolist()
+y = df["mu"] 
+
+
+
+# smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles]
+sample_size = len(y) # 50000
+train_index = int(sample_size * 0.8)
+test_index = train_index + int(sample_size * 0.1)
+
+# normalize data 
+y_mean = y[:train_index].mean()
+y_std = y[:train_index].std()
+y = (y - y_mean) / y_std
+
+
+max_vocab_size = 30
+token = tokenizer.SmilesTokenizer()
+vocab = tokenizer.build_vocab(smiles[:sample_size], token, max_vocab_size)
+vocab_size = len(vocab)
+
+train_data = data.generate_dataset(smiles[:train_index], y[:train_index], token, vocab)
+val_data = data.generate_dataset(smiles[train_index:test_index], y[train_index:test_index], token, vocab)
+test_data = data.generate_dataset(smiles[test_index:sample_size], y[test_index:sample_size], token, vocab)
+
+train_dataloader = data.batchify_data(train_data)
+val_dataloader = data.batchify_data(val_data)
+test_dataloader = data.batchify_data(test_data)
+
+
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+device =  "mps" if torch.backends.mps.is_available and torch.backends.mps.is_built() else "cpu"
+print(device)
+
+model = transformer.Transformer(
+    num_tokens=vocab_size, dim_model=100, num_heads=4, num_encoder_layers=3, dropout_p=0.2
+).to(device)
+opt = torch.optim.SGD(model.parameters(), lr=0.01)
+# loss_fn = nn.CrossEntropyLoss()
+loss_fn = nn.MSELoss()
+
+train_loss_list, val_loss_list = training.fit(model, opt, loss_fn, train_dataloader, val_dataloader, 50, device)
+
+plot.plot_loss(train_loss_list, val_loss_list)
+
+test_loss, predictions, ground_truth = inference.test_loop(model, loss_fn, test_dataloader, device)
+print(f"Test loss: {test_loss:.4f}")
+plot.plot_targets(predictions, ground_truth)
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/plot.py b/teachopencadd/talktorials/T039_molecular_transformers/code/plot.py
@@ -0,0 +1,44 @@
+import matplotlib.pylab as plt
+from matplotlib.ticker import MaxNLocator
+
+
+def plot_loss(train_loss, val_loss):
+    """Plot the loss for each epoch
+
+    Args:
+        epochs (int): number of epochs
+        train_loss (array): training losses for each epoch
+        val_loss (array): validation losses for each epoch
+    """
+    plt.plot(train_loss, label="Training loss")
+    plt.plot(val_loss, label="Validation loss")
+    plt.legend()
+    plt.ylabel("loss")
+    plt.xlabel("epoch")
+    plt.title("Model Loss")
+    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
+    # plt.show()
+    plt.savefig("plots/loss_4.png")
+
+
+
+
+def plot_targets(pred, ground_truth):
+    """Plot true vs predicted value in a scatter plot
+
+    Args:
+        pred (array): predicted values
+        ground_truth (array): ground truth values
+    """
+    f, ax = plt.subplots(figsize=(6, 6))
+    ax.scatter(pred, ground_truth, s=0.5)
+    plt.xlim(-2, 7)
+    plt.ylim(-2, 7)
+    ax.axline((1, 1), slope=1)
+    plt.xlabel("Predicted Value")
+    plt.ylabel("Ground truth")
+    plt.title("Ground truth vs prediction")
+    # plt.show()
+    plt.savefig("plots/scatter_4.png")
+
+
diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/tokenizer.py b/teachopencadd/talktorials/T039_molecular_transformers/code/tokenizer.py
@@ -0,0 +1,81 @@
+import re 
+from collections import Counter 
+
+
+class SmilesTokenizer(object):
+    """
+    A simple regex-based tokenizer adapted from the deepchem smiles_tokenizer package.
+    SMILES regex pattern for the tokenization is designed by Schwaller et. al., ACS Cent. Sci 5 (2019)
+    """
+
+    def __init__(self):
+        self.regex_pattern = (
+            r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\."
+            r"|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        )
+        self.regex = re.compile(self.regex_pattern)
+
+    def tokenize(self, smiles):
+        """
+        Tokenizes SMILES string.
+
+        Parameters
+        ----------
+        smiles : str
+            Input SMILES string.
+
+        Returns
+        -------
+        List[str]
+            A list of tokens.
+        """
+        tokens = [token for token in self.regex.findall(smiles)]
+        return tokens
+
+def build_vocab(smiles_list, tokenizer, max_vocab_size):
+    """
+    Builds a vocabulary of N=max_vocab_size most common tokens from list of SMILES strings.
+
+    Parameters
+    ----------
+    smiles_list : List[str]
+        List of SMILES strings.
+    tokenizer : SmilesTokenizer
+    max_vocab_size : int
+        Maximum size of vocabulary.
+
+    Returns
+    -------
+    Dict[str, int]
+        A dictionary that defines mapping of a token to its index in the vocabulary.
+    """
+    tokenized_smiles = [tokenizer.tokenize(s) for s in smiles_list]
+    token_counter = Counter(c for s in tokenized_smiles for c in s)
+    tokens = [token for token, _ in token_counter.most_common(max_vocab_size)]
+    vocab = {token: idx for idx, token in enumerate(tokens)}
+    return vocab
+
+
+def smiles_to_ohe(smiles, tokenizer, vocab):
+    """
+    Transforms SMILES string to one-hot encoding representation.
+
+    Parameters
+    ----------
+    smiles : str
+        Input SMILES string.
+    tokenizer : SmilesTokenizer
+    vocab : Dict[str, int]
+        A dictionary that defines mapping of a token to its index in the vocabulary.
+
+    Returns
+    -------
+    Tensor
+        A pytorch Tensor with shape (n_tokens, vocab_size), where n_tokens is the
+        length of tokenized input string, vocab_size is the number of tokens in
+        the vocabulary
+    """
+    unknown_token_id = len(vocab) - 1
+    token_ids = [vocab.get(token, unknown_token_id) for token in tokenizer.tokenize(smiles)]
+    # ohe = torch.eye(len(vocab))[token_ids]
+    return token_ids # ohe