Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New talktorial: Molecular Transformer #397

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions teachopencadd/talktorials/T039_molecular_transformers/code/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
from torch.nn.utils.rnn import pad_sequence

import random
import tokenizer
import math
import torch


def generate_random_data(n):
SOS_token = np.array([2])
EOS_token = np.array([3])
length = 8

data = []

# 1,1,1,1,1,1 -> 1,1,1,1,1
for i in range(n // 3):
X = np.concatenate((SOS_token, np.ones(length), EOS_token))
y = 1 # np.concatenate((SOS_token, np.ones(length), EOS_token))
data.append([X, y])

# 0,0,0,0 -> 0,0,0,0
for i in range(n // 3):
X = np.concatenate((SOS_token, np.zeros(length), EOS_token))
y = 0 # np.concatenate((SOS_token, np.zeros(length), EOS_token))
data.append([X, y])

# 1,0,1,0 -> 1,0,1,0,1
for i in range(n // 3):
X = np.zeros(length)
start = random.randint(0, 1)

X[start::2] = 1

# y = np.zeros(length)
#if X[-1] == 0:
# y[::2] = 1
#else:
# y[1::2] = 1

X = np.concatenate((SOS_token, X, EOS_token))
y = 1 # np.concatenate((SOS_token, y, EOS_token))

data.append([X, y])

np.random.shuffle(data)

return data


def batchify_data(data, batch_size=64, padding=True, padding_token=30):
batches = []
for idx in range(0, len(data), batch_size):
# We make sure we dont get the last bit if its not batch_size size
if idx + batch_size < len(data):
# Here you would need to get the max length of the batch,
# and normalize the length with the PAD token.
if padding:
max_batch_length = 0

# Get longest sentence in batch
for seq in data[idx : idx + batch_size]:
if len(seq[0]) > max_batch_length:
max_batch_length = len(seq[0])
# Append X padding tokens until it reaches the max length
for seq_idx in range(batch_size):
remaining_length = max_batch_length - len(data[idx + seq_idx][0])
data[idx + seq_idx][0] = np.concatenate([data[idx + seq_idx][0], np.array([padding_token] * remaining_length, dtype=np.int64)], axis=0)
batches.append(np.array(data[idx : idx + batch_size]))

print(f"{len(batches)} batches of size {batch_size}")
# batches = add_padding(batches)
return batches


def generate_dataset(smiles, y, token, vocab):
# data = np.array(zip(smiles, y))
# build a vocab using the training data

data = []
smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles]
for smi, tar in zip(smiles, y):
if not math.isnan(tar):
smi = np.array(smi)
# smi = np.concatenate((SOS_token, smi, EOS_token))
data.append([smi, tar])

np.random.shuffle(data)
return data

def add_padding(data):
data = pad_sequence(sequences=torch.tensor(data),
batch_first=True,
padding_value=0,
)
return data
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
def __init__(self, dim_model, dropout_p, max_len):
super().__init__()
# Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
# max_len determines how far the position can have an effect on a token (window)

# Info
self.dropout = nn.Dropout(dropout_p)

# Encoding - From formula
pos_encoding = torch.zeros(max_len, 1, dim_model)
# positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
# positions_list = torch.arange(max_len).unsqueeze(1)
# division_term = torch.exp(torch.arange(0, dim_model, 2) * (-math.log(10000.0) / dim_model))

# division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
factor = -math.log(10000.0) / dim_model # outs loop
for pos in range(0, max_len): # position of word in seq
for i in range(0, dim_model, 2): # pos of embed of word
div_term = math.exp(i * factor)
pos_encoding[pos, 0, i] = math.sin(pos * div_term)
pos_encoding[pos, 0, i+1] = math.cos(pos * div_term)
# PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
# pos_encoding[:, 0, 0::2] = torch.sin(positions_list * division_term)

# PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
# pos_encoding[:, 0, 1::2] = torch.cos(positions_list * division_term)

# Saving buffer (same as parameter without gradients needed)
# pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
self.register_buffer("pos_encoding", pos_encoding)

def forward(self, token_embedding: torch.tensor) -> torch.tensor:
# Residual connection + pos encoding
pos_enc = self.pos_encoding[:token_embedding.size(0), :]
token_embedding = token_embedding + pos_enc
return self.dropout(token_embedding)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import torch
import numpy as np


def predict(model, input_sequence):
"""
Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
"""
model.eval()

# Get source mask
# tgt_mask = model.get_tgt_mask(y_input.size(1)).to(device)

pred = model(input_sequence)
# pred = pred.argmax(axis=1)

return pred


def test_loop(model, loss_fn, dataloader, device):

total_loss = 0
# total_acc = 0
# total = 0
prediction = np.empty((0))
ground_truth = np.empty((0))
model.eval()

for batch in dataloader:
with torch.no_grad():
X, y = batch[:, 0], batch[:, 1]
X = np.array([arr.astype(np.int64) for arr in X])
X, y = torch.tensor(X).to(device), torch.tensor(y.astype(np.float32)).to(device)

pred = model(X)
loss = loss_fn(pred, y.float().unsqueeze(1))

# correct = pred.argmax(axis=1) == y
total_loss += loss.detach().item()
# total_acc += correct.sum().item()
# total += correct.size(0)
prediction = np.concatenate((prediction, pred.cpu().detach().numpy()[:, 0]))
ground_truth = np.concatenate((ground_truth, y.cpu().detach().numpy()))

return total_loss / len(dataloader), prediction, ground_truth# , total_acc / total

70 changes: 70 additions & 0 deletions teachopencadd/talktorials/T039_molecular_transformers/code/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import torch
import torch.nn as nn

import transformer
import data
import inference
import training
import tokenizer
import plot

from pathlib import Path
import pandas as pd
import os


HERE = Path(__file__).parent.resolve()
DATA = HERE / "data"

# load the dataset
df = pd.read_csv(os.path.join(DATA, "qm9.csv.gz"), compression="gzip")
df = df.sample(frac=1).reset_index(drop=True)

smiles = df["smiles"].tolist()
y = df["mu"]



# smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles]
sample_size = len(y) # 50000
train_index = int(sample_size * 0.8)
test_index = train_index + int(sample_size * 0.1)

# normalize data
y_mean = y[:train_index].mean()
y_std = y[:train_index].std()
y = (y - y_mean) / y_std


max_vocab_size = 30
token = tokenizer.SmilesTokenizer()
vocab = tokenizer.build_vocab(smiles[:sample_size], token, max_vocab_size)
vocab_size = len(vocab)

train_data = data.generate_dataset(smiles[:train_index], y[:train_index], token, vocab)
val_data = data.generate_dataset(smiles[train_index:test_index], y[train_index:test_index], token, vocab)
test_data = data.generate_dataset(smiles[test_index:sample_size], y[test_index:sample_size], token, vocab)

train_dataloader = data.batchify_data(train_data)
val_dataloader = data.batchify_data(val_data)
test_dataloader = data.batchify_data(test_data)


# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "mps" if torch.backends.mps.is_available and torch.backends.mps.is_built() else "cpu"
print(device)

model = transformer.Transformer(
num_tokens=vocab_size, dim_model=100, num_heads=4, num_encoder_layers=3, dropout_p=0.2
).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.01)
# loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.MSELoss()

train_loss_list, val_loss_list = training.fit(model, opt, loss_fn, train_dataloader, val_dataloader, 50, device)

plot.plot_loss(train_loss_list, val_loss_list)

test_loss, predictions, ground_truth = inference.test_loop(model, loss_fn, test_dataloader, device)
print(f"Test loss: {test_loss:.4f}")
plot.plot_targets(predictions, ground_truth)
44 changes: 44 additions & 0 deletions teachopencadd/talktorials/T039_molecular_transformers/code/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import matplotlib.pylab as plt
from matplotlib.ticker import MaxNLocator


def plot_loss(train_loss, val_loss):
"""Plot the loss for each epoch

Args:
epochs (int): number of epochs
train_loss (array): training losses for each epoch
val_loss (array): validation losses for each epoch
"""
plt.plot(train_loss, label="Training loss")
plt.plot(val_loss, label="Validation loss")
plt.legend()
plt.ylabel("loss")
plt.xlabel("epoch")
plt.title("Model Loss")
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
# plt.show()
plt.savefig("plots/loss_4.png")




def plot_targets(pred, ground_truth):
"""Plot true vs predicted value in a scatter plot

Args:
pred (array): predicted values
ground_truth (array): ground truth values
"""
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(pred, ground_truth, s=0.5)
plt.xlim(-2, 7)
plt.ylim(-2, 7)
ax.axline((1, 1), slope=1)
plt.xlabel("Predicted Value")
plt.ylabel("Ground truth")
plt.title("Ground truth vs prediction")
# plt.show()
plt.savefig("plots/scatter_4.png")


Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import re
from collections import Counter


class SmilesTokenizer(object):
"""
A simple regex-based tokenizer adapted from the deepchem smiles_tokenizer package.
SMILES regex pattern for the tokenization is designed by Schwaller et. al., ACS Cent. Sci 5 (2019)
"""

def __init__(self):
self.regex_pattern = (
r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\."
r"|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
)
self.regex = re.compile(self.regex_pattern)

def tokenize(self, smiles):
"""
Tokenizes SMILES string.

Parameters
----------
smiles : str
Input SMILES string.

Returns
-------
List[str]
A list of tokens.
"""
tokens = [token for token in self.regex.findall(smiles)]
return tokens

def build_vocab(smiles_list, tokenizer, max_vocab_size):
"""
Builds a vocabulary of N=max_vocab_size most common tokens from list of SMILES strings.

Parameters
----------
smiles_list : List[str]
List of SMILES strings.
tokenizer : SmilesTokenizer
max_vocab_size : int
Maximum size of vocabulary.

Returns
-------
Dict[str, int]
A dictionary that defines mapping of a token to its index in the vocabulary.
"""
tokenized_smiles = [tokenizer.tokenize(s) for s in smiles_list]
token_counter = Counter(c for s in tokenized_smiles for c in s)
tokens = [token for token, _ in token_counter.most_common(max_vocab_size)]
vocab = {token: idx for idx, token in enumerate(tokens)}
return vocab


def smiles_to_ohe(smiles, tokenizer, vocab):
"""
Transforms SMILES string to one-hot encoding representation.

Parameters
----------
smiles : str
Input SMILES string.
tokenizer : SmilesTokenizer
vocab : Dict[str, int]
A dictionary that defines mapping of a token to its index in the vocabulary.

Returns
-------
Tensor
A pytorch Tensor with shape (n_tokens, vocab_size), where n_tokens is the
length of tokenized input string, vocab_size is the number of tokens in
the vocabulary
"""
unknown_token_id = len(vocab) - 1
token_ids = [vocab.get(token, unknown_token_id) for token in tokenizer.tokenize(smiles)]
# ohe = torch.eye(len(vocab))[token_ids]
return token_ids # ohe
Loading