diff --git a/clr_parameters_finder.py b/clr_parameters_finder.py index 80d794e..d786ecd 100644 --- a/clr_parameters_finder.py +++ b/clr_parameters_finder.py @@ -1,4 +1,4 @@ -''' +""" This script allows to find the optimal parameters for a learning rate scheduling: - min_lr @@ -20,7 +20,7 @@ reference: https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee -''' +""" import math import os @@ -51,17 +51,21 @@ def run(args): batch_size = args.batch_size - training_params = {"batch_size": batch_size, - "shuffle": True, - "num_workers": args.workers} + training_params = { + "batch_size": batch_size, + "shuffle": True, + "num_workers": args.workers, + } texts, labels, number_of_classes, sample_weights = load_data(args) - train_texts, _, train_labels, _, _, _ = train_test_split(texts, - labels, - sample_weights, - test_size=args.validation_split, - random_state=42, - stratify=labels) + train_texts, _, train_labels, _, _, _ = train_test_split( + texts, + labels, + sample_weights, + test_size=args.validation_split, + random_state=42, + stratify=labels, + ) training_set = MyDataset(train_texts, train_labels, args) training_generator = DataLoader(training_set, **training_params) @@ -74,31 +78,31 @@ def run(args): criterion = nn.CrossEntropyLoss() - if args.optimizer == 'sgd': - optimizer = torch.optim.SGD( - model.parameters(), lr=args.start_lr, momentum=0.9 - ) - elif args.optimizer == 'adam': - optimizer = torch.optim.Adam( - model.parameters(), lr=args.start_lr - ) + if args.optimizer == "sgd": + optimizer = torch.optim.SGD(model.parameters(), lr=args.start_lr, momentum=0.9) + elif args.optimizer == "adam": + optimizer = torch.optim.Adam(model.parameters(), lr=args.start_lr) start_lr = args.start_lr end_lr = args.end_lr lr_find_epochs = args.epochs smoothing = args.smoothing - def lr_lambda(x): return math.exp( - x * math.log(end_lr / start_lr) / (lr_find_epochs * len(training_generator))) + def lr_lambda(x): + return math.exp( + x * math.log(end_lr / start_lr) / (lr_find_epochs * len(training_generator)) + ) + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) losses = [] learning_rates = [] for epoch in range(lr_find_epochs): - print(f'[epoch {epoch + 1} / {lr_find_epochs}]') - progress_bar = tqdm(enumerate(training_generator), - total=len(training_generator)) + print(f"[epoch {epoch + 1} / {lr_find_epochs}]") + progress_bar = tqdm( + enumerate(training_generator), total=len(training_generator) + ) for iter, batch in progress_bar: features, labels = batch if torch.cuda.is_available(): @@ -124,41 +128,42 @@ def lr_lambda(x): return math.exp( losses.append(loss) plt.semilogx(learning_rates, losses) - plt.savefig('./plots/losses_vs_lr.png') + plt.savefig("./plots/losses_vs_lr.png") if __name__ == "__main__": - parser = argparse.ArgumentParser( - 'Character Based CNN for text classification') - parser.add_argument('--data_path', type=str, - default='./data/train.csv') - parser.add_argument('--validation_split', type=float, default=0.2) - parser.add_argument('--label_column', type=str, default='Sentiment') - parser.add_argument('--text_column', type=str, default='SentimentText') - parser.add_argument('--max_rows', type=int, default=None) - parser.add_argument('--chunksize', type=int, default=50000) - parser.add_argument('--encoding', type=str, default='utf-8') - parser.add_argument('--sep', type=str, default=',') - parser.add_argument('--steps', nargs='+', default=['lower']) - parser.add_argument('--group_labels', type=str, - default=None, choices=[None, 'binarize']) - parser.add_argument('--ratio', type=float, default=1) - - parser.add_argument('--alphabet', type=str, - default='abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"\\/|_@#$%^&*~`+-=<>()[]{}') - parser.add_argument('--number_of_characters', type=int, default=69) - parser.add_argument('--extra_characters', type=str, default='') - parser.add_argument('--max_length', type=int, default=150) - parser.add_argument('--batch_size', type=int, default=128) - parser.add_argument('--optimizer', type=str, - choices=['adam', 'sgd'], default='sgd') - parser.add_argument('--learning_rate', type=float, default=0.01) - parser.add_argument('--workers', type=int, default=1) - - parser.add_argument('--start_lr', type=float, default=1e-5) - parser.add_argument('--end_lr', type=float, default=1e-2) - parser.add_argument('--smoothing', type=float, default=0.05) - parser.add_argument('--epochs', type=int, default=1) + parser = argparse.ArgumentParser("Character Based CNN for text classification") + parser.add_argument("--data_path", type=str, default="./data/train.csv") + parser.add_argument("--validation_split", type=float, default=0.2) + parser.add_argument("--label_column", type=str, default="Sentiment") + parser.add_argument("--text_column", type=str, default="SentimentText") + parser.add_argument("--max_rows", type=int, default=None) + parser.add_argument("--chunksize", type=int, default=50000) + parser.add_argument("--encoding", type=str, default="utf-8") + parser.add_argument("--sep", type=str, default=",") + parser.add_argument("--steps", nargs="+", default=["lower"]) + parser.add_argument( + "--group_labels", type=str, default=None, choices=[None, "binarize"] + ) + parser.add_argument("--ratio", type=float, default=1) + + parser.add_argument( + "--alphabet", + type=str, + default="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"\\/|_@#$%^&*~`+-=<>()[]{}", + ) + parser.add_argument("--number_of_characters", type=int, default=69) + parser.add_argument("--extra_characters", type=str, default="") + parser.add_argument("--max_length", type=int, default=150) + parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--optimizer", type=str, choices=["adam", "sgd"], default="sgd") + parser.add_argument("--learning_rate", type=float, default=0.01) + parser.add_argument("--workers", type=int, default=1) + + parser.add_argument("--start_lr", type=float, default=1e-5) + parser.add_argument("--end_lr", type=float, default=1e-2) + parser.add_argument("--smoothing", type=float, default=0.05) + parser.add_argument("--epochs", type=int, default=1) args = parser.parse_args() run(args) diff --git a/config.json b/config.json index 9ece17a..a7d6d7b 100644 --- a/config.json +++ b/config.json @@ -1,71 +1,49 @@ { - "alphabet": { - "en": { - "lower": { - "alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}", - "number_of_characters": 69 - }, - "both": { - "alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}", - "number_of_characters": 95 - } - } - }, + "alphabet": { + "en": { + "lower": { + "alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}", + "number_of_characters": 69 + }, + "both": { + "alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}", + "number_of_characters": 95 + } + } + }, - "model_parameters": { - "small": { - "conv": [ - [ - 256, - 7, - 3 - ], - [ - 256, - 7, - 3 - ], - [ - 256, - 3, - -1 - ], - [ - 256, - 3, - -1 - ], - [ - 256, - 3, - -1 - ], - [ - 256, - 3, - 3 - ] - ], - "fc": [ - 1024, - 1024 - ] - } - }, - "data": { - "text_column": "SentimentText", - "label_column": "Sentiment", - "max_length": 150, - "num_of_classes": 2, - "encoding": null, - "chunksize": 50000, - "max_rows": 100000, - "preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"] - }, - "training": { - "batch_size": 128, - "learning_rate": 0.01, - "epochs": 10, - "optimizer": "sgd" + "model_parameters": { + "small": { + "conv": [ + [256, 7, 3], + [256, 7, 3], + [256, 3, -1], + [256, 3, -1], + [256, 3, -1], + [256, 3, 3] + ], + "fc": [1024, 1024] } -} \ No newline at end of file + }, + "data": { + "text_column": "SentimentText", + "label_column": "Sentiment", + "max_length": 150, + "num_of_classes": 2, + "encoding": null, + "chunksize": 50000, + "max_rows": 100000, + "preprocessing_steps": [ + "lower", + "remove_hashtags", + "remove_urls", + "remove_user_mentions" + ] + }, + "training": { + "batch_size": 128, + "learning_rate": 0.01, + "epochs": 10, + "optimizer": "sgd" + } +} diff --git a/predict.py b/predict.py index a0eabbb..763f37e 100644 --- a/predict.py +++ b/predict.py @@ -6,18 +6,19 @@ use_cuda = torch.cuda.is_available() + def predict(args): model = CharacterLevelCNN(args, args.number_of_classes) state = torch.load(args.model) model.load_state_dict(state) model.eval() - + processed_input = utils.preprocess_input(args) processed_input = torch.tensor(processed_input) processed_input = processed_input.unsqueeze(0) if use_cuda: - processed_input = processed_input.to('cuda') - model = model.to('cuda') + processed_input = processed_input.to("cuda") + model = model.to("cuda") prediction = model(processed_input) probabilities = F.softmax(prediction, dim=1) probabilities = probabilities.detach().cpu().numpy() @@ -26,22 +27,25 @@ def predict(args): if __name__ == "__main__": parser = argparse.ArgumentParser( - 'Testing a pretrained Character Based CNN for text classification') - parser.add_argument('--model', type=str, help='path for pre-trained model') - parser.add_argument('--text', type=str, - default='I love pizza!', help='text string') - parser.add_argument('--steps', nargs="+", default=['lower']) + "Testing a pretrained Character Based CNN for text classification" + ) + parser.add_argument("--model", type=str, help="path for pre-trained model") + parser.add_argument("--text", type=str, default="I love pizza!", help="text string") + parser.add_argument("--steps", nargs="+", default=["lower"]) # arguments needed for the predicition - parser.add_argument('--alphabet', type=str, - default="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") - parser.add_argument('--number_of_characters', type=int, default=69) - parser.add_argument('--extra_characters', type=str, default="éàèùâêîôûçëïü") - parser.add_argument('--max_length', type=int, default=300) - parser.add_argument('--number_of_classes', type=int, default=2) + parser.add_argument( + "--alphabet", + type=str, + default="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}", + ) + parser.add_argument("--number_of_characters", type=int, default=69) + parser.add_argument("--extra_characters", type=str, default="éàèùâêîôûçëïü") + parser.add_argument("--max_length", type=int, default=300) + parser.add_argument("--number_of_classes", type=int, default=2) args = parser.parse_args() prediction = predict(args) - - print('input : {}'.format(args.text)) - print('prediction : {}'.format(prediction)) + + print("input : {}".format(args.text)) + print("prediction : {}".format(prediction)) diff --git a/src/data_loader.py b/src/data_loader.py index b51caec..9aa949f 100644 --- a/src/data_loader.py +++ b/src/data_loader.py @@ -21,12 +21,14 @@ def get_sample_weights(labels): def load_data(args): # chunk your dataframes in small portions - chunks = pd.read_csv(args.data_path, - usecols=[args.text_column, args.label_column], - chunksize=args.chunksize, - encoding=args.encoding, - nrows=args.max_rows, - sep=args.sep) + chunks = pd.read_csv( + args.data_path, + usecols=[args.text_column, args.label_column], + chunksize=args.chunksize, + encoding=args.encoding, + nrows=args.max_rows, + sep=args.sep, + ) texts = [] labels = [] for df_chunk in tqdm(chunks): @@ -34,9 +36,10 @@ def load_data(args): aux_df = aux_df.sample(frac=1) aux_df = aux_df[~aux_df[args.text_column].isnull()] aux_df = aux_df[(aux_df[args.text_column].map(len) > 1)] - aux_df['processed_text'] = (aux_df[args.text_column] - .map(lambda text: utils.process_text(args.steps, text))) - texts += aux_df['processed_text'].tolist() + aux_df["processed_text"] = aux_df[args.text_column].map( + lambda text: utils.process_text(args.steps, text) + ) + texts += aux_df["processed_text"].tolist() labels += aux_df[args.label_column].tolist() if bool(args.group_labels): @@ -45,19 +48,20 @@ def load_data(args): label_ignored = args.label_ignored - clean_data = [(text, label) for (text, label) in zip( - texts, labels) if label not in [label_ignored]] + clean_data = [ + (text, label) + for (text, label) in zip(texts, labels) + if label not in [label_ignored] + ] texts = [text for (text, label) in clean_data] labels = [label for (text, label) in clean_data] - labels = list( - map(lambda l: {1: 0, 2: 0, 4: 1, 5: 1}[l], labels)) + labels = list(map(lambda l: {1: 0, 2: 0, 4: 1, 5: 1}[l], labels)) else: - labels = list( - map(lambda l: {1: 0, 2: 0, 3: 1, 4: 2, 5: 2}[l], labels)) - + labels = list(map(lambda l: {1: 0, 2: 0, 3: 1, 4: 2, 5: 2}[l], labels)) + if bool(args.balance): counter = Counter(labels) @@ -68,9 +72,13 @@ def load_data(args): balanced_labels = [] balanced_texts = [] - for key in keys: - balanced_texts += [text for text, label in zip(texts, labels) if label == key][:int(args.ratio * count_minority)] - balanced_labels += [label for text, label in zip(texts, labels) if label == key][:int(args.ratio * count_minority)] + for key in keys: + balanced_texts += [ + text for text, label in zip(texts, labels) if label == key + ][: int(args.ratio * count_minority)] + balanced_labels += [ + label for text, label in zip(texts, labels) if label == key + ][: int(args.ratio * count_minority)] texts = balanced_texts labels = balanced_labels @@ -78,8 +86,9 @@ def load_data(args): number_of_classes = len(set(labels)) print( - f'data loaded successfully with {len(texts)} rows and {number_of_classes} labels') - print('Distribution of the classes', Counter(labels)) + f"data loaded successfully with {len(texts)} rows and {number_of_classes} labels" + ) + print("Distribution of the classes", Counter(labels)) sample_weights = get_sample_weights(labels) @@ -93,8 +102,9 @@ def __init__(self, texts, labels, args): self.length = len(self.texts) self.vocabulary = args.alphabet + args.extra_characters - self.number_of_characters = args.number_of_characters + \ - len(args.extra_characters) + self.number_of_characters = args.number_of_characters + len( + args.extra_characters + ) self.max_length = args.max_length self.preprocessing_steps = args.steps self.identity_mat = np.identity(self.number_of_characters) @@ -105,16 +115,30 @@ def __len__(self): def __getitem__(self, index): raw_text = self.texts[index] - data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text)[::-1] if i in self.vocabulary], - dtype=np.float32) + data = np.array( + [ + self.identity_mat[self.vocabulary.index(i)] + for i in list(raw_text)[::-1] + if i in self.vocabulary + ], + dtype=np.float32, + ) if len(data) > self.max_length: - data = data[:self.max_length] + data = data[: self.max_length] elif 0 < len(data) < self.max_length: data = np.concatenate( - (data, np.zeros((self.max_length - len(data), self.number_of_characters), dtype=np.float32))) + ( + data, + np.zeros( + (self.max_length - len(data), self.number_of_characters), + dtype=np.float32, + ), + ) + ) elif len(data) == 0: data = np.zeros( - (self.max_length, self.number_of_characters), dtype=np.float32) + (self.max_length, self.number_of_characters), dtype=np.float32 + ) label = self.labels[index] data = torch.Tensor(data) diff --git a/src/focal_loss.py b/src/focal_loss.py index ce3b49e..65d4f1a 100644 --- a/src/focal_loss.py +++ b/src/focal_loss.py @@ -3,13 +3,14 @@ import torch.nn.functional as F import torch.nn as nn + class FocalLoss(nn.Module): def __init__(self, gamma=0, alpha=None, size_average=True): super(FocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha if isinstance(alpha, (float, int)): - self.alpha = torch.Tensor([alpha, 1-alpha]) + self.alpha = torch.Tensor([alpha, 1 - alpha]) if isinstance(alpha, list): self.alpha = torch.Tensor(alpha) self.size_average = size_average @@ -18,8 +19,8 @@ def forward(self, input, target): if input.dim() > 2: # N,C,H,W => N,C,H*W input = input.view(input.size(0), input.size(1), -1) - input = input.transpose(1, 2) # N,C,H*W => N,H*W,C - input = input.contiguous().view(-1, input.size(2)) # N,H*W,C => N*H*W,C + input = input.transpose(1, 2) # N,C,H*W => N,H*W,C + input = input.contiguous().view(-1, input.size(2)) # N,H*W,C => N*H*W,C target = target.view(-1, 1) logpt = F.log_softmax(input, dim=1) @@ -33,7 +34,7 @@ def forward(self, input, target): at = self.alpha.gather(0, target.data.view(-1)) logpt = logpt * Variable(at) - loss = -1 * (1-pt)**self.gamma * logpt + loss = -1 * (1 - pt) ** self.gamma * logpt if self.size_average: return loss.mean() else: diff --git a/src/model.py b/src/model.py index a765045..0897ab8 100644 --- a/src/model.py +++ b/src/model.py @@ -11,56 +11,53 @@ def __init__(self, args, number_of_classes): self.dropout_input = nn.Dropout2d(args.dropout_input) - self.conv1 = nn.Sequential(nn.Conv1d(args.number_of_characters + len(args.extra_characters), - 256, - kernel_size=7, - padding=0), - nn.ReLU(), - nn.MaxPool1d(3) - ) - - self.conv2 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=7, padding=0), - nn.ReLU(), - nn.MaxPool1d(3) - ) - - self.conv3 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), - nn.ReLU() - ) - - self.conv4 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), - nn.ReLU() - ) - - self.conv5 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), - nn.ReLU() - ) - - self.conv6 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), - nn.ReLU(), - nn.MaxPool1d(3) - ) + self.conv1 = nn.Sequential( + nn.Conv1d( + args.number_of_characters + len(args.extra_characters), + 256, + kernel_size=7, + padding=0, + ), + nn.ReLU(), + nn.MaxPool1d(3), + ) + + self.conv2 = nn.Sequential( + nn.Conv1d(256, 256, kernel_size=7, padding=0), nn.ReLU(), nn.MaxPool1d(3) + ) + + self.conv3 = nn.Sequential( + nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU() + ) + + self.conv4 = nn.Sequential( + nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU() + ) + + self.conv5 = nn.Sequential( + nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU() + ) + + self.conv6 = nn.Sequential( + nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU(), nn.MaxPool1d(3) + ) # compute the output shape after forwarding an input to the conv layers - input_shape = (128, - args.max_length, - args.number_of_characters + len(args.extra_characters)) + input_shape = ( + 128, + args.max_length, + args.number_of_characters + len(args.extra_characters), + ) self.output_dimension = self._get_conv_output(input_shape) # define linear layers self.fc1 = nn.Sequential( - nn.Linear(self.output_dimension, 1024), - nn.ReLU(), - nn.Dropout(0.5) + nn.Linear(self.output_dimension, 1024), nn.ReLU(), nn.Dropout(0.5) ) - self.fc2 = nn.Sequential( - nn.Linear(1024, 1024), - nn.ReLU(), - nn.Dropout(0.5) - ) + self.fc2 = nn.Sequential(nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.5)) self.fc3 = nn.Linear(1024, number_of_classes) @@ -75,7 +72,6 @@ def _create_weights(self, mean=0.0, std=0.05): if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear): module.weight.data.normal_(mean, std) - def _get_conv_output(self, shape): x = torch.rand(shape) x = x.transpose(1, 2) @@ -105,4 +101,3 @@ def forward(self, x): x = self.fc2(x) x = self.fc3(x) return x - diff --git a/src/utils.py b/src/utils.py index 04b3603..e160324 100644 --- a/src/utils.py +++ b/src/utils.py @@ -12,25 +12,25 @@ def lower(text): def remove_hashtags(text): - clean_text = re.sub(r'#[A-Za-z0-9_]+', "", text) + clean_text = re.sub(r"#[A-Za-z0-9_]+", "", text) return clean_text def remove_user_mentions(text): - clean_text = re.sub(r'@[A-Za-z0-9_]+', "", text) + clean_text = re.sub(r"@[A-Za-z0-9_]+", "", text) return clean_text def remove_urls(text): - clean_text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) + clean_text = re.sub(r"^https?:\/\/.*[\r\n]*", "", text, flags=re.MULTILINE) return clean_text preprocessing_setps = { - 'remove_hashtags': remove_hashtags, - 'remove_urls': remove_urls, - 'remove_user_mentions': remove_user_mentions, - 'lower': lower + "remove_hashtags": remove_hashtags, + "remove_urls": remove_urls, + "remove_user_mentions": remove_user_mentions, + "lower": lower, } @@ -40,16 +40,17 @@ def process_text(steps, text): text = preprocessing_setps[step](text) return text + # metrics // model evaluations def get_evaluation(y_true, y_prob, list_metrics): y_pred = np.argmax(y_prob, -1) output = {} - if 'accuracy' in list_metrics: - output['accuracy'] = metrics.accuracy_score(y_true, y_pred) - if 'f1' in list_metrics: - output['f1'] = metrics.f1_score(y_true, y_pred, average='weighted') + if "accuracy" in list_metrics: + output["accuracy"] = metrics.accuracy_score(y_true, y_pred) + if "f1" in list_metrics: + output["f1"] = metrics.f1_score(y_true, y_pred, average="weighted") return output @@ -88,6 +89,7 @@ def accuracy(output, target, topk=(1,)): res.append(correct_k.mul_(100.0 / batch_size)) return res + # preprocess input for prediction @@ -97,35 +99,50 @@ def preprocess_input(args): for step in steps: raw_text = preprocessing_setps[step](raw_text) - number_of_characters = args.number_of_characters + \ - len(args.extra_characters) + number_of_characters = args.number_of_characters + len(args.extra_characters) identity_mat = np.identity(number_of_characters) vocabulary = list(args.alphabet) + list(args.extra_characters) max_length = args.max_length - processed_output = np.array([identity_mat[vocabulary.index(i)] for i in list( - raw_text[::-1]) if i in vocabulary], dtype=np.float32) + processed_output = np.array( + [ + identity_mat[vocabulary.index(i)] + for i in list(raw_text[::-1]) + if i in vocabulary + ], + dtype=np.float32, + ) if len(processed_output) > max_length: processed_output = processed_output[:max_length] elif 0 < len(processed_output) < max_length: - processed_output = np.concatenate((processed_output, np.zeros( - (max_length - len(processed_output), number_of_characters), dtype=np.float32))) + processed_output = np.concatenate( + ( + processed_output, + np.zeros( + (max_length - len(processed_output), number_of_characters), + dtype=np.float32, + ), + ) + ) elif len(processed_output) == 0: processed_output = np.zeros( - (max_length, number_of_characters), dtype=np.float32) + (max_length, number_of_characters), dtype=np.float32 + ) return processed_output # cyclic learning rate scheduling + def cyclical_lr(stepsize, min_lr=1.7e-3, max_lr=1e-2): # Scaler: we can adapt this if we do not want the triangular CLR - def scaler(x): return 1. + def scaler(x): + return 1.0 # Lambda function to calculate the LR - def lr_lambda(it): return min_lr + (max_lr - - min_lr) * relative(it, stepsize) + def lr_lambda(it): + return min_lr + (max_lr - min_lr) * relative(it, stepsize) # Additional function to see where on the cycle we are def relative(it, stepsize): diff --git a/train.py b/train.py index 49867a6..177f6dd 100644 --- a/train.py +++ b/train.py @@ -23,14 +23,25 @@ from src.focal_loss import FocalLoss -def train(model, training_generator, optimizer, criterion, epoch, writer, log_file, scheduler, class_names, args, print_every=25): +def train( + model, + training_generator, + optimizer, + criterion, + epoch, + writer, + log_file, + scheduler, + class_names, + args, + print_every=25, +): model.train() losses = utils.AverageMeter() accuracies = utils.AverageMeter() num_iter_per_epoch = len(training_generator) - progress_bar = tqdm(enumerate(training_generator), - total=num_iter_per_epoch) + progress_bar = tqdm(enumerate(training_generator), total=num_iter_per_epoch) y_true = [] y_pred = [] @@ -50,75 +61,75 @@ def train(model, training_generator, optimizer, criterion, epoch, writer, log_fi loss = criterion(predictions, labels) loss.backward() - if args.scheduler == 'clr': + if args.scheduler == "clr": scheduler.step() optimizer.step() - training_metrics = utils.get_evaluation(labels.cpu().numpy(), - predictions.cpu().detach().numpy(), - list_metrics=["accuracy", "f1"]) + training_metrics = utils.get_evaluation( + labels.cpu().numpy(), + predictions.cpu().detach().numpy(), + list_metrics=["accuracy", "f1"], + ) losses.update(loss.data, features.size(0)) accuracies.update(training_metrics["accuracy"], features.size(0)) - f1 = training_metrics['f1'] + f1 = training_metrics["f1"] - writer.add_scalar('Train/Loss', - loss.item(), - epoch * num_iter_per_epoch + iter) + writer.add_scalar("Train/Loss", loss.item(), epoch * num_iter_per_epoch + iter) - writer.add_scalar('Train/Accuracy', - training_metrics['accuracy'], - epoch * num_iter_per_epoch + iter) + writer.add_scalar( + "Train/Accuracy", + training_metrics["accuracy"], + epoch * num_iter_per_epoch + iter, + ) - writer.add_scalar('Train/f1', - f1, - epoch * num_iter_per_epoch + iter) + writer.add_scalar("Train/f1", f1, epoch * num_iter_per_epoch + iter) lr = optimizer.state_dict()["param_groups"][0]["lr"] if (iter % print_every == 0) and (iter > 0): - print("[Training - Epoch: {}], LR: {} , Iteration: {}/{} , Loss: {}, Accuracy: {}".format( - epoch + 1, - lr, - iter, - num_iter_per_epoch, - losses.avg, - accuracies.avg - )) + print( + "[Training - Epoch: {}], LR: {} , Iteration: {}/{} , Loss: {}, Accuracy: {}".format( + epoch + 1, lr, iter, num_iter_per_epoch, losses.avg, accuracies.avg + ) + ) if bool(args.log_f1): intermediate_report = classification_report( - y_true, y_pred, output_dict=True) + y_true, y_pred, output_dict=True + ) - f1_by_class = 'F1 Scores by class: ' + f1_by_class = "F1 Scores by class: " for class_name in class_names: f1_by_class += f"{class_name} : {np.round(intermediate_report[class_name]['f1-score'], 4)} |" print(f1_by_class) - f1_train = f1_score(y_true, y_pred, average='weighted') + f1_train = f1_score(y_true, y_pred, average="weighted") - writer.add_scalar('Train/loss/epoch', losses.avg, epoch + iter) - writer.add_scalar('Train/acc/epoch', accuracies.avg, epoch + iter) - writer.add_scalar('Train/f1/epoch', f1_train, epoch + iter) + writer.add_scalar("Train/loss/epoch", losses.avg, epoch + iter) + writer.add_scalar("Train/acc/epoch", accuracies.avg, epoch + iter) + writer.add_scalar("Train/f1/epoch", f1_train, epoch + iter) report = classification_report(y_true, y_pred) print(report) - with open(log_file, 'a') as f: - f.write(f'Training on Epoch {epoch} \n') - f.write(f'Average loss: {losses.avg.item()} \n') - f.write(f'Average accuracy: {accuracies.avg.item()} \n') - f.write(f'F1 score: {f1_train} \n\n') + with open(log_file, "a") as f: + f.write(f"Training on Epoch {epoch} \n") + f.write(f"Average loss: {losses.avg.item()} \n") + f.write(f"Average accuracy: {accuracies.avg.item()} \n") + f.write(f"F1 score: {f1_train} \n\n") f.write(report) - f.write('*' * 25) - f.write('\n') + f.write("*" * 25) + f.write("\n") return losses.avg.item(), accuracies.avg.item(), f1_train -def evaluate(model, validation_generator, criterion, epoch, writer, log_file, print_every=25): +def evaluate( + model, validation_generator, criterion, epoch, writer, log_file, print_every=25 +): model.eval() losses = utils.AverageMeter() accuracies = utils.AverageMeter() @@ -139,53 +150,47 @@ def evaluate(model, validation_generator, criterion, epoch, writer, log_file, pr y_true += labels.cpu().numpy().tolist() y_pred += torch.max(predictions, 1)[1].cpu().numpy().tolist() - validation_metrics = utils.get_evaluation(labels.cpu().numpy(), - predictions.cpu().detach().numpy(), - list_metrics=["accuracy", "f1"]) - accuracy = validation_metrics['accuracy'] - f1 = validation_metrics['f1'] + validation_metrics = utils.get_evaluation( + labels.cpu().numpy(), + predictions.cpu().detach().numpy(), + list_metrics=["accuracy", "f1"], + ) + accuracy = validation_metrics["accuracy"] + f1 = validation_metrics["f1"] losses.update(loss.data, features.size(0)) accuracies.update(validation_metrics["accuracy"], features.size(0)) - writer.add_scalar('Test/Loss', - loss.item(), - epoch * num_iter_per_epoch + iter) + writer.add_scalar("Test/Loss", loss.item(), epoch * num_iter_per_epoch + iter) - writer.add_scalar('Test/Accuracy', - accuracy, - epoch * num_iter_per_epoch + iter) + writer.add_scalar("Test/Accuracy", accuracy, epoch * num_iter_per_epoch + iter) - writer.add_scalar('Test/f1', - f1, - epoch * num_iter_per_epoch + iter) + writer.add_scalar("Test/f1", f1, epoch * num_iter_per_epoch + iter) if (iter % print_every == 0) and (iter > 0): - print("[Validation - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format( - epoch + 1, - iter, - num_iter_per_epoch, - losses.avg, - accuracies.avg - )) + print( + "[Validation - Epoch: {}] , Iteration: {}/{} , Loss: {}, Accuracy: {}".format( + epoch + 1, iter, num_iter_per_epoch, losses.avg, accuracies.avg + ) + ) - f1_test = f1_score(y_true, y_pred, average='weighted') + f1_test = f1_score(y_true, y_pred, average="weighted") - writer.add_scalar('Test/loss/epoch', losses.avg, epoch + iter) - writer.add_scalar('Test/acc/epoch', accuracies.avg, epoch + iter) - writer.add_scalar('Test/f1/epoch', f1_test, epoch + iter) + writer.add_scalar("Test/loss/epoch", losses.avg, epoch + iter) + writer.add_scalar("Test/acc/epoch", accuracies.avg, epoch + iter) + writer.add_scalar("Test/f1/epoch", f1_test, epoch + iter) report = classification_report(y_true, y_pred) print(report) - with open(log_file, 'a') as f: - f.write(f'Validation on Epoch {epoch} \n') - f.write(f'Average loss: {losses.avg.item()} \n') - f.write(f'Average accuracy: {accuracies.avg.item()} \n') - f.write(f'F1 score {f1_test} \n\n') + with open(log_file, "a") as f: + f.write(f"Validation on Epoch {epoch} \n") + f.write(f"Average loss: {losses.avg.item()} \n") + f.write(f"Average accuracy: {accuracies.avg.item()} \n") + f.write(f"F1 score {f1_test} \n\n") f.write(report) - f.write('=' * 50) - f.write('\n') + f.write("=" * 50) + f.write("\n") return losses.avg.item(), accuracies.avg.item(), f1_test @@ -201,41 +206,55 @@ def run(args, both_cases=False): now = datetime.now() logdir = args.log_path + now.strftime("%Y%m%d-%H%M%S") + "/" os.makedirs(logdir) - log_file = logdir + 'log.txt' + log_file = logdir + "log.txt" writer = SummaryWriter(logdir) batch_size = args.batch_size - training_params = {"batch_size": batch_size, - "shuffle": True, - "num_workers": args.workers, - "drop_last": True} + training_params = { + "batch_size": batch_size, + "shuffle": True, + "num_workers": args.workers, + "drop_last": True, + } - validation_params = {"batch_size": batch_size, - "shuffle": False, - "num_workers": args.workers, - "drop_last": True} + validation_params = { + "batch_size": batch_size, + "shuffle": False, + "num_workers": args.workers, + "drop_last": True, + } texts, labels, number_of_classes, sample_weights = load_data(args) class_names = sorted(list(set(labels))) class_names = [str(class_name) for class_name in class_names] - train_texts, val_texts, train_labels, val_labels, train_sample_weights, _ = train_test_split(texts, - labels, - sample_weights, - test_size=args.validation_split, - random_state=42, - stratify=labels) + ( + train_texts, + val_texts, + train_labels, + val_labels, + train_sample_weights, + _, + ) = train_test_split( + texts, + labels, + sample_weights, + test_size=args.validation_split, + random_state=42, + stratify=labels, + ) training_set = MyDataset(train_texts, train_labels, args) validation_set = MyDataset(val_texts, val_labels, args) if bool(args.use_sampler): train_sample_weights = torch.from_numpy(train_sample_weights) - sampler = WeightedRandomSampler(train_sample_weights.type( - 'torch.DoubleTensor'), len(train_sample_weights)) - training_params['sampler'] = sampler - training_params['shuffle'] = False + sampler = WeightedRandomSampler( + train_sample_weights.type("torch.DoubleTensor"), len(train_sample_weights) + ) + training_params["sampler"] = sampler + training_params["shuffle"] = False training_generator = DataLoader(training_set, **training_params) validation_generator = DataLoader(validation_set, **validation_params) @@ -257,7 +276,7 @@ def run(args, both_cases=False): weights = torch.Tensor(weights) if torch.cuda.is_available(): weights = weights.cuda() - print(f'passing weights to CrossEntropyLoss : {weights}') + print(f"passing weights to CrossEntropyLoss : {weights}") criterion = nn.CrossEntropyLoss(weight=weights) else: criterion = nn.CrossEntropyLoss() @@ -266,11 +285,12 @@ def run(args, both_cases=False): if args.alpha is None: criterion = FocalLoss(gamma=args.gamma, alpha=None) else: - criterion = FocalLoss(gamma=args.gamma, - alpha=[args.alpha] * number_of_classes) + criterion = FocalLoss( + gamma=args.gamma, alpha=[args.alpha] * number_of_classes + ) - if args.optimizer == 'sgd': - if args.scheduler == 'clr': + if args.optimizer == "sgd": + if args.scheduler == "clr": optimizer = torch.optim.SGD( model.parameters(), lr=1, momentum=0.9, weight_decay=0.00001 ) @@ -278,15 +298,13 @@ def run(args, both_cases=False): optimizer = torch.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=0.9 ) - elif args.optimizer == 'adam': - optimizer = torch.optim.Adam( - model.parameters(), lr=args.learning_rate - ) + elif args.optimizer == "adam": + optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) best_f1 = 0 best_epoch = 0 - if args.scheduler == 'clr': + if args.scheduler == "clr": stepsize = int(args.stepsize * len(training_generator)) clr = utils.cyclical_lr(stepsize, args.min_lr, args.max_lr) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) @@ -294,39 +312,51 @@ def run(args, both_cases=False): scheduler = None for epoch in range(args.epochs): - training_loss, training_accuracy, train_f1 = train(model, - training_generator, - optimizer, - criterion, - epoch, - writer, - log_file, - scheduler, - class_names, - args, - args.log_every) - - validation_loss, validation_accuracy, validation_f1 = evaluate(model, - validation_generator, - criterion, - epoch, - writer, - log_file, - args.log_every) - - print('[Epoch: {} / {}]\ttrain_loss: {:.4f} \ttrain_acc: {:.4f} \tval_loss: {:.4f} \tval_acc: {:.4f}'. - format(epoch + 1, args.epochs, training_loss, training_accuracy, validation_loss, validation_accuracy)) + training_loss, training_accuracy, train_f1 = train( + model, + training_generator, + optimizer, + criterion, + epoch, + writer, + log_file, + scheduler, + class_names, + args, + args.log_every, + ) + + validation_loss, validation_accuracy, validation_f1 = evaluate( + model, + validation_generator, + criterion, + epoch, + writer, + log_file, + args.log_every, + ) + + print( + "[Epoch: {} / {}]\ttrain_loss: {:.4f} \ttrain_acc: {:.4f} \tval_loss: {:.4f} \tval_acc: {:.4f}".format( + epoch + 1, + args.epochs, + training_loss, + training_accuracy, + validation_loss, + validation_accuracy, + ) + ) print("=" * 50) # learning rate scheduling - if args.scheduler == 'step': - if args.optimizer == 'sgd' and ((epoch + 1) % 3 == 0) and epoch > 0: - current_lr = optimizer.state_dict()['param_groups'][0]['lr'] + if args.scheduler == "step": + if args.optimizer == "sgd" and ((epoch + 1) % 3 == 0) and epoch > 0: + current_lr = optimizer.state_dict()["param_groups"][0]["lr"] current_lr /= 2 - print('Decreasing learning rate to {0}'.format(current_lr)) + print("Decreasing learning rate to {0}".format(current_lr)) for param_group in optimizer.param_groups: - param_group['lr'] = current_lr + param_group["lr"] = current_lr # model checkpoint @@ -334,82 +364,82 @@ def run(args, both_cases=False): best_f1 = validation_f1 best_epoch = epoch if args.checkpoint == 1: - torch.save(model.state_dict(), args.output + 'model_{}_epoch_{}_maxlen_{}_lr_{}_loss_{}_acc_{}_f1_{}.pth'.format(args.model_name, - epoch, - args.max_length, - optimizer.state_dict()[ - 'param_groups'][0]['lr'], - round( - validation_loss, 4), - round( - validation_accuracy, 4), - round( - validation_f1, 4) - )) + torch.save( + model.state_dict(), + args.output + + "model_{}_epoch_{}_maxlen_{}_lr_{}_loss_{}_acc_{}_f1_{}.pth".format( + args.model_name, + epoch, + args.max_length, + optimizer.state_dict()["param_groups"][0]["lr"], + round(validation_loss, 4), + round(validation_accuracy, 4), + round(validation_f1, 4), + ), + ) if bool(args.early_stopping): if epoch - best_epoch > args.patience > 0: - print("Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format( - epoch, validation_loss, best_epoch)) + print( + "Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format( + epoch, validation_loss, best_epoch + ) + ) break if __name__ == "__main__": - parser = argparse.ArgumentParser( - 'Character Based CNN for text classification') - parser.add_argument('--data_path', type=str, - default='./data/train.csv') - parser.add_argument('--validation_split', type=float, default=0.2) - parser.add_argument('--label_column', type=str, default='Sentiment') - parser.add_argument('--text_column', type=str, default='SentimentText') - parser.add_argument('--max_rows', type=int, default=None) - parser.add_argument('--chunksize', type=int, default=50000) - parser.add_argument('--encoding', type=str, default='utf-8') - parser.add_argument('--sep', type=str, default=',') - parser.add_argument('--steps', nargs='+', default=['lower']) - parser.add_argument('--group_labels', type=int, default=1, choices=[0, 1]) - parser.add_argument('--ignore_center', type=int, default=1, choices=[0, 1]) - parser.add_argument('--label_ignored', type=int, default=None) - parser.add_argument('--ratio', type=float, default=1) - parser.add_argument('--balance', type=int, default=0, choices=[0, 1]) - parser.add_argument('--use_sampler', type=int, - default=0, choices=[0, 1]) - - parser.add_argument('--alphabet', type=str, - default="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") - parser.add_argument('--number_of_characters', type=int, default=69) - parser.add_argument('--extra_characters', type=str, default='') - parser.add_argument('--max_length', type=int, default=150) - parser.add_argument('--dropout_input', type=float, default=0.1) - parser.add_argument('--epochs', type=int, default=10) - parser.add_argument('--batch_size', type=int, default=128) - parser.add_argument('--optimizer', type=str, - choices=['adam', 'sgd'], default='sgd') - parser.add_argument('--learning_rate', type=float, default=0.01) - parser.add_argument('--class_weights', type=int, - default=0, choices=[0, 1]) - parser.add_argument('--focal_loss', type=int, default=0, choices=[0, 1]) - parser.add_argument('--gamma', type=float, default=2) - parser.add_argument('--alpha', type=float, default=None) - - parser.add_argument('--scheduler', type=str, - default='step', choices=['clr', 'step']) - parser.add_argument('--min_lr', type=float, default=1.7e-3) - parser.add_argument('--max_lr', type=float, default=1e-2) - parser.add_argument('--stepsize', type=float, default=4) - parser.add_argument('--patience', type=int, default=3) - parser.add_argument('--early_stopping', type=int, - default=0, choices=[0, 1]) - parser.add_argument('--checkpoint', type=int, - choices=[0, 1], default=1) - parser.add_argument('--workers', type=int, default=1) - parser.add_argument('--log_path', type=str, default='./logs/') - parser.add_argument('--log_every', type=int, default=100) - parser.add_argument('--log_f1', type=int, default=1, choices=[0, 1]) - parser.add_argument('--flush_history', type=int, - default=1, choices=[0, 1]) - parser.add_argument('--output', type=str, default='./models/') - parser.add_argument('--model_name', type=str, default='') + parser = argparse.ArgumentParser("Character Based CNN for text classification") + parser.add_argument("--data_path", type=str, default="./data/train.csv") + parser.add_argument("--validation_split", type=float, default=0.2) + parser.add_argument("--label_column", type=str, default="Sentiment") + parser.add_argument("--text_column", type=str, default="SentimentText") + parser.add_argument("--max_rows", type=int, default=None) + parser.add_argument("--chunksize", type=int, default=50000) + parser.add_argument("--encoding", type=str, default="utf-8") + parser.add_argument("--sep", type=str, default=",") + parser.add_argument("--steps", nargs="+", default=["lower"]) + parser.add_argument("--group_labels", type=int, default=1, choices=[0, 1]) + parser.add_argument("--ignore_center", type=int, default=1, choices=[0, 1]) + parser.add_argument("--label_ignored", type=int, default=None) + parser.add_argument("--ratio", type=float, default=1) + parser.add_argument("--balance", type=int, default=0, choices=[0, 1]) + parser.add_argument("--use_sampler", type=int, default=0, choices=[0, 1]) + + parser.add_argument( + "--alphabet", + type=str, + default="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}", + ) + parser.add_argument("--number_of_characters", type=int, default=69) + parser.add_argument("--extra_characters", type=str, default="") + parser.add_argument("--max_length", type=int, default=150) + parser.add_argument("--dropout_input", type=float, default=0.1) + parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--optimizer", type=str, choices=["adam", "sgd"], default="sgd") + parser.add_argument("--learning_rate", type=float, default=0.01) + parser.add_argument("--class_weights", type=int, default=0, choices=[0, 1]) + parser.add_argument("--focal_loss", type=int, default=0, choices=[0, 1]) + parser.add_argument("--gamma", type=float, default=2) + parser.add_argument("--alpha", type=float, default=None) + + parser.add_argument( + "--scheduler", type=str, default="step", choices=["clr", "step"] + ) + parser.add_argument("--min_lr", type=float, default=1.7e-3) + parser.add_argument("--max_lr", type=float, default=1e-2) + parser.add_argument("--stepsize", type=float, default=4) + parser.add_argument("--patience", type=int, default=3) + parser.add_argument("--early_stopping", type=int, default=0, choices=[0, 1]) + parser.add_argument("--checkpoint", type=int, choices=[0, 1], default=1) + parser.add_argument("--workers", type=int, default=1) + parser.add_argument("--log_path", type=str, default="./logs/") + parser.add_argument("--log_every", type=int, default=100) + parser.add_argument("--log_f1", type=int, default=1, choices=[0, 1]) + parser.add_argument("--flush_history", type=int, default=1, choices=[0, 1]) + parser.add_argument("--output", type=str, default="./models/") + parser.add_argument("--model_name", type=str, default="") args = parser.parse_args() run(args)