From c7a4fa94315e706563c34c6652ea12bd03027f63 Mon Sep 17 00:00:00 2001 From: Jeremy Costello Date: Mon, 14 Aug 2023 13:44:21 -0230 Subject: [PATCH 1/6] add model from reinforcement learning for topic models --- tests/test_octis.py | 73 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/test_octis.py b/tests/test_octis.py index 5dd107c4..7c70378c 100644 --- a/tests/test_octis.py +++ b/tests/test_octis.py @@ -9,6 +9,7 @@ from octis.models.LDA_tomopy import LDA_tomopy as LDATOMOTO from octis.models.ETM import ETM from octis.models.CTM import CTM +from octis.models.RLTM import RLTM from octis.models.NMF import NMF from octis.models.NMF_scikit import NMF_scikit from octis.models.ProdLDA import ProdLDA @@ -574,3 +575,75 @@ def test_model_output_prodlda_not_partitioned(data_dir): assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_corpus())) + + +def test_model_output_rltm_seeded(data_dir): + dataset = Dataset() + dataset.load_custom_dataset_from_folder(data_dir + '/M10') + num_topics = 3 + model = RLTM( + num_topics=num_topics, seed=10, num_epochs=5) + output = model.train_model(dataset) + + model_2 = RLTM( + num_topics=num_topics, seed=10, num_epochs=5) + output_2 = model_2.train_model(dataset) + + assert output['topics'] == output_2['topics'] + + +def test_model_output_rltm(data_dir): + dataset = Dataset() + dataset.load_custom_dataset_from_folder(data_dir + '/M10') + num_topics = 3 + model = RLTM(num_topics=num_topics, num_epochs=5) + output = model.train_model(dataset) + assert 'topics' in output.keys() + assert 'topic-word-matrix' in output.keys() + assert 'test-topic-document-matrix' in output.keys() + + # check topics format + assert type(output['topics']) == list + assert len(output['topics']) == num_topics + + # check topic-word-matrix format + assert type(output['topic-word-matrix']) == np.ndarray + assert output['topic-word-matrix'].shape == (num_topics, len( + dataset.get_vocabulary())) + + # check topic-document-matrix format + assert type(output['topic-document-matrix']) == np.ndarray + assert output['topic-document-matrix'].shape == ( + num_topics, len(dataset.get_partitioned_corpus()[0])) + + # check test-topic-document-matrix format + assert type(output['test-topic-document-matrix']) == np.ndarray + assert output['test-topic-document-matrix'].shape == ( + num_topics, len(dataset.get_partitioned_corpus()[2])) + + +def test_model_output_rltm_not_partition(data_dir): + dataset = Dataset() + dataset.load_custom_dataset_from_folder(data_dir + '/M10') + num_topics = 3 + model = RLTM( + num_topics=num_topics, num_epochs=5, + use_partitions=False, bert_path='./not_part') + output = model.train_model(dataset) + assert 'topics' in output.keys() + assert 'topic-word-matrix' in output.keys() + assert 'test-topic-document-matrix' not in output.keys() + + # check topics format + assert type(output['topics']) == list + assert len(output['topics']) == num_topics + + # check topic-word-matrix format + assert type(output['topic-word-matrix']) == np.ndarray + assert output['topic-word-matrix'].shape == ( + num_topics, len(dataset.get_vocabulary())) + + # check topic-document-matrix format + assert type(output['topic-document-matrix']) == np.ndarray + assert output['topic-document-matrix'].shape == ( + num_topics, len(dataset.get_corpus())) From b9ddee7472fc0f104d8fc6587b5af08656763472 Mon Sep 17 00:00:00 2001 From: Jeremy Costello Date: Mon, 14 Aug 2023 13:50:57 -0230 Subject: [PATCH 2/6] add model from reinforcement learning for topic models --- octis/models/RLTM.py | 255 +++++++++ .../rl_for_topic_models/datasets/__init__.py | 0 .../rl_for_topic_models/datasets/dataset.py | 37 ++ .../rl_for_topic_models/models/__init__.py | 0 .../models/rl_for_topic_models/models/rltm.py | 485 ++++++++++++++++++ .../rl_for_topic_models/networks/__init__.py | 0 .../networks/decoder_network.py | 131 +++++ .../networks/inference_network.py | 71 +++ .../rl_for_topic_models/utils/__init__.py | 0 .../utils/data_preparation.py | 151 ++++++ .../utils/preprocessing.py | 58 +++ 11 files changed, 1188 insertions(+) create mode 100644 octis/models/RLTM.py create mode 100644 octis/models/rl_for_topic_models/datasets/__init__.py create mode 100644 octis/models/rl_for_topic_models/datasets/dataset.py create mode 100644 octis/models/rl_for_topic_models/models/__init__.py create mode 100644 octis/models/rl_for_topic_models/models/rltm.py create mode 100644 octis/models/rl_for_topic_models/networks/__init__.py create mode 100644 octis/models/rl_for_topic_models/networks/decoder_network.py create mode 100644 octis/models/rl_for_topic_models/networks/inference_network.py create mode 100644 octis/models/rl_for_topic_models/utils/__init__.py create mode 100644 octis/models/rl_for_topic_models/utils/data_preparation.py create mode 100644 octis/models/rl_for_topic_models/utils/preprocessing.py diff --git a/octis/models/RLTM.py b/octis/models/RLTM.py new file mode 100644 index 00000000..916cc165 --- /dev/null +++ b/octis/models/RLTM.py @@ -0,0 +1,255 @@ +from sklearn.feature_extraction.text import CountVectorizer + +from octis.models.model import AbstractModel +from octis.models.rl_for_topic_models.datasets import dataset +from octis.models.rl_for_topic_models.models import rltm +from octis.models.rl_for_topic_models.utils.data_preparation import ( + bert_embeddings_from_list) + +import os +import pickle as pkl +import torch +import numpy as np +import random + + +class RLTM(AbstractModel): + + def __init__( + self, num_topics=10, activation='gelu', num_layers=2, num_neurons=128, + inference_dropout=0.2, policy_dropout=0.0, batch_size=256, lr=3e-4, + momentum=0.9, solver='adamw', num_epochs=200, num_samples=10, + seed=None, use_partitions=True, reduce_on_plateau=False, bert_path="", + bert_model="all-MiniLM-L6-v2", weight_decay=0.01, kl_multiplier=1.0): + """ + initialization of RLTM + + :param num_topics : int, number of topic components, (default 10) + :param activation : string, 'softplus', 'relu', 'sigmoid', + 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu', + 'gelu' (default 'gelu') + :param num_layers : int, number of layers (default 2) + :param num_neurons : int, number of neurons per layer (default 128) + :param inference_dropout : float, inference dropout to use (default 0.2) + :param policy_dropout : float, policy dropout to use (default 0.0) + :param batch_size : int, size of batch to use for training (default 256) + :param lr : float, learning rate to use for training (default 3e-4) + :param momentum : float, momentum to use for training (default 0.9) + :param solver: string, optimizer 'adagrad', 'adam', 'sgd', 'adadelta', + 'rmsprop', 'adamw' (default 'adamw') + :param num_epochs : int, number of epochs to train for, (default 200) + :param num_samples: int, number of times theta needs to be sampled + (default: 10) + :param seed : int, the random seed. Not used if None (default None). + :param use_partitions: bool, if true the model will be trained on the + training set and evaluated on the test set (default: true) + :param reduce_on_plateau : bool, reduce learning rate by 10x on + plateau of 10 epochs (default False) + :param bert_path: path to store the document contextualized + representations + :param bert_model: name of the contextualized model + (default: all-MiniLM-L6-v2). + see https://www.sbert.net/docs/pretrained_models.html + :param weight_decay: float, L2 regularization on model weights + :param kl_multiplier: float or int, multiplier on the KL + divergence (default 1.0) + """ + + super().__init__() + + self.hyperparameters['num_topics'] = num_topics + self.hyperparameters['activation'] = activation + self.hyperparameters['inference_dropout'] = inference_dropout + self.hyperparameters['policy_dropout'] = policy_dropout + self.hyperparameters['batch_size'] = batch_size + self.hyperparameters['lr'] = lr + self.hyperparameters['num_samples'] = num_samples + self.hyperparameters['momentum'] = momentum + self.hyperparameters['solver'] = solver + self.hyperparameters['num_epochs'] = num_epochs + self.hyperparameters['reduce_on_plateau'] = reduce_on_plateau + self.hyperparameters["num_neurons"] = num_neurons + self.hyperparameters["bert_path"] = bert_path + self.hyperparameters["num_layers"] = num_layers + self.hyperparameters["bert_model"] = bert_model + self.hyperparameters["seed"] = seed + self.hyperparameters["weight_decay"] = weight_decay + self.hyperparameters['kl_multiplier'] = kl_multiplier + self.use_partitions = use_partitions + + hidden_sizes = tuple([num_neurons for _ in range(num_layers)]) + self.hyperparameters['hidden_sizes'] = tuple(hidden_sizes) + + self.model = None + self.vocab = None + + def train_model(self, dataset, hyperparameters=None, top_words=10): + """ + trains RLTM model + + :param dataset: octis Dataset for training the model + :param hyperparameters: dict, with (optionally) the hyperparameters + :param top_words: number of top-n words of the topics (default 10) + + """ + if hyperparameters is None: + hyperparameters = {} + + self.set_params(hyperparameters) + self.vocab = dataset.get_vocabulary() + self.set_seed(seed=self.hyperparameters['seed']) + + if self.use_partitions: + train, validation, test = dataset.get_partitioned_corpus( + use_validation=True) + + data_corpus_train = [' '.join(i) for i in train] + data_corpus_test = [' '.join(i) for i in test] + data_corpus_validation = [' '.join(i) for i in validation] + + x_train, x_test, x_valid, input_size = self.preprocess( + self.vocab, data_corpus_train, test=data_corpus_test, + validation=data_corpus_validation, + bert_train_path=( + self.hyperparameters['bert_path'] + "_train.pkl"), + bert_test_path=self.hyperparameters['bert_path'] + "_test.pkl", + bert_val_path=self.hyperparameters['bert_path'] + "_val.pkl", + bert_model=self.hyperparameters["bert_model"]) + + self.model = rltm.RLTM( + input_size=input_size, bert_size=x_train.X_bert.shape[1], + num_topics=self.hyperparameters['num_topics'], + hidden_sizes=self.hyperparameters['hidden_sizes'], + activation=self.hyperparameters['activation'], + inference_dropout=self.hyperparameters['inference_dropout'], + policy_dropout=self.hyperparameters['policy_dropout'], + batch_size=self.hyperparameters['batch_size'], + lr=self.hyperparameters['lr'], + momentum=self.hyperparameters['momentum'], + solver=self.hyperparameters['solver'], + num_epochs=self.hyperparameters['num_epochs'], + num_samples=self.hyperparameters['num_samples'], + reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], + weight_decay=self.hyperparameters['weight_decay'], + kl_multiplier=self.hyperparameters['kl_multiplier'], + top_words=top_words) + + self.model.fit(x_train, x_valid, verbose=False) + result = self.inference(x_test) + return result + + else: + data_corpus = [' '.join(i) for i in dataset.get_corpus()] + x_train, input_size = self.preprocess( + self.vocab, train=data_corpus, + bert_train_path=( + self.hyperparameters['bert_path'] + "_train.pkl"), + bert_model=self.hyperparameters["bert_model"]) + + self.model = rltm.RLTM( + input_size=input_size, bert_size=x_train.X_bert.shape[1], + num_topics=self.hyperparameters['num_topics'], + hidden_sizes=self.hyperparameters['hidden_sizes'], + activation=self.hyperparameters['activation'], + inference_dropout=self.hyperparameters['inference_dropout'], + policy_dropout=self.hyperparameters['policy_dropout'], + batch_size=self.hyperparameters['batch_size'], + lr=self.hyperparameters['lr'], + momentum=self.hyperparameters['momentum'], + solver=self.hyperparameters['solver'], + num_epochs=self.hyperparameters['num_epochs'], + num_samples=self.hyperparameters['num_samples'], + reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], + weight_decay=self.hyperparameters['weight_decay'], + kl_multiplier=self.hyperparameters['kl_multiplier'], + top_words=top_words) + + self.model.fit(x_train, None, verbose=False) + result = self.model.get_info() + return result + + def set_params(self, hyperparameters): + for k in hyperparameters.keys(): + if k in self.hyperparameters.keys() and k != 'hidden_sizes': + self.hyperparameters[k] = hyperparameters.get( + k, self.hyperparameters[k]) + + self.hyperparameters['hidden_sizes'] = tuple( + [self.hyperparameters["num_neurons"] for _ in range( + self.hyperparameters["num_layers"])]) + + def inference(self, x_test): + assert isinstance(self.use_partitions, bool) and self.use_partitions + results = self.model.predict(x_test) + return results + + def partitioning(self, use_partitions=False): + self.use_partitions = use_partitions + + @staticmethod + def set_seed(seed=None): + if seed is not None: + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.enabled = False + torch.backends.cudnn.deterministic = True + + @staticmethod + def preprocess( + vocab, train, bert_model, test=None, validation=None, + bert_train_path=None, bert_test_path=None, bert_val_path=None): + vocab2id = {w: i for i, w in enumerate(vocab)} + vec = CountVectorizer( + vocabulary=vocab2id, token_pattern=r'(?u)\b[\w+|\-]+\b') + entire_dataset = train.copy() + if test is not None: + entire_dataset.extend(test) + if validation is not None: + entire_dataset.extend(validation) + + vec.fit(entire_dataset) + idx2token = {v: k for (k, v) in vec.vocabulary_.items()} + + x_train = vec.transform(train) + b_train = RLTM.load_bert_data(bert_train_path, train, bert_model) + + train_data = dataset.RLTMDataset(x_train.toarray(), b_train, idx2token) + input_size = len(idx2token.keys()) + + if test is not None and validation is not None: + x_test = vec.transform(test) + b_test = RLTM.load_bert_data(bert_test_path, test, bert_model) + test_data = dataset.RLTMDataset(x_test.toarray(), b_test, idx2token) + + x_valid = vec.transform(validation) + b_val = RLTM.load_bert_data(bert_val_path, validation, bert_model) + valid_data = dataset.RLTMDataset( + x_valid.toarray(), b_val, idx2token) + return train_data, test_data, valid_data, input_size + if test is None and validation is not None: + x_valid = vec.transform(validation) + b_val = RLTM.load_bert_data(bert_val_path, validation, bert_model) + valid_data = dataset.RLTMDataset( + x_valid.toarray(), b_val, idx2token) + return train_data, valid_data, input_size + if test is not None and validation is None: + x_test = vec.transform(test) + b_test = RLTM.load_bert_data(bert_test_path, test, bert_model) + test_data = dataset.RLTMDataset(x_test.toarray(), b_test, idx2token) + return train_data, test_data, input_size + if test is None and validation is None: + return train_data, input_size + + @staticmethod + def load_bert_data(bert_path, texts, bert_model): + if bert_path is not None: + if os.path.exists(bert_path): + bert_ouput = pkl.load(open(bert_path, 'rb')) + else: + bert_ouput = bert_embeddings_from_list(texts, bert_model) + pkl.dump(bert_ouput, open(bert_path, 'wb')) + else: + bert_ouput = bert_embeddings_from_list(texts, bert_model) + return bert_ouput diff --git a/octis/models/rl_for_topic_models/datasets/__init__.py b/octis/models/rl_for_topic_models/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/octis/models/rl_for_topic_models/datasets/dataset.py b/octis/models/rl_for_topic_models/datasets/dataset.py new file mode 100644 index 00000000..67a4f206 --- /dev/null +++ b/octis/models/rl_for_topic_models/datasets/dataset.py @@ -0,0 +1,37 @@ +import torch +from torch.utils.data import Dataset +import scipy.sparse + + +class RLTMDataset(Dataset): + + """Class to load RLTM dataset.""" + + def __init__(self, X, X_bert, idx2token): + """ + Args + X : array-like, shape=(n_samples, n_features) + Document word matrix. + """ + if X.shape[0] != len(X_bert): + raise Exception("Wait! BoW and Contextual Embeddings have different sizes! " + "You might want to check if the BoW preparation method has removed some documents. ") + + self.X = X + self.X_bert = X_bert + self.idx2token = idx2token + + def __len__(self): + """Return length of dataset.""" + return self.X.shape[0] + + def __getitem__(self, i): + """Return sample from dataset at index i.""" + if type(self.X[i]) == scipy.sparse.csr.csr_matrix: + X = torch.FloatTensor(self.X[i].todense()) + X_bert = torch.FloatTensor(self.X_bert[i]) + else: + X = torch.FloatTensor(self.X[i]) + X_bert = torch.FloatTensor(self.X_bert[i]) + + return {'X': X, 'X_bert': X_bert} diff --git a/octis/models/rl_for_topic_models/models/__init__.py b/octis/models/rl_for_topic_models/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/octis/models/rl_for_topic_models/models/rltm.py b/octis/models/rl_for_topic_models/models/rltm.py new file mode 100644 index 00000000..d53698f7 --- /dev/null +++ b/octis/models/rl_for_topic_models/models/rltm.py @@ -0,0 +1,485 @@ +import datetime +import os +from collections import defaultdict + +import numpy as np +import torch +from torch import nn +from torch import optim +from torch.nn import functional as F +from torch.optim.lr_scheduler import ReduceLROnPlateau +from torch.utils.data import DataLoader + +from octis.models.rl_for_topic_models.networks.decoder_network import ( + DecoderNetwork) +from octis.models.early_stopping.pytorchtools import EarlyStopping + + +class RLTM(object): + """Class to train the reinforcement learning topic model + """ + + def __init__( + self, input_size, bert_size, num_topics=10, hidden_sizes=(128, 128), + activation='gelu', inference_dropout=0.2, policy_dropout=0.0, + batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', num_epochs=200, + num_samples=10, reduce_on_plateau=False, top_words=10, + num_data_loader_workers=0, weight_decay=0.01, kl_multiplier=1.0): + + """ + :param input_size: int, dimension of input + :param bert_input_size: int, dimension of BERT input + :param num_topics: int, number of topic components, (default 10) + :param hidden_sizes: tuple, length = n_layers, (default (128, 128)) + :param activation: string, 'softplus', 'relu', 'sigmoid', 'swish', + 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu', 'gelu' (default 'gelu') + :param inference_dropout: float, inference dropout to use (default 0.2) + :param policy_dropout: float, policy dropout to use (default 0.0) + :param batch_size: int, size of batch to use for training (default 256) + :param lr: float, learning rate to use for training (default 3e-4) + :param momentum: float, momentum to use for training (default 0.9) + :param solver: string, optimizer 'adagrad', 'adam', 'sgd', 'adadelta', + 'rmsprop', 'adamw' (default 'adamw') + :param num_samples: int, number of times theta needs to be sampled + :param num_epochs: int, number of epochs to train for, (default 200) + :param reduce_on_plateau: bool, reduce learning rate by 10x on plateau + of 10 epochs (default False) + :param num_data_loader_workers: int, number of data loader workers + (default cpu_count). set it to 0 if you are using Windows + :param weight_decay: float, L2 regularization on model weights (default 0.01) + :param kl_multiplier: float or int, multiplier on the KL + divergence (default 1.0) + """ + + assert isinstance(input_size, int) and input_size > 0, \ + "input_size must by type int > 0." + assert isinstance(bert_size, int) and bert_size > 0, \ + "bert_size must by type int > 0." + assert (isinstance(num_topics, int) or isinstance( + num_topics, np.int64)) and num_topics > 0, \ + "num_topics must by type int > 0." + assert isinstance(hidden_sizes, tuple), \ + "hidden_sizes must be type tuple." + assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', + 'rrelu', 'elu', 'selu', 'gelu'], \ + "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ + " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." + assert inference_dropout >= 0, "inference dropout must be >= 0." + assert policy_dropout >= 0, "policy dropout must be >= 0." + assert isinstance(batch_size, int) and batch_size > 0, \ + "batch_size must be int > 0." + assert lr > 0, "lr must be > 0." + assert isinstance( + momentum, float) and momentum > 0 and momentum <= 1, \ + "momentum must be 0 < float <= 1." + assert solver in ['adagrad', 'adam', 'sgd', 'adadelta', 'rmsprop', + 'adamw'], "solver must be 'adam', 'adadelta', \ + 'sgd', 'rmsprop', 'adagrad', or 'adamw'" + assert isinstance(num_epochs, int) and num_epochs > 0, \ + "num_epochs must be int > 0" + assert isinstance(num_samples, int) and num_samples > 0, \ + "num_samples must be int > 0" + assert isinstance(reduce_on_plateau, bool), \ + "reduce_on_plateau must be type bool." + assert isinstance(top_words, int) and top_words > 0, \ + "top_words must be int > 0" + assert isinstance(num_data_loader_workers, int) \ + and num_data_loader_workers >= 0, \ + "num_data_loader_workers must be int >= 0" + assert weight_decay >= 0, "weight_decay must be >= 0" + assert isinstance(kl_multiplier, float) or isinstance(kl_multiplier, int), \ + "kl_multiplier must be a float or int" + + self.input_size = input_size + self.num_topics = num_topics + self.hidden_sizes = hidden_sizes + self.activation = activation + self.inference_dropout = inference_dropout + self.policy_dropout = policy_dropout + self.batch_size = batch_size + self.lr = lr + self.num_samples = num_samples + self.top_words = top_words + self.momentum = momentum + self.solver = solver + self.num_epochs = num_epochs + self.reduce_on_plateau = reduce_on_plateau + self.num_data_loader_workers = num_data_loader_workers + + # init decoder network + model = DecoderNetwork( + input_size, bert_size, num_topics, hidden_sizes, activation, + inference_dropout, policy_dropout, kl_multiplier) + self.early_stopping = EarlyStopping(patience=5, verbose=False) + + # init optimizer + if self.solver == 'adamw': + self.optimizer = self._configure_adamw(model, weight_decay, lr, + betas=(self.momentum, 0.999)) + if self.solver == 'adam': + self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=( + self.momentum, 0.999)) + elif self.solver == 'sgd': + self.optimizer = optim.SGD( + self.model.parameters(), lr=lr, momentum=self.momentum) + elif self.solver == 'adagrad': + self.optimizer = optim.Adagrad(self.model.parameters(), lr=lr) + elif self.solver == 'adadelta': + self.optimizer = optim.Adadelta(self.model.parameters(), lr=lr) + elif self.solver == 'rmsprop': + self.optimizer = optim.RMSprop( + self.model.parameters(), lr=lr, momentum=self.momentum) + + # init lr scheduler + if self.reduce_on_plateau: + self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10) + + # performance attributes + self.best_loss_train = float('inf') + + # training attributes + self.model_dir = None + self.train_data = None + self.nn_epoch = None + + # learned topics + self.best_components = None + + # Use cuda if available + self.model = model + if torch.cuda.is_available(): + self.USE_CUDA = True + self.model = self.model.cuda() + else: + self.USE_CUDA = False + + @staticmethod + def _configure_adamw(model, weight_decay, lr, betas): + whitelist_weight_modules = (nn.Linear,) + blacklist_weight_modules = (nn.LayerNorm, nn.Embedding) + whitelist_weight_names = () + blacklist_weight_names = ('prior_mean', 'prior_variance', 'beta') + + decay = set() + no_decay = set() + for mn, m in model.named_modules(): + for pn, p in m.named_parameters(): + fpn = f'{mn}.{pn}' if mn else pn + # don't decay biases + if pn.endswith('bias'): + no_decay.add(fpn) + # decay weights according to white/blacklist + elif pn.endswith('weight'): + if isinstance(m, whitelist_weight_modules): + decay.add(fpn) + elif isinstance(m, blacklist_weight_modules): + no_decay.add(fpn) + else: + if fpn in whitelist_weight_names: + decay.add(fpn) + elif fpn in blacklist_weight_names: + no_decay.add(fpn) + param_dict = {pn: p for pn, p in model.named_parameters()} + + # for decay and no decay sets, ensure no intersection and union contains all parameters + inter_params = decay & no_decay + union_params = decay | no_decay + assert len(inter_params) == 0, f'parameters {inter_params} made it into both decay and no_decay set' + assert len(param_dict.keys() - union_params) == 0, \ + f'parameters {param_dict.keys() - union_params} were not separated into either decay or no_decay set' + + optim_groups = [ + {'params': [param_dict[pn] for pn in sorted(list(decay))], 'weight_decay': weight_decay}, + {'params': [param_dict[pn] for pn in sorted(list(no_decay))], 'weight_decay': 0.0} + ] + + return optim.AdamW(optim_groups, lr=lr, betas=betas) + + def _train_epoch(self, loader): + """Train epoch.""" + self.model.train() + train_loss = 0 + samples_processed = 0 + topic_doc_list = [] + for batch_samples in loader: + # batch_size x vocab_size + X = batch_samples['X'] + X = X.reshape(X.shape[0], -1) + X_bert = batch_samples['X_bert'] + if self.USE_CUDA: + X = X.cuda() + X_bert = X_bert.cuda() + + # forward pass + self.model.zero_grad() + loss = self.model(X, X_bert) + topic_doc_list.extend(F.softmax(self.model.action, dim=-1)) + + # backward pass + loss.backward() + self.optimizer.step() + + # compute train loss + samples_processed += X.size()[0] + train_loss += loss.item() + + train_loss /= samples_processed + + return samples_processed, train_loss, self.model.beta, topic_doc_list + + def _validation(self, loader): + """Train epoch.""" + self.model.eval() + val_loss = 0 + samples_processed = 0 + for batch_samples in loader: + # batch_size x vocab_size + X = batch_samples['X'] + X = X.reshape(X.shape[0], -1) + X_bert = batch_samples['X_bert'] + + if self.USE_CUDA: + X = X.cuda() + X_bert = X_bert.cuda() + + # forward pass + self.model.zero_grad() + loss = self.model(X, X_bert) + + # compute train loss + samples_processed += X.size()[0] + val_loss += loss.item() + + val_loss /= samples_processed + + return samples_processed, val_loss + + def fit(self, train_dataset, validation_dataset=None, + save_dir=None, verbose=True): + """ + Train the RLTM model. + + :param train_dataset: PyTorch Dataset class for training data. + :param validation_dataset: PyTorch Dataset class for validation data + :param save_dir: directory to save checkpoint models to. + :param verbose: verbose + """ + # Print settings to output file + if verbose: + print("Settings: \n\ + N Components: {}\n\ + Hidden Sizes: {}\n\ + Activation: {}\n\ + Inference Dropout: {}\n\ + Policy Dropout: {}\n\ + Batch Size: {}\n\ + Learning Rate: {}\n\ + Momentum: {}\n\ + Reduce On Plateau: {}\n\ + Save Dir: {}".format( + self.num_topics, self.hidden_sizes, self.activation, + self.inference_dropout, self.policy_dropout, self.batch_size, + self.lr, self.momentum, self.reduce_on_plateau, save_dir)) + + self.model_dir = save_dir + self.train_data = train_dataset + self.validation_data = validation_dataset + + train_loader = DataLoader( + self.train_data, batch_size=self.batch_size, shuffle=True, + num_workers=self.num_data_loader_workers) + + # init training variables + train_loss = 0 + samples_processed = 0 + + # train loop + for epoch in range(self.num_epochs): + self.nn_epoch = epoch + # train epoch + s = datetime.datetime.now() + sp, train_loss, topic_word, topic_document = self._train_epoch( + train_loader) + samples_processed += sp + e = datetime.datetime.now() + + if verbose: + print("Epoch: [{}/{}]\tSamples: [{}/{}]\tTrain Loss: {}\tTime: {}".format( + epoch + 1, self.num_epochs, samples_processed, + len(self.train_data) * self.num_epochs, train_loss, e - s)) + + self.best_components = self.model.beta + self.final_topic_word = topic_word + self.final_topic_document = topic_document + self.best_loss_train = train_loss + if self.validation_data is not None: + validation_loader = DataLoader( + self.validation_data, batch_size=self.batch_size, + shuffle=True, num_workers=self.num_data_loader_workers) + # train epoch + s = datetime.datetime.now() + val_samples_processed, val_loss = self._validation( + validation_loader) + e = datetime.datetime.now() + + if verbose: + print( + "Epoch: [{}/{}]\tSamples: [{}/{}]" + "\tValidation Loss: {}\tTime: {}".format( + epoch + 1, self.num_epochs, val_samples_processed, + len(self.validation_data) * self.num_epochs, + val_loss, e - s)) + + if np.isnan(val_loss) or np.isnan(train_loss): + break + else: + self.early_stopping(val_loss, self.model) + if self.early_stopping.early_stop: + if verbose: + print("Early stopping") + if save_dir is not None: + self.save(save_dir) + break + + def predict(self, dataset): + """Predict input.""" + self.model.eval() + + loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False, + num_workers=self.num_data_loader_workers) + + topic_document_mat = [] + with torch.no_grad(): + for batch_samples in loader: + # batch_size x vocab_size + X = batch_samples['X'] + X = X.reshape(X.shape[0], -1) + X_bert = batch_samples['X_bert'] + + if self.USE_CUDA: + X = X.cuda() + X_bert = X_bert.cuda() + # forward pass + self.model.zero_grad() + _ = self.model(X, X_bert) + topic_document_mat.append(F.softmax(self.model.action, dim=-1)) + + results = self.get_info() + results['test-topic-document-matrix'] = np.asarray( + self.get_thetas(dataset)).T + + return results + + def get_topic_word_mat(self): + top_wor = self.final_topic_word.cpu().detach().numpy() + return top_wor + + def get_topic_document_mat(self): + top_doc = self.final_topic_document + top_doc_arr = np.array([i.cpu().detach().numpy() for i in top_doc]) + return top_doc_arr + + def get_topics(self): + """ + Retrieve topic words. + + """ + assert self.top_words <= self.input_size, "top_words must be <= input size." # noqa + component_dists = self.best_components + topics = defaultdict(list) + topics_list = [] + if self.num_topics is not None: + for i in range(self.num_topics): + _, idxs = torch.topk(component_dists[i], self.top_words) + component_words = [self.train_data.idx2token[idx] + for idx in idxs.cpu().numpy()] + topics[i] = component_words + topics_list.append(component_words) + + return topics_list + + def get_info(self): + info = {} + topic_word = self.get_topics() + topic_word_dist = self.get_topic_word_mat() + topic_document_dist = self.get_topic_document_mat() + info['topics'] = topic_word + + info['topic-document-matrix'] = np.asarray( + self.get_thetas(self.train_data)).T + + info['topic-word-matrix'] = topic_word_dist + return info + + def _format_file(self): + model_dir = ( + "RLTM_nc_{}_hs_{}_ac_{}_id_{}_" + "pd_{}_bs_{}_lr_{}_mo_{}_rp_{}".format( + self.num_topics, self.hidden_sizes, self.activation, + self.inference_dropout, self.policy_dropout, self.batch_size, + self.lr, self.momentum, self.reduce_on_plateau)) + return model_dir + + def save(self, models_dir=None): + """ + Save model. + + :param models_dir: path to directory for saving NN models. + """ + if (self.model is not None) and (models_dir is not None): + + model_dir = self._format_file() + if not os.path.isdir(os.path.join(models_dir, model_dir)): + os.makedirs(os.path.join(models_dir, model_dir)) + + filename = "epoch_{}".format(self.nn_epoch) + '.pth' + fileloc = os.path.join(models_dir, model_dir, filename) + with open(fileloc, 'wb') as file: + torch.save({'state_dict': self.model.state_dict(), + 'dcue_dict': self.__dict__}, file) + + def load(self, model_dir, epoch): + """ + Load a previously trained model. + + :param model_dir: directory where models are saved. + :param epoch: epoch of model to load. + """ + epoch_file = "epoch_" + str(epoch) + ".pth" + model_file = os.path.join(model_dir, epoch_file) + with open(model_file, 'rb') as model_dict: + checkpoint = torch.load(model_dict) + + for (k, v) in checkpoint['dcue_dict'].items(): + setattr(self, k, v) + + self.model.load_state_dict(checkpoint['state_dict']) + + def get_thetas(self, dataset): + """ + Get the document-topic distribution for a dataset of topics. + Includes multiple sampling to reduce variation via + the parameter num_samples. + :param dataset: a PyTorch Dataset containing the documents + """ + self.model.eval() + + loader = DataLoader( + dataset, batch_size=self.batch_size, shuffle=False, + num_workers=self.num_data_loader_workers) + final_thetas = [] + for sample_index in range(self.num_samples): + with torch.no_grad(): + collect_theta = [] + for batch_samples in loader: + # batch_size x vocab_size + x_bert = batch_samples['X_bert'] + if self.USE_CUDA: + x_bert = x_bert.cuda() + # forward pass + self.model.zero_grad() + collect_theta.extend( + self.model.get_topic_distribution(x_bert).cpu().numpy().tolist()) + + final_thetas.append(np.array(collect_theta)) + return np.sum(final_thetas, axis=0) / self.num_samples diff --git a/octis/models/rl_for_topic_models/networks/__init__.py b/octis/models/rl_for_topic_models/networks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/octis/models/rl_for_topic_models/networks/decoder_network.py b/octis/models/rl_for_topic_models/networks/decoder_network.py new file mode 100644 index 00000000..761c3ca5 --- /dev/null +++ b/octis/models/rl_for_topic_models/networks/decoder_network.py @@ -0,0 +1,131 @@ +"""PyTorch class for feed-forward decoder network.""" + +import math +import torch +from torch import nn +from torch.nn import functional as F +from torch.distributions.normal import Normal +import numpy as np + +from octis.models.rl_for_topic_models.networks.inference_network import InferenceNetwork + + +class DecoderNetwork(nn.Module): + + """AVITM Network.""" + + def __init__(self, input_size, bert_size, n_components=10, + hidden_sizes=(128, 128), activation='gelu', inference_dropout=0.2, + policy_dropout=0.0, kl_multiplier=1.0): + """ + Initialize InferenceNetwork. + + Args + input_size : int, dimension of input + bert_size : int, dimension of BERT input + n_components : int, number of topic components, (default 10) + hidden_sizes : tuple, length = n_layers, (default (128, 128)) + activation : string, default 'gelu' + inference_dropout : float, inference dropout to use (default 0.2) + policy_dropout : float, policy dropout to use (default 0.0) + kl_multiplier : float or int, multiplier on the KL divergence (default 1.0) + """ + super(DecoderNetwork, self).__init__() + assert isinstance(input_size, int), "input_size must by type int." + assert isinstance(bert_size, int), "input_size must by type int." + assert (isinstance(n_components, int) or isinstance(n_components, np.int64)) and n_components > 0, \ + "n_components must be type int > 0." + assert isinstance(hidden_sizes, tuple), \ + "hidden_sizes must be type tuple." + assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', + 'rrelu', 'elu', 'selu', 'gelu'], \ + "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ + " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." + assert inference_dropout >= 0, "inference dropout must be >= 0." + assert policy_dropout >= 0, "policy dropout must be >= 0." + assert isinstance(kl_multiplier, float) or isinstance(kl_multiplier, int), \ + "kl_multiplier must be a float or int" + + self.n_components = n_components + self.kl_multiplier = float(kl_multiplier) + + self.mu_inference = InferenceNetwork( + bert_size, n_components, hidden_sizes, activation=activation, dropout=inference_dropout) + self.log_sigma_inference = InferenceNetwork( + bert_size, n_components, hidden_sizes, activation=activation, dropout=inference_dropout) + + if torch.cuda.is_available(): + self.mu_inference = self.mu_inference.cuda() + self.log_sigma_inference = self.log_sigma_inference.cuda() + + self.prior_mean = torch.Tensor(torch.zeros(n_components)) + if torch.cuda.is_available(): + self.prior_mean = nn.Parameter(self.prior_mean.cuda()) + self.prior_mean = nn.Parameter(self.prior_mean) + + self.prior_variance = torch.Tensor(torch.ones(n_components)) + if torch.cuda.is_available(): + self.prior_variance = self.prior_variance.cuda() + self.prior_variance = nn.Parameter(self.prior_variance) + + self.beta = 0.02 * torch.randn((n_components, input_size)) + if torch.cuda.is_available(): + self.beta = self.beta.cuda() + self.beta = nn.Parameter(self.beta) + + self.posterior_log_sigma_norm = nn.LayerNorm(n_components, elementwise_affine=False) + self.dropout = nn.Dropout(p=policy_dropout) + self.beta_norm = nn.LayerNorm(input_size, elementwise_affine=False) + + def kl_divergence(self, p_mean, p_variance, q_mean, q_variance): + var_division = torch.sum(p_variance ** 2 / q_variance ** 2, dim=-1) + diff_term = torch.sum((q_mean - p_mean) ** 2 / q_variance ** 2, dim=-1) + logvar_det_division = torch.sum(torch.log(q_variance ** 2) - torch.log(p_variance ** 2), dim=-1) + return 0.5 * (var_division + diff_term - self.n_components + logvar_det_division) + + def loss_fn(self, bow, word_dist, posterior_mu, posterior_log_sigma, epsilon=1e-8): + # forward KL divergence + unscaled_kl = self.kl_divergence(posterior_mu, torch.exp(posterior_log_sigma), + self.prior_mean, self.prior_variance) + + kl = self.kl_multiplier * unscaled_kl + + # reconstruction loss (log likelihood) + nll = -1.0 * torch.sum(bow * torch.log(word_dist + epsilon), dim=-1) + + reward = nll + kl + return reward.sum() + + def forward(self, x_bow, x_bert): + """Forward pass.""" + # inference networks + posterior_mu = self.mu_inference(x_bert) + posterior_log_sigma_unnormalized = self.log_sigma_inference(x_bert) + posterior_log_sigma = self.posterior_log_sigma_norm(posterior_log_sigma_unnormalized) + posterior_distribution = Normal(posterior_mu, torch.exp(posterior_log_sigma)) + + # RL policy + action = posterior_distribution.rsample() + self.action = action + policy = (1 / (torch.exp(posterior_log_sigma) * math.sqrt(2 * math.pi))) \ + * torch.exp(-1.0 * (action - posterior_mu) ** 2 / (2 * torch.exp(posterior_log_sigma) ** 2)) + policy = self.dropout(policy) + + # product of experts + word_dist = F.softmax(self.beta_norm(torch.matmul(policy, self.beta)), dim=-1) + + # loss + loss = self.loss_fn(x_bow, word_dist, posterior_mu, posterior_log_sigma) + return loss + + def get_topic_distribution(self, x_bert): + with torch.no_grad(): + # inference networks + posterior_mu = self.mu_inference(x_bert) + posterior_log_sigma_unnormalized = self.log_sigma_inference(x_bert) + posterior_log_sigma = self.posterior_log_sigma_norm(posterior_log_sigma_unnormalized) + posterior_distribution = Normal(posterior_mu, torch.exp(posterior_log_sigma)) + + action = posterior_distribution.rsample() + softmax_action = F.softmax(action, dim=-1) + return softmax_action diff --git a/octis/models/rl_for_topic_models/networks/inference_network.py b/octis/models/rl_for_topic_models/networks/inference_network.py new file mode 100644 index 00000000..7c6ebd0b --- /dev/null +++ b/octis/models/rl_for_topic_models/networks/inference_network.py @@ -0,0 +1,71 @@ +"""PyTorch class for feed-forward inference network.""" + +from collections import OrderedDict +from torch import nn +import torch +import numpy as np + +class InferenceNetwork(nn.Module): + + """Inference Network.""" + + def __init__(self, bert_size, output_size, hidden_sizes, + activation='gelu', dropout=0.2): + """ + Initialize InferenceNetwork. + + Args + bert_size : int, dimension of BERT input + output_size : int, dimension of output + hidden_sizes : tuple, length = n_layers + activation : string, default 'gelu' + dropout : float, default 0.2 + """ + super(InferenceNetwork, self).__init__() + assert isinstance(bert_size, int), "input_size must by type int." + assert isinstance(output_size, int) or isinstance(output_size, np.int64), "output_size must be type int." + assert isinstance(hidden_sizes, tuple), \ + "hidden_sizes must be type tuple." + assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', + 'rrelu', 'elu', 'selu', 'gelu'], \ + "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ + " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." + assert dropout >= 0, "dropout must be >= 0." + + if activation == 'softplus': + self.activation = nn.Softplus() + elif activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'sigmoid': + self.activation = nn.Sigmoid() + elif activation == 'tanh': + self.activation = nn.Tanh() + elif activation == 'leakyrelu': + self.activation = nn.LeakyReLU() + elif activation == 'rrelu': + self.activation = nn.RReLU() + elif activation == 'elu': + self.activation = nn.ELU() + elif activation == 'selu': + self.activation = nn.SELU() + elif activation == 'gelu': + self.activation = nn.GELU() + + self.adapt_bert = nn.Linear(bert_size, hidden_sizes[0]) + self.dropout = nn.Dropout(p=dropout) + + self.hiddens = nn.Sequential(OrderedDict([ + (f"l_{i}", nn.Sequential(nn.Linear(h_in, h_out), self.activation, nn.Dropout(p=dropout))) + for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) + + self.output = nn.Linear(hidden_sizes[-1], output_size) + + def forward(self, x_bert): + """Forward pass.""" + x_bert = self.adapt_bert(x_bert) + x = self.activation(x_bert) + x = self.dropout(x) + x = self.hiddens(x) + out = self.output(x) + + return out diff --git a/octis/models/rl_for_topic_models/utils/__init__.py b/octis/models/rl_for_topic_models/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/octis/models/rl_for_topic_models/utils/data_preparation.py b/octis/models/rl_for_topic_models/utils/data_preparation.py new file mode 100644 index 00000000..aa7a517a --- /dev/null +++ b/octis/models/rl_for_topic_models/utils/data_preparation.py @@ -0,0 +1,151 @@ +import numpy as np +from sentence_transformers import SentenceTransformer +import scipy.sparse +import warnings +from octis.models.rl_for_topic_models.datasets.dataset import RLTMDataset +import os +import pickle as pkl + +def get_bag_of_words(data, min_length): + """ + Creates the bag of words + """ + vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length) + for x in data if np.sum(x[x != np.array(None)]) != 0] + + vect = scipy.sparse.csr_matrix(vect) + return vect + +def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200): + """ + Creates SBERT Embeddings from an input file + """ + model = SentenceTransformer(sbert_model_to_load) + with open(text_file, encoding="utf-8") as filino: + train_text = list(map(lambda x: x, filino.readlines())) + + return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size)) + + +def bert_embeddings_from_list(texts, sbert_model_to_load="bert-base-nli-mean-tokens", batch_size=100): + """ + Creates SBERT Embeddings from a list + """ + model = SentenceTransformer(sbert_model_to_load) + return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size)) + + +class QuickText: + """ + Integrated class to handle all the text preprocessing needed + """ + def __init__(self, bert_model, text_for_bow, text_for_bert=None, bert_path=None): + """ + :param bert_model: string, bert model to use + :param text_for_bert: list, list of sentences with the unpreprocessed text + :param text_for_bow: list, list of sentences with the preprocessed text + """ + self.vocab_dict = {} + self.vocab = [] + self.index_dd = None + self.idx2token = None + self.bow = None + self.bert_model = bert_model + self.text_handler = "" + self.data_bert = None + self.text_for_bow = text_for_bow + + if text_for_bert is not None: + self.text_for_bert = text_for_bert + else: + self.text_for_bert = None + self.bert_path = bert_path + + def prepare_bow(self): + indptr = [0] + indices = [] + data = [] + vocabulary = {} + + if self.text_for_bow is not None: + docs = self.text_for_bow + else: + docs = self.text_for_bert + + for d in docs: + for term in d.split(): + index = vocabulary.setdefault(term, len(vocabulary)) + indices.append(index) + data.append(1) + indptr.append(len(indices)) + + self.vocab_dict = vocabulary + self.vocab = list(vocabulary.keys()) + + self.idx2token = {v: k for (k, v) in self.vocab_dict.items()} + self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int) + + def load_contextualized_embeddings(self, embeddings): + self.data_bert = embeddings + + def load_dataset(self): + self.prepare_bow() + if self.bert_path is not None: + if os.path.exists(self.bert_path): + self.data_bert = pkl.load(open(self.bert_path, 'r')) + else: + if self.data_bert is None: + if self.text_for_bert is not None: + self.data_bert = bert_embeddings_from_list(self.text_for_bert, self.bert_model) + else: + self.data_bert = bert_embeddings_from_list(self.text_for_bow, self.bert_model) + pkl.dump(self.data_bert, open(self.bert_path, 'w')) + + training_dataset = RLTMDataset(self.bow, self.data_bert, self.idx2token) + return training_dataset + +class TextHandler: + """ + Class used to handle the text preparation and the BagOfWord + """ + def __init__(self, file_name=None, sentences=None): + self.file_name = file_name + self.sentences = sentences + self.vocab_dict = {} + self.vocab = [] + self.index_dd = None + self.idx2token = None + self.bow = None + + warnings.simplefilter('always', DeprecationWarning) + warnings.warn("TextHandler class is deprecated and will be removed in version 2.0. Use QuickText.", Warning) + + def prepare(self): + indptr = [0] + indices = [] + data = [] + vocabulary = {} + + if self.sentences is None and self.file_name is None: + raise Exception("Sentences and file_names cannot both be none") + + if self.sentences is not None: + docs = self.sentences + elif self.file_name is not None: + with open(self.file_name, encoding="utf-8") as filino: + docs = filino.readlines() + else: + raise Exception("One parameter between sentences and file_name should be selected") + + for d in docs: + for term in d.split(): + index = vocabulary.setdefault(term, len(vocabulary)) + indices.append(index) + data.append(1) + indptr.append(len(indices)) + + self.vocab_dict = vocabulary + self.vocab = list(vocabulary.keys()) + + self.idx2token = {v: k for (k, v) in self.vocab_dict.items()} + self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int) diff --git a/octis/models/rl_for_topic_models/utils/preprocessing.py b/octis/models/rl_for_topic_models/utils/preprocessing.py new file mode 100644 index 00000000..c6cc5062 --- /dev/null +++ b/octis/models/rl_for_topic_models/utils/preprocessing.py @@ -0,0 +1,58 @@ +from sklearn.feature_extraction.text import CountVectorizer +import string +from nltk.corpus import stopwords as stop_words +import warnings + +class WhiteSpacePreprocessing(): + """ + Provides a very simple preprocessing script that filters infrequent tokens from text + """ + def __init__(self, documents, stopwords_language="english", vocabulary_size=2000): + """ + + :param documents: list of strings + :param stopwords_language: string of the language of the stopwords (see nltk stopwords) + :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents + """ + self.documents = documents + self.stopwords = set(stop_words.words(stopwords_language)) + self.vocabulary_size = vocabulary_size + + def preprocess(self): + """ + Note that if after filtering some documents do not contain words we remove them. That is why we return also the + list of unpreprocessed documents. + + :return: preprocessed documents, unpreprocessed documents and the vocabulary list + """ + preprocessed_docs_tmp = self.documents + preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp] + preprocessed_docs_tmp = [doc.translate( + str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp] + preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords]) + for doc in preprocessed_docs_tmp] + + vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b') + vectorizer.fit_transform(preprocessed_docs_tmp) + vocabulary = set(vectorizer.get_feature_names()) + preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocabulary]) + for doc in preprocessed_docs_tmp] + + preprocessed_docs, unpreprocessed_docs = [], [] + for i, doc in enumerate(preprocessed_docs_tmp): + if len(doc) > 0: + preprocessed_docs.append(doc) + unpreprocessed_docs.append(self.documents[i]) + + return preprocessed_docs, unpreprocessed_docs, list(vocabulary) + + +class SimplePreprocessing(WhiteSpacePreprocessing): + def __init__(self, documents, stopwords_language="english"): + super().__init__(documents, stopwords_language) + warnings.simplefilter('always', DeprecationWarning) + + if self.__class__.__name__ == "CTM": + + warnings.warn("SimplePrepocessing is deprecated and will be removed in version 2.0, " + "use WhiteSpacePreprocessing", DeprecationWarning) From fe2935ba93e6a882c08acc10fec7bf568ea49ac7 Mon Sep 17 00:00:00 2001 From: Jeremy Costello Date: Mon, 14 Aug 2023 14:03:31 -0230 Subject: [PATCH 3/6] update comment --- octis/models/rl_for_topic_models/networks/decoder_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/octis/models/rl_for_topic_models/networks/decoder_network.py b/octis/models/rl_for_topic_models/networks/decoder_network.py index 761c3ca5..8a150ef0 100644 --- a/octis/models/rl_for_topic_models/networks/decoder_network.py +++ b/octis/models/rl_for_topic_models/networks/decoder_network.py @@ -12,7 +12,7 @@ class DecoderNetwork(nn.Module): - """AVITM Network.""" + """RLTM Network.""" def __init__(self, input_size, bert_size, n_components=10, hidden_sizes=(128, 128), activation='gelu', inference_dropout=0.2, From dd6b053d6113250c11dbd0017e632ddf7260a948 Mon Sep 17 00:00:00 2001 From: Jeremy Costello Date: Mon, 14 Aug 2023 16:18:11 -0230 Subject: [PATCH 4/6] some changes --- octis/models/RLTM.py | 35 +++++++++++-------- .../models/rl_for_topic_models/models/rltm.py | 22 ++++++------ .../networks/decoder_network.py | 13 +++++++ .../utils/data_preparation.py | 2 +- tests/test_octis.py | 3 ++ 5 files changed, 48 insertions(+), 27 deletions(-) diff --git a/octis/models/RLTM.py b/octis/models/RLTM.py index 916cc165..8e34837c 100644 --- a/octis/models/RLTM.py +++ b/octis/models/RLTM.py @@ -20,7 +20,8 @@ def __init__( inference_dropout=0.2, policy_dropout=0.0, batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', num_epochs=200, num_samples=10, seed=None, use_partitions=True, reduce_on_plateau=False, bert_path="", - bert_model="all-MiniLM-L6-v2", weight_decay=0.01, kl_multiplier=1.0): + bert_model="all-MiniLM-L6-v2", weight_decay=0.01, kl_multiplier=1.0, + grad_norm_clip=1.0): """ initialization of RLTM @@ -53,28 +54,30 @@ def __init__( :param weight_decay: float, L2 regularization on model weights :param kl_multiplier: float or int, multiplier on the KL divergence (default 1.0) + :param grad_norm_clip: float or None; clip gradient norms (default 1.0) """ super().__init__() - self.hyperparameters['num_topics'] = num_topics - self.hyperparameters['activation'] = activation - self.hyperparameters['inference_dropout'] = inference_dropout - self.hyperparameters['policy_dropout'] = policy_dropout - self.hyperparameters['batch_size'] = batch_size - self.hyperparameters['lr'] = lr - self.hyperparameters['num_samples'] = num_samples - self.hyperparameters['momentum'] = momentum - self.hyperparameters['solver'] = solver - self.hyperparameters['num_epochs'] = num_epochs - self.hyperparameters['reduce_on_plateau'] = reduce_on_plateau + self.hyperparameters["num_topics"] = num_topics + self.hyperparameters["activation"] = activation + self.hyperparameters["inference_dropout"] = inference_dropout + self.hyperparameters["policy_dropout"] = policy_dropout + self.hyperparameters["batch_size"] = batch_size + self.hyperparameters["lr"] = lr + self.hyperparameters["num_samples"] = num_samples + self.hyperparameters["momentum"] = momentum + self.hyperparameters["solver"] = solver + self.hyperparameters["num_epochs"] = num_epochs + self.hyperparameters["reduce_on_plateau"] = reduce_on_plateau self.hyperparameters["num_neurons"] = num_neurons self.hyperparameters["bert_path"] = bert_path self.hyperparameters["num_layers"] = num_layers self.hyperparameters["bert_model"] = bert_model self.hyperparameters["seed"] = seed self.hyperparameters["weight_decay"] = weight_decay - self.hyperparameters['kl_multiplier'] = kl_multiplier + self.hyperparameters["kl_multiplier"] = kl_multiplier + self.hyperparameters["grad_norm_clip"] = grad_norm_clip self.use_partitions = use_partitions hidden_sizes = tuple([num_neurons for _ in range(num_layers)]) @@ -83,7 +86,7 @@ def __init__( self.model = None self.vocab = None - def train_model(self, dataset, hyperparameters=None, top_words=10): + def train_model(self, dataset, hyperparameters=None, top_words=10, verbose=False): """ trains RLTM model @@ -132,9 +135,10 @@ def train_model(self, dataset, hyperparameters=None, top_words=10): reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], weight_decay=self.hyperparameters['weight_decay'], kl_multiplier=self.hyperparameters['kl_multiplier'], + grad_norm_clip=self.hyperparameters['grad_norm_clip'], top_words=top_words) - self.model.fit(x_train, x_valid, verbose=False) + self.model.fit(x_train, x_valid, verbose=verbose) result = self.inference(x_test) return result @@ -162,6 +166,7 @@ def train_model(self, dataset, hyperparameters=None, top_words=10): reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], weight_decay=self.hyperparameters['weight_decay'], kl_multiplier=self.hyperparameters['kl_multiplier'], + grad_norm_clip=self.hyperparameters['grad_norm_clip'], top_words=top_words) self.model.fit(x_train, None, verbose=False) diff --git a/octis/models/rl_for_topic_models/models/rltm.py b/octis/models/rl_for_topic_models/models/rltm.py index d53698f7..3a4d00df 100644 --- a/octis/models/rl_for_topic_models/models/rltm.py +++ b/octis/models/rl_for_topic_models/models/rltm.py @@ -12,7 +12,6 @@ from octis.models.rl_for_topic_models.networks.decoder_network import ( DecoderNetwork) -from octis.models.early_stopping.pytorchtools import EarlyStopping class RLTM(object): @@ -24,7 +23,8 @@ def __init__( activation='gelu', inference_dropout=0.2, policy_dropout=0.0, batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', num_epochs=200, num_samples=10, reduce_on_plateau=False, top_words=10, - num_data_loader_workers=0, weight_decay=0.01, kl_multiplier=1.0): + num_data_loader_workers=0, weight_decay=0.01, kl_multiplier=1.0, + grad_norm_clip=1.0): """ :param input_size: int, dimension of input @@ -49,6 +49,7 @@ def __init__( :param weight_decay: float, L2 regularization on model weights (default 0.01) :param kl_multiplier: float or int, multiplier on the KL divergence (default 1.0) + :param grad_norm_clip: float or None; clip gradient norms (default 1.0) """ assert isinstance(input_size, int) and input_size > 0, \ @@ -89,6 +90,10 @@ def __init__( assert weight_decay >= 0, "weight_decay must be >= 0" assert isinstance(kl_multiplier, float) or isinstance(kl_multiplier, int), \ "kl_multiplier must be a float or int" + assert isinstance(grad_norm_clip, float) or grad_norm_clip is None, \ + "grad_norm_clip must be a float or None" + if grad_norm_clip is not None: + assert grad_norm_clip > 0, "grad_norm_clip must be > 0" self.input_size = input_size self.num_topics = num_topics @@ -105,12 +110,12 @@ def __init__( self.num_epochs = num_epochs self.reduce_on_plateau = reduce_on_plateau self.num_data_loader_workers = num_data_loader_workers + self.grad_norm_clip = grad_norm_clip # init decoder network model = DecoderNetwork( input_size, bert_size, num_topics, hidden_sizes, activation, inference_dropout, policy_dropout, kl_multiplier) - self.early_stopping = EarlyStopping(patience=5, verbose=False) # init optimizer if self.solver == 'adamw': @@ -217,6 +222,8 @@ def _train_epoch(self, loader): # backward pass loss.backward() + if self.grad_norm_clip is not None: + nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_norm_clip) self.optimizer.step() # compute train loss @@ -331,15 +338,8 @@ def fit(self, train_dataset, validation_dataset=None, val_loss, e - s)) if np.isnan(val_loss) or np.isnan(train_loss): + print("loss is NaN") break - else: - self.early_stopping(val_loss, self.model) - if self.early_stopping.early_stop: - if verbose: - print("Early stopping") - if save_dir is not None: - self.save(save_dir) - break def predict(self, dataset): """Predict input.""" diff --git a/octis/models/rl_for_topic_models/networks/decoder_network.py b/octis/models/rl_for_topic_models/networks/decoder_network.py index 8a150ef0..6c0e151d 100644 --- a/octis/models/rl_for_topic_models/networks/decoder_network.py +++ b/octis/models/rl_for_topic_models/networks/decoder_network.py @@ -76,6 +76,19 @@ def __init__(self, input_size, bert_size, n_components=10, self.posterior_log_sigma_norm = nn.LayerNorm(n_components, elementwise_affine=False) self.dropout = nn.Dropout(p=policy_dropout) self.beta_norm = nn.LayerNorm(input_size, elementwise_affine=False) + + self.apply(self._init_weights) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + if module.bias is not None: + nn.init.zeros_(module.bias) + if module.weight is not None: + nn.init.ones_(module.weight) def kl_divergence(self, p_mean, p_variance, q_mean, q_variance): var_division = torch.sum(p_variance ** 2 / q_variance ** 2, dim=-1) diff --git a/octis/models/rl_for_topic_models/utils/data_preparation.py b/octis/models/rl_for_topic_models/utils/data_preparation.py index aa7a517a..9a55fd59 100644 --- a/octis/models/rl_for_topic_models/utils/data_preparation.py +++ b/octis/models/rl_for_topic_models/utils/data_preparation.py @@ -27,7 +27,7 @@ def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200): return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size)) -def bert_embeddings_from_list(texts, sbert_model_to_load="bert-base-nli-mean-tokens", batch_size=100): +def bert_embeddings_from_list(texts, sbert_model_to_load="all-MiniLM-L6-v2", batch_size=100): """ Creates SBERT Embeddings from a list """ diff --git a/tests/test_octis.py b/tests/test_octis.py index 7c70378c..e96e3f9f 100644 --- a/tests/test_octis.py +++ b/tests/test_octis.py @@ -577,6 +577,7 @@ def test_model_output_prodlda_not_partitioned(data_dir): num_topics, len(dataset.get_corpus())) +@pytest.mark.rltm def test_model_output_rltm_seeded(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') @@ -592,6 +593,7 @@ def test_model_output_rltm_seeded(data_dir): assert output['topics'] == output_2['topics'] +@pytest.mark.rltm def test_model_output_rltm(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') @@ -622,6 +624,7 @@ def test_model_output_rltm(data_dir): num_topics, len(dataset.get_partitioned_corpus()[2])) +@pytest.mark.rltm def test_model_output_rltm_not_partition(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') From 6dca30e57bb53c1cd11b48079aca4120d7d585e6 Mon Sep 17 00:00:00 2001 From: jeremy-costello <37478736+jeremy-costello@users.noreply.github.com> Date: Wed, 6 Sep 2023 14:17:57 -0230 Subject: [PATCH 5/6] Add RLTM to README.rst --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 3d49a267..fed2855f 100644 --- a/README.rst +++ b/README.rst @@ -263,6 +263,8 @@ Available Models +-------------------------------------------+-----------------------------------------------------------+ | ProdLda `(Srivastava and Sutton 2017)`_ | https://github.com/estebandito22/PyTorchAVITM | +-------------------------------------------+-----------------------------------------------------------+ +| RLTM `(Costello and Reformat 2023)`_ | https://github.com/jeremy-costello/rl-for-topic-models | ++-------------------------------------------+-----------------------------------------------------------+ .. _(Bianchi et al. 2021): https://www.aclweb.org/anthology/2021.eacl-main.143/ @@ -271,7 +273,8 @@ Available Models .. _(Blei et al. 2003): https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf .. _(Landauer et al. 1998): http://lsa.colorado.edu/papers/dp1.LSAintro.pdf .. _(Lee and Seung 2000): https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization -.. _(Srivastava and Sutton 2017): https://arxiv.org/abs/1703.01488 +.. _(Srivastava and Sutton 2017): https://arxiv.org/abs/1703.01488 +.. _(Costello and Reformat 2023): https://aclanthology.org/2023.findings-acl.265/ If you use one of these implementations, make sure to cite the right paper. From eea59710db4f8dd0ac7bc0283630808646c26223 Mon Sep 17 00:00:00 2001 From: Jeremy Costello Date: Wed, 6 Sep 2023 15:58:15 -0230 Subject: [PATCH 6/6] changes to pass tests --- MANIFEST.in | 1 + octis/models/RLTM.py | 28 ++++--- .../rl_for_topic_models/datasets/dataset.py | 6 +- .../models/rl_for_topic_models/models/rltm.py | 82 +++++++++++-------- .../networks/decoder_network.py | 82 ++++++++++++------- .../networks/inference_network.py | 18 ++-- .../utils/data_preparation.py | 49 ++++++++--- .../utils/preprocessing.py | 59 +++++++++---- tests/test_octis.py | 8 +- 9 files changed, 218 insertions(+), 115 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 965b2dda..33c63472 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include CONTRIBUTING.rst include HISTORY.rst include LICENSE include README.rst +include requirements.txt recursive-include tests * recursive-exclude * __pycache__ diff --git a/octis/models/RLTM.py b/octis/models/RLTM.py index 8e34837c..3a938fef 100644 --- a/octis/models/RLTM.py +++ b/octis/models/RLTM.py @@ -16,12 +16,13 @@ class RLTM(AbstractModel): def __init__( - self, num_topics=10, activation='gelu', num_layers=2, num_neurons=128, - inference_dropout=0.2, policy_dropout=0.0, batch_size=256, lr=3e-4, - momentum=0.9, solver='adamw', num_epochs=200, num_samples=10, - seed=None, use_partitions=True, reduce_on_plateau=False, bert_path="", - bert_model="all-MiniLM-L6-v2", weight_decay=0.01, kl_multiplier=1.0, - grad_norm_clip=1.0): + self, num_topics=10, activation='gelu', num_layers=2, + num_neurons=128, inference_dropout=0.2, policy_dropout=0.0, + batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', + num_epochs=200, num_samples=10, seed=None, use_partitions=True, + reduce_on_plateau=False, bert_path="", + bert_model="all-MiniLM-L6-v2", weight_decay=0.01, + kl_multiplier=1.0, grad_norm_clip=1.0): """ initialization of RLTM @@ -31,9 +32,11 @@ def __init__( 'gelu' (default 'gelu') :param num_layers : int, number of layers (default 2) :param num_neurons : int, number of neurons per layer (default 128) - :param inference_dropout : float, inference dropout to use (default 0.2) + :param inference_dropout : float, inference dropout to use + (default 0.2) :param policy_dropout : float, policy dropout to use (default 0.0) - :param batch_size : int, size of batch to use for training (default 256) + :param batch_size : int, size of batch to use for training + (default 256) :param lr : float, learning rate to use for training (default 3e-4) :param momentum : float, momentum to use for training (default 0.9) :param solver: string, optimizer 'adagrad', 'adam', 'sgd', 'adadelta', @@ -86,7 +89,8 @@ def __init__( self.model = None self.vocab = None - def train_model(self, dataset, hyperparameters=None, top_words=10, verbose=False): + def train_model(self, dataset, hyperparameters=None, + top_words=10, verbose=False): """ trains RLTM model @@ -226,7 +230,8 @@ def preprocess( if test is not None and validation is not None: x_test = vec.transform(test) b_test = RLTM.load_bert_data(bert_test_path, test, bert_model) - test_data = dataset.RLTMDataset(x_test.toarray(), b_test, idx2token) + test_data = dataset.RLTMDataset( + x_test.toarray(), b_test, idx2token) x_valid = vec.transform(validation) b_val = RLTM.load_bert_data(bert_val_path, validation, bert_model) @@ -242,7 +247,8 @@ def preprocess( if test is not None and validation is None: x_test = vec.transform(test) b_test = RLTM.load_bert_data(bert_test_path, test, bert_model) - test_data = dataset.RLTMDataset(x_test.toarray(), b_test, idx2token) + test_data = dataset.RLTMDataset( + x_test.toarray(), b_test, idx2token) return train_data, test_data, input_size if test is None and validation is None: return train_data, input_size diff --git a/octis/models/rl_for_topic_models/datasets/dataset.py b/octis/models/rl_for_topic_models/datasets/dataset.py index 67a4f206..633de20f 100644 --- a/octis/models/rl_for_topic_models/datasets/dataset.py +++ b/octis/models/rl_for_topic_models/datasets/dataset.py @@ -14,8 +14,10 @@ def __init__(self, X, X_bert, idx2token): Document word matrix. """ if X.shape[0] != len(X_bert): - raise Exception("Wait! BoW and Contextual Embeddings have different sizes! " - "You might want to check if the BoW preparation method has removed some documents. ") + raise Exception("Wait! BoW and Contextual Embeddings \ + have different sizes! You might want \ + to check if the BoW preparation method \ + has removed some documents.") self.X = X self.X_bert = X_bert diff --git a/octis/models/rl_for_topic_models/models/rltm.py b/octis/models/rl_for_topic_models/models/rltm.py index 3a4d00df..1915438f 100644 --- a/octis/models/rl_for_topic_models/models/rltm.py +++ b/octis/models/rl_for_topic_models/models/rltm.py @@ -19,20 +19,21 @@ class RLTM(object): """ def __init__( - self, input_size, bert_size, num_topics=10, hidden_sizes=(128, 128), - activation='gelu', inference_dropout=0.2, policy_dropout=0.0, - batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', num_epochs=200, - num_samples=10, reduce_on_plateau=False, top_words=10, - num_data_loader_workers=0, weight_decay=0.01, kl_multiplier=1.0, - grad_norm_clip=1.0): - + self, input_size, bert_size, num_topics=10, + hidden_sizes=(128, 128), activation='gelu', + inference_dropout=0.2, policy_dropout=0.0, + batch_size=256, lr=3e-4, momentum=0.9, solver='adamw', + num_epochs=200, num_samples=10, reduce_on_plateau=False, + top_words=10, num_data_loader_workers=0, weight_decay=0.01, + kl_multiplier=1.0, grad_norm_clip=1.0): """ :param input_size: int, dimension of input :param bert_input_size: int, dimension of BERT input :param num_topics: int, number of topic components, (default 10) :param hidden_sizes: tuple, length = n_layers, (default (128, 128)) - :param activation: string, 'softplus', 'relu', 'sigmoid', 'swish', - 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu', 'gelu' (default 'gelu') + :param activation: string, 'softplus', 'relu', 'sigmoid', + 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu', + 'gelu' (default 'gelu') :param inference_dropout: float, inference dropout to use (default 0.2) :param policy_dropout: float, policy dropout to use (default 0.0) :param batch_size: int, size of batch to use for training (default 256) @@ -46,7 +47,8 @@ def __init__( of 10 epochs (default False) :param num_data_loader_workers: int, number of data loader workers (default cpu_count). set it to 0 if you are using Windows - :param weight_decay: float, L2 regularization on model weights (default 0.01) + :param weight_decay: float, L2 regularizationon model weights + (default 0.01) :param kl_multiplier: float or int, multiplier on the KL divergence (default 1.0) :param grad_norm_clip: float or None; clip gradient norms (default 1.0) @@ -61,8 +63,9 @@ def __init__( "num_topics must by type int > 0." assert isinstance(hidden_sizes, tuple), \ "hidden_sizes must be type tuple." - assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', - 'rrelu', 'elu', 'selu', 'gelu'], \ + assert activation in ['softplus', 'relu', 'sigmoid', + 'tanh', 'leakyrelu', 'rrelu', + 'elu', 'selu', 'gelu'], \ "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." assert inference_dropout >= 0, "inference dropout must be >= 0." @@ -88,7 +91,8 @@ def __init__( and num_data_loader_workers >= 0, \ "num_data_loader_workers must be int >= 0" assert weight_decay >= 0, "weight_decay must be >= 0" - assert isinstance(kl_multiplier, float) or isinstance(kl_multiplier, int), \ + assert isinstance(kl_multiplier, float) \ + or isinstance(kl_multiplier, int), \ "kl_multiplier must be a float or int" assert isinstance(grad_norm_clip, float) or grad_norm_clip is None, \ "grad_norm_clip must be a float or None" @@ -119,8 +123,8 @@ def __init__( # init optimizer if self.solver == 'adamw': - self.optimizer = self._configure_adamw(model, weight_decay, lr, - betas=(self.momentum, 0.999)) + self.optimizer = self._configure_adamw( + model, weight_decay, lr, betas=(self.momentum, 0.999)) if self.solver == 'adam': self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=( self.momentum, 0.999)) @@ -134,7 +138,7 @@ def __init__( elif self.solver == 'rmsprop': self.optimizer = optim.RMSprop( self.model.parameters(), lr=lr, momentum=self.momentum) - + # init lr scheduler if self.reduce_on_plateau: self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10) @@ -156,8 +160,8 @@ def __init__( self.USE_CUDA = True self.model = self.model.cuda() else: - self.USE_CUDA = False - + self.USE_CUDA = False + @staticmethod def _configure_adamw(model, weight_decay, lr, betas): whitelist_weight_modules = (nn.Linear,) @@ -186,16 +190,26 @@ def _configure_adamw(model, weight_decay, lr, betas): no_decay.add(fpn) param_dict = {pn: p for pn, p in model.named_parameters()} - # for decay and no decay sets, ensure no intersection and union contains all parameters + # for decay and no decay sets, ensure no intersection + # and union contains all parameters inter_params = decay & no_decay union_params = decay | no_decay - assert len(inter_params) == 0, f'parameters {inter_params} made it into both decay and no_decay set' + assert len(inter_params) == 0, \ + f'parameters {inter_params} made it into \ + both decay and no_decay set' assert len(param_dict.keys() - union_params) == 0, \ - f'parameters {param_dict.keys() - union_params} were not separated into either decay or no_decay set' + f'parameters {param_dict.keys() - union_params} \ + were not separated into either decay or no_decay set' optim_groups = [ - {'params': [param_dict[pn] for pn in sorted(list(decay))], 'weight_decay': weight_decay}, - {'params': [param_dict[pn] for pn in sorted(list(no_decay))], 'weight_decay': 0.0} + { + 'params': [param_dict[pn] for pn in sorted(list(decay))], + 'weight_decay': weight_decay + }, + { + 'params': [param_dict[pn] for pn in sorted(list(no_decay))], + 'weight_decay': 0.0 + } ] return optim.AdamW(optim_groups, lr=lr, betas=betas) @@ -223,12 +237,13 @@ def _train_epoch(self, loader): # backward pass loss.backward() if self.grad_norm_clip is not None: - nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_norm_clip) + nn.utils.clip_grad_norm_( + self.model.parameters(), self.grad_norm_clip) self.optimizer.step() # compute train loss samples_processed += X.size()[0] - train_loss += loss.item() + train_loss += loss.item() * X.size()[0] train_loss /= samples_processed @@ -255,7 +270,7 @@ def _validation(self, loader): # compute train loss samples_processed += X.size()[0] - val_loss += loss.item() + val_loss += loss.item() * X.size()[0] val_loss /= samples_processed @@ -311,9 +326,12 @@ def fit(self, train_dataset, validation_dataset=None, e = datetime.datetime.now() if verbose: - print("Epoch: [{}/{}]\tSamples: [{}/{}]\tTrain Loss: {}\tTime: {}".format( - epoch + 1, self.num_epochs, samples_processed, - len(self.train_data) * self.num_epochs, train_loss, e - s)) + print( + "Epoch: [{}/{}]\tSamples: [{}/{}]\ + \tTrain Loss: {}\tTime: {}".format( + epoch + 1, self.num_epochs, samples_processed, + len(self.train_data) * self.num_epochs, + train_loss, e - s)) self.best_components = self.model.beta self.final_topic_word = topic_word @@ -402,7 +420,6 @@ def get_info(self): info = {} topic_word = self.get_topics() topic_word_dist = self.get_topic_word_mat() - topic_document_dist = self.get_topic_document_mat() info['topics'] = topic_word info['topic-document-matrix'] = np.asarray( @@ -457,7 +474,7 @@ def load(self, model_dir, epoch): def get_thetas(self, dataset): """ - Get the document-topic distribution for a dataset of topics. + Get the document-topic distribution for a dataset of topics. Includes multiple sampling to reduce variation via the parameter num_samples. :param dataset: a PyTorch Dataset containing the documents @@ -479,7 +496,8 @@ def get_thetas(self, dataset): # forward pass self.model.zero_grad() collect_theta.extend( - self.model.get_topic_distribution(x_bert).cpu().numpy().tolist()) + self.model.get_topic_distribution( + x_bert).cpu().numpy().tolist()) final_thetas.append(np.array(collect_theta)) return np.sum(final_thetas, axis=0) / self.num_samples diff --git a/octis/models/rl_for_topic_models/networks/decoder_network.py b/octis/models/rl_for_topic_models/networks/decoder_network.py index 6c0e151d..5a93df89 100644 --- a/octis/models/rl_for_topic_models/networks/decoder_network.py +++ b/octis/models/rl_for_topic_models/networks/decoder_network.py @@ -7,16 +7,18 @@ from torch.distributions.normal import Normal import numpy as np -from octis.models.rl_for_topic_models.networks.inference_network import InferenceNetwork +from octis.models.rl_for_topic_models.networks.inference_network \ + import InferenceNetwork class DecoderNetwork(nn.Module): """RLTM Network.""" - def __init__(self, input_size, bert_size, n_components=10, - hidden_sizes=(128, 128), activation='gelu', inference_dropout=0.2, - policy_dropout=0.0, kl_multiplier=1.0): + def __init__( + self, input_size, bert_size, n_components=10, + hidden_sizes=(128, 128), activation='gelu', + inference_dropout=0.2, policy_dropout=0.0, kl_multiplier=1.0): """ Initialize InferenceNetwork. @@ -28,31 +30,36 @@ def __init__(self, input_size, bert_size, n_components=10, activation : string, default 'gelu' inference_dropout : float, inference dropout to use (default 0.2) policy_dropout : float, policy dropout to use (default 0.0) - kl_multiplier : float or int, multiplier on the KL divergence (default 1.0) + kl_multiplier : float or int, multiplier on the KL divergence + (default 1.0) """ super(DecoderNetwork, self).__init__() assert isinstance(input_size, int), "input_size must by type int." assert isinstance(bert_size, int), "input_size must by type int." - assert (isinstance(n_components, int) or isinstance(n_components, np.int64)) and n_components > 0, \ - "n_components must be type int > 0." + assert (isinstance(n_components, int) or + isinstance(n_components, np.int64)) \ + and n_components > 0, "n_components must be type int > 0." assert isinstance(hidden_sizes, tuple), \ "hidden_sizes must be type tuple." - assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', - 'rrelu', 'elu', 'selu', 'gelu'], \ + assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', + 'leakyrelu', 'rrelu', 'elu', 'selu', 'gelu'], \ "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." assert inference_dropout >= 0, "inference dropout must be >= 0." assert policy_dropout >= 0, "policy dropout must be >= 0." - assert isinstance(kl_multiplier, float) or isinstance(kl_multiplier, int), \ + assert isinstance(kl_multiplier, float) \ + or isinstance(kl_multiplier, int), \ "kl_multiplier must be a float or int" self.n_components = n_components self.kl_multiplier = float(kl_multiplier) self.mu_inference = InferenceNetwork( - bert_size, n_components, hidden_sizes, activation=activation, dropout=inference_dropout) + bert_size, n_components, hidden_sizes, + activation=activation, dropout=inference_dropout) self.log_sigma_inference = InferenceNetwork( - bert_size, n_components, hidden_sizes, activation=activation, dropout=inference_dropout) + bert_size, n_components, hidden_sizes, + activation=activation, dropout=inference_dropout) if torch.cuda.is_available(): self.mu_inference = self.mu_inference.cuda() @@ -62,7 +69,7 @@ def __init__(self, input_size, bert_size, n_components=10, if torch.cuda.is_available(): self.prior_mean = nn.Parameter(self.prior_mean.cuda()) self.prior_mean = nn.Parameter(self.prior_mean) - + self.prior_variance = torch.Tensor(torch.ones(n_components)) if torch.cuda.is_available(): self.prior_variance = self.prior_variance.cuda() @@ -73,12 +80,13 @@ def __init__(self, input_size, bert_size, n_components=10, self.beta = self.beta.cuda() self.beta = nn.Parameter(self.beta) - self.posterior_log_sigma_norm = nn.LayerNorm(n_components, elementwise_affine=False) + self.posterior_log_sigma_norm = nn.LayerNorm( + n_components, elementwise_affine=False) self.dropout = nn.Dropout(p=policy_dropout) self.beta_norm = nn.LayerNorm(input_size, elementwise_affine=False) self.apply(self._init_weights) - + def _init_weights(self, module): if isinstance(module, nn.Linear): nn.init.normal_(module.weight, mean=0.0, std=0.02) @@ -89,46 +97,56 @@ def _init_weights(self, module): nn.init.zeros_(module.bias) if module.weight is not None: nn.init.ones_(module.weight) - + def kl_divergence(self, p_mean, p_variance, q_mean, q_variance): var_division = torch.sum(p_variance ** 2 / q_variance ** 2, dim=-1) diff_term = torch.sum((q_mean - p_mean) ** 2 / q_variance ** 2, dim=-1) - logvar_det_division = torch.sum(torch.log(q_variance ** 2) - torch.log(p_variance ** 2), dim=-1) - return 0.5 * (var_division + diff_term - self.n_components + logvar_det_division) + logvar_det_division = torch.sum( + torch.log(q_variance ** 2) - torch.log(p_variance ** 2), dim=-1) + return 0.5 * (var_division + diff_term + - self.n_components + logvar_det_division) - def loss_fn(self, bow, word_dist, posterior_mu, posterior_log_sigma, epsilon=1e-8): + def loss_fn(self, bow, word_dist, posterior_mu, + posterior_log_sigma, epsilon=1e-8): # forward KL divergence - unscaled_kl = self.kl_divergence(posterior_mu, torch.exp(posterior_log_sigma), - self.prior_mean, self.prior_variance) - + unscaled_kl = self.kl_divergence( + posterior_mu, torch.exp(posterior_log_sigma), + self.prior_mean, self.prior_variance) + kl = self.kl_multiplier * unscaled_kl # reconstruction loss (log likelihood) nll = -1.0 * torch.sum(bow * torch.log(word_dist + epsilon), dim=-1) reward = nll + kl - return reward.sum() + return reward.mean() def forward(self, x_bow, x_bert): """Forward pass.""" # inference networks posterior_mu = self.mu_inference(x_bert) posterior_log_sigma_unnormalized = self.log_sigma_inference(x_bert) - posterior_log_sigma = self.posterior_log_sigma_norm(posterior_log_sigma_unnormalized) - posterior_distribution = Normal(posterior_mu, torch.exp(posterior_log_sigma)) + posterior_log_sigma = self.posterior_log_sigma_norm( + posterior_log_sigma_unnormalized) + posterior_distribution = Normal( + posterior_mu, torch.exp(posterior_log_sigma)) # RL policy action = posterior_distribution.rsample() self.action = action - policy = (1 / (torch.exp(posterior_log_sigma) * math.sqrt(2 * math.pi))) \ - * torch.exp(-1.0 * (action - posterior_mu) ** 2 / (2 * torch.exp(posterior_log_sigma) ** 2)) + policy = (1 / (torch.exp(posterior_log_sigma) + * math.sqrt(2 * math.pi))) \ + * torch.exp(-1.0 * (action - posterior_mu) ** 2 + / (2 * torch.exp(posterior_log_sigma) ** 2)) policy = self.dropout(policy) # product of experts - word_dist = F.softmax(self.beta_norm(torch.matmul(policy, self.beta)), dim=-1) + word_dist = F.softmax( + self.beta_norm(torch.matmul(policy, self.beta)), dim=-1) # loss - loss = self.loss_fn(x_bow, word_dist, posterior_mu, posterior_log_sigma) + loss = self.loss_fn( + x_bow, word_dist, posterior_mu, posterior_log_sigma) return loss def get_topic_distribution(self, x_bert): @@ -136,8 +154,10 @@ def get_topic_distribution(self, x_bert): # inference networks posterior_mu = self.mu_inference(x_bert) posterior_log_sigma_unnormalized = self.log_sigma_inference(x_bert) - posterior_log_sigma = self.posterior_log_sigma_norm(posterior_log_sigma_unnormalized) - posterior_distribution = Normal(posterior_mu, torch.exp(posterior_log_sigma)) + posterior_log_sigma = self.posterior_log_sigma_norm( + posterior_log_sigma_unnormalized) + posterior_distribution = Normal( + posterior_mu, torch.exp(posterior_log_sigma)) action = posterior_distribution.rsample() softmax_action = F.softmax(action, dim=-1) diff --git a/octis/models/rl_for_topic_models/networks/inference_network.py b/octis/models/rl_for_topic_models/networks/inference_network.py index 7c6ebd0b..c4c10b94 100644 --- a/octis/models/rl_for_topic_models/networks/inference_network.py +++ b/octis/models/rl_for_topic_models/networks/inference_network.py @@ -2,9 +2,9 @@ from collections import OrderedDict from torch import nn -import torch import numpy as np + class InferenceNetwork(nn.Module): """Inference Network.""" @@ -23,11 +23,13 @@ def __init__(self, bert_size, output_size, hidden_sizes, """ super(InferenceNetwork, self).__init__() assert isinstance(bert_size, int), "input_size must by type int." - assert isinstance(output_size, int) or isinstance(output_size, np.int64), "output_size must be type int." + assert isinstance(output_size, int) \ + or isinstance(output_size, np.int64), \ + "output_size must be type int." assert isinstance(hidden_sizes, tuple), \ "hidden_sizes must be type tuple." - assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', - 'rrelu', 'elu', 'selu', 'gelu'], \ + assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', + 'leakyrelu', 'rrelu', 'elu', 'selu', 'gelu'], \ "activation must be 'softplus', 'relu', 'sigmoid', 'tanh'," \ " 'leakyrelu', 'rrelu', 'elu', 'selu', or 'gelu'." assert dropout >= 0, "dropout must be >= 0." @@ -55,8 +57,12 @@ def __init__(self, bert_size, output_size, hidden_sizes, self.dropout = nn.Dropout(p=dropout) self.hiddens = nn.Sequential(OrderedDict([ - (f"l_{i}", nn.Sequential(nn.Linear(h_in, h_out), self.activation, nn.Dropout(p=dropout))) - for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) + (f"l_{i}", nn.Sequential( + nn.Linear(h_in, h_out), + self.activation, + nn.Dropout(p=dropout))) + for i, (h_in, h_out) in enumerate(zip( + hidden_sizes[:-1], hidden_sizes[1:]))])) self.output = nn.Linear(hidden_sizes[-1], output_size) diff --git a/octis/models/rl_for_topic_models/utils/data_preparation.py b/octis/models/rl_for_topic_models/utils/data_preparation.py index 9a55fd59..a46073c4 100644 --- a/octis/models/rl_for_topic_models/utils/data_preparation.py +++ b/octis/models/rl_for_topic_models/utils/data_preparation.py @@ -6,16 +6,23 @@ import os import pickle as pkl + def get_bag_of_words(data, min_length): """ Creates the bag of words """ - vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length) - for x in data if np.sum(x[x != np.array(None)]) != 0] + vect = [ + np.bincount( + x[x != np.array(None)].astype('int'), + minlength=min_length + ) + for x in data if np.sum(x[x != np.array(None)]) != 0 + ] vect = scipy.sparse.csr_matrix(vect) return vect + def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200): """ Creates SBERT Embeddings from an input file @@ -24,26 +31,35 @@ def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200): with open(text_file, encoding="utf-8") as filino: train_text = list(map(lambda x: x, filino.readlines())) - return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size)) + return np.array( + model.encode( + train_text, show_progress_bar=True, batch_size=batch_size)) -def bert_embeddings_from_list(texts, sbert_model_to_load="all-MiniLM-L6-v2", batch_size=100): +def bert_embeddings_from_list( + texts, sbert_model_to_load="all-MiniLM-L6-v2", batch_size=100): """ Creates SBERT Embeddings from a list """ model = SentenceTransformer(sbert_model_to_load) - return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size)) + return np.array( + model.encode( + texts, show_progress_bar=True, batch_size=batch_size)) class QuickText: """ Integrated class to handle all the text preprocessing needed """ - def __init__(self, bert_model, text_for_bow, text_for_bert=None, bert_path=None): + def __init__( + self, bert_model, text_for_bow, + text_for_bert=None, bert_path=None): """ :param bert_model: string, bert model to use - :param text_for_bert: list, list of sentences with the unpreprocessed text - :param text_for_bow: list, list of sentences with the preprocessed text + :param text_for_bert: list, list of sentences + with the unpreprocessed text + :param text_for_bow: list, list of sentences + with the preprocessed text """ self.vocab_dict = {} self.vocab = [] @@ -96,14 +112,18 @@ def load_dataset(self): else: if self.data_bert is None: if self.text_for_bert is not None: - self.data_bert = bert_embeddings_from_list(self.text_for_bert, self.bert_model) + self.data_bert = bert_embeddings_from_list( + self.text_for_bert, self.bert_model) else: - self.data_bert = bert_embeddings_from_list(self.text_for_bow, self.bert_model) + self.data_bert = bert_embeddings_from_list( + self.text_for_bow, self.bert_model) pkl.dump(self.data_bert, open(self.bert_path, 'w')) - training_dataset = RLTMDataset(self.bow, self.data_bert, self.idx2token) + training_dataset = RLTMDataset( + self.bow, self.data_bert, self.idx2token) return training_dataset + class TextHandler: """ Class used to handle the text preparation and the BagOfWord @@ -118,7 +138,9 @@ def __init__(self, file_name=None, sentences=None): self.bow = None warnings.simplefilter('always', DeprecationWarning) - warnings.warn("TextHandler class is deprecated and will be removed in version 2.0. Use QuickText.", Warning) + warnings.warn("TextHandler class is deprecated and \ + will be removed in version 2.0. Use \ + QuickText.", Warning) def prepare(self): indptr = [0] @@ -135,7 +157,8 @@ def prepare(self): with open(self.file_name, encoding="utf-8") as filino: docs = filino.readlines() else: - raise Exception("One parameter between sentences and file_name should be selected") + raise Exception("One parameter between sentences and \ + file_name should be selected") for d in docs: for term in d.split(): diff --git a/octis/models/rl_for_topic_models/utils/preprocessing.py b/octis/models/rl_for_topic_models/utils/preprocessing.py index c6cc5062..61d2f600 100644 --- a/octis/models/rl_for_topic_models/utils/preprocessing.py +++ b/octis/models/rl_for_topic_models/utils/preprocessing.py @@ -3,16 +3,22 @@ from nltk.corpus import stopwords as stop_words import warnings + class WhiteSpacePreprocessing(): """ - Provides a very simple preprocessing script that filters infrequent tokens from text + Provides a very simple preprocessing script + that filters infrequent tokens from text """ - def __init__(self, documents, stopwords_language="english", vocabulary_size=2000): + def __init__( + self, documents, stopwords_language="english", + vocabulary_size=2000): """ - :param documents: list of strings - :param stopwords_language: string of the language of the stopwords (see nltk stopwords) - :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents + :param stopwords_language: string of the language + of the stopwords(see nltk stopwords) + :param vocabulary_size: the number of most frequent words + to include in the documents. Infrequent words will be + discarded from the list of preprocessed documents """ self.documents = documents self.stopwords = set(stop_words.words(stopwords_language)) @@ -20,23 +26,41 @@ def __init__(self, documents, stopwords_language="english", vocabulary_size=2000 def preprocess(self): """ - Note that if after filtering some documents do not contain words we remove them. That is why we return also the - list of unpreprocessed documents. + Note that if after filtering some documents do not + contain words we remove them. That is why we return + also the list of unpreprocessed documents. - :return: preprocessed documents, unpreprocessed documents and the vocabulary list + :return: preprocessed documents, unpreprocessed + documents and the vocabulary list """ preprocessed_docs_tmp = self.documents preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp] - preprocessed_docs_tmp = [doc.translate( - str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp] - preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords]) - for doc in preprocessed_docs_tmp] + preprocessed_docs_tmp = [ + doc.translate( + str.maketrans(string.punctuation, + ' ' * len(string.punctuation)) + ) + for doc in preprocessed_docs_tmp + ] + preprocessed_docs_tmp = [ + ' '.join( + [w for w in doc.split() + if len(w) > 0 and w not in self.stopwords] + ) + for doc in preprocessed_docs_tmp + ] - vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b') + vectorizer = CountVectorizer( + max_features=self.vocabulary_size, + token_pattern=r'\b[a-zA-Z]{2,}\b') vectorizer.fit_transform(preprocessed_docs_tmp) vocabulary = set(vectorizer.get_feature_names()) - preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocabulary]) - for doc in preprocessed_docs_tmp] + preprocessed_docs_tmp = [ + ' '.join( + [w for w in doc.split() if w in vocabulary] + ) + for doc in preprocessed_docs_tmp + ] preprocessed_docs, unpreprocessed_docs = [], [] for i, doc in enumerate(preprocessed_docs_tmp): @@ -54,5 +78,6 @@ def __init__(self, documents, stopwords_language="english"): if self.__class__.__name__ == "CTM": - warnings.warn("SimplePrepocessing is deprecated and will be removed in version 2.0, " - "use WhiteSpacePreprocessing", DeprecationWarning) + warnings.warn("SimplePrepocessing is deprecated and \ + will be removed in version 2.0, use \ + WhiteSpacePreprocessing", DeprecationWarning) diff --git a/tests/test_octis.py b/tests/test_octis.py index e96e3f9f..45ceb1f8 100644 --- a/tests/test_octis.py +++ b/tests/test_octis.py @@ -221,6 +221,7 @@ def test_model_output_etm_with_binary_word2vec_embeddings_file( assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2])) + def test_model_output_etm_with_text_word2vec_embeddings_file( data_dir, embeddings_dir): dataset = Dataset() @@ -229,7 +230,7 @@ def test_model_output_etm_with_text_word2vec_embeddings_file( model = ETM( num_topics=num_topics, num_epochs=5, train_embeddings=False, embeddings_type='word2vec', - embeddings_path=embeddings_dir +'/test_example/example.txt', + embeddings_path=embeddings_dir + '/test_example/example.txt', binary_embeddings=False) output = model.train_model(dataset) assert 'topics' in output.keys() @@ -264,7 +265,8 @@ def test_model_output_etm_with_headerless_text_word2vec_embeddings_file( model = ETM( num_topics=num_topics, num_epochs=5, train_embeddings=False, embeddings_type='word2vec', - embeddings_path=embeddings_dir + '/test_example/headerless_example.txt', + embeddings_path=embeddings_dir + + '/test_example/headerless_example.txt', binary_embeddings=False, headerless_embeddings=True) output = model.train_model(dataset) assert 'topics' in output.keys() @@ -299,7 +301,7 @@ def test_model_output_etm_with_keyedvectors_embeddings_file( model = ETM( num_topics=num_topics, num_epochs=5, train_embeddings=False, embeddings_type='keyedvectors', - embeddings_path=embeddings_dir +'/test_example/example.keyedvectors') + embeddings_path=embeddings_dir + '/test_example/example.keyedvectors') output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys()