From 7498e5e34aaf167386b5ca3fb7744735bd6b5bde Mon Sep 17 00:00:00 2001
From: Haluk Dogan <haluk.dogan@huskers.unl.edu>
Date: Thu, 25 Feb 2021 19:45:14 -0600
Subject: [PATCH 1/2] containerization and code clean up

---
 .gitignore           | 209 +++++++++++++++++
 Dockerfile           |  31 +++
 Makefile             |  27 +++
 README.md            |  28 ++-
 bin/pyls             |   4 +
 bin/pythondocker     |   3 +
 data.py              |  71 +++---
 docker-compose.yml   |  13 ++
 etm.py               |  96 ++++----
 main.py              | 536 ++++++++++++++++++++++++++++---------------
 requirements.txt     |  11 +
 scripts/data_20ng.py | 248 +++++++++++++-------
 scripts/data_nyt.py  | 221 ++++++++++++------
 skipgram.py          |  72 ++++--
 utils.py             |  46 ++--
 15 files changed, 1156 insertions(+), 460 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100755 bin/pyls
 create mode 100755 bin/pythondocker
 create mode 100644 docker-compose.yml
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a61809f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,209 @@
+# Created by https://www.toptal.com/developers/gitignore/api/linux,python,emacs
+# Edit at https://www.toptal.com/developers/gitignore?templates=linux,python,emacs
+
+### Emacs ###
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+ltximg/**
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+dist/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+
+### Linux ###
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# profiling data
+.prof
+
+# End of https://www.toptal.com/developers/gitignore/api/linux,python,emacs
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8ced8ed
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,31 @@
+ARG TAG=3.8
+
+FROM python:$TAG
+ENV PYTHONUNBUFFERED 1
+
+ARG USER
+ARG USER_ID
+ARG GROUP_ID
+ARG WORKDIR
+
+RUN apt-get update \
+    && apt-get clean \
+    && apt-get update -qqq \
+    && apt-get install -y -q build-essential graphviz graphviz-dev \
+    && apt-get install -y -q ffmpeg libsm6 libxext6 \
+    && pip install --upgrade pip \
+    && pip install Cython scipy
+
+RUN groupadd --gid 1000 $USER
+RUN useradd --create-home --uid $USER_ID --gid $GROUP_ID $USER
+
+USER ${USER}
+ENV PATH "$PATH:/home/$USER/.local/bin"
+
+COPY ./requirements.txt requirements.txt
+RUN pip install --user -r requirements.txt
+
+RUN pip install --user torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 \
+    -f https://download.pytorch.org/whl/torch_stable.html
+
+WORKDIR $WORKDIR
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..594fbd6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+##
+# ETM
+#
+# @file
+# @version 0.1
+
+user := $(shell whoami)
+userid := $(shell id -u)
+groupid := $(shell id -g)
+workdir := $(shell pwd)
+
+.PHONY: build
+build:
+	docker-compose build \
+	--build-arg USER=$(user) \
+	--build-arg USER_ID=${userid} \
+	--build-arg GROUP_ID=$(groupid) \
+	--build-arg WORKDIR=$(workdir)
+
+.PHONY: clean-container
+clean-container:
+	docker rmi etm_etm
+
+.PHONY: clean-python
+clean-python:
+	rm -rf __pycache__
+# end
diff --git a/README.md b/README.md
index 6bc2ee6..529ed33 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,19 @@ ETM defines words and topics in the same embedding space. The likelihood of a wo
 
 ## Dependencies
 
-+ python 3.6.7
-+ pytorch 1.1.0
++ python 3.8.7
++ pytorch 1.7.1
+
+## Optional
+
++ docker
++ docker-compose
+
+### Build Docker Image
+
+``` sh
+make build
+```
 
 ## Datasets
 
@@ -20,7 +31,7 @@ All the datasets are pre-processed and can be found below:
 + https://bitbucket.org/franrruiz/data_stopwords_largev_2/src/master/ (this one contains stop words and was used to showcase robustness of ETM to stop words.)
 + https://bitbucket.org/franrruiz/data_20ng_largev/src/master/
 
-All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py  
+All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py
 
 ## To Run
 
@@ -38,14 +49,20 @@ To learn interpretable topics using ETM with pre-fitted word embeddings (called
 
 + first fit the word embeddings. For example to use simple skipgram you can run
 ```
-python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4 
+python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4
 ```
 
-+ then run the following 
++ then run the following
 ```
 python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH_TO_EMBEDDINGS --num_topics 50 --train_embeddings 0 --epochs 1000
 ```
 
+## To Run in a Container
+
+``` sh
+docker-compose run --rm etm python main.py --mode train --dataset 20ng --data_path data/20ng --num_topics 50 --train_embeddings 1 --epochs 1000
+```
+
 ## Citation
 
 ```
@@ -56,4 +73,3 @@ python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH
   year={2019}
 }
 ```
-
diff --git a/bin/pyls b/bin/pyls
new file mode 100755
index 0000000..bb5c5d8
--- /dev/null
+++ b/bin/pyls
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+# docker-compose build ml &> /dev/null
+exec docker-compose run --name etm_python --rm --no-deps -T etm pyls
diff --git a/bin/pythondocker b/bin/pythondocker
new file mode 100755
index 0000000..cb8fe4a
--- /dev/null
+++ b/bin/pythondocker
@@ -0,0 +1,3 @@
+#!/bin/bash
+set -e
+exec docker exec -it etm_python python $@
diff --git a/data.py b/data.py
index 2f3afe9..277423b 100644
--- a/data.py
+++ b/data.py
@@ -1,56 +1,63 @@
 import os
-import random
 import pickle
+
 import numpy as np
-import torch 
 import scipy.io
+import torch
+
 
 def _fetch(path, name):
-    if name == 'train':
-        token_file = os.path.join(path, 'bow_tr_tokens.mat')
-        count_file = os.path.join(path, 'bow_tr_counts.mat')
-    elif name == 'valid':
-        token_file = os.path.join(path, 'bow_va_tokens.mat')
-        count_file = os.path.join(path, 'bow_va_counts.mat')
+    if name == "train":
+        token_file = os.path.join(path, "bow_tr_tokens.mat")
+        count_file = os.path.join(path, "bow_tr_counts.mat")
+    elif name == "valid":
+        token_file = os.path.join(path, "bow_va_tokens.mat")
+        count_file = os.path.join(path, "bow_va_counts.mat")
     else:
-        token_file = os.path.join(path, 'bow_ts_tokens.mat')
-        count_file = os.path.join(path, 'bow_ts_counts.mat')
-    tokens = scipy.io.loadmat(token_file)['tokens'].squeeze()
-    counts = scipy.io.loadmat(count_file)['counts'].squeeze()
-    if name == 'test':
-        token_1_file = os.path.join(path, 'bow_ts_h1_tokens.mat')
-        count_1_file = os.path.join(path, 'bow_ts_h1_counts.mat')
-        token_2_file = os.path.join(path, 'bow_ts_h2_tokens.mat')
-        count_2_file = os.path.join(path, 'bow_ts_h2_counts.mat')
-        tokens_1 = scipy.io.loadmat(token_1_file)['tokens'].squeeze()
-        counts_1 = scipy.io.loadmat(count_1_file)['counts'].squeeze()
-        tokens_2 = scipy.io.loadmat(token_2_file)['tokens'].squeeze()
-        counts_2 = scipy.io.loadmat(count_2_file)['counts'].squeeze()
-        return {'tokens': tokens, 'counts': counts, 
-                    'tokens_1': tokens_1, 'counts_1': counts_1, 
-                        'tokens_2': tokens_2, 'counts_2': counts_2}
-    return {'tokens': tokens, 'counts': counts}
+        token_file = os.path.join(path, "bow_ts_tokens.mat")
+        count_file = os.path.join(path, "bow_ts_counts.mat")
+    tokens = scipy.io.loadmat(token_file)["tokens"].squeeze()
+    counts = scipy.io.loadmat(count_file)["counts"].squeeze()
+    if name == "test":
+        token_1_file = os.path.join(path, "bow_ts_h1_tokens.mat")
+        count_1_file = os.path.join(path, "bow_ts_h1_counts.mat")
+        token_2_file = os.path.join(path, "bow_ts_h2_tokens.mat")
+        count_2_file = os.path.join(path, "bow_ts_h2_counts.mat")
+        tokens_1 = scipy.io.loadmat(token_1_file)["tokens"].squeeze()
+        counts_1 = scipy.io.loadmat(count_1_file)["counts"].squeeze()
+        tokens_2 = scipy.io.loadmat(token_2_file)["tokens"].squeeze()
+        counts_2 = scipy.io.loadmat(count_2_file)["counts"].squeeze()
+        return {
+            "tokens": tokens,
+            "counts": counts,
+            "tokens_1": tokens_1,
+            "counts_1": counts_1,
+            "tokens_2": tokens_2,
+            "counts_2": counts_2,
+        }
+    return {"tokens": tokens, "counts": counts}
+
 
 def get_data(path):
-    with open(os.path.join(path, 'vocab.pkl'), 'rb') as f:
+    with open(os.path.join(path, "vocab.pkl"), "rb") as f:
         vocab = pickle.load(f)
 
-    train = _fetch(path, 'train')
-    valid = _fetch(path, 'valid')
-    test = _fetch(path, 'test')
+    train = _fetch(path, "train")
+    valid = _fetch(path, "valid")
+    test = _fetch(path, "test")
 
     return vocab, train, valid, test
 
+
 def get_batch(tokens, counts, ind, vocab_size, device, emsize=300):
     """fetch input data by batch."""
     batch_size = len(ind)
     data_batch = np.zeros((batch_size, vocab_size))
-    
+
     for i, doc_id in enumerate(ind):
         doc = tokens[doc_id]
         count = counts[doc_id]
-        L = count.shape[1]
-        if len(doc) == 1: 
+        if len(doc) == 1:
             doc = [doc.squeeze()]
             count = [count.squeeze()]
         else:
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..08c3b61
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.8'
+
+services:
+  etm:
+    build:
+      context: .
+      args:
+        - USER
+        - USER_ID
+        - GROUP_ID
+        - WORKDIR
+    volumes:
+      - ./:/home/hd/git/ETM
diff --git a/etm.py b/etm.py
index 81c00e6..03c1846 100644
--- a/etm.py
+++ b/etm.py
@@ -1,18 +1,26 @@
 import torch
-import torch.nn.functional as F 
-import numpy as np 
-import math 
-
+import torch.nn.functional as F
 from torch import nn
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+
 class ETM(nn.Module):
-    def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emsize, 
-                    theta_act, embeddings=None, train_embeddings=True, enc_drop=0.5):
+    def __init__(
+        self,
+        num_topics,
+        vocab_size,
+        t_hidden_size,
+        rho_size,
+        emsize,
+        theta_act,
+        embeddings=None,
+        train_embeddings=True,
+        enc_drop=0.5,
+    ):
         super(ETM, self).__init__()
 
-        ## define hyperparameters
+        # define hyperparameters
         self.num_topics = num_topics
         self.vocab_size = vocab_size
         self.t_hidden_size = t_hidden_size
@@ -22,55 +30,55 @@ def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emsize,
         self.t_drop = nn.Dropout(enc_drop)
 
         self.theta_act = self.get_activation(theta_act)
-        
-        ## define the word embedding matrix \rho
+
+        # define the word embedding matrix \rho
         if train_embeddings:
             self.rho = nn.Linear(rho_size, vocab_size, bias=False)
         else:
             num_embeddings, emsize = embeddings.size()
-            rho = nn.Embedding(num_embeddings, emsize)
             self.rho = embeddings.clone().float().to(device)
 
-        ## define the matrix containing the topic embeddings
-        self.alphas = nn.Linear(rho_size, num_topics, bias=False)#nn.Parameter(torch.randn(rho_size, num_topics))
-    
-        ## define variational distribution for \theta_{1:D} via amortizartion
+        # define the matrix containing the topic embeddings
+        self.alphas = nn.Linear(
+            rho_size, num_topics, bias=False
+        )  # nn.Parameter(torch.randn(rho_size, num_topics))
+
+        # define variational distribution for \theta_{1:D} via amortizartion
         self.q_theta = nn.Sequential(
-                nn.Linear(vocab_size, t_hidden_size), 
-                self.theta_act,
-                nn.Linear(t_hidden_size, t_hidden_size),
-                self.theta_act,
-            )
+            nn.Linear(vocab_size, t_hidden_size),
+            self.theta_act,
+            nn.Linear(t_hidden_size, t_hidden_size),
+            self.theta_act,
+        )
         self.mu_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True)
         self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True)
 
     def get_activation(self, act):
-        if act == 'tanh':
+        if act == "tanh":
             act = nn.Tanh()
-        elif act == 'relu':
+        elif act == "relu":
             act = nn.ReLU()
-        elif act == 'softplus':
+        elif act == "softplus":
             act = nn.Softplus()
-        elif act == 'rrelu':
+        elif act == "rrelu":
             act = nn.RReLU()
-        elif act == 'leakyrelu':
+        elif act == "leakyrelu":
             act = nn.LeakyReLU()
-        elif act == 'elu':
+        elif act == "elu":
             act = nn.ELU()
-        elif act == 'selu':
+        elif act == "selu":
             act = nn.SELU()
-        elif act == 'glu':
+        elif act == "glu":
             act = nn.GLU()
         else:
-            print('Defaulting to tanh activations...')
+            print("Defaulting to tanh activations...")
             act = nn.Tanh()
-        return act 
+        return act
 
     def reparameterize(self, mu, logvar):
-        """Returns a sample from a Gaussian distribution via reparameterization.
-        """
+        """Returns a sample from a Gaussian distribution via reparameterization."""
         if self.training:
-            std = torch.exp(0.5 * logvar) 
+            std = torch.exp(0.5 * logvar)
             eps = torch.randn_like(std)
             return eps.mul_(std).add_(mu)
         else:
@@ -88,42 +96,46 @@ def encode(self, bows):
             q_theta = self.t_drop(q_theta)
         mu_theta = self.mu_q_theta(q_theta)
         logsigma_theta = self.logsigma_q_theta(q_theta)
-        kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean()
+        kl_theta = (
+            -0.5
+            * torch.sum(
+                1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1
+            ).mean()
+        )
         return mu_theta, logsigma_theta, kl_theta
 
     def get_beta(self):
         try:
-            logit = self.alphas(self.rho.weight) # torch.mm(self.rho, self.alphas)
+            logit = self.alphas(self.rho.weight)  # torch.mm(self.rho, self.alphas)
         except:
             logit = self.alphas(self.rho)
-        beta = F.softmax(logit, dim=0).transpose(1, 0) ## softmax over vocab dimension
+        beta = F.softmax(logit, dim=0).transpose(1, 0)  # softmax over vocab dimension
         return beta
 
     def get_theta(self, normalized_bows):
         mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows)
         z = self.reparameterize(mu_theta, logsigma_theta)
-        theta = F.softmax(z, dim=-1) 
+        theta = F.softmax(z, dim=-1)
         return theta, kld_theta
 
     def decode(self, theta, beta):
         res = torch.mm(theta, beta)
-        preds = torch.log(res+1e-6)
-        return preds 
+        preds = torch.log(res + 1e-6)
+        return preds
 
     def forward(self, bows, normalized_bows, theta=None, aggregate=True):
-        ## get \theta
+        # get \theta
         if theta is None:
             theta, kld_theta = self.get_theta(normalized_bows)
         else:
             kld_theta = None
 
-        ## get \beta
+        # get \beta
         beta = self.get_beta()
 
-        ## get prediction loss
+        # get prediction loss
         preds = self.decode(theta, beta)
         recon_loss = -(preds * bows).sum(1)
         if aggregate:
             recon_loss = recon_loss.mean()
         return recon_loss, kld_theta
-
diff --git a/main.py b/main.py
index 2b710eb..9a2bad3 100644
--- a/main.py
+++ b/main.py
@@ -1,108 +1,171 @@
-#/usr/bin/python
-
-from __future__ import print_function
+# /usr/bin/python
 
 import argparse
-import torch
-import pickle 
-import numpy as np 
-import os 
-import math 
-import random 
-import sys
-import matplotlib.pyplot as plt 
-import data
-import scipy.io
+import math
+import os
 
-from torch import nn, optim
-from torch.nn import functional as F
+import numpy as np
+import torch
+from torch import optim
 
+import data
 from etm import ETM
-from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity
-
-parser = argparse.ArgumentParser(description='The Embedded Topic Model')
-
-### data and file related arguments
-parser.add_argument('--dataset', type=str, default='20ng', help='name of corpus')
-parser.add_argument('--data_path', type=str, default='data/20ng', help='directory containing data')
-parser.add_argument('--emb_path', type=str, default='data/20ng_embeddings.txt', help='directory containing word embeddings')
-parser.add_argument('--save_path', type=str, default='./results', help='path to save results')
-parser.add_argument('--batch_size', type=int, default=1000, help='input batch size for training')
-
-### model-related arguments
-parser.add_argument('--num_topics', type=int, default=50, help='number of topics')
-parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho')
-parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings')
-parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)')
-parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)')
-parser.add_argument('--train_embeddings', type=int, default=0, help='whether to fix rho or train it')
-
-### optimization-related arguments
-parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
-parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this...')
-parser.add_argument('--epochs', type=int, default=20, help='number of epochs to train...150 for 20ng 100 for others')
-parser.add_argument('--mode', type=str, default='train', help='train or eval model')
-parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer')
-parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)')
-parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder')
-parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping')
-parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed')
-parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization')
-parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not')
-parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not')
-
-### evaluation, visualization, and logging-related arguments
-parser.add_argument('--num_words', type=int, default=10, help='number of words for topic viz')
-parser.add_argument('--log_interval', type=int, default=2, help='when to log training')
-parser.add_argument('--visualize_every', type=int, default=10, help='when to visualize results')
-parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation')
-parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from')
-parser.add_argument('--tc', type=int, default=0, help='whether to compute topic coherence or not')
-parser.add_argument('--td', type=int, default=0, help='whether to compute topic diversity or not')
+from utils import get_topic_coherence, get_topic_diversity, nearest_neighbors
+
+parser = argparse.ArgumentParser(description="The Embedded Topic Model")
+
+# data and file related arguments
+parser.add_argument("--dataset", type=str, default="20ng",
+                    help="name of corpus")
+parser.add_argument(
+    "--data_path", type=str, default="data/20ng",
+    help="directory containing data")
+
+parser.add_argument(
+    "--emb_path",
+    type=str,
+    default="data/20ng_embeddings.txt",
+    help="directory containing word embeddings",
+)
+parser.add_argument(
+    "--save_path", type=str, default="results", help="path to save results"
+)
+parser.add_argument(
+    "--batch_size", type=int, default=1000, help="input batch size for training"
+)
+
+# model-related arguments
+parser.add_argument("--num_topics", type=int, default=50,
+                    help="number of topics")
+parser.add_argument("--rho_size", type=int, default=300,
+                    help="dimension of rho")
+parser.add_argument("--emb_size", type=int, default=300,
+                    help="dimension of embeddings")
+parser.add_argument(
+    "--t_hidden_size",
+    type=int,
+    default=800,
+    help="dimension of hidden space of q(theta)",
+)
+parser.add_argument(
+    "--theta_act",
+    type=str,
+    default="relu",
+    help="tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)",
+)
+parser.add_argument(
+    "--train_embeddings", type=int, default=0,
+    help="whether to fix rho or train it"
+)
+
+# optimization-related arguments
+parser.add_argument("--lr", type=float, default=0.005, help="learning rate")
+parser.add_argument(
+    "--lr_factor", type=float, default=4.0,
+    help="divide learning rate by this..."
+)
+parser.add_argument(
+    "--epochs",
+    type=int,
+    default=20,
+    help="number of epochs to train...150 for 20ng 100 for others",
+)
+parser.add_argument("--mode", type=str, default="train",
+                    help="train or eval model")
+parser.add_argument("--optimizer", type=str, default="adam",
+                    help="choice of optimizer")
+parser.add_argument("--seed", type=int, default=2019,
+                    help="random seed (default: 1)")
+parser.add_argument(
+    "--enc_drop", type=float, default=0.0, help="dropout rate on encoder"
+)
+parser.add_argument("--clip", type=float, default=0.0,
+                    help="gradient clipping")
+parser.add_argument(
+    "--nonmono", type=int, default=10, help="number of bad hits allowed"
+)
+parser.add_argument(
+    "--wdecay", type=float, default=1.2e-6, help="some l2 regularization"
+)
+parser.add_argument(
+    "--anneal_lr",
+    type=int,
+    default=0,
+    help="whether to anneal the learning rate or not",
+)
+parser.add_argument("--bow_norm", type=int, default=1,
+                    help="normalize the bows or not")
+
+# evaluation, visualization, and logging-related arguments
+parser.add_argument(
+    "--num_words", type=int, default=10, help="number of words for topic viz"
+)
+parser.add_argument("--log_interval", type=int, default=2,
+                    help="when to log training")
+parser.add_argument(
+    "--visualize_every", type=int, default=10, help="when to visualize results"
+)
+parser.add_argument(
+    "--eval_batch_size", type=int, default=1000,
+    help="input batch size for evaluation"
+)
+parser.add_argument(
+    "--load_from", type=str, default="",
+    help="the name of the ckpt to eval from"
+)
+parser.add_argument(
+    "--tc", type=int, default=0,
+    help="whether to compute topic coherence or not"
+)
+parser.add_argument(
+    "--td", type=int, default=0,
+    help="whether to compute topic diversity or not"
+)
 
 args = parser.parse_args()
+root = os.path.dirname(__file__)
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-print('\n')
+print("\n")
 np.random.seed(args.seed)
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     torch.cuda.manual_seed(args.seed)
 
-## get data
+# get data
 # 1. vocabulary
 vocab, train, valid, test = data.get_data(os.path.join(args.data_path))
 vocab_size = len(vocab)
 args.vocab_size = vocab_size
 
 # 1. training data
-train_tokens = train['tokens']
-train_counts = train['counts']
+train_tokens = train["tokens"]
+train_counts = train["counts"]
 args.num_docs_train = len(train_tokens)
 
 # 2. dev set
-valid_tokens = valid['tokens']
-valid_counts = valid['counts']
+valid_tokens = valid["tokens"]
+valid_counts = valid["counts"]
 args.num_docs_valid = len(valid_tokens)
 
 # 3. test data
-test_tokens = test['tokens']
-test_counts = test['counts']
+test_tokens = test["tokens"]
+test_counts = test["counts"]
 args.num_docs_test = len(test_tokens)
-test_1_tokens = test['tokens_1']
-test_1_counts = test['counts_1']
+test_1_tokens = test["tokens_1"]
+test_1_counts = test["counts_1"]
 args.num_docs_test_1 = len(test_1_tokens)
-test_2_tokens = test['tokens_2']
-test_2_counts = test['counts_2']
+test_2_tokens = test["tokens_2"]
+test_2_counts = test["counts_2"]
 args.num_docs_test_2 = len(test_2_tokens)
 
 embeddings = None
 if not args.train_embeddings:
     emb_path = args.emb_path
-    vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl')   
+    vect_path = os.path.join(args.data_path.split("/")[0], "embeddings.pkl")
     vectors = {}
-    with open(emb_path, 'rb') as f:
+    with open(emb_path, "rb") as f:
         for l in f:
             line = l.decode().split()
             word = line[0]
@@ -112,50 +175,83 @@
     embeddings = np.zeros((vocab_size, args.emb_size))
     words_found = 0
     for i, word in enumerate(vocab):
-        try: 
+        try:
             embeddings[i] = vectors[word]
             words_found += 1
         except KeyError:
-            embeddings[i] = np.random.normal(scale=0.6, size=(args.emb_size, ))
+            embeddings[i] = np.random.normal(scale=0.6, size=(args.emb_size,))
     embeddings = torch.from_numpy(embeddings).to(device)
     args.embeddings_dim = embeddings.size()
 
-print('=*'*100)
-print('Training an Embedded Topic Model on {} with the following settings: {}'.format(args.dataset.upper(), args))
-print('=*'*100)
+print("=*" * 100)
+print(
+    "Training an Embedded Topic Model on {} with the following settings: {}".format(
+        args.dataset.upper(), args
+    )
+)
+print("=*" * 100)
 
-## define checkpoint
+# define checkpoint
 if not os.path.exists(args.save_path):
-    os.makedirs(args.save_path)
+    os.makedirs(os.path.join(root, args.save_path))
 
-if args.mode == 'eval':
+if args.mode == "eval":
     ckpt = args.load_from
 else:
-    ckpt = os.path.join(args.save_path, 
-        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format(
-        args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, 
-            args.lr, args.batch_size, args.rho_size, args.train_embeddings))
-
-## define model and optimizer
-model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, 
-                args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device)
-
-print('model: {}'.format(model))
-
-if args.optimizer == 'adam':
-    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
-elif args.optimizer == 'adagrad':
-    optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
-elif args.optimizer == 'adadelta':
-    optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
-elif args.optimizer == 'rmsprop':
-    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
-elif args.optimizer == 'asgd':
-    optimizer = optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
+    ckpt = os.path.join(
+        root,
+        args.save_path,
+        "etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}".format(
+            args.dataset,
+            args.num_topics,
+            args.t_hidden_size,
+            args.optimizer,
+            args.clip,
+            args.theta_act,
+            args.lr,
+            args.batch_size,
+            args.rho_size,
+            args.train_embeddings,
+        ),
+    )
+
+# define model and optimizer
+model = ETM(
+    args.num_topics,
+    vocab_size,
+    args.t_hidden_size,
+    args.rho_size,
+    args.emb_size,
+    args.theta_act,
+    embeddings,
+    args.train_embeddings,
+    args.enc_drop,
+).to(device)
+
+print("model: {}".format(model))
+
+if args.optimizer == "adam":
+    optimizer = optim.Adam(model.parameters(), lr=args.lr,
+                           weight_decay=args.wdecay)
+elif args.optimizer == "adagrad":
+    optimizer = optim.Adagrad(model.parameters(), lr=args.lr,
+                              weight_decay=args.wdecay)
+elif args.optimizer == "adadelta":
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr,
+                               weight_decay=args.wdecay)
+elif args.optimizer == "rmsprop":
+    optimizer = optim.RMSprop(model.parameters(), lr=args.lr,
+                              weight_decay=args.wdecay)
+elif args.optimizer == "asgd":
+    optimizer = optim.ASGD(
+        model.parameters(), lr=args.lr, t0=0, lambd=0.0,
+        weight_decay=args.wdecay
+    )
 else:
-    print('Defaulting to vanilla SGD')
+    print("Defaulting to vanilla SGD")
     optimizer = optim.SGD(model.parameters(), lr=args.lr)
 
+
 def train(epoch):
     model.train()
     acc_loss = 0
@@ -166,7 +262,9 @@ def train(epoch):
     for idx, ind in enumerate(indices):
         optimizer.zero_grad()
         model.zero_grad()
-        data_batch = data.get_batch(train_tokens, train_counts, ind, args.vocab_size, device)
+        data_batch = data.get_batch(
+            train_tokens, train_counts, ind, args.vocab_size, device
+        )
         sums = data_batch.sum(1).unsqueeze(1)
         if args.bow_norm:
             normalized_data_batch = data_batch / sums
@@ -185,81 +283,121 @@ def train(epoch):
         cnt += 1
 
         if idx % args.log_interval == 0 and idx > 0:
-            cur_loss = round(acc_loss / cnt, 2) 
-            cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
+            cur_loss = round(acc_loss / cnt, 2)
+            cur_kl_theta = round(acc_kl_theta_loss / cnt, 2)
             cur_real_loss = round(cur_loss + cur_kl_theta, 2)
 
-            print('Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
-                epoch, idx, len(indices), optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))
-    
-    cur_loss = round(acc_loss / cnt, 2) 
-    cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
+            print(
+                "Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}".format(
+                    epoch,
+                    idx,
+                    len(indices),
+                    optimizer.param_groups[0]["lr"],
+                    cur_kl_theta,
+                    cur_loss,
+                    cur_real_loss,
+                )
+            )
+
+    cur_loss = round(acc_loss / cnt, 2)
+    cur_kl_theta = round(acc_kl_theta_loss / cnt, 2)
     cur_real_loss = round(cur_loss + cur_kl_theta, 2)
-    print('*'*100)
-    print('Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
-            epoch, optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))
-    print('*'*100)
+    print("*" * 100)
+    print(
+        "Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}".format(
+            epoch,
+            optimizer.param_groups[0]["lr"],
+            cur_kl_theta,
+            cur_loss,
+            cur_real_loss,
+        )
+    )
+    print("*" * 100)
+
 
 def visualize(m, show_emb=True):
-    if not os.path.exists('./results'):
-        os.makedirs('./results')
+    if not os.path.exists(os.path.join(root, "results")):
+        os.makedirs(os.path.join(root, "results"))
 
     m.eval()
 
-    queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', 
-                'intelligence', 'money', 'politics', 'health', 'people', 'family']
-
-    ## visualize topics using monte carlo
+    queries = [
+        "andrew",
+        "computer",
+        "sports",
+        "religion",
+        "man",
+        "love",
+        "intelligence",
+        "money",
+        "politics",
+        "health",
+        "people",
+        "family",
+    ]
+
+    # visualize topics using monte carlo
     with torch.no_grad():
-        print('#'*100)
-        print('Visualize topics...')
+        print("#" * 100)
+        print("Visualize topics...")
         topics_words = []
         gammas = m.get_beta()
         for k in range(args.num_topics):
             gamma = gammas[k]
-            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1])
+            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1 :][::-1])
             topic_words = [vocab[a] for a in top_words]
-            topics_words.append(' '.join(topic_words))
-            print('Topic {}: {}'.format(k, topic_words))
+            topics_words.append(" ".join(topic_words))
+            print("Topic {}: {}".format(k, topic_words))
 
         if show_emb:
-            ## visualize word embeddings by using V to get nearest neighbors
-            print('#'*100)
-            print('Visualize word embeddings by using output embedding matrix')
+            # visualize word embeddings by using V to get nearest neighbors
+            print("#" * 100)
+            print("Visualize word embeddings by using output embedding matrix")
             try:
                 embeddings = m.rho.weight  # Vocab_size x E
             except:
-                embeddings = m.rho         # Vocab_size x E
-            neighbors = []
+                embeddings = m.rho  # Vocab_size x E
+            # neighbors = []
             for word in queries:
-                print('word: {} .. neighbors: {}'.format(
-                    word, nearest_neighbors(word, embeddings, vocab)))
-            print('#'*100)
+                print(
+                    "word: {} .. neighbors: {}".format(
+                        word, nearest_neighbors(word, embeddings, vocab)
+                    )
+                )
+            print("#" * 100)
+
 
 def evaluate(m, source, tc=False, td=False):
-    """Compute perplexity on document completion.
-    """
+    """Compute perplexity on document completion."""
     m.eval()
     with torch.no_grad():
-        if source == 'val':
-            indices = torch.split(torch.tensor(range(args.num_docs_valid)), args.eval_batch_size)
+        if source == "val":
+            indices = torch.split(
+                torch.tensor(range(args.num_docs_valid)), args.eval_batch_size
+            )
             tokens = valid_tokens
             counts = valid_counts
-        else: 
-            indices = torch.split(torch.tensor(range(args.num_docs_test)), args.eval_batch_size)
+        else:
+            indices = torch.split(
+                torch.tensor(range(args.num_docs_test)), args.eval_batch_size
+            )
             tokens = test_tokens
             counts = test_counts
 
-        ## get \beta here
+        # get \beta here
         beta = m.get_beta()
 
-        ### do dc and tc here
+        # do dc and tc here
         acc_loss = 0
         cnt = 0
-        indices_1 = torch.split(torch.tensor(range(args.num_docs_test_1)), args.eval_batch_size)
+        indices_1 = torch.split(
+            torch.tensor(range(args.num_docs_test_1)), args.eval_batch_size
+        )
         for idx, ind in enumerate(indices_1):
-            ## get theta from first half of docs
-            data_batch_1 = data.get_batch(test_1_tokens, test_1_counts, ind, args.vocab_size, device)
+            # get theta from first half of docs
+            data_batch_1 = data.get_batch(
+                test_1_tokens, test_1_counts, ind, args.vocab_size, device
+            )
             sums_1 = data_batch_1.sum(1).unsqueeze(1)
             if args.bow_norm:
                 normalized_data_batch_1 = data_batch_1 / sums_1
@@ -267,79 +405,88 @@ def evaluate(m, source, tc=False, td=False):
                 normalized_data_batch_1 = data_batch_1
             theta, _ = m.get_theta(normalized_data_batch_1)
 
-            ## get prediction loss using second half
-            data_batch_2 = data.get_batch(test_2_tokens, test_2_counts, ind, args.vocab_size, device)
+            # get prediction loss using second half
+            data_batch_2 = data.get_batch(
+                test_2_tokens, test_2_counts, ind, args.vocab_size, device
+            )
             sums_2 = data_batch_2.sum(1).unsqueeze(1)
             res = torch.mm(theta, beta)
             preds = torch.log(res)
             recon_loss = -(preds * data_batch_2).sum(1)
-            
+
             loss = recon_loss / sums_2.squeeze()
             loss = loss.mean().item()
             acc_loss += loss
             cnt += 1
         cur_loss = acc_loss / cnt
         ppl_dc = round(math.exp(cur_loss), 1)
-        print('*'*100)
-        print('{} Doc Completion PPL: {}'.format(source.upper(), ppl_dc))
-        print('*'*100)
+        print("*" * 100)
+        print("{} Doc Completion PPL: {}".format(source.upper(), ppl_dc))
+        print("*" * 100)
         if tc or td:
             beta = beta.data.cpu().numpy()
             if tc:
-                print('Computing topic coherence...')
+                print("Computing topic coherence...")
                 get_topic_coherence(beta, train_tokens, vocab)
             if td:
-                print('Computing topic diversity...')
+                print("Computing topic diversity...")
                 get_topic_diversity(beta, 25)
         return ppl_dc
 
-if args.mode == 'train':
-    ## train model on data 
+
+if args.mode == "train":
+    # train model on data
     best_epoch = 0
     best_val_ppl = 1e9
     all_val_ppls = []
-    print('\n')
-    print('Visualizing model quality before training...')
+    print("\n")
+    print("Visualizing model quality before training...")
     visualize(model)
-    print('\n')
+    print("\n")
     for epoch in range(1, args.epochs):
         train(epoch)
-        val_ppl = evaluate(model, 'val')
+        val_ppl = evaluate(model, "val")
         if val_ppl < best_val_ppl:
-            with open(ckpt, 'wb') as f:
+            with open(ckpt, "wb") as f:
                 torch.save(model, f)
             best_epoch = epoch
             best_val_ppl = val_ppl
         else:
-            ## check whether to anneal lr
-            lr = optimizer.param_groups[0]['lr']
-            if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5):
-                optimizer.param_groups[0]['lr'] /= args.lr_factor
+            # check whether to anneal lr
+            lr = optimizer.param_groups[0]["lr"]
+            if args.anneal_lr and (
+                len(all_val_ppls) > args.nonmono
+                and val_ppl > min(all_val_ppls[: -args.nonmono])
+                and lr > 1e-5
+            ):
+                optimizer.param_groups[0]["lr"] /= args.lr_factor
         if epoch % args.visualize_every == 0:
             visualize(model)
         all_val_ppls.append(val_ppl)
-    with open(ckpt, 'rb') as f:
+    with open(ckpt, "rb") as f:
         model = torch.load(f)
     model = model.to(device)
-    val_ppl = evaluate(model, 'val')
-else:   
-    with open(ckpt, 'rb') as f:
+    val_ppl = evaluate(model, "val")
+else:
+    with open(ckpt, "rb") as f:
         model = torch.load(f)
     model = model.to(device)
     model.eval()
 
     with torch.no_grad():
-        ## get document completion perplexities
-        test_ppl = evaluate(model, 'test', tc=args.tc, td=args.td)
+        # get document completion perplexities
+        test_ppl = evaluate(model, "test", tc=args.tc, td=args.td)
 
-        ## get most used topics
+        # get most used topics
         indices = torch.tensor(range(args.num_docs_train))
         indices = torch.split(indices, args.batch_size)
         thetaAvg = torch.zeros(1, args.num_topics).to(device)
         thetaWeightedAvg = torch.zeros(1, args.num_topics).to(device)
         cnt = 0
         for idx, ind in enumerate(indices):
-            data_batch = data.get_batch(train_tokens, train_counts, ind, args.vocab_size, device)
+            data_batch = data.get_batch(
+                train_tokens, train_counts, ind, args.vocab_size, device
+            )
             sums = data_batch.sum(1).unsqueeze(1)
             cnt += sums.sum(0).squeeze().cpu().numpy()
             if args.bow_norm:
@@ -351,30 +498,51 @@ def evaluate(m, source, tc=False, td=False):
             weighed_theta = sums * theta
             thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0)
             if idx % 100 == 0 and idx > 0:
-                print('batch: {}/{}'.format(idx, len(indices)))
+                print("batch: {}/{}".format(idx, len(indices)))
         thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt
-        print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))
+        print(
+            "\nThe 10 most used topics are {}".format(
+                thetaWeightedAvg.argsort()[::-1][:10]
+            )
+        )
 
-        ## show topics
+        # show topics
         beta = model.get_beta()
-        topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics
-        print('\n')
-        for k in range(args.num_topics):#topic_indices:
+        topic_indices = list(np.random.choice(args.num_topics, 10))  # 10 random topics
+        print("\n")
+        for k in range(args.num_topics):  # topic_indices:
             gamma = beta[k]
-            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1])
+            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1 :][::-1])
             topic_words = [vocab[a] for a in top_words]
-            print('Topic {}: {}'.format(k, topic_words))
+            print("Topic {}: {}".format(k, topic_words))
 
         if args.train_embeddings:
-            ## show etm embeddings 
+            # show etm embeddings
             try:
                 rho_etm = model.rho.weight.cpu()
             except:
                 rho_etm = model.rho.cpu()
-            queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', 
-                            'intelligence', 'money', 'politics', 'health', 'people', 'family']
-            print('\n')
-            print('ETM embeddings...')
+            queries = [
+                "andrew",
+                "woman",
+                "computer",
+                "sports",
+                "religion",
+                "man",
+                "love",
+                "intelligence",
+                "money",
+                "politics",
+                "health",
+                "people",
+                "family",
+            ]
+            print("\n")
+            print("ETM embeddings...")
             for word in queries:
-                print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab)))
-            print('\n')
+                print(
+                    "word: {} .. etm neighbors: {}".format(
+                        word, nearest_neighbors(word, rho_etm, vocab)
+                    )
+                )
+            print("\n")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b9d8531
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+black
+gensim
+ipdb
+isort
+matplotlib
+pyls-black
+pyls-isort
+pyls-mypy
+python-language-server[all]==0.36.2
+scikit-learn
+scipy
diff --git a/scripts/data_20ng.py b/scripts/data_20ng.py
index f6528af..f01af92 100644
--- a/scripts/data_20ng.py
+++ b/scripts/data_20ng.py
@@ -1,58 +1,73 @@
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.datasets import fetch_20newsgroups
-import numpy as np
+import os
 import pickle
-import random
-from scipy import sparse
-import itertools
-from scipy.io import savemat, loadmat
 import re
 import string
 
+import numpy as np
+from scipy import sparse
+from scipy.io import savemat
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import CountVectorizer
+
 # Maximum / minimum document frequency
 max_df = 0.7
 min_df = 10  # choose desired value for min_df
 
 # Read stopwords
-with open('stops.txt', 'r') as f:
-    stops = f.read().split('\n')
+with open("stops.txt", "r") as f:
+    stops = f.read().split("\n")
 
 # Read data
-print('reading data...')
-train_data = fetch_20newsgroups(subset='train')
-test_data = fetch_20newsgroups(subset='test')
+print("reading data...")
+train_data = fetch_20newsgroups(subset="train")
+test_data = fetch_20newsgroups(subset="test")
+
+init_docs_tr = [
+    re.findall(r"""[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]""", train_data.data[doc])
+    for doc in range(len(train_data.data))
+]
+init_docs_ts = [
+    re.findall(r"""[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]""", test_data.data[doc])
+    for doc in range(len(test_data.data))
+]
 
-init_docs_tr = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', train_data.data[doc]) for doc in range(len(train_data.data))]
-init_docs_ts = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', test_data.data[doc]) for doc in range(len(test_data.data))]
 
 def contains_punctuation(w):
     return any(char in string.punctuation for char in w)
 
+
 def contains_numeric(w):
     return any(char.isdigit() for char in w)
-    
+
+
 init_docs = init_docs_tr + init_docs_ts
-init_docs = [[w.lower() for w in init_docs[doc] if not contains_punctuation(w)] for doc in range(len(init_docs))]
-init_docs = [[w for w in init_docs[doc] if not contains_numeric(w)] for doc in range(len(init_docs))]
-init_docs = [[w for w in init_docs[doc] if len(w)>1] for doc in range(len(init_docs))]
+init_docs = [
+    [w.lower() for w in init_docs[doc] if not contains_punctuation(w)]
+    for doc in range(len(init_docs))
+]
+init_docs = [
+    [w for w in init_docs[doc] if not contains_numeric(w)]
+    for doc in range(len(init_docs))
+]
+init_docs = [[w for w in init_docs[doc] if len(w) > 1] for doc in range(len(init_docs))]
 init_docs = [" ".join(init_docs[doc]) for doc in range(len(init_docs))]
 
-# Create count vectorizer
-print('counting document frequency of words...')
+#  Create count vectorizer
+print("counting document frequency of words...")
 cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None)
 cvz = cvectorizer.fit_transform(init_docs).sign()
 
-# Get vocabulary
-print('building the vocabulary...')
+#  Get vocabulary
+print("building the vocabulary...")
 sum_counts = cvz.sum(axis=0)
 v_size = sum_counts.shape[1]
 sum_counts_np = np.zeros(v_size, dtype=int)
 for v in range(v_size):
-    sum_counts_np[v] = sum_counts[0,v]
+    sum_counts_np[v] = sum_counts[0, v]
 word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
 id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
 del cvectorizer
-print('  initial vocabulary size: {}'.format(v_size))
+print("  initial vocabulary size: {}".format(v_size))
 
 # Sort elements in vocabulary
 idx_sort = np.argsort(sum_counts_np)
@@ -60,7 +75,7 @@ def contains_numeric(w):
 
 # Filter out stopwords (if any)
 vocab_aux = [w for w in vocab_aux if w not in stops]
-print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))
+print("  vocabulary size after removing stopwords from list: {}".format(len(vocab_aux)))
 
 # Create dictionary and inverse dictionary
 vocab = vocab_aux
@@ -68,83 +83,143 @@ def contains_numeric(w):
 word2id = dict([(w, j) for j, w in enumerate(vocab)])
 id2word = dict([(j, w) for j, w in enumerate(vocab)])
 
-# Split in train/test/valid
-print('tokenizing documents and splitting into train/test/valid...')
+#  Split in train/test/valid
+print("tokenizing documents and splitting into train/test/valid...")
 num_docs_tr = len(init_docs_tr)
-trSize = num_docs_tr-100
+trSize = num_docs_tr - 100
 tsSize = len(init_docs_ts)
 vaSize = 100
 idx_permute = np.random.permutation(num_docs_tr).astype(int)
 
-# Remove words not in train_data
-vocab = list(set([w for idx_d in range(trSize) for w in init_docs[idx_permute[idx_d]].split() if w in word2id]))
+#  Remove words not in train_data
+vocab = list(
+    set(
+        [
+            w
+            for idx_d in range(trSize)
+            for w in init_docs[idx_permute[idx_d]].split()
+            if w in word2id
+        ]
+    )
+)
 word2id = dict([(w, j) for j, w in enumerate(vocab)])
 id2word = dict([(j, w) for j, w in enumerate(vocab)])
-print('  vocabulary after removing words not in train: {}'.format(len(vocab)))
+print("  vocabulary after removing words not in train: {}".format(len(vocab)))
+
+#  Split in train/test/valid
+docs_tr = [
+    [word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id]
+    for idx_d in range(trSize)
+]
+docs_va = [
+    [word2id[w] for w in init_docs[idx_permute[idx_d + trSize]].split() if w in word2id]
+    for idx_d in range(vaSize)
+]
+docs_ts = [
+    [word2id[w] for w in init_docs[idx_d + num_docs_tr].split() if w in word2id]
+    for idx_d in range(tsSize)
+]
+
+print(
+    "  number of documents (train): {} [this should be equal to {}]".format(
+        len(docs_tr), trSize
+    )
+)
+print(
+    "  number of documents (test): {} [this should be equal to {}]".format(
+        len(docs_ts), tsSize
+    )
+)
+print(
+    "  number of documents (valid): {} [this should be equal to {}]".format(
+        len(docs_va), vaSize
+    )
+)
+
+#  Remove empty documents
+print("removing empty documents...")
 
-# Split in train/test/valid
-docs_tr = [[word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
-docs_va = [[word2id[w] for w in init_docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(vaSize)]
-docs_ts = [[word2id[w] for w in init_docs[idx_d+num_docs_tr].split() if w in word2id] for idx_d in range(tsSize)]
-
-print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize))
-print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize))
-print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize))
-
-# Remove empty documents
-print('removing empty documents...')
 
 def remove_empty(in_docs):
-    return [doc for doc in in_docs if doc!=[]]
+    return [doc for doc in in_docs if doc != []]
+
 
 docs_tr = remove_empty(docs_tr)
 docs_ts = remove_empty(docs_ts)
 docs_va = remove_empty(docs_va)
 
 # Remove test documents with length=1
-docs_ts = [doc for doc in docs_ts if len(doc)>1]
+docs_ts = [doc for doc in docs_ts if len(doc) > 1]
 
 # Split test set in 2 halves
-print('splitting test documents in 2 halves...')
-docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
-docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]
+print("splitting test documents in 2 halves...")
+docs_ts_h1 = [
+    [w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts
+]
+docs_ts_h2 = [
+    [w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts
+]
 
 # Getting lists of words and doc_indices
-print('creating lists of words...')
+print("creating lists of words...")
+
 
 def create_list_words(in_docs):
     return [x for y in in_docs for x in y]
 
+
 words_tr = create_list_words(docs_tr)
 words_ts = create_list_words(docs_ts)
 words_ts_h1 = create_list_words(docs_ts_h1)
 words_ts_h2 = create_list_words(docs_ts_h2)
 words_va = create_list_words(docs_va)
 
-print('  len(words_tr): ', len(words_tr))
-print('  len(words_ts): ', len(words_ts))
-print('  len(words_ts_h1): ', len(words_ts_h1))
-print('  len(words_ts_h2): ', len(words_ts_h2))
-print('  len(words_va): ', len(words_va))
+print("  len(words_tr): ", len(words_tr))
+print("  len(words_ts): ", len(words_ts))
+print("  len(words_ts_h1): ", len(words_ts_h1))
+print("  len(words_ts_h2): ", len(words_ts_h2))
+print("  len(words_va): ", len(words_va))
 
 # Get doc indices
-print('getting doc indices...')
+print("getting doc indices...")
+
 
 def create_doc_indices(in_docs):
     aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
     return [int(x) for y in aux for x in y]
 
+
 doc_indices_tr = create_doc_indices(docs_tr)
 doc_indices_ts = create_doc_indices(docs_ts)
 doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
 doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
 doc_indices_va = create_doc_indices(docs_va)
 
-print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
-print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
-print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
-print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
-print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))
+print(
+    "  len(np.unique(doc_indices_tr)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_tr)), len(docs_tr)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts)), len(docs_ts)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_va)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_va)), len(docs_va)
+    )
+)
 
 # Number of documents in each set
 n_docs_tr = len(docs_tr)
@@ -161,10 +236,14 @@ def create_doc_indices(in_docs):
 del docs_va
 
 # Create bow representation
-print('creating bow representation...')
+print("creating bow representation...")
+
 
 def create_bow(doc_indices, words, n_docs, vocab_size):
-    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()
+    return sparse.coo_matrix(
+        ([1] * len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size)
+    ).tocsr()
+
 
 bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
 bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
@@ -184,57 +263,66 @@ def create_bow(doc_indices, words, n_docs, vocab_size):
 del doc_indices_va
 
 # Write the vocabulary to a file
-path_save = './min_df_' + str(min_df) + '/'
+path_save = "./min_df_" + str(min_df) + "/"
 if not os.path.isdir(path_save):
-    os.system('mkdir -p ' + path_save)
+    os.system("mkdir -p " + path_save)
 
-with open(path_save + 'vocab.pkl', 'wb') as f:
+with open(path_save + "vocab.pkl", "wb") as f:
     pickle.dump(vocab, f)
 del vocab
 
 # Split bow intro token/value pairs
-print('splitting bow intro token/value pairs and saving to disk...')
+print("splitting bow intro token/value pairs and saving to disk...")
+
 
 def split_bow(bow_in, n_docs):
-    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
-    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
+    indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_docs)]
+    counts = [[c for c in bow_in[doc, :].data] for doc in range(n_docs)]
     return indices, counts
 
+
 bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
-savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
-savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
+savemat(path_save + "bow_tr_tokens", {"tokens": bow_tr_tokens}, do_compression=True)
+savemat(path_save + "bow_tr_counts", {"counts": bow_tr_counts}, do_compression=True)
 del bow_tr
 del bow_tr_tokens
 del bow_tr_counts
 
 bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
-savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
+savemat(path_save + "bow_ts_tokens", {"tokens": bow_ts_tokens}, do_compression=True)
+savemat(path_save + "bow_ts_counts", {"counts": bow_ts_counts}, do_compression=True)
 del bow_ts
 del bow_ts_tokens
 del bow_ts_counts
 
 bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
-savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
+savemat(
+    path_save + "bow_ts_h1_tokens", {"tokens": bow_ts_h1_tokens}, do_compression=True
+)
+savemat(
+    path_save + "bow_ts_h1_counts", {"counts": bow_ts_h1_counts}, do_compression=True
+)
 del bow_ts_h1
 del bow_ts_h1_tokens
 del bow_ts_h1_counts
 
 bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
-savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
+savemat(
+    path_save + "bow_ts_h2_tokens", {"tokens": bow_ts_h2_tokens}, do_compression=True
+)
+savemat(
+    path_save + "bow_ts_h2_counts", {"counts": bow_ts_h2_counts}, do_compression=True
+)
 del bow_ts_h2
 del bow_ts_h2_tokens
 del bow_ts_h2_counts
 
 bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
-savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
-savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
+savemat(path_save + "bow_va_tokens", {"tokens": bow_va_tokens}, do_compression=True)
+savemat(path_save + "bow_va_counts", {"counts": bow_va_counts}, do_compression=True)
 del bow_va
 del bow_va_tokens
 del bow_va_counts
 
-print('Data ready !!')
-print('*************')
-
+print("Data ready !!")
+print("*************")
diff --git a/scripts/data_nyt.py b/scripts/data_nyt.py
index 01d657f..2af3f68 100644
--- a/scripts/data_nyt.py
+++ b/scripts/data_nyt.py
@@ -1,41 +1,41 @@
-from sklearn.feature_extraction.text import CountVectorizer
-import numpy as np
+import os
 import pickle
-import random
+
+import numpy as np
 from scipy import sparse
-import itertools
-from scipy.io import savemat, loadmat
+from scipy.io import savemat
+from sklearn.feature_extraction.text import CountVectorizer
 
 # Maximum / minimum document frequency
 max_df = 0.7
 min_df = 100  # choose desired value for min_df
 
 # Read stopwords
-with open('stops.txt', 'r') as f:
-    stops = f.read().split('\n')
+with open("stops.txt", "r") as f:
+    stops = f.read().split("\n")
 
 # Read data
-print('reading text file...')
-data_file = 'raw/new_york_times_text/nyt_docs.txt'
-with open(data_file, 'r') as f:
+print("reading text file...")
+data_file = "raw/new_york_times_text/nyt_docs.txt"
+with open(data_file, "r") as f:
     docs = f.readlines()
 
-# Create count vectorizer
-print('counting document frequency of words...')
+#  Create count vectorizer
+print("counting document frequency of words...")
 cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None)
 cvz = cvectorizer.fit_transform(docs).sign()
 
-# Get vocabulary
-print('building the vocabulary...')
+#  Get vocabulary
+print("building the vocabulary...")
 sum_counts = cvz.sum(axis=0)
 v_size = sum_counts.shape[1]
 sum_counts_np = np.zeros(v_size, dtype=int)
 for v in range(v_size):
-    sum_counts_np[v] = sum_counts[0,v]
+    sum_counts_np[v] = sum_counts[0, v]
 word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
 id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
 del cvectorizer
-print('  initial vocabulary size: {}'.format(v_size))
+print("  initial vocabulary size: {}".format(v_size))
 
 # Sort elements in vocabulary
 idx_sort = np.argsort(sum_counts_np)
@@ -43,8 +43,8 @@
 
 # Filter out stopwords (if any)
 vocab_aux = [w for w in vocab_aux if w not in stops]
-print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))
-print('  vocabulary after removing stopwords: {}'.format(len(vocab_aux)))
+print("  vocabulary size after removing stopwords from list: {}".format(len(vocab_aux)))
+print("  vocabulary after removing stopwords: {}".format(len(vocab_aux)))
 
 # Create dictionary and inverse dictionary
 vocab = vocab_aux
@@ -52,84 +52,148 @@
 word2id = dict([(w, j) for j, w in enumerate(vocab)])
 id2word = dict([(j, w) for j, w in enumerate(vocab)])
 
-# Split in train/test/valid
-print('tokenizing documents and splitting into train/test/valid...')
+#  Split in train/test/valid
+print("tokenizing documents and splitting into train/test/valid...")
 num_docs = cvz.shape[0]
-trSize = int(np.floor(0.85*num_docs))
-tsSize = int(np.floor(0.10*num_docs))
+trSize = int(np.floor(0.85 * num_docs))
+tsSize = int(np.floor(0.10 * num_docs))
 vaSize = int(num_docs - trSize - tsSize)
 del cvz
 idx_permute = np.random.permutation(num_docs).astype(int)
 
-# Remove words not in train_data
-vocab = list(set([w for idx_d in range(trSize) for w in docs[idx_permute[idx_d]].split() if w in word2id]))
+#  Remove words not in train_data
+vocab = list(
+    set(
+        [
+            w
+            for idx_d in range(trSize)
+            for w in docs[idx_permute[idx_d]].split()
+            if w in word2id
+        ]
+    )
+)
 word2id = dict([(w, j) for j, w in enumerate(vocab)])
 id2word = dict([(j, w) for j, w in enumerate(vocab)])
-print('  vocabulary after removing words not in train: {}'.format(len(vocab)))
-
-docs_tr = [[word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
-docs_ts = [[word2id[w] for w in docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(tsSize)]
-docs_va = [[word2id[w] for w in docs[idx_permute[idx_d+trSize+tsSize]].split() if w in word2id] for idx_d in range(vaSize)]
+print("  vocabulary after removing words not in train: {}".format(len(vocab)))
+
+docs_tr = [
+    [word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id]
+    for idx_d in range(trSize)
+]
+docs_ts = [
+    [word2id[w] for w in docs[idx_permute[idx_d + trSize]].split() if w in word2id]
+    for idx_d in range(tsSize)
+]
+docs_va = [
+    [
+        word2id[w]
+        for w in docs[idx_permute[idx_d + trSize + tsSize]].split()
+        if w in word2id
+    ]
+    for idx_d in range(vaSize)
+]
 del docs
 
-print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize))
-print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize))
-print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize))
+print(
+    "  number of documents (train): {} [this should be equal to {}]".format(
+        len(docs_tr), trSize
+    )
+)
+print(
+    "  number of documents (test): {} [this should be equal to {}]".format(
+        len(docs_ts), tsSize
+    )
+)
+print(
+    "  number of documents (valid): {} [this should be equal to {}]".format(
+        len(docs_va), vaSize
+    )
+)
+
+#  Remove empty documents
+print("removing empty documents...")
 
-# Remove empty documents
-print('removing empty documents...')
 
 def remove_empty(in_docs):
-    return [doc for doc in in_docs if doc!=[]]
+    return [doc for doc in in_docs if doc != []]
+
 
 docs_tr = remove_empty(docs_tr)
 docs_ts = remove_empty(docs_ts)
 docs_va = remove_empty(docs_va)
 
 # Remove test documents with length=1
-docs_ts = [doc for doc in docs_ts if len(doc)>1]
+docs_ts = [doc for doc in docs_ts if len(doc) > 1]
 
 # Split test set in 2 halves
-print('splitting test documents in 2 halves...')
-docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
-docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]
+print("splitting test documents in 2 halves...")
+docs_ts_h1 = [
+    [w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts
+]
+docs_ts_h2 = [
+    [w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts
+]
 
 # Getting lists of words and doc_indices
-print('creating lists of words...')
+print("creating lists of words...")
+
 
 def create_list_words(in_docs):
     return [x for y in in_docs for x in y]
 
+
 words_tr = create_list_words(docs_tr)
 words_ts = create_list_words(docs_ts)
 words_ts_h1 = create_list_words(docs_ts_h1)
 words_ts_h2 = create_list_words(docs_ts_h2)
 words_va = create_list_words(docs_va)
 
-print('  len(words_tr): ', len(words_tr))
-print('  len(words_ts): ', len(words_ts))
-print('  len(words_ts_h1): ', len(words_ts_h1))
-print('  len(words_ts_h2): ', len(words_ts_h2))
-print('  len(words_va): ', len(words_va))
+print("  len(words_tr): ", len(words_tr))
+print("  len(words_ts): ", len(words_ts))
+print("  len(words_ts_h1): ", len(words_ts_h1))
+print("  len(words_ts_h2): ", len(words_ts_h2))
+print("  len(words_va): ", len(words_va))
 
 # Get doc indices
-print('getting doc indices...')
+print("getting doc indices...")
+
 
 def create_doc_indices(in_docs):
     aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
     return [int(x) for y in aux for x in y]
 
+
 doc_indices_tr = create_doc_indices(docs_tr)
 doc_indices_ts = create_doc_indices(docs_ts)
 doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
 doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
 doc_indices_va = create_doc_indices(docs_va)
 
-print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
-print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
-print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
-print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
-print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))
+print(
+    "  len(np.unique(doc_indices_tr)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_tr)), len(docs_tr)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts)), len(docs_ts)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)
+    )
+)
+print(
+    "  len(np.unique(doc_indices_va)): {} [this should be {}]".format(
+        len(np.unique(doc_indices_va)), len(docs_va)
+    )
+)
 
 # Number of documents in each set
 n_docs_tr = len(docs_tr)
@@ -146,10 +210,14 @@ def create_doc_indices(in_docs):
 del docs_va
 
 # Create bow representation
-print('creating bow representation...')
+print("creating bow representation...")
+
 
 def create_bow(doc_indices, words, n_docs, vocab_size):
-    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()
+    return sparse.coo_matrix(
+        ([1] * len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size)
+    ).tocsr()
+
 
 bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
 bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
@@ -169,57 +237,66 @@ def create_bow(doc_indices, words, n_docs, vocab_size):
 del doc_indices_va
 
 # Save vocabulary to file
-path_save = './min_df_' + str(min_df) + '/'
+path_save = "./min_df_" + str(min_df) + "/"
 if not os.path.isdir(path_save):
-    os.system('mkdir -p ' + path_save)
+    os.system("mkdir -p " + path_save)
 
-with open(path_save + 'vocab.pkl', 'wb') as f:
+with open(path_save + "vocab.pkl", "wb") as f:
     pickle.dump(vocab, f)
 del vocab
 
 # Split bow intro token/value pairs
-print('splitting bow intro token/value pairs and saving to disk...')
+print("splitting bow intro token/value pairs and saving to disk...")
+
 
 def split_bow(bow_in, n_docs):
-    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
-    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
+    indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_docs)]
+    counts = [[c for c in bow_in[doc, :].data] for doc in range(n_docs)]
     return indices, counts
 
+
 bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
-savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
-savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
+savemat(path_save + "bow_tr_tokens", {"tokens": bow_tr_tokens}, do_compression=True)
+savemat(path_save + "bow_tr_counts", {"counts": bow_tr_counts}, do_compression=True)
 del bow_tr
 del bow_tr_tokens
 del bow_tr_counts
 
 bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
-savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
+savemat(path_save + "bow_ts_tokens", {"tokens": bow_ts_tokens}, do_compression=True)
+savemat(path_save + "bow_ts_counts", {"counts": bow_ts_counts}, do_compression=True)
 del bow_ts
 del bow_ts_tokens
 del bow_ts_counts
 
 bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
-savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
+savemat(
+    path_save + "bow_ts_h1_tokens", {"tokens": bow_ts_h1_tokens}, do_compression=True
+)
+savemat(
+    path_save + "bow_ts_h1_counts", {"counts": bow_ts_h1_counts}, do_compression=True
+)
 del bow_ts_h1
 del bow_ts_h1_tokens
 del bow_ts_h1_counts
 
 bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
-savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
-savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
+savemat(
+    path_save + "bow_ts_h2_tokens", {"tokens": bow_ts_h2_tokens}, do_compression=True
+)
+savemat(
+    path_save + "bow_ts_h2_counts", {"counts": bow_ts_h2_counts}, do_compression=True
+)
 del bow_ts_h2
 del bow_ts_h2_tokens
 del bow_ts_h2_counts
 
 bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
-savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
-savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
+savemat(path_save + "bow_va_tokens", {"tokens": bow_va_tokens}, do_compression=True)
+savemat(path_save + "bow_va_counts", {"counts": bow_va_counts}, do_compression=True)
 del bow_va
 del bow_va_tokens
 del bow_va_counts
 
-print('Data ready !!')
-print('*************')
-
+print("Data ready !!")
+print("*************")
diff --git a/skipgram.py b/skipgram.py
index 4ad50ee..2440c12 100644
--- a/skipgram.py
+++ b/skipgram.py
@@ -1,43 +1,69 @@
-import gensim
-import pickle
-import os
-import numpy as np
 import argparse
 
-parser = argparse.ArgumentParser(description='The Embedded Topic Model')
+import gensim
+
+parser = argparse.ArgumentParser(description="The Embedded Topic Model")
 
-### data and file related arguments
-parser.add_argument('--data_file', type=str, default='', help='a .txt file containing the corpus')
-parser.add_argument('--emb_file', type=str, default='embeddings.txt', help='file to save the word embeddings')
-parser.add_argument('--dim_rho', type=int, default=300, help='dimensionality of the word embeddings')
-parser.add_argument('--min_count', type=int, default=2, help='minimum term frequency (to define the vocabulary)')
-parser.add_argument('--sg', type=int, default=1, help='whether to use skip-gram')
-parser.add_argument('--workers', type=int, default=25, help='number of CPU cores')
-parser.add_argument('--negative_samples', type=int, default=10, help='number of negative samples')
-parser.add_argument('--window_size', type=int, default=4, help='window size to determine context')
-parser.add_argument('--iters', type=int, default=50, help='number of iterationst')
+# data and file related arguments
+parser.add_argument(
+    "--data_file", type=str, default="", help="a .txt file containing the corpus"
+)
+parser.add_argument(
+    "--emb_file",
+    type=str,
+    default="embeddings.txt",
+    help="file to save the word embeddings",
+)
+parser.add_argument(
+    "--dim_rho", type=int, default=300, help="dimensionality of the word embeddings"
+)
+parser.add_argument(
+    "--min_count",
+    type=int,
+    default=2,
+    help="minimum term frequency (to define the vocabulary)",
+)
+parser.add_argument("--sg", type=int, default=1, help="whether to use skip-gram")
+parser.add_argument("--workers", type=int, default=25, help="number of CPU cores")
+parser.add_argument(
+    "--negative_samples", type=int, default=10, help="number of negative samples"
+)
+parser.add_argument(
+    "--window_size", type=int, default=4, help="window size to determine context"
+)
+parser.add_argument("--iters", type=int, default=50, help="number of iterationst")
 
 args = parser.parse_args()
 
+
 # Class for a memory-friendly iterator over the dataset
 class MySentences(object):
     def __init__(self, filename):
         self.filename = filename
- 
+
     def __iter__(self):
         for line in open(self.filename):
             yield line.split()
 
+
 # Gensim code to obtain the embeddings
-sentences = MySentences(args.data_file) # a memory-friendly iterator
-model = gensim.models.Word2Vec(sentences, min_count=args.min_count, sg=args.sg, size=args.dim_rho, 
-    iter=args.iters, workers=args.workers, negative=args.negative_samples, window=args.window_size)
+sentences = MySentences(args.data_file)  # a memory-friendly iterator
+model = gensim.models.Word2Vec(
+    sentences,
+    min_count=args.min_count,
+    sg=args.sg,
+    size=args.dim_rho,
+    iter=args.iters,
+    workers=args.workers,
+    negative=args.negative_samples,
+    window=args.window_size,
+)
 
 # Write the embeddings to a file
-with open(args.emb_file, 'w') as f:
+with open(args.emb_file, "w") as f:
     for v in list(model.wv.vocab):
         vec = list(model.wv.__getitem__(v))
-        f.write(v + ' ')
-        vec_str = ['%.9f' % val for val in vec]
+        f.write(v + " ")
+        vec_str = ["%.9f" % val for val in vec]
         vec_str = " ".join(vec_str)
-        f.write(vec_str + '\n')
+        f.write(vec_str + "\n")
diff --git a/utils.py b/utils.py
index 3975544..da40a44 100644
--- a/utils.py
+++ b/utils.py
@@ -1,22 +1,23 @@
-import torch 
 import numpy as np
 
+
 def get_topic_diversity(beta, topk):
     num_topics = beta.shape[0]
     list_w = np.zeros((num_topics, topk))
     for k in range(num_topics):
-        idx = beta[k,:].argsort()[-topk:][::-1]
-        list_w[k,:] = idx
+        idx = beta[k, :].argsort()[-topk:][::-1]
+        list_w[k, :] = idx
     n_unique = len(np.unique(list_w))
     TD = n_unique / (topk * num_topics)
-    print('Topic diveristy is: {}'.format(TD))
+    print("Topic diveristy is: {}".format(TD))
+
 
 def get_document_frequency(data, wi, wj=None):
     if wj is None:
         D_wi = 0
         for l in range(len(data)):
             doc = data[l].squeeze(0)
-            if len(doc) == 1: 
+            if len(doc) == 1:
                 continue
             else:
                 doc = doc.squeeze()
@@ -27,7 +28,7 @@ def get_document_frequency(data, wi, wj=None):
     D_wi_wj = 0
     for l in range(len(data)):
         doc = data[l].squeeze(0)
-        if len(doc) == 1: 
+        if len(doc) == 1:
             doc = [doc.squeeze()]
         else:
             doc = doc.squeeze()
@@ -35,17 +36,17 @@ def get_document_frequency(data, wi, wj=None):
             D_wj += 1
             if wi in doc:
                 D_wi_wj += 1
-    return D_wj, D_wi_wj 
+    return D_wj, D_wi_wj
+
 
 def get_topic_coherence(beta, data, vocab):
-    D = len(data) ## number of docs...data is list of documents
-    print('D: ', D)
+    D = len(data)  # number of docs...data is list of documents
+    print("D: ", D)
     TC = []
     num_topics = len(beta)
     for k in range(num_topics):
-        print('k: {}/{}'.format(k, num_topics))
+        print("k: {}/{}".format(k, num_topics))
         top_10 = list(beta[k].argsort()[-11:][::-1])
-        top_words = [vocab[a] for a in top_10]
         TC_k = 0
         counter = 0
         for i, word in enumerate(top_10):
@@ -60,28 +61,31 @@ def get_topic_coherence(beta, data, vocab):
                 if D_wi_wj == 0:
                     f_wi_wj = -1
                 else:
-                    f_wi_wj = -1 + ( np.log(D_wi) + np.log(D_wj)  - 2.0 * np.log(D) ) / ( np.log(D_wi_wj) - np.log(D) )
-                # update tmp: 
+                    f_wi_wj = -1 + (np.log(D_wi) + np.log(D_wj) - 2.0 * np.log(D)) / (
+                        np.log(D_wi_wj) - np.log(D)
+                    )
+                # update tmp:
                 tmp += f_wi_wj
                 j += 1
                 counter += 1
             # update TC_k
-            TC_k += tmp 
+            TC_k += tmp
         TC.append(TC_k)
-    print('counter: ', counter)
-    print('num topics: ', len(TC))
+    print("counter: ", counter)
+    print("num topics: ", len(TC))
     TC = np.mean(TC) / counter
-    print('Topic coherence is: {}'.format(TC))
+    print("Topic coherence is: {}".format(TC))
+
 
 def nearest_neighbors(word, embeddings, vocab):
-    vectors = embeddings.data.cpu().numpy() 
+    vectors = embeddings.data.cpu().numpy()
     index = vocab.index(word)
-    print('vectors: ', vectors.shape)
+    print("vectors: ", vectors.shape)
     query = vectors[index]
-    print('query: ', query.shape)
+    print("query: ", query.shape)
     ranks = vectors.dot(query).squeeze()
     denom = query.T.dot(query).squeeze()
-    denom = denom * np.sum(vectors**2, 1)
+    denom = denom * np.sum(vectors ** 2, 1)
     denom = np.sqrt(denom)
     ranks = ranks / denom
     mostSimilar = []

From 7411cfccdd045a11b2ba3dce991ef322995dd1b0 Mon Sep 17 00:00:00 2001
From: Haluk Dogan <haluk.dogan@huskers.unl.edu>
Date: Thu, 25 Feb 2021 19:49:03 -0600
Subject: [PATCH 2/2] project config for emacs

---
 .dir-locals.el | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 .dir-locals.el

diff --git a/.dir-locals.el b/.dir-locals.el
new file mode 100644
index 0000000..fcc8077
--- /dev/null
+++ b/.dir-locals.el
@@ -0,0 +1,18 @@
+(
+ (python-mode . (
+                 (eval .
+                       (progn
+                           ;; set path to the python modules directory
+                           (add-to-list 'exec-path (concat (locate-dominating-file default-directory dir-locals-file) "bin/"))
+                           ;; configure inferior python shell.
+                           (setq-local python-shell-interpreter "pythondocker")
+                           (setq-local python-shell-interpreter-interactive-arg "-i")
+                           (setq-local python-shell-completion-native-enable nil)
+                           (setq-local lsp-pyls-plugins-mypy-enabled t)
+                           (setq-local lsp-pyls-plugins-mypy.live_mode t)
+                           (setq-local lsp-pyls-plugins-black-enabled t)
+                           (setq-local lsp-pyls-plugins-isort-enabled t)
+                           )
+                       )
+                 ))
+ )