From 7498e5e34aaf167386b5ca3fb7744735bd6b5bde Mon Sep 17 00:00:00 2001 From: Haluk Dogan Date: Thu, 25 Feb 2021 19:45:14 -0600 Subject: [PATCH 1/2] containerization and code clean up --- .gitignore | 209 +++++++++++++++++ Dockerfile | 31 +++ Makefile | 27 +++ README.md | 28 ++- bin/pyls | 4 + bin/pythondocker | 3 + data.py | 71 +++--- docker-compose.yml | 13 ++ etm.py | 96 ++++---- main.py | 536 ++++++++++++++++++++++++++++--------------- requirements.txt | 11 + scripts/data_20ng.py | 248 +++++++++++++------- scripts/data_nyt.py | 221 ++++++++++++------ skipgram.py | 72 ++++-- utils.py | 46 ++-- 15 files changed, 1156 insertions(+), 460 deletions(-) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 Makefile create mode 100755 bin/pyls create mode 100755 bin/pythondocker create mode 100644 docker-compose.yml create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a61809f --- /dev/null +++ b/.gitignore @@ -0,0 +1,209 @@ +# Created by https://www.toptal.com/developers/gitignore/api/linux,python,emacs +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,python,emacs + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive +ltximg/** + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +### Linux ### + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# profiling data +.prof + +# End of https://www.toptal.com/developers/gitignore/api/linux,python,emacs diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8ced8ed --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +ARG TAG=3.8 + +FROM python:$TAG +ENV PYTHONUNBUFFERED 1 + +ARG USER +ARG USER_ID +ARG GROUP_ID +ARG WORKDIR + +RUN apt-get update \ + && apt-get clean \ + && apt-get update -qqq \ + && apt-get install -y -q build-essential graphviz graphviz-dev \ + && apt-get install -y -q ffmpeg libsm6 libxext6 \ + && pip install --upgrade pip \ + && pip install Cython scipy + +RUN groupadd --gid 1000 $USER +RUN useradd --create-home --uid $USER_ID --gid $GROUP_ID $USER + +USER ${USER} +ENV PATH "$PATH:/home/$USER/.local/bin" + +COPY ./requirements.txt requirements.txt +RUN pip install --user -r requirements.txt + +RUN pip install --user torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 \ + -f https://download.pytorch.org/whl/torch_stable.html + +WORKDIR $WORKDIR diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..594fbd6 --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +## +# ETM +# +# @file +# @version 0.1 + +user := $(shell whoami) +userid := $(shell id -u) +groupid := $(shell id -g) +workdir := $(shell pwd) + +.PHONY: build +build: + docker-compose build \ + --build-arg USER=$(user) \ + --build-arg USER_ID=${userid} \ + --build-arg GROUP_ID=$(groupid) \ + --build-arg WORKDIR=$(workdir) + +.PHONY: clean-container +clean-container: + docker rmi etm_etm + +.PHONY: clean-python +clean-python: + rm -rf __pycache__ +# end diff --git a/README.md b/README.md index 6bc2ee6..529ed33 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,19 @@ ETM defines words and topics in the same embedding space. The likelihood of a wo ## Dependencies -+ python 3.6.7 -+ pytorch 1.1.0 ++ python 3.8.7 ++ pytorch 1.7.1 + +## Optional + ++ docker ++ docker-compose + +### Build Docker Image + +``` sh +make build +``` ## Datasets @@ -20,7 +31,7 @@ All the datasets are pre-processed and can be found below: + https://bitbucket.org/franrruiz/data_stopwords_largev_2/src/master/ (this one contains stop words and was used to showcase robustness of ETM to stop words.) + https://bitbucket.org/franrruiz/data_20ng_largev/src/master/ -All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py +All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py ## To Run @@ -38,14 +49,20 @@ To learn interpretable topics using ETM with pre-fitted word embeddings (called + first fit the word embeddings. For example to use simple skipgram you can run ``` -python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4 +python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4 ``` -+ then run the following ++ then run the following ``` python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH_TO_EMBEDDINGS --num_topics 50 --train_embeddings 0 --epochs 1000 ``` +## To Run in a Container + +``` sh +docker-compose run --rm etm python main.py --mode train --dataset 20ng --data_path data/20ng --num_topics 50 --train_embeddings 1 --epochs 1000 +``` + ## Citation ``` @@ -56,4 +73,3 @@ python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH year={2019} } ``` - diff --git a/bin/pyls b/bin/pyls new file mode 100755 index 0000000..bb5c5d8 --- /dev/null +++ b/bin/pyls @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +# docker-compose build ml &> /dev/null +exec docker-compose run --name etm_python --rm --no-deps -T etm pyls diff --git a/bin/pythondocker b/bin/pythondocker new file mode 100755 index 0000000..cb8fe4a --- /dev/null +++ b/bin/pythondocker @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +exec docker exec -it etm_python python $@ diff --git a/data.py b/data.py index 2f3afe9..277423b 100644 --- a/data.py +++ b/data.py @@ -1,56 +1,63 @@ import os -import random import pickle + import numpy as np -import torch import scipy.io +import torch + def _fetch(path, name): - if name == 'train': - token_file = os.path.join(path, 'bow_tr_tokens.mat') - count_file = os.path.join(path, 'bow_tr_counts.mat') - elif name == 'valid': - token_file = os.path.join(path, 'bow_va_tokens.mat') - count_file = os.path.join(path, 'bow_va_counts.mat') + if name == "train": + token_file = os.path.join(path, "bow_tr_tokens.mat") + count_file = os.path.join(path, "bow_tr_counts.mat") + elif name == "valid": + token_file = os.path.join(path, "bow_va_tokens.mat") + count_file = os.path.join(path, "bow_va_counts.mat") else: - token_file = os.path.join(path, 'bow_ts_tokens.mat') - count_file = os.path.join(path, 'bow_ts_counts.mat') - tokens = scipy.io.loadmat(token_file)['tokens'].squeeze() - counts = scipy.io.loadmat(count_file)['counts'].squeeze() - if name == 'test': - token_1_file = os.path.join(path, 'bow_ts_h1_tokens.mat') - count_1_file = os.path.join(path, 'bow_ts_h1_counts.mat') - token_2_file = os.path.join(path, 'bow_ts_h2_tokens.mat') - count_2_file = os.path.join(path, 'bow_ts_h2_counts.mat') - tokens_1 = scipy.io.loadmat(token_1_file)['tokens'].squeeze() - counts_1 = scipy.io.loadmat(count_1_file)['counts'].squeeze() - tokens_2 = scipy.io.loadmat(token_2_file)['tokens'].squeeze() - counts_2 = scipy.io.loadmat(count_2_file)['counts'].squeeze() - return {'tokens': tokens, 'counts': counts, - 'tokens_1': tokens_1, 'counts_1': counts_1, - 'tokens_2': tokens_2, 'counts_2': counts_2} - return {'tokens': tokens, 'counts': counts} + token_file = os.path.join(path, "bow_ts_tokens.mat") + count_file = os.path.join(path, "bow_ts_counts.mat") + tokens = scipy.io.loadmat(token_file)["tokens"].squeeze() + counts = scipy.io.loadmat(count_file)["counts"].squeeze() + if name == "test": + token_1_file = os.path.join(path, "bow_ts_h1_tokens.mat") + count_1_file = os.path.join(path, "bow_ts_h1_counts.mat") + token_2_file = os.path.join(path, "bow_ts_h2_tokens.mat") + count_2_file = os.path.join(path, "bow_ts_h2_counts.mat") + tokens_1 = scipy.io.loadmat(token_1_file)["tokens"].squeeze() + counts_1 = scipy.io.loadmat(count_1_file)["counts"].squeeze() + tokens_2 = scipy.io.loadmat(token_2_file)["tokens"].squeeze() + counts_2 = scipy.io.loadmat(count_2_file)["counts"].squeeze() + return { + "tokens": tokens, + "counts": counts, + "tokens_1": tokens_1, + "counts_1": counts_1, + "tokens_2": tokens_2, + "counts_2": counts_2, + } + return {"tokens": tokens, "counts": counts} + def get_data(path): - with open(os.path.join(path, 'vocab.pkl'), 'rb') as f: + with open(os.path.join(path, "vocab.pkl"), "rb") as f: vocab = pickle.load(f) - train = _fetch(path, 'train') - valid = _fetch(path, 'valid') - test = _fetch(path, 'test') + train = _fetch(path, "train") + valid = _fetch(path, "valid") + test = _fetch(path, "test") return vocab, train, valid, test + def get_batch(tokens, counts, ind, vocab_size, device, emsize=300): """fetch input data by batch.""" batch_size = len(ind) data_batch = np.zeros((batch_size, vocab_size)) - + for i, doc_id in enumerate(ind): doc = tokens[doc_id] count = counts[doc_id] - L = count.shape[1] - if len(doc) == 1: + if len(doc) == 1: doc = [doc.squeeze()] count = [count.squeeze()] else: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..08c3b61 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.8' + +services: + etm: + build: + context: . + args: + - USER + - USER_ID + - GROUP_ID + - WORKDIR + volumes: + - ./:/home/hd/git/ETM diff --git a/etm.py b/etm.py index 81c00e6..03c1846 100644 --- a/etm.py +++ b/etm.py @@ -1,18 +1,26 @@ import torch -import torch.nn.functional as F -import numpy as np -import math - +import torch.nn.functional as F from torch import nn device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + class ETM(nn.Module): - def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emsize, - theta_act, embeddings=None, train_embeddings=True, enc_drop=0.5): + def __init__( + self, + num_topics, + vocab_size, + t_hidden_size, + rho_size, + emsize, + theta_act, + embeddings=None, + train_embeddings=True, + enc_drop=0.5, + ): super(ETM, self).__init__() - ## define hyperparameters + # define hyperparameters self.num_topics = num_topics self.vocab_size = vocab_size self.t_hidden_size = t_hidden_size @@ -22,55 +30,55 @@ def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emsize, self.t_drop = nn.Dropout(enc_drop) self.theta_act = self.get_activation(theta_act) - - ## define the word embedding matrix \rho + + # define the word embedding matrix \rho if train_embeddings: self.rho = nn.Linear(rho_size, vocab_size, bias=False) else: num_embeddings, emsize = embeddings.size() - rho = nn.Embedding(num_embeddings, emsize) self.rho = embeddings.clone().float().to(device) - ## define the matrix containing the topic embeddings - self.alphas = nn.Linear(rho_size, num_topics, bias=False)#nn.Parameter(torch.randn(rho_size, num_topics)) - - ## define variational distribution for \theta_{1:D} via amortizartion + # define the matrix containing the topic embeddings + self.alphas = nn.Linear( + rho_size, num_topics, bias=False + ) # nn.Parameter(torch.randn(rho_size, num_topics)) + + # define variational distribution for \theta_{1:D} via amortizartion self.q_theta = nn.Sequential( - nn.Linear(vocab_size, t_hidden_size), - self.theta_act, - nn.Linear(t_hidden_size, t_hidden_size), - self.theta_act, - ) + nn.Linear(vocab_size, t_hidden_size), + self.theta_act, + nn.Linear(t_hidden_size, t_hidden_size), + self.theta_act, + ) self.mu_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True) self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True) def get_activation(self, act): - if act == 'tanh': + if act == "tanh": act = nn.Tanh() - elif act == 'relu': + elif act == "relu": act = nn.ReLU() - elif act == 'softplus': + elif act == "softplus": act = nn.Softplus() - elif act == 'rrelu': + elif act == "rrelu": act = nn.RReLU() - elif act == 'leakyrelu': + elif act == "leakyrelu": act = nn.LeakyReLU() - elif act == 'elu': + elif act == "elu": act = nn.ELU() - elif act == 'selu': + elif act == "selu": act = nn.SELU() - elif act == 'glu': + elif act == "glu": act = nn.GLU() else: - print('Defaulting to tanh activations...') + print("Defaulting to tanh activations...") act = nn.Tanh() - return act + return act def reparameterize(self, mu, logvar): - """Returns a sample from a Gaussian distribution via reparameterization. - """ + """Returns a sample from a Gaussian distribution via reparameterization.""" if self.training: - std = torch.exp(0.5 * logvar) + std = torch.exp(0.5 * logvar) eps = torch.randn_like(std) return eps.mul_(std).add_(mu) else: @@ -88,42 +96,46 @@ def encode(self, bows): q_theta = self.t_drop(q_theta) mu_theta = self.mu_q_theta(q_theta) logsigma_theta = self.logsigma_q_theta(q_theta) - kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean() + kl_theta = ( + -0.5 + * torch.sum( + 1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1 + ).mean() + ) return mu_theta, logsigma_theta, kl_theta def get_beta(self): try: - logit = self.alphas(self.rho.weight) # torch.mm(self.rho, self.alphas) + logit = self.alphas(self.rho.weight) # torch.mm(self.rho, self.alphas) except: logit = self.alphas(self.rho) - beta = F.softmax(logit, dim=0).transpose(1, 0) ## softmax over vocab dimension + beta = F.softmax(logit, dim=0).transpose(1, 0) # softmax over vocab dimension return beta def get_theta(self, normalized_bows): mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows) z = self.reparameterize(mu_theta, logsigma_theta) - theta = F.softmax(z, dim=-1) + theta = F.softmax(z, dim=-1) return theta, kld_theta def decode(self, theta, beta): res = torch.mm(theta, beta) - preds = torch.log(res+1e-6) - return preds + preds = torch.log(res + 1e-6) + return preds def forward(self, bows, normalized_bows, theta=None, aggregate=True): - ## get \theta + # get \theta if theta is None: theta, kld_theta = self.get_theta(normalized_bows) else: kld_theta = None - ## get \beta + # get \beta beta = self.get_beta() - ## get prediction loss + # get prediction loss preds = self.decode(theta, beta) recon_loss = -(preds * bows).sum(1) if aggregate: recon_loss = recon_loss.mean() return recon_loss, kld_theta - diff --git a/main.py b/main.py index 2b710eb..9a2bad3 100644 --- a/main.py +++ b/main.py @@ -1,108 +1,171 @@ -#/usr/bin/python - -from __future__ import print_function +# /usr/bin/python import argparse -import torch -import pickle -import numpy as np -import os -import math -import random -import sys -import matplotlib.pyplot as plt -import data -import scipy.io +import math +import os -from torch import nn, optim -from torch.nn import functional as F +import numpy as np +import torch +from torch import optim +import data from etm import ETM -from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity - -parser = argparse.ArgumentParser(description='The Embedded Topic Model') - -### data and file related arguments -parser.add_argument('--dataset', type=str, default='20ng', help='name of corpus') -parser.add_argument('--data_path', type=str, default='data/20ng', help='directory containing data') -parser.add_argument('--emb_path', type=str, default='data/20ng_embeddings.txt', help='directory containing word embeddings') -parser.add_argument('--save_path', type=str, default='./results', help='path to save results') -parser.add_argument('--batch_size', type=int, default=1000, help='input batch size for training') - -### model-related arguments -parser.add_argument('--num_topics', type=int, default=50, help='number of topics') -parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho') -parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings') -parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)') -parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)') -parser.add_argument('--train_embeddings', type=int, default=0, help='whether to fix rho or train it') - -### optimization-related arguments -parser.add_argument('--lr', type=float, default=0.005, help='learning rate') -parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this...') -parser.add_argument('--epochs', type=int, default=20, help='number of epochs to train...150 for 20ng 100 for others') -parser.add_argument('--mode', type=str, default='train', help='train or eval model') -parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer') -parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)') -parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder') -parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping') -parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed') -parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization') -parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not') -parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not') - -### evaluation, visualization, and logging-related arguments -parser.add_argument('--num_words', type=int, default=10, help='number of words for topic viz') -parser.add_argument('--log_interval', type=int, default=2, help='when to log training') -parser.add_argument('--visualize_every', type=int, default=10, help='when to visualize results') -parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation') -parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from') -parser.add_argument('--tc', type=int, default=0, help='whether to compute topic coherence or not') -parser.add_argument('--td', type=int, default=0, help='whether to compute topic diversity or not') +from utils import get_topic_coherence, get_topic_diversity, nearest_neighbors + +parser = argparse.ArgumentParser(description="The Embedded Topic Model") + +# data and file related arguments +parser.add_argument("--dataset", type=str, default="20ng", + help="name of corpus") +parser.add_argument( + "--data_path", type=str, default="data/20ng", + help="directory containing data") + +parser.add_argument( + "--emb_path", + type=str, + default="data/20ng_embeddings.txt", + help="directory containing word embeddings", +) +parser.add_argument( + "--save_path", type=str, default="results", help="path to save results" +) +parser.add_argument( + "--batch_size", type=int, default=1000, help="input batch size for training" +) + +# model-related arguments +parser.add_argument("--num_topics", type=int, default=50, + help="number of topics") +parser.add_argument("--rho_size", type=int, default=300, + help="dimension of rho") +parser.add_argument("--emb_size", type=int, default=300, + help="dimension of embeddings") +parser.add_argument( + "--t_hidden_size", + type=int, + default=800, + help="dimension of hidden space of q(theta)", +) +parser.add_argument( + "--theta_act", + type=str, + default="relu", + help="tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)", +) +parser.add_argument( + "--train_embeddings", type=int, default=0, + help="whether to fix rho or train it" +) + +# optimization-related arguments +parser.add_argument("--lr", type=float, default=0.005, help="learning rate") +parser.add_argument( + "--lr_factor", type=float, default=4.0, + help="divide learning rate by this..." +) +parser.add_argument( + "--epochs", + type=int, + default=20, + help="number of epochs to train...150 for 20ng 100 for others", +) +parser.add_argument("--mode", type=str, default="train", + help="train or eval model") +parser.add_argument("--optimizer", type=str, default="adam", + help="choice of optimizer") +parser.add_argument("--seed", type=int, default=2019, + help="random seed (default: 1)") +parser.add_argument( + "--enc_drop", type=float, default=0.0, help="dropout rate on encoder" +) +parser.add_argument("--clip", type=float, default=0.0, + help="gradient clipping") +parser.add_argument( + "--nonmono", type=int, default=10, help="number of bad hits allowed" +) +parser.add_argument( + "--wdecay", type=float, default=1.2e-6, help="some l2 regularization" +) +parser.add_argument( + "--anneal_lr", + type=int, + default=0, + help="whether to anneal the learning rate or not", +) +parser.add_argument("--bow_norm", type=int, default=1, + help="normalize the bows or not") + +# evaluation, visualization, and logging-related arguments +parser.add_argument( + "--num_words", type=int, default=10, help="number of words for topic viz" +) +parser.add_argument("--log_interval", type=int, default=2, + help="when to log training") +parser.add_argument( + "--visualize_every", type=int, default=10, help="when to visualize results" +) +parser.add_argument( + "--eval_batch_size", type=int, default=1000, + help="input batch size for evaluation" +) +parser.add_argument( + "--load_from", type=str, default="", + help="the name of the ckpt to eval from" +) +parser.add_argument( + "--tc", type=int, default=0, + help="whether to compute topic coherence or not" +) +parser.add_argument( + "--td", type=int, default=0, + help="whether to compute topic diversity or not" +) args = parser.parse_args() +root = os.path.dirname(__file__) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print('\n') +print("\n") np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) -## get data +# get data # 1. vocabulary vocab, train, valid, test = data.get_data(os.path.join(args.data_path)) vocab_size = len(vocab) args.vocab_size = vocab_size # 1. training data -train_tokens = train['tokens'] -train_counts = train['counts'] +train_tokens = train["tokens"] +train_counts = train["counts"] args.num_docs_train = len(train_tokens) # 2. dev set -valid_tokens = valid['tokens'] -valid_counts = valid['counts'] +valid_tokens = valid["tokens"] +valid_counts = valid["counts"] args.num_docs_valid = len(valid_tokens) # 3. test data -test_tokens = test['tokens'] -test_counts = test['counts'] +test_tokens = test["tokens"] +test_counts = test["counts"] args.num_docs_test = len(test_tokens) -test_1_tokens = test['tokens_1'] -test_1_counts = test['counts_1'] +test_1_tokens = test["tokens_1"] +test_1_counts = test["counts_1"] args.num_docs_test_1 = len(test_1_tokens) -test_2_tokens = test['tokens_2'] -test_2_counts = test['counts_2'] +test_2_tokens = test["tokens_2"] +test_2_counts = test["counts_2"] args.num_docs_test_2 = len(test_2_tokens) embeddings = None if not args.train_embeddings: emb_path = args.emb_path - vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl') + vect_path = os.path.join(args.data_path.split("/")[0], "embeddings.pkl") vectors = {} - with open(emb_path, 'rb') as f: + with open(emb_path, "rb") as f: for l in f: line = l.decode().split() word = line[0] @@ -112,50 +175,83 @@ embeddings = np.zeros((vocab_size, args.emb_size)) words_found = 0 for i, word in enumerate(vocab): - try: + try: embeddings[i] = vectors[word] words_found += 1 except KeyError: - embeddings[i] = np.random.normal(scale=0.6, size=(args.emb_size, )) + embeddings[i] = np.random.normal(scale=0.6, size=(args.emb_size,)) embeddings = torch.from_numpy(embeddings).to(device) args.embeddings_dim = embeddings.size() -print('=*'*100) -print('Training an Embedded Topic Model on {} with the following settings: {}'.format(args.dataset.upper(), args)) -print('=*'*100) +print("=*" * 100) +print( + "Training an Embedded Topic Model on {} with the following settings: {}".format( + args.dataset.upper(), args + ) +) +print("=*" * 100) -## define checkpoint +# define checkpoint if not os.path.exists(args.save_path): - os.makedirs(args.save_path) + os.makedirs(os.path.join(root, args.save_path)) -if args.mode == 'eval': +if args.mode == "eval": ckpt = args.load_from else: - ckpt = os.path.join(args.save_path, - 'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format( - args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, - args.lr, args.batch_size, args.rho_size, args.train_embeddings)) - -## define model and optimizer -model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, - args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device) - -print('model: {}'.format(model)) - -if args.optimizer == 'adam': - optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) -elif args.optimizer == 'adagrad': - optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay) -elif args.optimizer == 'adadelta': - optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wdecay) -elif args.optimizer == 'rmsprop': - optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.wdecay) -elif args.optimizer == 'asgd': - optimizer = optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) + ckpt = os.path.join( + root, + args.save_path, + "etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}".format( + args.dataset, + args.num_topics, + args.t_hidden_size, + args.optimizer, + args.clip, + args.theta_act, + args.lr, + args.batch_size, + args.rho_size, + args.train_embeddings, + ), + ) + +# define model and optimizer +model = ETM( + args.num_topics, + vocab_size, + args.t_hidden_size, + args.rho_size, + args.emb_size, + args.theta_act, + embeddings, + args.train_embeddings, + args.enc_drop, +).to(device) + +print("model: {}".format(model)) + +if args.optimizer == "adam": + optimizer = optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.wdecay) +elif args.optimizer == "adagrad": + optimizer = optim.Adagrad(model.parameters(), lr=args.lr, + weight_decay=args.wdecay) +elif args.optimizer == "adadelta": + optimizer = optim.Adadelta(model.parameters(), lr=args.lr, + weight_decay=args.wdecay) +elif args.optimizer == "rmsprop": + optimizer = optim.RMSprop(model.parameters(), lr=args.lr, + weight_decay=args.wdecay) +elif args.optimizer == "asgd": + optimizer = optim.ASGD( + model.parameters(), lr=args.lr, t0=0, lambd=0.0, + weight_decay=args.wdecay + ) else: - print('Defaulting to vanilla SGD') + print("Defaulting to vanilla SGD") optimizer = optim.SGD(model.parameters(), lr=args.lr) + def train(epoch): model.train() acc_loss = 0 @@ -166,7 +262,9 @@ def train(epoch): for idx, ind in enumerate(indices): optimizer.zero_grad() model.zero_grad() - data_batch = data.get_batch(train_tokens, train_counts, ind, args.vocab_size, device) + data_batch = data.get_batch( + train_tokens, train_counts, ind, args.vocab_size, device + ) sums = data_batch.sum(1).unsqueeze(1) if args.bow_norm: normalized_data_batch = data_batch / sums @@ -185,81 +283,121 @@ def train(epoch): cnt += 1 if idx % args.log_interval == 0 and idx > 0: - cur_loss = round(acc_loss / cnt, 2) - cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) + cur_loss = round(acc_loss / cnt, 2) + cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) cur_real_loss = round(cur_loss + cur_kl_theta, 2) - print('Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format( - epoch, idx, len(indices), optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss)) - - cur_loss = round(acc_loss / cnt, 2) - cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) + print( + "Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}".format( + epoch, + idx, + len(indices), + optimizer.param_groups[0]["lr"], + cur_kl_theta, + cur_loss, + cur_real_loss, + ) + ) + + cur_loss = round(acc_loss / cnt, 2) + cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) cur_real_loss = round(cur_loss + cur_kl_theta, 2) - print('*'*100) - print('Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format( - epoch, optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss)) - print('*'*100) + print("*" * 100) + print( + "Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}".format( + epoch, + optimizer.param_groups[0]["lr"], + cur_kl_theta, + cur_loss, + cur_real_loss, + ) + ) + print("*" * 100) + def visualize(m, show_emb=True): - if not os.path.exists('./results'): - os.makedirs('./results') + if not os.path.exists(os.path.join(root, "results")): + os.makedirs(os.path.join(root, "results")) m.eval() - queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', - 'intelligence', 'money', 'politics', 'health', 'people', 'family'] - - ## visualize topics using monte carlo + queries = [ + "andrew", + "computer", + "sports", + "religion", + "man", + "love", + "intelligence", + "money", + "politics", + "health", + "people", + "family", + ] + + # visualize topics using monte carlo with torch.no_grad(): - print('#'*100) - print('Visualize topics...') + print("#" * 100) + print("Visualize topics...") topics_words = [] gammas = m.get_beta() for k in range(args.num_topics): gamma = gammas[k] - top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1]) + top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1 :][::-1]) topic_words = [vocab[a] for a in top_words] - topics_words.append(' '.join(topic_words)) - print('Topic {}: {}'.format(k, topic_words)) + topics_words.append(" ".join(topic_words)) + print("Topic {}: {}".format(k, topic_words)) if show_emb: - ## visualize word embeddings by using V to get nearest neighbors - print('#'*100) - print('Visualize word embeddings by using output embedding matrix') + # visualize word embeddings by using V to get nearest neighbors + print("#" * 100) + print("Visualize word embeddings by using output embedding matrix") try: embeddings = m.rho.weight # Vocab_size x E except: - embeddings = m.rho # Vocab_size x E - neighbors = [] + embeddings = m.rho # Vocab_size x E + # neighbors = [] for word in queries: - print('word: {} .. neighbors: {}'.format( - word, nearest_neighbors(word, embeddings, vocab))) - print('#'*100) + print( + "word: {} .. neighbors: {}".format( + word, nearest_neighbors(word, embeddings, vocab) + ) + ) + print("#" * 100) + def evaluate(m, source, tc=False, td=False): - """Compute perplexity on document completion. - """ + """Compute perplexity on document completion.""" m.eval() with torch.no_grad(): - if source == 'val': - indices = torch.split(torch.tensor(range(args.num_docs_valid)), args.eval_batch_size) + if source == "val": + indices = torch.split( + torch.tensor(range(args.num_docs_valid)), args.eval_batch_size + ) tokens = valid_tokens counts = valid_counts - else: - indices = torch.split(torch.tensor(range(args.num_docs_test)), args.eval_batch_size) + else: + indices = torch.split( + torch.tensor(range(args.num_docs_test)), args.eval_batch_size + ) tokens = test_tokens counts = test_counts - ## get \beta here + # get \beta here beta = m.get_beta() - ### do dc and tc here + # do dc and tc here acc_loss = 0 cnt = 0 - indices_1 = torch.split(torch.tensor(range(args.num_docs_test_1)), args.eval_batch_size) + indices_1 = torch.split( + torch.tensor(range(args.num_docs_test_1)), args.eval_batch_size + ) for idx, ind in enumerate(indices_1): - ## get theta from first half of docs - data_batch_1 = data.get_batch(test_1_tokens, test_1_counts, ind, args.vocab_size, device) + # get theta from first half of docs + data_batch_1 = data.get_batch( + test_1_tokens, test_1_counts, ind, args.vocab_size, device + ) sums_1 = data_batch_1.sum(1).unsqueeze(1) if args.bow_norm: normalized_data_batch_1 = data_batch_1 / sums_1 @@ -267,79 +405,88 @@ def evaluate(m, source, tc=False, td=False): normalized_data_batch_1 = data_batch_1 theta, _ = m.get_theta(normalized_data_batch_1) - ## get prediction loss using second half - data_batch_2 = data.get_batch(test_2_tokens, test_2_counts, ind, args.vocab_size, device) + # get prediction loss using second half + data_batch_2 = data.get_batch( + test_2_tokens, test_2_counts, ind, args.vocab_size, device + ) sums_2 = data_batch_2.sum(1).unsqueeze(1) res = torch.mm(theta, beta) preds = torch.log(res) recon_loss = -(preds * data_batch_2).sum(1) - + loss = recon_loss / sums_2.squeeze() loss = loss.mean().item() acc_loss += loss cnt += 1 cur_loss = acc_loss / cnt ppl_dc = round(math.exp(cur_loss), 1) - print('*'*100) - print('{} Doc Completion PPL: {}'.format(source.upper(), ppl_dc)) - print('*'*100) + print("*" * 100) + print("{} Doc Completion PPL: {}".format(source.upper(), ppl_dc)) + print("*" * 100) if tc or td: beta = beta.data.cpu().numpy() if tc: - print('Computing topic coherence...') + print("Computing topic coherence...") get_topic_coherence(beta, train_tokens, vocab) if td: - print('Computing topic diversity...') + print("Computing topic diversity...") get_topic_diversity(beta, 25) return ppl_dc -if args.mode == 'train': - ## train model on data + +if args.mode == "train": + # train model on data best_epoch = 0 best_val_ppl = 1e9 all_val_ppls = [] - print('\n') - print('Visualizing model quality before training...') + print("\n") + print("Visualizing model quality before training...") visualize(model) - print('\n') + print("\n") for epoch in range(1, args.epochs): train(epoch) - val_ppl = evaluate(model, 'val') + val_ppl = evaluate(model, "val") if val_ppl < best_val_ppl: - with open(ckpt, 'wb') as f: + with open(ckpt, "wb") as f: torch.save(model, f) best_epoch = epoch best_val_ppl = val_ppl else: - ## check whether to anneal lr - lr = optimizer.param_groups[0]['lr'] - if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5): - optimizer.param_groups[0]['lr'] /= args.lr_factor + # check whether to anneal lr + lr = optimizer.param_groups[0]["lr"] + if args.anneal_lr and ( + len(all_val_ppls) > args.nonmono + and val_ppl > min(all_val_ppls[: -args.nonmono]) + and lr > 1e-5 + ): + optimizer.param_groups[0]["lr"] /= args.lr_factor if epoch % args.visualize_every == 0: visualize(model) all_val_ppls.append(val_ppl) - with open(ckpt, 'rb') as f: + with open(ckpt, "rb") as f: model = torch.load(f) model = model.to(device) - val_ppl = evaluate(model, 'val') -else: - with open(ckpt, 'rb') as f: + val_ppl = evaluate(model, "val") +else: + with open(ckpt, "rb") as f: model = torch.load(f) model = model.to(device) model.eval() with torch.no_grad(): - ## get document completion perplexities - test_ppl = evaluate(model, 'test', tc=args.tc, td=args.td) + # get document completion perplexities + test_ppl = evaluate(model, "test", tc=args.tc, td=args.td) - ## get most used topics + # get most used topics indices = torch.tensor(range(args.num_docs_train)) indices = torch.split(indices, args.batch_size) thetaAvg = torch.zeros(1, args.num_topics).to(device) thetaWeightedAvg = torch.zeros(1, args.num_topics).to(device) cnt = 0 for idx, ind in enumerate(indices): - data_batch = data.get_batch(train_tokens, train_counts, ind, args.vocab_size, device) + data_batch = data.get_batch( + train_tokens, train_counts, ind, args.vocab_size, device + ) sums = data_batch.sum(1).unsqueeze(1) cnt += sums.sum(0).squeeze().cpu().numpy() if args.bow_norm: @@ -351,30 +498,51 @@ def evaluate(m, source, tc=False, td=False): weighed_theta = sums * theta thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0) if idx % 100 == 0 and idx > 0: - print('batch: {}/{}'.format(idx, len(indices))) + print("batch: {}/{}".format(idx, len(indices))) thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt - print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10])) + print( + "\nThe 10 most used topics are {}".format( + thetaWeightedAvg.argsort()[::-1][:10] + ) + ) - ## show topics + # show topics beta = model.get_beta() - topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics - print('\n') - for k in range(args.num_topics):#topic_indices: + topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics + print("\n") + for k in range(args.num_topics): # topic_indices: gamma = beta[k] - top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1]) + top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1 :][::-1]) topic_words = [vocab[a] for a in top_words] - print('Topic {}: {}'.format(k, topic_words)) + print("Topic {}: {}".format(k, topic_words)) if args.train_embeddings: - ## show etm embeddings + # show etm embeddings try: rho_etm = model.rho.weight.cpu() except: rho_etm = model.rho.cpu() - queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', - 'intelligence', 'money', 'politics', 'health', 'people', 'family'] - print('\n') - print('ETM embeddings...') + queries = [ + "andrew", + "woman", + "computer", + "sports", + "religion", + "man", + "love", + "intelligence", + "money", + "politics", + "health", + "people", + "family", + ] + print("\n") + print("ETM embeddings...") for word in queries: - print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab))) - print('\n') + print( + "word: {} .. etm neighbors: {}".format( + word, nearest_neighbors(word, rho_etm, vocab) + ) + ) + print("\n") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b9d8531 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +black +gensim +ipdb +isort +matplotlib +pyls-black +pyls-isort +pyls-mypy +python-language-server[all]==0.36.2 +scikit-learn +scipy diff --git a/scripts/data_20ng.py b/scripts/data_20ng.py index f6528af..f01af92 100644 --- a/scripts/data_20ng.py +++ b/scripts/data_20ng.py @@ -1,58 +1,73 @@ -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.datasets import fetch_20newsgroups -import numpy as np +import os import pickle -import random -from scipy import sparse -import itertools -from scipy.io import savemat, loadmat import re import string +import numpy as np +from scipy import sparse +from scipy.io import savemat +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import CountVectorizer + # Maximum / minimum document frequency max_df = 0.7 min_df = 10 # choose desired value for min_df # Read stopwords -with open('stops.txt', 'r') as f: - stops = f.read().split('\n') +with open("stops.txt", "r") as f: + stops = f.read().split("\n") # Read data -print('reading data...') -train_data = fetch_20newsgroups(subset='train') -test_data = fetch_20newsgroups(subset='test') +print("reading data...") +train_data = fetch_20newsgroups(subset="train") +test_data = fetch_20newsgroups(subset="test") + +init_docs_tr = [ + re.findall(r"""[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]""", train_data.data[doc]) + for doc in range(len(train_data.data)) +] +init_docs_ts = [ + re.findall(r"""[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]""", test_data.data[doc]) + for doc in range(len(test_data.data)) +] -init_docs_tr = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', train_data.data[doc]) for doc in range(len(train_data.data))] -init_docs_ts = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', test_data.data[doc]) for doc in range(len(test_data.data))] def contains_punctuation(w): return any(char in string.punctuation for char in w) + def contains_numeric(w): return any(char.isdigit() for char in w) - + + init_docs = init_docs_tr + init_docs_ts -init_docs = [[w.lower() for w in init_docs[doc] if not contains_punctuation(w)] for doc in range(len(init_docs))] -init_docs = [[w for w in init_docs[doc] if not contains_numeric(w)] for doc in range(len(init_docs))] -init_docs = [[w for w in init_docs[doc] if len(w)>1] for doc in range(len(init_docs))] +init_docs = [ + [w.lower() for w in init_docs[doc] if not contains_punctuation(w)] + for doc in range(len(init_docs)) +] +init_docs = [ + [w for w in init_docs[doc] if not contains_numeric(w)] + for doc in range(len(init_docs)) +] +init_docs = [[w for w in init_docs[doc] if len(w) > 1] for doc in range(len(init_docs))] init_docs = [" ".join(init_docs[doc]) for doc in range(len(init_docs))] -# Create count vectorizer -print('counting document frequency of words...') +#  Create count vectorizer +print("counting document frequency of words...") cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None) cvz = cvectorizer.fit_transform(init_docs).sign() -# Get vocabulary -print('building the vocabulary...') +#  Get vocabulary +print("building the vocabulary...") sum_counts = cvz.sum(axis=0) v_size = sum_counts.shape[1] sum_counts_np = np.zeros(v_size, dtype=int) for v in range(v_size): - sum_counts_np[v] = sum_counts[0,v] + sum_counts_np[v] = sum_counts[0, v] word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_]) id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_]) del cvectorizer -print(' initial vocabulary size: {}'.format(v_size)) +print(" initial vocabulary size: {}".format(v_size)) # Sort elements in vocabulary idx_sort = np.argsort(sum_counts_np) @@ -60,7 +75,7 @@ def contains_numeric(w): # Filter out stopwords (if any) vocab_aux = [w for w in vocab_aux if w not in stops] -print(' vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux))) +print(" vocabulary size after removing stopwords from list: {}".format(len(vocab_aux))) # Create dictionary and inverse dictionary vocab = vocab_aux @@ -68,83 +83,143 @@ def contains_numeric(w): word2id = dict([(w, j) for j, w in enumerate(vocab)]) id2word = dict([(j, w) for j, w in enumerate(vocab)]) -# Split in train/test/valid -print('tokenizing documents and splitting into train/test/valid...') +#  Split in train/test/valid +print("tokenizing documents and splitting into train/test/valid...") num_docs_tr = len(init_docs_tr) -trSize = num_docs_tr-100 +trSize = num_docs_tr - 100 tsSize = len(init_docs_ts) vaSize = 100 idx_permute = np.random.permutation(num_docs_tr).astype(int) -# Remove words not in train_data -vocab = list(set([w for idx_d in range(trSize) for w in init_docs[idx_permute[idx_d]].split() if w in word2id])) +#  Remove words not in train_data +vocab = list( + set( + [ + w + for idx_d in range(trSize) + for w in init_docs[idx_permute[idx_d]].split() + if w in word2id + ] + ) +) word2id = dict([(w, j) for j, w in enumerate(vocab)]) id2word = dict([(j, w) for j, w in enumerate(vocab)]) -print(' vocabulary after removing words not in train: {}'.format(len(vocab))) +print(" vocabulary after removing words not in train: {}".format(len(vocab))) + +#  Split in train/test/valid +docs_tr = [ + [word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id] + for idx_d in range(trSize) +] +docs_va = [ + [word2id[w] for w in init_docs[idx_permute[idx_d + trSize]].split() if w in word2id] + for idx_d in range(vaSize) +] +docs_ts = [ + [word2id[w] for w in init_docs[idx_d + num_docs_tr].split() if w in word2id] + for idx_d in range(tsSize) +] + +print( + " number of documents (train): {} [this should be equal to {}]".format( + len(docs_tr), trSize + ) +) +print( + " number of documents (test): {} [this should be equal to {}]".format( + len(docs_ts), tsSize + ) +) +print( + " number of documents (valid): {} [this should be equal to {}]".format( + len(docs_va), vaSize + ) +) + +#  Remove empty documents +print("removing empty documents...") -# Split in train/test/valid -docs_tr = [[word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)] -docs_va = [[word2id[w] for w in init_docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(vaSize)] -docs_ts = [[word2id[w] for w in init_docs[idx_d+num_docs_tr].split() if w in word2id] for idx_d in range(tsSize)] - -print(' number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize)) -print(' number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize)) -print(' number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize)) - -# Remove empty documents -print('removing empty documents...') def remove_empty(in_docs): - return [doc for doc in in_docs if doc!=[]] + return [doc for doc in in_docs if doc != []] + docs_tr = remove_empty(docs_tr) docs_ts = remove_empty(docs_ts) docs_va = remove_empty(docs_va) # Remove test documents with length=1 -docs_ts = [doc for doc in docs_ts if len(doc)>1] +docs_ts = [doc for doc in docs_ts if len(doc) > 1] # Split test set in 2 halves -print('splitting test documents in 2 halves...') -docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts] -docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts] +print("splitting test documents in 2 halves...") +docs_ts_h1 = [ + [w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts +] +docs_ts_h2 = [ + [w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts +] # Getting lists of words and doc_indices -print('creating lists of words...') +print("creating lists of words...") + def create_list_words(in_docs): return [x for y in in_docs for x in y] + words_tr = create_list_words(docs_tr) words_ts = create_list_words(docs_ts) words_ts_h1 = create_list_words(docs_ts_h1) words_ts_h2 = create_list_words(docs_ts_h2) words_va = create_list_words(docs_va) -print(' len(words_tr): ', len(words_tr)) -print(' len(words_ts): ', len(words_ts)) -print(' len(words_ts_h1): ', len(words_ts_h1)) -print(' len(words_ts_h2): ', len(words_ts_h2)) -print(' len(words_va): ', len(words_va)) +print(" len(words_tr): ", len(words_tr)) +print(" len(words_ts): ", len(words_ts)) +print(" len(words_ts_h1): ", len(words_ts_h1)) +print(" len(words_ts_h2): ", len(words_ts_h2)) +print(" len(words_va): ", len(words_va)) # Get doc indices -print('getting doc indices...') +print("getting doc indices...") + def create_doc_indices(in_docs): aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)] return [int(x) for y in aux for x in y] + doc_indices_tr = create_doc_indices(docs_tr) doc_indices_ts = create_doc_indices(docs_ts) doc_indices_ts_h1 = create_doc_indices(docs_ts_h1) doc_indices_ts_h2 = create_doc_indices(docs_ts_h2) doc_indices_va = create_doc_indices(docs_va) -print(' len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr))) -print(' len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts))) -print(' len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1))) -print(' len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2))) -print(' len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va))) +print( + " len(np.unique(doc_indices_tr)): {} [this should be {}]".format( + len(np.unique(doc_indices_tr)), len(docs_tr) + ) +) +print( + " len(np.unique(doc_indices_ts)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts)), len(docs_ts) + ) +) +print( + " len(np.unique(doc_indices_ts_h1)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1) + ) +) +print( + " len(np.unique(doc_indices_ts_h2)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2) + ) +) +print( + " len(np.unique(doc_indices_va)): {} [this should be {}]".format( + len(np.unique(doc_indices_va)), len(docs_va) + ) +) # Number of documents in each set n_docs_tr = len(docs_tr) @@ -161,10 +236,14 @@ def create_doc_indices(in_docs): del docs_va # Create bow representation -print('creating bow representation...') +print("creating bow representation...") + def create_bow(doc_indices, words, n_docs, vocab_size): - return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr() + return sparse.coo_matrix( + ([1] * len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size) + ).tocsr() + bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab)) bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab)) @@ -184,57 +263,66 @@ def create_bow(doc_indices, words, n_docs, vocab_size): del doc_indices_va # Write the vocabulary to a file -path_save = './min_df_' + str(min_df) + '/' +path_save = "./min_df_" + str(min_df) + "/" if not os.path.isdir(path_save): - os.system('mkdir -p ' + path_save) + os.system("mkdir -p " + path_save) -with open(path_save + 'vocab.pkl', 'wb') as f: +with open(path_save + "vocab.pkl", "wb") as f: pickle.dump(vocab, f) del vocab # Split bow intro token/value pairs -print('splitting bow intro token/value pairs and saving to disk...') +print("splitting bow intro token/value pairs and saving to disk...") + def split_bow(bow_in, n_docs): - indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)] - counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)] + indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_docs)] + counts = [[c for c in bow_in[doc, :].data] for doc in range(n_docs)] return indices, counts + bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr) -savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True) -savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True) +savemat(path_save + "bow_tr_tokens", {"tokens": bow_tr_tokens}, do_compression=True) +savemat(path_save + "bow_tr_counts", {"counts": bow_tr_counts}, do_compression=True) del bow_tr del bow_tr_tokens del bow_tr_counts bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts) -savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True) +savemat(path_save + "bow_ts_tokens", {"tokens": bow_ts_tokens}, do_compression=True) +savemat(path_save + "bow_ts_counts", {"counts": bow_ts_counts}, do_compression=True) del bow_ts del bow_ts_tokens del bow_ts_counts bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1) -savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True) +savemat( + path_save + "bow_ts_h1_tokens", {"tokens": bow_ts_h1_tokens}, do_compression=True +) +savemat( + path_save + "bow_ts_h1_counts", {"counts": bow_ts_h1_counts}, do_compression=True +) del bow_ts_h1 del bow_ts_h1_tokens del bow_ts_h1_counts bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2) -savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True) +savemat( + path_save + "bow_ts_h2_tokens", {"tokens": bow_ts_h2_tokens}, do_compression=True +) +savemat( + path_save + "bow_ts_h2_counts", {"counts": bow_ts_h2_counts}, do_compression=True +) del bow_ts_h2 del bow_ts_h2_tokens del bow_ts_h2_counts bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va) -savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True) -savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True) +savemat(path_save + "bow_va_tokens", {"tokens": bow_va_tokens}, do_compression=True) +savemat(path_save + "bow_va_counts", {"counts": bow_va_counts}, do_compression=True) del bow_va del bow_va_tokens del bow_va_counts -print('Data ready !!') -print('*************') - +print("Data ready !!") +print("*************") diff --git a/scripts/data_nyt.py b/scripts/data_nyt.py index 01d657f..2af3f68 100644 --- a/scripts/data_nyt.py +++ b/scripts/data_nyt.py @@ -1,41 +1,41 @@ -from sklearn.feature_extraction.text import CountVectorizer -import numpy as np +import os import pickle -import random + +import numpy as np from scipy import sparse -import itertools -from scipy.io import savemat, loadmat +from scipy.io import savemat +from sklearn.feature_extraction.text import CountVectorizer # Maximum / minimum document frequency max_df = 0.7 min_df = 100 # choose desired value for min_df # Read stopwords -with open('stops.txt', 'r') as f: - stops = f.read().split('\n') +with open("stops.txt", "r") as f: + stops = f.read().split("\n") # Read data -print('reading text file...') -data_file = 'raw/new_york_times_text/nyt_docs.txt' -with open(data_file, 'r') as f: +print("reading text file...") +data_file = "raw/new_york_times_text/nyt_docs.txt" +with open(data_file, "r") as f: docs = f.readlines() -# Create count vectorizer -print('counting document frequency of words...') +#  Create count vectorizer +print("counting document frequency of words...") cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None) cvz = cvectorizer.fit_transform(docs).sign() -# Get vocabulary -print('building the vocabulary...') +#  Get vocabulary +print("building the vocabulary...") sum_counts = cvz.sum(axis=0) v_size = sum_counts.shape[1] sum_counts_np = np.zeros(v_size, dtype=int) for v in range(v_size): - sum_counts_np[v] = sum_counts[0,v] + sum_counts_np[v] = sum_counts[0, v] word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_]) id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_]) del cvectorizer -print(' initial vocabulary size: {}'.format(v_size)) +print(" initial vocabulary size: {}".format(v_size)) # Sort elements in vocabulary idx_sort = np.argsort(sum_counts_np) @@ -43,8 +43,8 @@ # Filter out stopwords (if any) vocab_aux = [w for w in vocab_aux if w not in stops] -print(' vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux))) -print(' vocabulary after removing stopwords: {}'.format(len(vocab_aux))) +print(" vocabulary size after removing stopwords from list: {}".format(len(vocab_aux))) +print(" vocabulary after removing stopwords: {}".format(len(vocab_aux))) # Create dictionary and inverse dictionary vocab = vocab_aux @@ -52,84 +52,148 @@ word2id = dict([(w, j) for j, w in enumerate(vocab)]) id2word = dict([(j, w) for j, w in enumerate(vocab)]) -# Split in train/test/valid -print('tokenizing documents and splitting into train/test/valid...') +#  Split in train/test/valid +print("tokenizing documents and splitting into train/test/valid...") num_docs = cvz.shape[0] -trSize = int(np.floor(0.85*num_docs)) -tsSize = int(np.floor(0.10*num_docs)) +trSize = int(np.floor(0.85 * num_docs)) +tsSize = int(np.floor(0.10 * num_docs)) vaSize = int(num_docs - trSize - tsSize) del cvz idx_permute = np.random.permutation(num_docs).astype(int) -# Remove words not in train_data -vocab = list(set([w for idx_d in range(trSize) for w in docs[idx_permute[idx_d]].split() if w in word2id])) +#  Remove words not in train_data +vocab = list( + set( + [ + w + for idx_d in range(trSize) + for w in docs[idx_permute[idx_d]].split() + if w in word2id + ] + ) +) word2id = dict([(w, j) for j, w in enumerate(vocab)]) id2word = dict([(j, w) for j, w in enumerate(vocab)]) -print(' vocabulary after removing words not in train: {}'.format(len(vocab))) - -docs_tr = [[word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)] -docs_ts = [[word2id[w] for w in docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(tsSize)] -docs_va = [[word2id[w] for w in docs[idx_permute[idx_d+trSize+tsSize]].split() if w in word2id] for idx_d in range(vaSize)] +print(" vocabulary after removing words not in train: {}".format(len(vocab))) + +docs_tr = [ + [word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] + for idx_d in range(trSize) +] +docs_ts = [ + [word2id[w] for w in docs[idx_permute[idx_d + trSize]].split() if w in word2id] + for idx_d in range(tsSize) +] +docs_va = [ + [ + word2id[w] + for w in docs[idx_permute[idx_d + trSize + tsSize]].split() + if w in word2id + ] + for idx_d in range(vaSize) +] del docs -print(' number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize)) -print(' number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize)) -print(' number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize)) +print( + " number of documents (train): {} [this should be equal to {}]".format( + len(docs_tr), trSize + ) +) +print( + " number of documents (test): {} [this should be equal to {}]".format( + len(docs_ts), tsSize + ) +) +print( + " number of documents (valid): {} [this should be equal to {}]".format( + len(docs_va), vaSize + ) +) + +#  Remove empty documents +print("removing empty documents...") -# Remove empty documents -print('removing empty documents...') def remove_empty(in_docs): - return [doc for doc in in_docs if doc!=[]] + return [doc for doc in in_docs if doc != []] + docs_tr = remove_empty(docs_tr) docs_ts = remove_empty(docs_ts) docs_va = remove_empty(docs_va) # Remove test documents with length=1 -docs_ts = [doc for doc in docs_ts if len(doc)>1] +docs_ts = [doc for doc in docs_ts if len(doc) > 1] # Split test set in 2 halves -print('splitting test documents in 2 halves...') -docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts] -docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts] +print("splitting test documents in 2 halves...") +docs_ts_h1 = [ + [w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts +] +docs_ts_h2 = [ + [w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts +] # Getting lists of words and doc_indices -print('creating lists of words...') +print("creating lists of words...") + def create_list_words(in_docs): return [x for y in in_docs for x in y] + words_tr = create_list_words(docs_tr) words_ts = create_list_words(docs_ts) words_ts_h1 = create_list_words(docs_ts_h1) words_ts_h2 = create_list_words(docs_ts_h2) words_va = create_list_words(docs_va) -print(' len(words_tr): ', len(words_tr)) -print(' len(words_ts): ', len(words_ts)) -print(' len(words_ts_h1): ', len(words_ts_h1)) -print(' len(words_ts_h2): ', len(words_ts_h2)) -print(' len(words_va): ', len(words_va)) +print(" len(words_tr): ", len(words_tr)) +print(" len(words_ts): ", len(words_ts)) +print(" len(words_ts_h1): ", len(words_ts_h1)) +print(" len(words_ts_h2): ", len(words_ts_h2)) +print(" len(words_va): ", len(words_va)) # Get doc indices -print('getting doc indices...') +print("getting doc indices...") + def create_doc_indices(in_docs): aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)] return [int(x) for y in aux for x in y] + doc_indices_tr = create_doc_indices(docs_tr) doc_indices_ts = create_doc_indices(docs_ts) doc_indices_ts_h1 = create_doc_indices(docs_ts_h1) doc_indices_ts_h2 = create_doc_indices(docs_ts_h2) doc_indices_va = create_doc_indices(docs_va) -print(' len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr))) -print(' len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts))) -print(' len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1))) -print(' len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2))) -print(' len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va))) +print( + " len(np.unique(doc_indices_tr)): {} [this should be {}]".format( + len(np.unique(doc_indices_tr)), len(docs_tr) + ) +) +print( + " len(np.unique(doc_indices_ts)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts)), len(docs_ts) + ) +) +print( + " len(np.unique(doc_indices_ts_h1)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1) + ) +) +print( + " len(np.unique(doc_indices_ts_h2)): {} [this should be {}]".format( + len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2) + ) +) +print( + " len(np.unique(doc_indices_va)): {} [this should be {}]".format( + len(np.unique(doc_indices_va)), len(docs_va) + ) +) # Number of documents in each set n_docs_tr = len(docs_tr) @@ -146,10 +210,14 @@ def create_doc_indices(in_docs): del docs_va # Create bow representation -print('creating bow representation...') +print("creating bow representation...") + def create_bow(doc_indices, words, n_docs, vocab_size): - return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr() + return sparse.coo_matrix( + ([1] * len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size) + ).tocsr() + bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab)) bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab)) @@ -169,57 +237,66 @@ def create_bow(doc_indices, words, n_docs, vocab_size): del doc_indices_va # Save vocabulary to file -path_save = './min_df_' + str(min_df) + '/' +path_save = "./min_df_" + str(min_df) + "/" if not os.path.isdir(path_save): - os.system('mkdir -p ' + path_save) + os.system("mkdir -p " + path_save) -with open(path_save + 'vocab.pkl', 'wb') as f: +with open(path_save + "vocab.pkl", "wb") as f: pickle.dump(vocab, f) del vocab # Split bow intro token/value pairs -print('splitting bow intro token/value pairs and saving to disk...') +print("splitting bow intro token/value pairs and saving to disk...") + def split_bow(bow_in, n_docs): - indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)] - counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)] + indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_docs)] + counts = [[c for c in bow_in[doc, :].data] for doc in range(n_docs)] return indices, counts + bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr) -savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True) -savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True) +savemat(path_save + "bow_tr_tokens", {"tokens": bow_tr_tokens}, do_compression=True) +savemat(path_save + "bow_tr_counts", {"counts": bow_tr_counts}, do_compression=True) del bow_tr del bow_tr_tokens del bow_tr_counts bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts) -savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True) +savemat(path_save + "bow_ts_tokens", {"tokens": bow_ts_tokens}, do_compression=True) +savemat(path_save + "bow_ts_counts", {"counts": bow_ts_counts}, do_compression=True) del bow_ts del bow_ts_tokens del bow_ts_counts bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1) -savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True) +savemat( + path_save + "bow_ts_h1_tokens", {"tokens": bow_ts_h1_tokens}, do_compression=True +) +savemat( + path_save + "bow_ts_h1_counts", {"counts": bow_ts_h1_counts}, do_compression=True +) del bow_ts_h1 del bow_ts_h1_tokens del bow_ts_h1_counts bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2) -savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True) -savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True) +savemat( + path_save + "bow_ts_h2_tokens", {"tokens": bow_ts_h2_tokens}, do_compression=True +) +savemat( + path_save + "bow_ts_h2_counts", {"counts": bow_ts_h2_counts}, do_compression=True +) del bow_ts_h2 del bow_ts_h2_tokens del bow_ts_h2_counts bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va) -savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True) -savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True) +savemat(path_save + "bow_va_tokens", {"tokens": bow_va_tokens}, do_compression=True) +savemat(path_save + "bow_va_counts", {"counts": bow_va_counts}, do_compression=True) del bow_va del bow_va_tokens del bow_va_counts -print('Data ready !!') -print('*************') - +print("Data ready !!") +print("*************") diff --git a/skipgram.py b/skipgram.py index 4ad50ee..2440c12 100644 --- a/skipgram.py +++ b/skipgram.py @@ -1,43 +1,69 @@ -import gensim -import pickle -import os -import numpy as np import argparse -parser = argparse.ArgumentParser(description='The Embedded Topic Model') +import gensim + +parser = argparse.ArgumentParser(description="The Embedded Topic Model") -### data and file related arguments -parser.add_argument('--data_file', type=str, default='', help='a .txt file containing the corpus') -parser.add_argument('--emb_file', type=str, default='embeddings.txt', help='file to save the word embeddings') -parser.add_argument('--dim_rho', type=int, default=300, help='dimensionality of the word embeddings') -parser.add_argument('--min_count', type=int, default=2, help='minimum term frequency (to define the vocabulary)') -parser.add_argument('--sg', type=int, default=1, help='whether to use skip-gram') -parser.add_argument('--workers', type=int, default=25, help='number of CPU cores') -parser.add_argument('--negative_samples', type=int, default=10, help='number of negative samples') -parser.add_argument('--window_size', type=int, default=4, help='window size to determine context') -parser.add_argument('--iters', type=int, default=50, help='number of iterationst') +# data and file related arguments +parser.add_argument( + "--data_file", type=str, default="", help="a .txt file containing the corpus" +) +parser.add_argument( + "--emb_file", + type=str, + default="embeddings.txt", + help="file to save the word embeddings", +) +parser.add_argument( + "--dim_rho", type=int, default=300, help="dimensionality of the word embeddings" +) +parser.add_argument( + "--min_count", + type=int, + default=2, + help="minimum term frequency (to define the vocabulary)", +) +parser.add_argument("--sg", type=int, default=1, help="whether to use skip-gram") +parser.add_argument("--workers", type=int, default=25, help="number of CPU cores") +parser.add_argument( + "--negative_samples", type=int, default=10, help="number of negative samples" +) +parser.add_argument( + "--window_size", type=int, default=4, help="window size to determine context" +) +parser.add_argument("--iters", type=int, default=50, help="number of iterationst") args = parser.parse_args() + # Class for a memory-friendly iterator over the dataset class MySentences(object): def __init__(self, filename): self.filename = filename - + def __iter__(self): for line in open(self.filename): yield line.split() + # Gensim code to obtain the embeddings -sentences = MySentences(args.data_file) # a memory-friendly iterator -model = gensim.models.Word2Vec(sentences, min_count=args.min_count, sg=args.sg, size=args.dim_rho, - iter=args.iters, workers=args.workers, negative=args.negative_samples, window=args.window_size) +sentences = MySentences(args.data_file) # a memory-friendly iterator +model = gensim.models.Word2Vec( + sentences, + min_count=args.min_count, + sg=args.sg, + size=args.dim_rho, + iter=args.iters, + workers=args.workers, + negative=args.negative_samples, + window=args.window_size, +) # Write the embeddings to a file -with open(args.emb_file, 'w') as f: +with open(args.emb_file, "w") as f: for v in list(model.wv.vocab): vec = list(model.wv.__getitem__(v)) - f.write(v + ' ') - vec_str = ['%.9f' % val for val in vec] + f.write(v + " ") + vec_str = ["%.9f" % val for val in vec] vec_str = " ".join(vec_str) - f.write(vec_str + '\n') + f.write(vec_str + "\n") diff --git a/utils.py b/utils.py index 3975544..da40a44 100644 --- a/utils.py +++ b/utils.py @@ -1,22 +1,23 @@ -import torch import numpy as np + def get_topic_diversity(beta, topk): num_topics = beta.shape[0] list_w = np.zeros((num_topics, topk)) for k in range(num_topics): - idx = beta[k,:].argsort()[-topk:][::-1] - list_w[k,:] = idx + idx = beta[k, :].argsort()[-topk:][::-1] + list_w[k, :] = idx n_unique = len(np.unique(list_w)) TD = n_unique / (topk * num_topics) - print('Topic diveristy is: {}'.format(TD)) + print("Topic diveristy is: {}".format(TD)) + def get_document_frequency(data, wi, wj=None): if wj is None: D_wi = 0 for l in range(len(data)): doc = data[l].squeeze(0) - if len(doc) == 1: + if len(doc) == 1: continue else: doc = doc.squeeze() @@ -27,7 +28,7 @@ def get_document_frequency(data, wi, wj=None): D_wi_wj = 0 for l in range(len(data)): doc = data[l].squeeze(0) - if len(doc) == 1: + if len(doc) == 1: doc = [doc.squeeze()] else: doc = doc.squeeze() @@ -35,17 +36,17 @@ def get_document_frequency(data, wi, wj=None): D_wj += 1 if wi in doc: D_wi_wj += 1 - return D_wj, D_wi_wj + return D_wj, D_wi_wj + def get_topic_coherence(beta, data, vocab): - D = len(data) ## number of docs...data is list of documents - print('D: ', D) + D = len(data) # number of docs...data is list of documents + print("D: ", D) TC = [] num_topics = len(beta) for k in range(num_topics): - print('k: {}/{}'.format(k, num_topics)) + print("k: {}/{}".format(k, num_topics)) top_10 = list(beta[k].argsort()[-11:][::-1]) - top_words = [vocab[a] for a in top_10] TC_k = 0 counter = 0 for i, word in enumerate(top_10): @@ -60,28 +61,31 @@ def get_topic_coherence(beta, data, vocab): if D_wi_wj == 0: f_wi_wj = -1 else: - f_wi_wj = -1 + ( np.log(D_wi) + np.log(D_wj) - 2.0 * np.log(D) ) / ( np.log(D_wi_wj) - np.log(D) ) - # update tmp: + f_wi_wj = -1 + (np.log(D_wi) + np.log(D_wj) - 2.0 * np.log(D)) / ( + np.log(D_wi_wj) - np.log(D) + ) + # update tmp: tmp += f_wi_wj j += 1 counter += 1 # update TC_k - TC_k += tmp + TC_k += tmp TC.append(TC_k) - print('counter: ', counter) - print('num topics: ', len(TC)) + print("counter: ", counter) + print("num topics: ", len(TC)) TC = np.mean(TC) / counter - print('Topic coherence is: {}'.format(TC)) + print("Topic coherence is: {}".format(TC)) + def nearest_neighbors(word, embeddings, vocab): - vectors = embeddings.data.cpu().numpy() + vectors = embeddings.data.cpu().numpy() index = vocab.index(word) - print('vectors: ', vectors.shape) + print("vectors: ", vectors.shape) query = vectors[index] - print('query: ', query.shape) + print("query: ", query.shape) ranks = vectors.dot(query).squeeze() denom = query.T.dot(query).squeeze() - denom = denom * np.sum(vectors**2, 1) + denom = denom * np.sum(vectors ** 2, 1) denom = np.sqrt(denom) ranks = ranks / denom mostSimilar = [] From 7411cfccdd045a11b2ba3dce991ef322995dd1b0 Mon Sep 17 00:00:00 2001 From: Haluk Dogan Date: Thu, 25 Feb 2021 19:49:03 -0600 Subject: [PATCH 2/2] project config for emacs --- .dir-locals.el | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .dir-locals.el diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 0000000..fcc8077 --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1,18 @@ +( + (python-mode . ( + (eval . + (progn + ;; set path to the python modules directory + (add-to-list 'exec-path (concat (locate-dominating-file default-directory dir-locals-file) "bin/")) + ;; configure inferior python shell. + (setq-local python-shell-interpreter "pythondocker") + (setq-local python-shell-interpreter-interactive-arg "-i") + (setq-local python-shell-completion-native-enable nil) + (setq-local lsp-pyls-plugins-mypy-enabled t) + (setq-local lsp-pyls-plugins-mypy.live_mode t) + (setq-local lsp-pyls-plugins-black-enabled t) + (setq-local lsp-pyls-plugins-isort-enabled t) + ) + ) + )) + )