Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Containerization and import cleanup, code formatting #28

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .dir-locals.el
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
(
(python-mode . (
(eval .
(progn
;; set path to the python modules directory
(add-to-list 'exec-path (concat (locate-dominating-file default-directory dir-locals-file) "bin/"))
;; configure inferior python shell.
(setq-local python-shell-interpreter "pythondocker")
(setq-local python-shell-interpreter-interactive-arg "-i")
(setq-local python-shell-completion-native-enable nil)
(setq-local lsp-pyls-plugins-mypy-enabled t)
(setq-local lsp-pyls-plugins-mypy.live_mode t)
(setq-local lsp-pyls-plugins-black-enabled t)
(setq-local lsp-pyls-plugins-isort-enabled t)
)
)
))
)
209 changes: 209 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# Created by https://www.toptal.com/developers/gitignore/api/linux,python,emacs
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,python,emacs

### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*

# Org-mode
.org-id-locations
*_archive
ltximg/**

# flymake-mode
*_flymake.*

# eshell files
/eshell/history
/eshell/lastdir

# elpa packages
/elpa/

# reftex files
*.rel

# AUCTeX auto folder
/auto/

# cask packages
.cask/
dist/

# Flycheck
flycheck_*.el

# server auth directory
/server/

# projectiles files
.projectile

# directory configuration
.dir-locals.el

# network security
/network-security.data


### Linux ###

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*

# .nfs files are created when an open file is removed but is still being accessed
.nfs*

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
doc/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# profiling data
.prof

# End of https://www.toptal.com/developers/gitignore/api/linux,python,emacs
31 changes: 31 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
ARG TAG=3.8

FROM python:$TAG
ENV PYTHONUNBUFFERED 1

ARG USER
ARG USER_ID
ARG GROUP_ID
ARG WORKDIR

RUN apt-get update \
&& apt-get clean \
&& apt-get update -qqq \
&& apt-get install -y -q build-essential graphviz graphviz-dev \
&& apt-get install -y -q ffmpeg libsm6 libxext6 \
&& pip install --upgrade pip \
&& pip install Cython scipy

RUN groupadd --gid 1000 $USER
RUN useradd --create-home --uid $USER_ID --gid $GROUP_ID $USER

USER ${USER}
ENV PATH "$PATH:/home/$USER/.local/bin"

COPY ./requirements.txt requirements.txt
RUN pip install --user -r requirements.txt

RUN pip install --user torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 \
-f https://download.pytorch.org/whl/torch_stable.html

WORKDIR $WORKDIR
27 changes: 27 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
##
# ETM
#
# @file
# @version 0.1

user := $(shell whoami)
userid := $(shell id -u)
groupid := $(shell id -g)
workdir := $(shell pwd)

.PHONY: build
build:
docker-compose build \
--build-arg USER=$(user) \
--build-arg USER_ID=${userid} \
--build-arg GROUP_ID=$(groupid) \
--build-arg WORKDIR=$(workdir)

.PHONY: clean-container
clean-container:
docker rmi etm_etm

.PHONY: clean-python
clean-python:
rm -rf __pycache__
# end
28 changes: 22 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@ ETM defines words and topics in the same embedding space. The likelihood of a wo

## Dependencies

+ python 3.6.7
+ pytorch 1.1.0
+ python 3.8.7
+ pytorch 1.7.1

## Optional

+ docker
+ docker-compose

### Build Docker Image

``` sh
make build
```

## Datasets

Expand All @@ -20,7 +31,7 @@ All the datasets are pre-processed and can be found below:
+ https://bitbucket.org/franrruiz/data_stopwords_largev_2/src/master/ (this one contains stop words and was used to showcase robustness of ETM to stop words.)
+ https://bitbucket.org/franrruiz/data_20ng_largev/src/master/

All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py
All the scripts to pre-process a given dataset for ETM can be found in the folder 'scripts'. The script for 20NewsGroup is self-contained as it uses scikit-learn. If you want to run ETM on your own dataset, follow the script for New York Times (given as example) called data_nyt.py

## To Run

Expand All @@ -38,14 +49,20 @@ To learn interpretable topics using ETM with pre-fitted word embeddings (called

+ first fit the word embeddings. For example to use simple skipgram you can run
```
python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4
python skipgram.py --data_file PATH_TO_DATA --emb_file PATH_TO_EMBEDDINGS --dim_rho 300 --iters 50 --window_size 4
```

+ then run the following
+ then run the following
```
python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH_TO_EMBEDDINGS --num_topics 50 --train_embeddings 0 --epochs 1000
```

## To Run in a Container

``` sh
docker-compose run --rm etm python main.py --mode train --dataset 20ng --data_path data/20ng --num_topics 50 --train_embeddings 1 --epochs 1000
```

## Citation

```
Expand All @@ -56,4 +73,3 @@ python main.py --mode train --dataset 20ng --data_path data/20ng --emb_path PATH
year={2019}
}
```

4 changes: 4 additions & 0 deletions bin/pyls
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e
# docker-compose build ml &> /dev/null
exec docker-compose run --name etm_python --rm --no-deps -T etm pyls
3 changes: 3 additions & 0 deletions bin/pythondocker
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
set -e
exec docker exec -it etm_python python $@
Loading