Skip to content
This repository has been archived by the owner on Jun 7, 2023. It is now read-only.

add dialoGPT-medium #65

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions models/dialogpt-medium/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This file is adapted from github.com/cpllab/lm-zoo/blob/master/models/gpt2/Dockerfile
FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-devel
RUN apt-get -y update \
&& apt-get install -y curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Root of model directory relative to the build context.

ARG MODEL_ROOT=models/dialogpt-medium
ARG COMMIT
# Test dependencies
RUN pip install nose jsonschema

# Runtime dependencies
RUN pip install sentencepiece==0.1.91 transformers==3.0.2 h5py

# Copy in custom file for surprisal evaluation
COPY ${MODEL_ROOT}/get_surprisals.py /opt/dialogpt-medium/get_surprisals.py
COPY ${MODEL_ROOT}/tokenizer.py /opt/dialogpt-medium/tokenizer.py


# Download the model files
RUN curl https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-medium/config.json -so /opt/dialogpt-medium/config.json
RUN curl https://cdn.huggingface.co/microsoft/DialoGPT-medium/pytorch_model.bin -so /opt/dialogpt-medium/pytorch_model.bin
RUN curl https://cdn.huggingface.co/microsoft/DialoGPT-medium/vocab.json -so /opt/dialogpt-medium/vocab.json
RUN curl https://cdn.huggingface.co/microsoft/DialoGPT-medium/merges.txt -so /opt/dialogpt-medium/merges.txt

# Copy spec template
COPY ${MODEL_ROOT}/spec.template.json /opt/spec.json

ENV PYTHONIOENCODING utf-8:surrogateescape
# open issue with pytorch https://github.com/pytorch/pytorch/issues/37377
ENV MKL_SERVICE_FORCE_INTEL=1

# Copy external-facing scripts
COPY ${MODEL_ROOT}/bin /opt/bin
ENV PATH "/opt/bin:${PATH}"

WORKDIR /opt/bin
4 changes: 4 additions & 0 deletions models/dialogpt-medium/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
This image is adapted from github.com/cpllab/lm-zoo/tree/master/models/transformers-base.

DialoGPT-medium extends gpt-2-medium by fine-tuning on Reddit data in order to model dialogue.
For this, the eos token is used to mark a speaker change (represented by the `[SEP]` token in the input, which requires some modifications to `get_surprisals.py` and `tokenizer.py`).
10 changes: 10 additions & 0 deletions models/dialogpt-medium/bin/get_predictions.hdf5
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

#python /opt/transformers/get_surprisals.py --mode predictions \
# "$1" --outputf "$2" \
# --model_path "$TRANSFORMER_MODEL_PATH"


#python /opt/dialogpt/get_surprisals.py --mode predictions \
# "$1" --outputf "$2"
exit 99
3 changes: 3 additions & 0 deletions models/dialogpt-medium/bin/get_surprisals
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

python /opt/dialogpt-medium/get_surprisals.py --mode surprisal "$1"
46 changes: 46 additions & 0 deletions models/dialogpt-medium/bin/spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python

import json
import os
import sys

import torch

from transformers import AutoTokenizer

# Load spec template.
with open("/opt/spec.json", "r") as spec_f:
spec = json.load(spec_f)

# Load tokenizer.
#model_path = os.environ["TRANSFORMER_MODEL_PATH"]
tokenizer = AutoTokenizer.from_pretrained("/opt/dialogpt-medium")

def filter_none(xs):
return [x for x in xs if x is not None]

# Set spec vocabulary information from tokenizer.
special_tokens = set(tokenizer.all_special_tokens) - \
{tokenizer.bos_token, tokenizer.eos_token, tokenizer.unk_token}

spec["vocabulary"] = {
"items": list(tokenizer.get_vocab().keys()),

"prefix_types": filter_none([tokenizer.bos_token]),
"suffix_types": filter_none([tokenizer.eos_token]),
"unk_types": filter_none([tokenizer.unk_token]),
"special_types": list(special_tokens),
}
#spec["supported_features"]["get_predictions"] = False
spec["image"]["supported_features"]["get_predictions"] = False
spec["tokenizer"]["type"] = "subword"
spec["tokenizer"]["sentinel_position"] = "initial"
spec["tokenizer"]["sentinel_pattern"] = "Ġ"


spec["image"]["gpu"]["supported"] = True
spec["image"]["maintainer"] = "[email protected]"
spec["ref_url"] = "https://huggingface.co/microsoft/DialoGPT-medium"
spec["name"] = "DialoGPT-medium"

json.dump(spec, sys.stdout)
8 changes: 8 additions & 0 deletions models/dialogpt-medium/bin/tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

#python /opt/transformers/tokenizer.py -m tokenize \
# "$1" \
# --model_path "$TRANSFORMER_MODEL_PATH"

python /opt/dialogpt-medium/tokenizer.py -m tokenize \
"$1" \
4 changes: 4 additions & 0 deletions models/dialogpt-medium/bin/unkify
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

python /opt/dialogpt-medium/tokenizer.py -m unkify \
"$1"
144 changes: 144 additions & 0 deletions models/dialogpt-medium/get_surprisals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
Adapted from github.com/cpllab/lm-zoo/blob/master/models/transformers-base/get_surprisals.py
Get surprisal estimates for the DialoGPT model.
"""

import argparse
import os
import logging
import operator
from pathlib import Path
import sys

import h5py
import torch
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
logging.getLogger("transformers").setLevel(logging.ERROR)

def readlines(inputf):
with inputf as f:
lines = f.readlines()
lines = [l.strip('\n') for l in lines]
return lines

def set_seed(seed, cuda=False):
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
torch.cuda.manual_seed_all(seed)


def _get_predictions_inner(sentence, tokenizer, model, device):
# TODO handle sentence maxlen

sent_tokens = tokenizer.tokenize(sentence)
indexed_tokens = tokenizer.convert_tokens_to_ids(sent_tokens)
# create 1 * T input token tensor
tokens_tensor = torch.tensor(indexed_tokens).unsqueeze(0)
tokens_tensor = tokens_tensor.to(device)

with torch.no_grad():
log_probs = model(tokens_tensor)[0].log_softmax(dim=2).squeeze()

return list(zip(sent_tokens, indexed_tokens, (None,) + log_probs.unbind()))


def get_predictions(sentence, tokenizer, model, device):
for token, idx, probs in _get_predictions_inner(sentence, tokenizer, model, device):
yield token, idx, probs.numpy() if probs is not None else probs


def get_surprisals(sentence, tokenizer, model, device):
# Adapted:
# In dialoGPT, the eos token is used to mark a speaker change.
# This is indicated in the input by the string "[SEP]".
sentence.replace("[SEP]", tokenizer.eos_token)

predictions = _get_predictions_inner(sentence, tokenizer, model, device)
surprisals = []
for j, (word, word_idx, preds) in enumerate(predictions):
if preds == None:
surprisal = 0.0
else:
surprisal = -preds[word_idx].item() / np.log(2)

# convert the eos token back to "[SEP]"
if word == tokenizer.eos_token:
word = "[SEP]"
surprisals.append((word, word_idx, surprisal))
return surprisals


def main(args):
# Adapted model paths
set_seed(args.seed, cuda=args.cuda)

logger.info('Importing tokenizer and pre-trained model...')
tokenizer = AutoTokenizer.from_pretrained("/opt/dialogpt-medium")
model = AutoModelForCausalLM.from_pretrained("/opt/dialogpt-medium")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

logger.info('Reading sentences from %s...', args.inputf)
sentences = readlines(args.inputf)

if args.mode == "surprisal":
with args.outputf as f:
f.write("sentence_id\ttoken_id\ttoken\tsurprisal\n")

for i, sentence in enumerate(sentences):
surprisals = get_surprisals(sentence, tokenizer, model, device)
# write surprisals for sentence (append to outputf)
for j, (word, word_idx, surprisal) in enumerate(surprisals):
f.write("%i\t%i\t%s\t%f\n" % (i + 1, j + 1, word, surprisal))
elif args.mode == "predictions":
outf = h5py.File(args.outputf.name, "w")

for i, sentence in enumerate(sentences):
predictions = list(get_predictions(sentence, tokenizer, model, device))
tokens, token_ids, probs = list(zip(*predictions))

# Replace null first prediction with a uniform log-probability
# distribution
probs = list(probs)
probs[0] = np.ones_like(probs[1])
probs[0] /= probs[0].sum()
probs[0] = np.log(probs[0])
probs = np.array(probs)

group = outf.create_group("/sentence/%i" % i)
group.create_dataset("predictions", data=probs)
group.create_dataset("tokens", data=token_ids)

# dict: word -> idx
vocab = tokenizer.get_vocab()
vocab = [tok for tok, idx in sorted(vocab.items(), key=operator.itemgetter(1))]
vocab_encoded = np.char.encode(vocab, "utf-8")
outf.create_dataset("/vocabulary", data=vocab_encoded)

outf.close()
else:
raise ValueError("Unsupported mode %s" % args.mode)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get token-level model surprisal estimates')
parser.add_argument("inputf", type=argparse.FileType("r", encoding="utf-8"),
help="Input file")
parser.add_argument("--input_is_tokenized", default=False, action="store_true")
#parser.add_argument("--model_path", default=None, type=Path, required=True,
# help="Path to model directory containing checkpoint, vocabulary, config, etc.")
parser.add_argument('--cuda', default=False, action='store_true',
help='toggle cuda to run on GPU')
parser.add_argument('--seed', type=int, default=42,
help='random seed')
parser.add_argument('--outputf', '-o', type=argparse.FileType("w"), default=sys.stdout,
help='output file for generated text')
parser.add_argument("--mode", choices=["surprisal", "predictions"])
main(parser.parse_args())
34 changes: 34 additions & 0 deletions models/dialogpt-medium/spec.template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"name": "My Language Model",
"ref_url": "https://me.com/my_lm",

"image": {
"maintainer": "[email protected]",
"version": "0.0.1",
"checksum": "1235151591413afabcdedfade",
"datetime": "2020-09-25T18:39:20+00:32",
"gpu": {
"required": false,
"supported": false
},
"supported_features": {
"tokenize": true,
"unkify": true,
"get_surprisals": true,
"get_predictions": false,
"mount_checkpoint": true
}
},

"vocabulary": {
"unk_types": ["<unk>"],
"prefix_types": [],
"suffix_types": ["<eos>"],
"special_types": []
},

"tokenizer": {
"type": "word",
"cased": true
}
}
81 changes: 81 additions & 0 deletions models/dialogpt-medium/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Adapted from github.com/cpllab/lm-zoo/blob/master/models/transformers-base/tokenizer.py
Tokenization utilities for a transformers model.
"""

import argparse
import os
import logging
from pathlib import Path
import sys

import torch
import numpy as np

from transformers import AutoTokenizer
import transformers as tr


logger = logging.getLogger(__name__)


def readlines(inputf):
with inputf as f:
lines = f.readlines()
lines = [l.strip('\n') for l in lines]
return lines


def tokenize_sentence(sentence, tokenizer):
utts = sentence.split(" [SEP] ")
toks = []
for utt in utts[:-1]:
curr_toks = tokenizer.tokenize(utt)
curr_toks.append("[SEP]")
toks += curr_toks
curr_toks = tokenizer.tokenize(utts[-1])
toks += curr_toks
return toks


def unkify_sentence(sentence, tokenizer):
unks = []
utts = sentence.split(" [SEP] ")
unk_id = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

for utt in utts[:-1]:
token_ids = tokenizer.encode(utt)
curr_unks = ["1" if idx == unk_id else "0" for idx in token_ids]
curr_unks.append("0") # for the [SEP] token, which we're defining to be known
unks += curr_unks

token_ids = tokenizer.encode(utts[-1])
curr_unks = ["1" if idx == unk_id else "0" for idx in token_ids]
unks += curr_unks

return unks


def main(args):
logger.info("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained("/opt/dialogpt-medium")

logger.info("Reading sentences from %s", args.inputf)
sentences = readlines(args.inputf)

f = tokenize_sentence if args.mode == "tokenize" else unkify_sentence
with args.outputf as of:
for sentence in sentences:
of.write(" ".join(f(sentence, tokenizer)) + "\n")


if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("inputf", type=argparse.FileType("r", encoding="utf-8"), help="Input file")
p.add_argument("-m", "--mode", choices=["tokenize", "unkify"])
#p.add_argument("--model_path", default=None, type=Path, required=True,
# help="Path to model directory containing checkpoint, vocabulary, config, etc.")
p.add_argument('--outputf', '-o', type=argparse.FileType("w"), default=sys.stdout,
help='output file for generated text')

main(p.parse_args())