Skip to content

Commit

Permalink
feature/add:Script char2vec_standalone script
Browse files Browse the repository at this point in the history
  • Loading branch information
yueyp committed Nov 24, 2017
1 parent 1c76507 commit c72c06e
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 0 deletions.
123 changes: 123 additions & 0 deletions chi_annotator/algo_factory/offline/embedding/char2vec_standalone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 -


"""
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
Parameters for training:
-train <file>
Use text data from <file> to train the model
-output <file>
Use <file> to save the resulting word vectors / word clusters
-size <int>
Set size of word vectors; default is 100
-window <int>
Set max skip length between words; default is 5
-sample <float>
Set threshold for occurrence of words. Those that appear with higher frequency in the training data
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
-hs <int>
Use Hierarchical Softmax; default is 0 (not used)
-negative <int>
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
-threads <int>
Use <int> threads (default 3)
-iter <int>
Run more training iterations (default 5)
-min_count <int>
This will discard words that appear less than <int> times; default is 5
-alpha <float>
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
-binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
-cbow <int>
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
-accuracy <file>
Compute accuracy of the resulting model analogical inference power on questions file <file>
Example: python -m gensim.scripts.char2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
"""


import logging
import os.path
import sys
import argparse
from numpy import seterr

logger = logging.getLogger(__name__)

from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle


if __name__ == "__main__":
logging.basicConfig(
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))

# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)

seterr(all='raise') # don't ignore numpy errors

parser = argparse.ArgumentParser()
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; "
"default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5)
parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float)
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")

args = parser.parse_args()

if args.cbow == 0:
skipgram = 1
if not args.alpha:
args.alpha = 0.025
else:
skipgram = 0
if not args.alpha:
args.alpha = 0.05

corpus = LineSentence(args.train)

model = Word2Vec(
corpus, size=args.size, min_count=args.min_count, workers=args.threads,
window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram,
hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)

if args.output:
outfile = args.output
model.wv.save_word2vec_format(outfile, binary=args.binary)
else:
outfile = args.train.split('.')[0]
model.save(outfile + '.model')
if args.binary == 1:
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True)
else:
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False)

if args.accuracy:
questions_file = args.accuracy
model.accuracy(questions_file)

logger.info("finished running %s", program)
6 changes: 6 additions & 0 deletions docs/feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Features

特征是很多NLP模型的输入,本工具提供一下类型的特征

- number_of_tokens
- bag_of_words

0 comments on commit c72c06e

Please sign in to comment.