forked from deepwel/Chinese-Annotator
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature/add:Script char2vec_standalone script
- Loading branch information
yueyp
committed
Nov 24, 2017
1 parent
1c76507
commit c72c06e
Showing
2 changed files
with
129 additions
and
0 deletions.
There are no files selected for viewing
123 changes: 123 additions & 0 deletions
123
chi_annotator/algo_factory/offline/embedding/char2vec_standalone.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Licensed under the GNU LGPL v2.1 - | ||
|
||
|
||
""" | ||
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW | ||
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER | ||
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE | ||
Parameters for training: | ||
-train <file> | ||
Use text data from <file> to train the model | ||
-output <file> | ||
Use <file> to save the resulting word vectors / word clusters | ||
-size <int> | ||
Set size of word vectors; default is 100 | ||
-window <int> | ||
Set max skip length between words; default is 5 | ||
-sample <float> | ||
Set threshold for occurrence of words. Those that appear with higher frequency in the training data | ||
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5) | ||
-hs <int> | ||
Use Hierarchical Softmax; default is 0 (not used) | ||
-negative <int> | ||
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) | ||
-threads <int> | ||
Use <int> threads (default 3) | ||
-iter <int> | ||
Run more training iterations (default 5) | ||
-min_count <int> | ||
This will discard words that appear less than <int> times; default is 5 | ||
-alpha <float> | ||
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW | ||
-binary <int> | ||
Save the resulting vectors in binary moded; default is 0 (off) | ||
-cbow <int> | ||
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) | ||
-accuracy <file> | ||
Compute accuracy of the resulting model analogical inference power on questions file <file> | ||
Example: python -m gensim.scripts.char2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 | ||
""" | ||
|
||
|
||
import logging | ||
import os.path | ||
import sys | ||
import argparse | ||
from numpy import seterr | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig( | ||
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', | ||
level=logging.INFO) | ||
logger.info("running %s", " ".join(sys.argv)) | ||
|
||
# check and process cmdline input | ||
program = os.path.basename(sys.argv[0]) | ||
if len(sys.argv) < 2: | ||
print(globals()['__doc__'] % locals()) | ||
sys.exit(1) | ||
|
||
seterr(all='raise') # don't ignore numpy errors | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) | ||
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") | ||
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) | ||
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) | ||
parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " | ||
"default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) | ||
parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) | ||
parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) | ||
parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) | ||
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) | ||
parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) | ||
parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float) | ||
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) | ||
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) | ||
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.cbow == 0: | ||
skipgram = 1 | ||
if not args.alpha: | ||
args.alpha = 0.025 | ||
else: | ||
skipgram = 0 | ||
if not args.alpha: | ||
args.alpha = 0.05 | ||
|
||
corpus = LineSentence(args.train) | ||
|
||
model = Word2Vec( | ||
corpus, size=args.size, min_count=args.min_count, workers=args.threads, | ||
window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, | ||
hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter) | ||
|
||
if args.output: | ||
outfile = args.output | ||
model.wv.save_word2vec_format(outfile, binary=args.binary) | ||
else: | ||
outfile = args.train.split('.')[0] | ||
model.save(outfile + '.model') | ||
if args.binary == 1: | ||
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) | ||
else: | ||
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) | ||
|
||
if args.accuracy: | ||
questions_file = args.accuracy | ||
model.accuracy(questions_file) | ||
|
||
logger.info("finished running %s", program) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Features | ||
|
||
特征是很多NLP模型的输入,本工具提供一下类型的特征 | ||
|
||
- number_of_tokens | ||
- bag_of_words |