diff --git a/chi_annotator/algo_factory/offline/embedding/char2vec_standalone.py b/chi_annotator/algo_factory/offline/embedding/char2vec_standalone.py new file mode 100644 index 0000000..fc620fa --- /dev/null +++ b/chi_annotator/algo_factory/offline/embedding/char2vec_standalone.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed under the GNU LGPL v2.1 - + + +""" +USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW +-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER +-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE + + +Parameters for training: + -train + Use text data from to train the model + -output + Use to save the resulting word vectors / word clusters + -size + Set size of word vectors; default is 100 + -window + Set max skip length between words; default is 5 + -sample + Set threshold for occurrence of words. Those that appear with higher frequency in the training data + will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5) + -hs + Use Hierarchical Softmax; default is 0 (not used) + -negative + Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) + -threads + Use threads (default 3) + -iter + Run more training iterations (default 5) + -min_count + This will discard words that appear less than times; default is 5 + -alpha + Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW + -binary + Save the resulting vectors in binary moded; default is 0 (off) + -cbow + Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) + -accuracy + Compute accuracy of the resulting model analogical inference power on questions file + +Example: python -m gensim.scripts.char2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 +""" + + +import logging +import os.path +import sys +import argparse +from numpy import seterr + +logger = logging.getLogger(__name__) + +from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle + + +if __name__ == "__main__": + logging.basicConfig( + format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', + level=logging.INFO) + logger.info("running %s", " ".join(sys.argv)) + + # check and process cmdline input + program = os.path.basename(sys.argv[0]) + if len(sys.argv) < 2: + print(globals()['__doc__'] % locals()) + sys.exit(1) + + seterr(all='raise') # don't ignore numpy errors + + parser = argparse.ArgumentParser() + parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) + parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") + parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) + parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) + parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " + "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) + parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) + parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) + parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) + parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) + parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float) + parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) + parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) + parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") + + args = parser.parse_args() + + if args.cbow == 0: + skipgram = 1 + if not args.alpha: + args.alpha = 0.025 + else: + skipgram = 0 + if not args.alpha: + args.alpha = 0.05 + + corpus = LineSentence(args.train) + + model = Word2Vec( + corpus, size=args.size, min_count=args.min_count, workers=args.threads, + window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, + hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter) + + if args.output: + outfile = args.output + model.wv.save_word2vec_format(outfile, binary=args.binary) + else: + outfile = args.train.split('.')[0] + model.save(outfile + '.model') + if args.binary == 1: + model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) + else: + model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) + + if args.accuracy: + questions_file = args.accuracy + model.accuracy(questions_file) + + logger.info("finished running %s", program) \ No newline at end of file diff --git a/docs/feature.md b/docs/feature.md new file mode 100644 index 0000000..1bb018a --- /dev/null +++ b/docs/feature.md @@ -0,0 +1,6 @@ +# Features + +特征是很多NLP模型的输入,本工具提供一下类型的特征 + +- number_of_tokens +- bag_of_words \ No newline at end of file