forked from Leoxu123/DLNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
45 lines (34 loc) · 1.35 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/env/bin python
# -*- coding: utf-8 -*-
import numpy as np
import random
from cs224d.data_utils import *
from utils import *
from word2vec import *
from sgd import *
# Load some data and initialize word vectors
# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10
# Context size
C = 5
# Train word vectors (this could take a while!)
# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)
wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors,
np.zeros((nWords, dimVectors))), axis=0)
wordVectors0 = sgd(lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient),
wordVectors, 0.3, 40000, None, False, PRINT_EVERY=10)
# sanity check: cost at convergence should be around or below 10
# sum the input and output word vectors
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
print "\n=== For autograder ==="
checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"]
checkIdx = [tokens[word] for word in checkWords]
checkVecs = wordVectors[checkIdx, :]
print checkVecs