forked from devmount/GermanWordEmbeddings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisualize.py
executable file
·152 lines (133 loc) · 5.34 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# script to visualize word embeddings of given model with PCA dimensionality reduction
# creates image with matplotlib
#
# @author: Andreas Mueller
# @see: Bachelor Thesis 'Analyse von Wort-Vektoren deutscher Textkorpora'
#
# Contributors:
# Michael Egger <[email protected]>
#
# @example: python visualize.py
import argparse
import gensim
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
parser = argparse.ArgumentParser(description='Script for visualizing word vector models')
parser.add_argument('model', type=str, help='source file with trained model')
args = parser.parse_args()
# configuration
currency = [
'Schweiz', 'Franken', 'Deutschland', 'Euro', 'Grossbritannien', 'Pfund',
'Japan', 'Yen', 'Russland', 'Rubel', 'USA', 'US-Dollar', 'Kroatien', 'Kuna'
]
# capital = [
# 'Athen', 'Griechenland', 'Berlin', 'Deutschland', 'Ankara', 'Tuerkei', 'Bern', 'Schweiz', 'Hanoi', 'Vietnam',
# 'Lissabon', 'Portugal', 'Moskau', 'Russland', 'Stockholm', 'Schweden', 'Tokio', 'Japan', 'Washington', 'USA'
# ]
# language = [
# 'Deutschland', 'Deutsch', 'USA', 'Englisch', 'Frankreich', 'Franzoesisch', 'Griechenland', 'Griechisch',
# 'Norwegen', 'Norwegisch', 'Schweden', 'Schwedisch', 'Polen', 'Polnisch', 'Ungarn', 'Ungarisch'
# ]
industry_company = [
'Banken', 'Commerzbank', 'Banken', 'WestLB', 'Banken', 'Haspa',
'Versicherungen', 'Allianz',
'Automobilbau', 'Daimler', 'Automobilbau', 'BMW',
'Energiewirtschaft', 'RWE', 'Energiewirtschaft', 'Ruhrkohle'
]
industry_product = [
'Banken', 'Kredit', 'Banken', 'Konto', 'Banken', 'Geld',
'Versicherungen', 'Haftpflicht', 'Versicherungen', 'Lebensversicherung',
'Automobilbau', 'Auto', 'Automobilbau', 'LKW',
'Energiewirtschaft', 'Strom', 'Energiewirtschaft', 'Kohle', 'Energiewirtschaft', 'Windenergie', 'Energiewirtschaft',
'Solarstrom'
]
consulting = [
'Unternehmensberatung', 'zeb',
'Unternehmensberatung', 'SKS Unternehmensberatung',
'Unternehmensberatung', 'Capgemini',
'Unternehmensberatung', 'McKinsey',
'Unternehmensberatung', 'KPMG',
'Unternehmensberatung', 'PwC',
'Unternehmensberatung', 'EY',
'Unternehmensberatung', 'ifb',
'Unternehmensberatung', 'Wipro',
'Unternehmensberatung', 'Roland_Berger',
'Unternehmensberatung', 'Adesso',
]
# matches = model.most_similar(positive=["Frau"], negative=[], topn=30)
# words = [match[0] for match in matches]
def draw_words(model, words, pca=False, alternate=True, arrows=True, x1=3, x2=3, y1=3, y2=3, title=''):
"""
Reduces dimensionality of vectors of given words either with PCA or with t-SNE and draws the words into a diagram.
:param model: to visualize vectors from
:param words: list of word strings to visualize
:param pca: use PCA (True) or t-SNE (False) to reduce dimensionality
:param alternate: use different color and label align for every second word
:param arrows: use arrows to connect related words (items that are next to each other in list)
:param x1: x axis range (from)
:param x2: x axis range (to)
:param y1: y axis range (from)
:param y2: y axis range (to)
:param title: for diagram
:return: None
"""
# get vectors for given words from model
vectors = [model[word] for word in words]
if pca:
pca = PCA(n_components=2, whiten=True)
vectors2d = pca.fit(vectors).transform(vectors)
else:
tsne = TSNE(n_components=2, random_state=0)
vectors2d = tsne.fit_transform(vectors)
# draw image
plt.figure(figsize=(6, 6))
if pca:
plt.axis([x1, x2, y1, y2])
first = True # color alternation to divide given groups
for point, word in zip(vectors2d, words):
# plot points
plt.scatter(point[0], point[1], c='r' if first else 'g')
# plot word annotations
plt.annotate(
word,
xy=(point[0], point[1]),
xytext=(-7, -6) if first else (7, -6),
textcoords='offset points',
ha='right' if first else 'left',
va='bottom',
size="x-large"
)
first = not first if alternate else first
# draw arrows
if arrows:
for i in range(0, len(words) - 1, 2):
a = vectors2d[i][0] + 0.04
b = vectors2d[i][1]
c = vectors2d[i + 1][0] - 0.04
d = vectors2d[i + 1][1]
plt.arrow(
a, b, c - a, d - b,
shape='full',
lw=0.1,
edgecolor='#bbbbbb',
facecolor='#bbbbbb',
length_includes_head=True,
head_width=0.08,
width=0.01
)
# draw diagram title
if title:
plt.title(title)
plt.tight_layout()
plt.show()
# get trained model
print(u'Load word2vec model from file...')
mwv = gensim.models.KeyedVectors.load(args.model, mmap='r')
# draw pca plots
draw_words(mwv, currency, True, True, True, -3, 3, -2, 2, r'$PCA\ Visualisierung:\ Currency')
draw_words(mwv, consulting, True, True, True, -3, 3, -2, 2, r'$PCA\ Visualisierung:\ Industry and Product')
draw_words(mwv, industry_company, True, True, True, -3, 3, -2, 2, r'$PCA\ Visualisierung:\ Industry and Company$')
draw_words(mwv, industry_product, True, True, True, -3, 3, -2, 2, r'$PCA\ Visualisierung:\ Industry and Product')