-
Notifications
You must be signed in to change notification settings - Fork 0
/
mkfeature.rb
118 lines (101 loc) · 2.67 KB
/
mkfeature.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
require 'MeCab'
if RUBY_VERSION < '1.9'
$KCODE = 'u'
class String
def force_encoding(enc)
self
end
def ord
self.unpack('U')[0]
end
def length
self.chars.count
end
def [](arg)
if arg.kind_of? Fixnum
return self.chars.to_a[arg]
elsif arg.kind_of? Range
return self.chars.to_a[arg].join
else
return (self)[arg]
end
end
end
end
def begin_node_list(bos, sen, i)
result = []
if sen[i] != ' '
prev = i == 0 ? '' : sen[0..(i-1)]
if / +$/ =~ prev
tail = $&.length
else
tail = 0
end
node = bos.begin_node_list(prev.bytesize - tail)
while node do
result << node
node = node.bnext
end
end
return result
end
def all_morph(bos, sen)
morph = []
sen.chars.each_with_index do |c,i|
nodes = begin_node_list(bos, sen, i)
for n in nodes
surf = n.surface.force_encoding('utf-8')
morph << {:pos => i, :len => surf.length, :node => n}
end
end
return morph
end
def test
sen = '井ノ上 です'
t = MeCab::Tagger.new('-a -l2')
bos = t.parseToNode(sen)
morph = all_morph(bos, sen)
for e in morph
puts "%s\t%s\t%s\t%s" % [e[:pos], e[:len],
e[:node].surface.force_encoding('utf-8'), e[:node].feature.force_encoding('utf-8')]
end
end
def main
while line = STDIN.gets do
line.chomp!
body = line
t = MeCab::Tagger.new('-a -l2')
bos = t.parseToNode(body)
morph = all_morph(bos, body)
body.chars.each_with_index do |c,i|
if c != ' '
cov = morph.select {|e| (e[:pos]..(e[:pos]+e[:len]-1)).include? i} .map {|e| e[:node] }
beg = morph.select {|e| e[:pos] == i } .map {|e| e[:node] }
total = cov.reduce(0.0) {|sum, n| sum+n.prob }
bprob = beg.reduce(0.0) {|sum, n| sum+n.prob }
if total
bscore = -1 * (Math.log(bprob / total + 1E-10) * 10.0).to_i
iscore = -1 * (Math.log((total - bprob) / total + 1E-10) * 10.0).to_i
else
bscore = -1 * (Math.log(1E-10) * 10.0).to_i
iscore = -1 * (Math.log(1E-10) * 10.0).to_i
end
if !beg.empty?
maxnode = beg.max_by {|n| n.prob }
feature = maxnode.feature.force_encoding('utf-8').split(',')[0..1].join(',')
else
feature = 'None'
end
else # c == ' '
bscore = -1 * (Math.log(1E-10) * 10.0).to_i
iscore = -1 * (Math.log(1E-10) * 10.0).to_i
feature = '記号,空白'
end
tag = ''
puts "%s\t%03d\t%03d\t%s\t%s" % [c.ord, bscore, iscore, feature, tag]
end
puts ""
end
end
main