forked from t4t5u0/lyrics-parse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
64 lines (49 loc) · 1.46 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
from itertools import chain
from sudachipy import tokenizer
from sudachipy import dictionary
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C
txt = ''
with open('./input.txt') as f:
txt = f.read()
# print(txt)
x = re.split("\u3000|\n", txt)
tmp = []
for ls in x:
tmp.append([(m.surface(), m.part_of_speech())
for m in tokenizer_obj.tokenize(ls, mode)])
tmp = list(chain.from_iterable(tmp))
# print(tmp)
# for item in tmp:
# print(item)
indices = [i+1 for i, item in enumerate(tmp)
if (item[1][0] in ['補助記号'])
or (item[1][1] in ['格助詞'] and tmp[min(i+1, len(tmp)-1)][1][0] not in ['補助記号'])]
indices = [0] + indices + [len(tmp)]
# print(indices)
delete_indices = [i for i, item in enumerate(tmp)
if item[1][0] in ['形状詞']]
# print(delete_indices)
for item in delete_indices:
try:
indices.remove(item)
except:
# ('に', ['助詞', '格助詞', '*', '*', '*', '*'])
# ('も', ['助詞', '係助詞', '*', '*', '*', '*'])
try:
indices.remove(item-1)
except:
pass
indices.append(item)
indices = sorted(indices)
# print(indices)
x = 0
result = []
for i in indices:
result.append(tmp[x:i])
x = i
result = ' '.join([''.join([txt[0] for txt in item]) for item in result][1:])
# result = re.sub(r"", " ", result)
print(result)
# print(len(result.split()))