-
Notifications
You must be signed in to change notification settings - Fork 0
/
textseg.py
49 lines (40 loc) · 1.7 KB
/
textseg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import argparse
from lcseg import LCseg
from log import Log
def get_options():
parser = argparse.ArgumentParser(description='This script is to segment text using LCseg.')
# file parameter
parser.add_argument('-i', '--input_file_path',
default="./dat/sample.dat",
help='File path of input text you want to segment.')
# MeCab dic parameter
parser.add_argument('-d', '--dic',
default="",
help='Dictionary path for MeCab')
# text segmentation's parameter
parser.add_argument('-g', '--gap',
type=int,
default=11,
help='連鎖を分割する空白の長さ(gap)')
parser.add_argument('-w', '--window',
type=int,
default=2,
help='分析窓幅(window)')
parser.add_argument('-pl', '--p_limit',
type=float,
default=0.1,
help='境界線信頼値の足きり閾値')
parser.add_argument('-a', '--alpha',
type=float,
default=0.5,
help='仮定した境界線に対する閾値の限界')
options = parser.parse_args()
return options
def main(options):
lcseg = LCseg(gap=options.gap, window=options.window, p_limit=options.p_limit, alpha=options.alpha)
sentences = lcseg.read_file(options.input_file_path)
segmented_sentences = lcseg.run(sentences, reference_border_num=3)
for ss in segmented_sentences:
print('\n'.join(ss), end='\n\n')
if __name__ == '__main__':
main(get_options())