forked from hankcs/TreebankPreprocessing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathctb.py
84 lines (67 loc) · 2.7 KB
/
ctb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding:utf-8 -*-
# Filename: make_ctb.py
# Author:hankcs
# Date: 2017-11-03 21:23
import argparse
from os import listdir
from os.path import isfile, join, isdir
import nltk
from utility import make_sure_path_exists, eprint, combine_files
def convert(ctb_root, out_root):
ctb_root = join(ctb_root, 'bracketed')
fids = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f))]
make_sure_path_exists(out_root)
for f in fids:
with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f), 'w') as out:
in_s_tag = False
try:
for line in src:
if line.startswith('<S ID='):
in_s_tag = True
elif line.startswith('</S>'):
in_s_tag = False
elif in_s_tag:
out.write(line)
except:
pass
def combine_fids(fids, out_path):
print('Generating ' + out_path)
files = []
for fid in fids:
f = 'chtb_%04d.nw' % fid
if fid > 1000:
f = 'chtb_%04d.mz' % fid
if isfile(join(ctb_in_nltk, f)):
files.append(f)
with open(out_path, 'w') as out:
combine_files(files, out, ctb)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Combine Chinese Treebank 5.1 fid files into train/dev/test set')
parser.add_argument("--ctb", required=True,
help='The root path to Chinese Treebank 5.1')
parser.add_argument("--output", required=True,
help='The folder where to store the output train.txt/dev.txt/test.txt')
args = parser.parse_args()
ctb_in_nltk = None
for root in nltk.data.path:
if isdir(root):
ctb_in_nltk = root
if ctb_in_nltk is None:
eprint('You should run nltk.download(\'ptb\') to fetch some data first!')
exit(1)
ctb_in_nltk = join(ctb_in_nltk, 'corpora')
ctb_in_nltk = join(ctb_in_nltk, 'ctb')
print('Converting CTB: removing xml tags...')
convert(args.ctb, ctb_in_nltk)
print('Importing to nltk...\n')
from nltk.corpus import BracketParseCorpusReader, LazyCorpusLoader
ctb = LazyCorpusLoader(
'ctb', BracketParseCorpusReader, r'chtb_\d{4}\.\w{2}',
tagset='unknown')
training = list(range(1, 815 + 1)) + list(range(1001, 1136 + 1))
development = list(range(886, 931 + 1)) + list(range(1148, 1151 + 1))
test = list(range(816, 885 + 1)) + list(range(1137, 1147 + 1))
root_path = args.output
combine_fids(training, join(root_path, 'train.txt'))
combine_fids(development, join(root_path, 'dev.txt'))
combine_fids(test, join(root_path, 'test.txt'))