-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_processing_get_df.py
93 lines (79 loc) · 2.73 KB
/
multi_processing_get_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import numpy as np
import boto
import boto3
import nltk
from nltk.tag import StanfordNERTagger
import time
from multiprocessing import Pool
import sys
from tqdm import tqdm
st = StanfordNERTagger('/home/ec2-user/GKGPreprocessing/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
'/home/ec2-user/GKGPreprocessing/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar',
encoding='utf-8')
def getDecision(input_list):
for item in input_list:
if item[1] == 'ORGANIZATION':
return "Yes"
break
return "No"
def ner_result(input_str):
new_str = [w.capitalize() for w in input_str.split(' ')]
classified_text = st.tag(new_str)
return getDecision(classified_text)
def main(start_num,end_num,file_num):
file_path = 's3://gkgpreprocessing/'
file_name = 'df_{}.csv'.format(file_num)
output_file = 'df_{}_{}_{}.csv'.format(file_num,start_num,end_num)
new_df = pd.read_csv(file_path + file_name ,index_col='Unnamed: 0')
parsed_df = new_df.loc[start_num:end_num].copy()
original_name = list(parsed_df['original_name'])
cleaned_name = list(parsed_df['cleaned_name'])
#s = time.time()
p = Pool()
result = p.map(ner_result,cleaned_name)
p.close()
p.join()
# parsed_df['decision'] = pd.Series(result)
# output_df = parsed_df[parsed_df.decision == 'Yes']
# output_df = output_df[['original_name','cleaned_name']]
# output_df.to_csv(output_file,header = False,index = False)
for i in range(len(result)):
temp_decision = result[i]
if temp_decision == 'Yes':
with open (output_file,'a') as f:
f.write('{},{}\n'.format(original_name[i],cleaned_name[i]))
f.close()
else:
pass
# for i in tqdm(range(start_num,end_num)):
# original_name = new_df.iloc[i]['original_name']
# cleaned_name = new_df.iloc[i]['cleaned_name']
# result = ner_result(cleaned_name)
# if result == 'Yes':
# with open (output_file,'a') as f:
# f.write('{},{}\n'.format(original_name,cleaned_name))
# f.close()
# else:
# pass
if __name__ == '__main__':
# while True:
# file_num = int(input('enter file you want to open: '))
# start_num = int(input('enter your start number: '))
# end_num = int(input('enter your end number: '))
# result_str = 'df_{} start from {} and end at {}, right? [y/n] '.format(str(file_num),str(start_num),str(end_num))
# if input(result_str) == 'y':
# break
# else:
# pass
file_name = sys.argv[1]
file_name = int(file_name)
start_end_file = sys.argv[2]
with open (start_end_file,'r') as f:
whole_str = f.read()
line_list = whole_str.split('\n')[:-1]
f.close()
for line in tqdm(line_list):
start = line.split(',')[0]
end = line.split(',')[1]
main(int(start),int(end),file_name)