This repository has been archived by the owner on Jun 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtransform_to_rayyan.py
136 lines (106 loc) · 3.96 KB
/
transform_to_rayyan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
# Changes based on 202003207 release
# Need to write something here ...
# Changes based on 202003207 release
# Change has_full_text --> has_pdf_parse
# Add has_pmc_xml_parse and cord_uid to notes
USAGE = '''\
Converts open COVID-19 dataset to rayyan compatible form
usage: python transform_to_rayyan.py <input_file.csv> <output_file.csv>\
'''
import csv
import sys
import ast
import numpy
import multiprocessing as mp
from dateparser import parse as normalparse
from daterangeparser import parse as rangeparse
from tqdm import tqdm
if len(sys.argv) != 3:
print(USAGE)
print("Incorrect number of arguments!")
exit(1)
#DX_DOI_PREFIX = 'http://dx.doi.org/'
def transform_row_to_rayyan(irow):
orow = {}
orow['title'] = irow['title']
orow['abstract'] = irow['abstract']
#orow['url'] = DX_DOI_PREFIX + irow['doi']
orow['url'] = irow['url']
orow['pmc_id'] = irow['pmcid']
orow['pubmed_id'] = irow['pubmed_id']
publish_time = irow['publish_time'].strip()
try:
# First, try parsing as a daterange.
# This should catch most date formats except
# those in 'DD-MM-YY' and some other forms.
start, end = rangeparse(publish_time)
except:
# If parsing as daterange fails, select
# the first word. It's usually 'YYYY' or 'DD-MM-YY'
# which is good enough.
publish_time = publish_time.split(' ')[0]
if publish_time:
try:
# Try another parse as daterange
start, end = rangeparse(publish_time)
except:
# If that fails, then it is ''DD-MM-YY',
# which can be picked up by normalparse.
start = normalparse(publish_time)
orow['year'] = start.year
orow['month'] = start.month
orow['day'] = start.day
else:
orow['year'] = ''
orow['month'] = ''
orow['day'] = ''
# Inital dataset had authors in a list form.
# Try parsing authors to see if it's a list.
try:
authors = ast.literal_eval(irow['authors'])
if type(authors) == list:
orow['authors'] = '; '.join(authors)
else:
raise RuntimeError
except:
# It's not a list, use the string as is.
orow['authors'] = irow['authors']
orow['journal'] = irow['journal']
notes = []
for col in ['cord_uid', 'sha', 'doi', 'source_x', 'license', 'mag_id', 'who_covidence_id', 'arxiv_id', 's2_id', 'pdf_json_files', 'pmc_json_files']:
notes.append(col + ': ' + irow[col])
orow['notes'] = '; '.join(notes)
return orow
def batch_tranform_to_rayyan(process_number, rows):
pbar = tqdm(desc="Process # %s" % process_number, total=len(rows), position=process_number)
output = []
for r in rows:
output.append(transform_row_to_rayyan(r))
pbar.update()
pbar.close()
return output
NUM_CORES = 8
OUTPUT_FIELDS = ['title', 'abstract', 'url', 'pmc_id', 'pubmed_id', 'year', 'month', 'day', 'authors', 'journal', 'notes']
if __name__ == "__main__":
input_csv = csv.DictReader(open(sys.argv[1], 'r', encoding='utf-8', errors='ignore'), delimiter=',')
output_csv = csv.DictWriter(open(sys.argv[2], "w+"), delimiter=',', fieldnames=OUTPUT_FIELDS)
output_csv.writerow(dict((fn, fn) for fn in OUTPUT_FIELDS))
# Gather all rows into memory
all_input_rows = [input_row for input_row in input_csv]
# Split rows into NUM_CORES chunks for parallel processing
input_chunks = numpy.array_split(numpy.array(all_input_rows), NUM_CORES)
input_chunks_with_index = [(index, chunk) for index, chunk in enumerate(input_chunks)]
# Create pool of workers
pool = mp.Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=NUM_CORES)
# Apply transformation in parallel
print("Starting transformation with %s workers..." % NUM_CORES)
output_chunks = pool.starmap(batch_tranform_to_rayyan, input_chunks_with_index)
# Wrap up workers
pool.close()
pool.join()
# Write output to file
for chunk in output_chunks:
for row in chunk:
output_csv.writerow(row)
print("Complete.")