forked from mkosmala/SnapshotSerengetiScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaggregate_cleveland_survey.py
138 lines (112 loc) · 5.64 KB
/
aggregate_cleveland_survey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys, os, glob
import pandas as pd, numpy as np
import ujson
import datetime
from ast import literal_eval
from get_workflow_info import get_workflow_info, translate_non_alphanumerics, get_short_slug
from aggregate_question_utils import breakout_anno_survey, getfrac, aggregate_survey, write_class_row
classfile = 'focus-on-wildlife-cleveland-metroparks-classifications.csv'
workflow_file = 'focus-on-wildlife-cleveland-metroparks-workflows.csv'
workflow_cfile = 'focus-on-wildlife-cleveland-metroparks-workflow_contents.csv'
workflow_id = 1432
workflow_version = "478.99"
try:
classfile = sys.argv[1]
except:
print("Usage:\n%s classfile\n example classifications export file: %s\n" % (sys.argv[0], classfile))
print("Optional inputs:")
print(" workflows=projectname-workflows.csv (export from project builder)")
print(" workflow_contents=projectname-workflow_contents.csv (export from project builder)")
print(" workflow_id=N")
print(" workflow_version=N (looks like a number with format: major.minor)")
print(" outfile_class=filename.csv\n file to save exploded classifications with 1 annotation per row")
print(" outfile_agg=filename.csv\n file to save aggregated classifications")
print(" If you don't specify an outfile_class or outfile_agg, the filenames\n will be based on the input classfile name.")
print(" If you vary the project from the suggested one above, you'll need to specify workflow files.\n")
exit(0)
annofile = classfile.replace('.csv', '_annotations_1lineeach.csv')
outfile = classfile.replace('.csv', '_aggregated.csv')
outfile_huge = classfile.replace('.csv', '_aggregated_kitchensink.csv')
# check for other command-line arguments
if len(sys.argv) > 2:
# if there are additional arguments, loop through them
for i_arg, argstr in enumerate(sys.argv[2:]):
arg = argstr.split('=')
if arg[0] == "workflow_id":
workflow_id = int(arg[1])
elif arg[0] == "workflow_version":
workflow_version = float(arg[1])
elif (arg[0] == "outfile_class"):
annofile = arg[1]
elif (arg[0] == "outfile_agg") | (arg[0] == "outfile"):
outfile = arg[1]
elif (arg[0] == "workflows"):
workflow_file = arg[1]
elif (arg[0] == "workflow_contents"):
workflow_cfile = arg[1]
elif (arg[0] == "class_in") | (arg[0] == "classfile") | (arg[0] == "in_class"):
infile = arg[1]
elif arg[0] == "classfile_breakout":
breakout_class = False
classfile_breakout = arg[1]
# make sure you don't overwrite even if the input file doesn't end in .csv
if classfile == annofile:
annofile = classfile + '_annotations_1lineeach.csv'
if outfile == annofile:
outfile = classfile + '_aggregated.csv'
workflow_df = pd.read_csv(workflow_file)
workflow_cdf = pd.read_csv(workflow_cfile)
workflow_info = get_workflow_info(workflow_df, workflow_cdf, workflow_id, workflow_version)
classifications = pd.read_csv(classfile, low_memory=False)
classifications['anno_json'] = [ujson.loads(q) for q in classifications.annotations]
classifications.fillna(0.0, inplace=True)
# now that we have the workflow information, we need to get the mark columns we will print
# we need both the survey ID columns and the "nothing here" etc. columns
thecols = []
for task in workflow_info['tasknames']:
if workflow_info[task]['type'] == 'survey':
# first, a column for what species was selected
thecols.append(task.lower()+'_choice')
# then columns for each request for additional sub-classification
for question in workflow_info[task]['questionsOrder']:
thecols.append("%s_%s" % (task.lower(), workflow_info[task]['questions'][question]['label_slug']))
elif workflow_info[task]['type'] == 'shortcut':
# each possible shortcut "answer" is a tickmark, i.e. True or False
# so 1 col each
for answer in workflow_info[task]['answers']:
thecols.append(answer['label_slug'])
classcols = "classification_id subject_ids created_at user_name user_id user_ip".split()
printcols = classcols + thecols
theheader = {}
for i in range(len(printcols)):
theheader[printcols[i]] = printcols[i]
# open the file
fp = open(annofile, "w")
# write the CSV header
write_class_row(fp, theheader, printcols)
# breakout and write each mark to the file
n_marks = classifications.apply(lambda row: breakout_anno_survey(row, workflow_info, fp, classcols, thecols), axis=1)
# done writing the file
fp.close()
print("%d annotations jailbroken from %d classifications, written to %s as individual marks..." % (sum(n_marks), len(classifications), annofile))
# save the number of marks per classification, in case it ends up being useful
classifications['n_marks'] = n_marks
# now re-read the csv file with the annotations
annotations = pd.read_csv(annofile)
annotations['count'] = np.ones_like(annotations.created_at)
# we need to group by subject in order to aggregate
by_subj = annotations.groupby('subject_ids')
class_agg = by_subj.apply(aggregate_survey, workflow_info=workflow_info)
# check for empty columns
all_cols = class_agg.columns.values
use_cols = (class_agg.columns.values).tolist()
for thecol in all_cols:
if sum(class_agg[thecol]) == 0.0:
use_cols.remove(thecol)
# write both the kitchen sink version and the version with no totally empty columns
class_agg.to_csv(outfile_huge)
class_agg[use_cols].to_csv(outfile)
print("\nAggregated classifications written to %s \n (kitchen sink version: %s )\n" % (outfile, outfile_huge))
# to do:
# print a summary file, one line per subject, with all species idents for that subject
#end