forked from kartiksbo/scoreNormalization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
145 lines (109 loc) · 6.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from torqueclient import Torque
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
torque = Torque(
"link",
"username",
"apikey"
)
COMPETITION = "competition_name"
JUDGE_DATA_TYPES = [
"COMMUNITY-INFORMED",
"IMPACTFUL",
"FEASIBLE",
"SUSTAINABLE",
]
def get_proposal_judge_data():
"""Gets the raw judge scoring data for all proposals in a dictionary mapping the proposal key
to the following dictionary:"""
judge_data_by_proposal = {}
for proposal in torque.competitions[COMPETITION].proposals:
judge_data = {}
for judge_data_type in JUDGE_DATA_TYPES:
judge_data[judge_data_type] = []
if proposal["Panel %s Judge Data" % judge_data_type]:
for torque_judge_datum in proposal["Panel %s Judge Data" % judge_data_type]["Comments"]:
judge_data[judge_data_type].append({
"judgename": torque_judge_datum["Anonymous Judge Name"],
"rawscore": torque_judge_datum["Score"]["Raw"],
})
judge_data_by_proposal[proposal.key] = {
"judgedata": judge_data,
"rawscore": proposal["Panel Score"]["Raw"],
}
return judge_data_by_proposal
def normalize_judge_data(judge_data_by_proposal):
"""Normalizes judge data using min-max normalization and z-score normalization."""
for proposal_key, judge_data in judge_data_by_proposal.items():
for judge_data_type, data in judge_data["judgedata"].items():
raw_scores = np.array([datum['rawscore'] for datum in data])
min_max_normalized_scores = (raw_scores - np.min(raw_scores)) / (np.max(raw_scores) - np.min(raw_scores))
z_score_normalized_scores = (raw_scores - np.mean(raw_scores)) / np.std(raw_scores)
for i, datum in enumerate(data):
datum["Min-Max Normalized"] = min_max_normalized_scores[i]
datum["Z-Score Normalized"] = z_score_normalized_scores[i]
def main():
# Bulk fetch and locally cache all the proposals for this competition to make things speedier
torque.bulk_fetch(torque.competitions[COMPETITION].proposals)
proposal_judge_data = get_proposal_judge_data()
normalize_judge_data(proposal_judge_data)
# Create a DataFrame from the proposal_judge_data dictionary
table = pd.DataFrame(proposal_judge_data).transpose().reset_index()
table.rename(columns={"index": "Proposal Key"}, inplace=True)
# Rename "rawscore" column to "Raw Trait Score"
table = table.rename(columns={"rawscore": "Raw Trait Score"})
# Check for missing data
missing_data = table.isnull().sum()
print(f"Missing data in each column:\n{missing_data}\n")
# Fill missing trait scores with column median
table["Raw Trait Score"].fillna(table["Raw Trait Score"].median(), inplace=True)
# Checking and handling outliers (Replace with upper whisker if data point is greater)
Q1 = table["Raw Trait Score"].quantile(0.25)
Q3 = table["Raw Trait Score"].quantile(0.75)
IQR = Q3 - Q1
upper_whisker = Q3 + 1.5 * IQR
table["Raw Trait Score"] = np.where(table["Raw Trait Score"] > upper_whisker, upper_whisker, table["Raw Trait Score"])
# Plot score distributions for each judge
#for judge in table["Anonymous Judge Name"].unique():
#sns.displot(table[table["Anonymous Judge Name"]==judge]["Raw Trait Score"], kde=True)
#plt.title(f"Distribution of scores for {judge}")
#plt.show()
# Before normalizing scores, calculate and print skewness and kurtosis for each judge
for judge in table["Anonymous Judge Name"].unique():
scores = table[table["Anonymous Judge Name"]==judge]["Raw Trait Score"]
skewness = scores.skew()
kurtosis = scores.kurt()
print(f"Scores for {judge}: skewness = {skewness:.2f}, kurtosis = {kurtosis:.2f}")
"""Skewness gives an idea of the symmetry of the distribution around the mean. A skewness value close to 0 suggests that the data is fairly symmetric. Negative skewness indicates that the data is skewed to the left, while positive skewness indicates that the data is skewed to the right.
Kurtosis measures the "tailedness" of the distribution. A high kurtosis value indicates a distribution with heavy tails, which means there are more outliers. A low kurtosis value indicates a distribution with light tails, meaning fewer outliers.
These statistics can help give an idea of the distribution of scores for each judge"""
# We normalize scores using min-max method for each judge
min_max_normalized_scores = table.groupby("Anonymous Judge Name")["Raw Trait Score"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
# We normalize scores using z-score method for each judge
z_score_normalized_scores = table.groupby("Anonymous Judge Name")["Raw Trait Score"].transform(lambda x: (x - x.mean()) / x.std())
# Add normalized scores to our Table
table["Min-Max Normalized Score"] = min_max_normalized_scores
table["Z-Score Normalized Score"] = z_score_normalized_scores
# Reposition the "Raw Trait Score" column before Min-Max and Z-Score normalization columns
table = table[['Application #', 'Organization', 'Judge Name', 'Anonymous Judge Name', 'Trait', 'Raw Trait Score', 'Min-Max Normalized Score', 'Z-Score Normalized Score', 'Overall Score Rank']]
# We pivot our Table to get scores by trait
pivoted_table = table.pivot_table(index=["Application #", "Organization"], columns="Trait", values=["Raw Trait Score", "Min-Max Normalized Score", "Z-Score Normalized Score"], aggfunc="mean")
# Calculate the overall sum of normalized scores per normalization style
pivoted_table["Min-Max Overall Score"] = pivoted_table["Min-Max Normalized Score"].sum(axis=1)
pivoted_table["Z-Score Overall Score"] = pivoted_table["Z-Score Normalized Score"].sum(axis=1)
# We rank applications by the overall score
pivoted_table["Min-Max Rank"] = pivoted_table["Min-Max Overall Score"].rank(ascending=False)
pivoted_table["Z-Score Rank"] = pivoted_table["Z-Score Overall Score"].rank(ascending=False)
# We reset the index and print the output table
output_table = pivoted_table.reset_index()
print(output_table)
# We get the current timestamp and append it to the output file name
current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_file = f"BaWoP_Normalized_{current_timestamp}.csv"
# We save the output to a new CSV file
table.to_csv(output_file, index=False)
if __name__ == "__main__":
main()