-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpareto.py
42 lines (37 loc) · 1.53 KB
/
pareto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2018-11-01
# @Author : imcake ([email protected])
# @Link : https://github.com/imcake
##############################################
# 计算累积频率分段值(帕累托图)
# 输入带有需要累积的值的csv文件和该值的字段名
# 输出百分之10-90对应的最相近数值
# 输出结果为字典,10:xxx, 20:xxx, ..., 90:xxx
#############################################
import pandas as pd
def get_pareto_penct(csv_name, value_column):
data_df = pd.read_csv(csv_name, header=0)
pencentList = []
pencentage = 0
rawList = data_df[value_column].tolist()
rawList.sort(reverse=True)
sum_raw = sum(rawList)
for i in range(len(rawList)):
pencent = float(rawList[i]) / sum_raw * 100
pencentage = pencentage + pencent # calculate the accumulated pencentage
pencentList.append(pencentage)
# add pencentage and raw date to a dict
dictionary = dict(zip(pencentList, rawList))
init_penct = [10, 20, 30, 40, 50, 60, 70, 80, 90]
value = []
for j in init_penct:
# get the closest raw date of certain pencentage
dict_key = min(pencentList, key=lambda x: abs(x - j))
value.append(dictionary.get(dict_key))
result_dict = dict(zip(init_penct, value))
return result_dict
if __name__ == '__main__':
csv_name = 'pareto.csv' # csv文件名
value_column = 'TOTAL' # 需要累积的字段名
print get_pareto_penct(csv_name, value_column)