-
Notifications
You must be signed in to change notification settings - Fork 1
/
cleaneventleads.py
88 lines (73 loc) · 2.85 KB
/
cleaneventleads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#Capitalize Words.
#Delete row by conditions
import pandas as pd
import numpy as np
from nameparser import HumanName
import string
from titlecase import titlecase
xls = pd.ExcelFile('xxxxxxx.xls')
df = xls.parse('SurveysAnswersCrosstab', skiprows=4, index_col=None) #skip first 4 rows
df = df.drop(df.index[len(df)-1]) #drop the last row
df = df.drop(df.index[0]) #drop the first row
df = df.drop_duplicates("ID", keep='first', inplace=False) #drop duplicate records by "ID"
df = df.reset_index(drop=True)
df = df.fillna(' ') #fill all the NaN with ' '
for i in range(0,len(df['Title'])): #Clean data in column 'Title'
df['Title'][i] = df['Title'][i].replace(".", " ")
df['Title'][i] = df['Title'][i].replace(" ", " ")
list1 = []
list2 = []
#Save the rows with "Notes" in list1, else in list2
for i in range(0,int(len(df['Notes']))):
if str(df['Notes'][i]) != str(df['Notes'][0]):
list1.append(df.iloc[i])
else:
list2.append(df.iloc[i])
n1 = pd.DataFrame(list1)
n2 = pd.DataFrame(list2)
n1 = n1.reset_index(drop=True)
n2 = n2.reset_index(drop=True)
#Capitalize all the initial letter of each word in selected columns.
col = ['First Name', 'Title', 'Company', 'Address', 'State', 'City', 'Country']
def capwordallnew(dataset):
for item in col:
for i in range(0, len(dataset)):
dataset[item][i] = titlecase(dataset[item][i])
return dataset
#Capitalize last name with Mc, Mac...
def caplastname(dataset):
for i in range(0,len(dataset['Last Name'])):
n = str(dataset['Last Name'][i])
name = HumanName(n)
name.capitalize(force=True)
ln = name.first
dataset['Last Name'][i] = ln
return dataset
#Clean datasets by functions.
n1 = capwordallnew(n1)
n1 = caplastname(n1)
n2 = capwordallnew(n2)
n2 = caplastname(n2)
#Delete the suffix in the string of column 'Company'
comsuffix = [' Inc', ', Inc', ' Llp', ', Llp', ' As',', As',' Llc', ', Llc',' Ltd', ', Ltd', ',']
for item in comsuffix:
n1['Company'] = n1['Company'].str.split(item).str[0]
for item in comsuffix:
n2['Company'] = n2['Company'].str.split(item).str[0]
#Delete the rows depends on whether the selected column contains a specific parameter.
def dropbycol(col,para):
new = n2[~n2[col].str.contains(para,na=False)]
return new
#Save the rows depends on whether the selected column contains a specific parameter.
def savebycol(col,para):
new = n2[n2[col].str.contains(para,na=False)]
return new
#for example, delete all the rows depends on whether the 'e-Mail' column contains 'edu'
n2 = dropbycol('e-Mail','edu')
#for example, save all the rows depends on whether the 'Title' column contains the element in col2.
col2 = ['a','b','c','d']
for item in col2:
n2 = savebycol('Title',item)
#Save to excel
n1.to_excel('n1.xlsx',header=True, index=False)
n2.to_excel('n2.xlsx',header=True, index=False)