-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearnSongs.py
126 lines (112 loc) · 4.93 KB
/
learnSongs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'''
Takes features: danceability, energy, and valence to predict
the mood of a song out of angry, sad, relaxed, and happy. It
uses data from Last.fm where songs are tagged by users in one of
these four moods. It finds the features of these songs using the
Spotify API and then uses this data to learn how to classify songs
into one of these four moods given the features of the song. It
returns a machine learning model that has been trained on this data.
Before returning this model it will try different classifiers and print
data about how they performed as well as create a boxplot comparing them.
Based on how well they performed, I selected a specific model and used that.
Some code taken from sklearn library.
'''
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
'''
Splits out dataset into 80% training data and 20% validation dataset.
Returns a tuple of numpy arrays X_train, X_validation, Y_train, Y_validation
as numpy arrays. It uses a random_state start seed of 100.
'''
def splitDataset(dataset, seed, scoring):
array = dataset.values
X = array[:,1:4]
Y = array[:,4]
validation_size = 0.20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y,
test_size=validation_size,
random_state=seed)
return X_train, X_validation, Y_train, Y_validation
'''
Tries different classifiers and see which one performs the best. Tries
Linear Discriminant Analysis, K Neighbors, Decision Tree, Logistic Regression,
Support Vector Machine, and Gaussian Native Bayes. It uses a cross validation score
to rate the classifiers. It will print out the mean and standard deviation of the
result scores. It will also create a graph that shows the difference in the algorithm
performance. This function is intended to be used to pick the best classifier to use.
'''
def tryClassifiers(X_train, Y_train, seed, scoring):
models = []
models.append(('Linear Discriminant Analysis', LinearDiscriminantAnalysis()))
models.append(('K Neighbors', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Logistic Regression', LogisticRegression(solver='liblinear',
multi_class='ovr')))
models.append(('Support Vector Machine', SVC(gamma='auto')))
models.append(('Gaussian Naive Bayes', GaussianNB()))
#Try each model and print out the results of the scores of each one
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
mean = cv_results.mean()
stdDev = cv_results.std()
msg = "%s: %f (%f)" % (name, mean, stdDev)
print(msg)
#Create graph that will show differences in algorithm performance
figure = plt.figure()
figure.suptitle('Differences in Algorithm Performance')
ax = figure.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
'''
Fits model to 80% training data.
Check how well model performs on the 20% data that we set aside as
the validation training set. This function will print an accuracy score,
confusion matrix, and classification report for the passed in model.
It will return the trained model.
'''
def checkModel(model, X_train, Y_train, X_validation, Y_validation):
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
#print(accuracy_score(Y_validation, predictions))
#print(confusion_matrix(Y_validation, predictions))
#print(classification_report(Y_validation, predictions))
return model
'''
Returns model that has been trained on the training dataset. The classifier
being used is a decision tree.
'''
def main():
# Load dataset
dataset = pandas.read_csv("songMoods.csv",
names=['id','danceability', 'energy',
'valence', 'mood'])
seed = 100
scoring = 'accuracy'
#split dataset into training and validation data
X_train, X_validation, Y_train, Y_validation = splitDataset(dataset, seed, scoring)
#try multiple classifiers and print data on how well they perform
#tryClassifiers(X_train, Y_train, seed, scoring)
#chose model based on results
chosenModel = LogisticRegression(solver='liblinear',
multi_class='ovr')
model = checkModel(chosenModel, X_train, Y_train,
X_validation, Y_validation)
return model