-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathdecision_tree.py
62 lines (46 loc) · 2.21 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from sklearn import tree
from sklearn.metrics import accuracy_score
import numpy as np
def load_data():
"""
This helper function loads the dataset saved in the CSV file
and returns 4 numpy arrays containing the training set inputs
and labels, and the testing set inputs and labels.
"""
# Load the training data from the CSV file
training_data = np.genfromtxt('dataset.csv', delimiter=',', dtype=np.int32)
"""
Each row of the CSV file contains the features collected on a website
as well as whether that website was used for phishing or not.
We now separate the inputs (features collected on each website)
from the output labels (whether the website is used for phishing).
"""
# Extract the inputs from the training data array (all columns but the last one)
inputs = training_data[:,:-1]
# Extract the outputs from the training data array (last column)
outputs = training_data[:, -1]
# Separate the training (first 2,000 websites) and testing data (last 456)
training_inputs = inputs[:2000]
training_outputs = outputs[:2000]
testing_inputs = inputs[2000:]
testing_outputs = outputs[2000:]
# Return the four arrays
return training_inputs, training_outputs, testing_inputs, testing_outputs
if __name__ == '__main__':
print "Tutorial: Training a decision tree to detect phishing websites"
# Load the training data
train_inputs, train_outputs, test_inputs, test_outputs = load_data()
print "Training data loaded."
# Create a decision tree classifier model using scikit-learn
classifier = tree.DecisionTreeClassifier()
print "Decision tree classifier created."
print "Beginning model training."
# Train the decision tree classifier
classifier.fit(train_inputs, train_outputs)
print "Model training completed."
# Use the trained classifier to make predictions on the test data
predictions = classifier.predict(test_inputs)
print "Predictions on testing data computed."
# Print the accuracy (percentage of phishing websites correctly predicted)
accuracy = 100.0 * accuracy_score(test_outputs, predictions)
print "The accuracy of your decision tree on testing data is: " + str(accuracy)