-
Notifications
You must be signed in to change notification settings - Fork 157
/
ag_news.py
111 lines (81 loc) · 3.18 KB
/
ag_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
from sklearn.metrics import classification_report
from torchtext.datasets import AG_NEWS
from npc_gzip.compressors.base import BaseCompressor
from npc_gzip.compressors.gzip_compressor import GZipCompressor
from npc_gzip.knn_classifier import KnnClassifier
def get_data() -> tuple:
"""
Pulls the AG_NEWS dataset
and returns two tuples the first being the
training data and the second being the test
data. Each tuple contains the text and label
respectively as numpy arrays.
"""
train_iter, test_iter = AG_NEWS(split=("train", "test"))
train_text = []
train_labels = []
for label, text in train_iter:
train_labels.append(label)
train_text.append(text)
test_text = []
test_labels = []
for label, text in test_iter:
test_labels.append(label)
test_text.append(text)
train_text = np.array(train_text)
train_labels = np.array(train_labels)
test_text = np.array(test_text)
test_labels = np.array(test_labels)
train = (train_text, train_labels)
test = (test_text, test_labels)
return (train, test)
def fit_model(
train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd"
) -> KnnClassifier:
"""
Fits a Knn-GZip compressor on the train
data and returns it.
Arguments:
train_text (np.ndarray): Training dataset as a numpy array.
train_labels (np.ndarray): Training labels as a numpy array.
Returns:
KnnClassifier: Trained Knn-Compressor model ready to make predictions.
"""
compressor: BaseCompressor = GZipCompressor()
model: KnnClassifier = KnnClassifier(
compressor=compressor,
training_inputs=train_text,
training_labels=train_labels,
distance_metric=distance_metric,
)
return model
def main() -> None:
print("Fetching data...")
((train_text, train_labels), (test_text, test_labels)) = get_data()
print("Fitting model...")
model = fit_model(train_text, train_labels)
random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False)
sample_test_text = test_text[random_indicies]
sample_test_labels = test_labels[random_indicies]
print("Generating predictions...")
top_k = 1
# Here we use the `sampling_percentage` to save time
# at the expense of worse predictions. This
# `sampling_percentage` selects a random % of training
# data to compare `sample_test_text` against rather
# than comparing it against the entire training dataset.
(distances, labels, similar_samples) = model.predict(
sample_test_text, top_k, sampling_percentage=0.01
)
print(classification_report(sample_test_labels, labels.reshape(-1)))
if __name__ == "__main__":
main()
# precision recall f1-score support
# 1 0.67 0.80 0.73 246
# 2 0.78 0.74 0.76 246
# 3 0.64 0.61 0.62 249
# 4 0.65 0.59 0.62 259
# accuracy 0.68 1000
# macro avg 0.68 0.68 0.68 1000
# weighted avg 0.68 0.68 0.68 1000