-
Notifications
You must be signed in to change notification settings - Fork 7
/
credulous.js
294 lines (257 loc) · 8.37 KB
/
credulous.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
(function () {
"use strict";
/**
* Constructor for the Naive Bayes learner.
* It takes a list of strings, which represent the different
* groups of strings to train on, followed by an array of possible
* classes.
*
* @param options: object of options that set up the naive bayes model.
* It *must* have a `label` property that is an array of possible labels to assign
* to things we are classifying.
* It can optionally have a `dataLength` that specifies how many groups of strings
* we are classifying. This is useful if we are classifying things that have several
* separate attributes that we want to compute probabilities for separately. For
* example, when calssifying an email, we might have three separate things to classify
* on: the text of the email, the text of the subject, and the country of origin in
* the sender's address. In this case, we would set `dataLength` to 3, and pass these
* in as three separate arguments to the `train()` method.
*/
function Credulous(options) {
var i
;
if (!this) {
return new Credulous(options);
}
options = validateOptions(options);
this.labels = options.labels;
this.dataLength = options.dataLength;
// For the label
this.trainArgumentsLength = this.dataLength + 1;
// TODO: make the data store more modular
this.dataStore = [];
setUpDataStore(this.dataStore, this.labels, this.dataLength);
this.instancesTrained = 0;
}
/**
* Set up structure of the datastore.
*/
function setUpDataStore(dataStore, labels, dataLength) {
var i
;
for (i = 0; i < dataLength; i++) {
dataStore[i] = {
words: {}
, labels: {}
};
labels.forEach(function (label) {
dataStore[i].labels[label] = 0;
});
}
}
/**
* Train the model with the given parameters.
* @parameters - a list of strings that represents the attributes of the instance to classify.
* The last parameter in the list is the label of the instance (ie 'spam', 'not spam', etc.
*/
// TODO: handle laplacian correction
Credulous.prototype.train = function () {
// TODO: fix this awfulness
var args = argsToArray(arguments)
, length = args.length
, label = popOffLabel(args)
, self = this
;
if (length != this.trainArgumentsLength) {
throw new Error('Wrong number of training arguments! Did you forget the class?')
}
if (!labelIsInPossibleLabels(label, this.labels)) {
throw new Error('Trained label not in possible labels!');
}
args.forEach(function (element, i, collection) {
var elements = processString(element);
// have a section for each word.
// have a label as well
// for this word, when the label was this, the count was this
elements.forEach(function (word) {
if (!self.dataStore[i].words[word]) {
initializeWordInDatastore(self.dataStore[i], word, self.labels);
}
self.dataStore[i].words[word][label]++
self.dataStore[i].labels[label]++;
});
});
this.instancesTrained++;
};
Credulous.prototype.classify = function () {
var args = argsToArray(arguments)
, processedDataItems
, labelScores = []
, self = this
;
if (args.length != this.dataLength) {
throw new Error('Wrong number of classifying arguments!');
}
processedDataItems = processDataItems(args);
// For each possible label:
// For each data item:
// For each word in the data item:
// Look up the probability of a class give this word appears
this.labels.forEach(function (label, i) {
labelScores[i] = getProbabilityForLabel(label, self.dataStore, processedDataItems);
});
return this.labels[argMax(labelScores)];
};
/**
* Render the model as JSON, so it can be written out and
* read back in by fromJSON.
* @return - JSON representing the parameters and data of the model.
*/
Credulous.prototype.toJSON = function () {
var json = {}
;
json.instancesTrained = this.instancesTrained;
json.labels = this.labels;
json.dataLength = this.dataLength;
json.trainArgumentsLength = this.trainArgumentsLength;
json.dataStore = this.dataStore;
return json;
}
/**
* Takes a JSON string and sets the properties of this
* model to those of the saved model.
*/
Credulous.prototype.fromJSON = function (json) {
this.instancesTrained = json.instancesTrained;
this.labels = json.labels;
this.dataLength = json.dataLength;
this.trainArgumentsLength = json.trainArgumentsLength;
this.dataStore = json.dataStore;
}
function validateOptions(opts) {
if (!opts || !opts.labels) {
throw new Error('Constructor needs options object with "labels" and "dataLength" attributes.');
}
if (!opts.dataLength) {
opts.dataLength = 1;
}
return opts;
}
function initializeWordInDatastore(datastore, word, labels) {
datastore.words[word] = {};
labels.forEach(function (label) {
datastore.words[word][label] = 0;
});
}
function labelIsInPossibleLabels(label, labels) {
return labels.indexOf(label) !== -1;
}
/**
* Return the index of the maximum value in the array.
* @param array - an array of numerical values
* @return the index of the largest value in the array.
*/
function argMax(array) {
var maxIndex = 0
, i
;
for (i = 0; i < array.length; i++) {
if (array[i] > array[maxIndex]) {
maxIndex = i;
}
}
return maxIndex;
}
function getProbabilityForLabel(label, dataStore, processedDataItems) {
var processedProbabilities = []
, n
;
dataStore.forEach(function (dataItem, dataItemIndex) {
processedDataItems[dataItemIndex].forEach(function (word, j) {
processedProbabilities.push(getProbabilityOfWordGivenLabel(word, label, dataItem));
});
});
return combineProbabilitiesIntoMAP(processedProbabilities);
}
function combineProbabilitiesIntoMAP(probabilities) {
// calculate probability in log space to avoid underflow
// see http://en.wikipedia.org/wiki/Bayesian_spam_filtering#Other_expression_of_the_formula_for_combining_individual_probabilities
var n = probabilities.reduce(function (runningSum, probability) {
return runningSum + (Math.log(1 - probability) - Math.log(probability))
}, 0.0);
return (1 / (1 + Math.exp(n)));
}
function getProbabilityOfWordGivenLabel(word, label, dataStore) {
// the probability of this word given the given class is the count of the word
// for that class / the count of all words of the given class
// TODO: where to do laplacian correction?
var count
, totalOfGivenClass = dataStore.labels[label]
;
// Handle classifying with words that we have never seen.
if (dataStore.words[word] === undefined) {
// Poor mans laplace correction
count = 1;
}
else {
count = dataStore.words[word][label];
}
if (count === 0) {
count = 1;
//increment the count of all other labels as well
var incLabel;
for (incLabel in dataStore.words[word]) {
if (dataStore.words[word].hasOwnProperty(incLabel)) {
dataStore.words[word][incLabel]++;
}
}
}
return count / totalOfGivenClass;
}
/**
* This is terribly named. Process the array of strings into an
* array of arrays of words, with them stemmed, etc.
*/
function processDataItems(items) {
var processedItems = []
;
// TODO: do stemming here
items.forEach(function (item, i) {
processedItems[i] = processString(item);
});
return processedItems;
}
/**
* Convert the arguments object into an
* actual array.
*/
function argsToArray(args) {
return Array.prototype.slice.call(args);
}
/**
* Split the string and return the array of words.
* This is a turrible name.
*/
function processString(str) {
// TODO: add stemming
var words = str.split(/\s+/)
;
return words;
}
/**
* Add to the count of classes seen. Instantiate a new object to hold the
* classes and counts if it has not been seen yet.
*/
function addLabel(label, self) {
if (!(label in self.labels)) {
self.labels[label] = 1;
}
else {
self.labels[label]++;
}
}
function popOffLabel(args) {
return args.pop();
}
module.exports = Credulous;
}());