-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaivebayes.pl
executable file
·319 lines (271 loc) · 9.12 KB
/
naivebayes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
use strict;
use DB_File;
# Dr. Dobb's Journal
# May 01, 2005
# Naive Bayesian Text Classification
# Fast, accurate, and easy to implement
# John Graham-Cumming
# http://www.ddj.com/development-tools/184406064
#
# Extended by Neal Richter
# - parse CSV text and treat phrases as intact symbols
# - export the model to CSV file
# - print stats on the model
# - prune the model
#
# Neal's notes
# To Train:
# find label1/training/ -exec perl naivebayes.pl add label1 '{}' \;
# find label2/training/ -exec perl naivebayes.pl add label2 '{}' \;
#
# To Test:
# perl naivebayes.pl classify label1/testing/somedatafile
# perl naivebayes.pl classify label2/testing/somedatafile
# The label with the smallest number wins (ie the first one in the list)
#
# Paying attention to the absolute difference between the scores is important as well. See the Naive Bayes literature for details.
#
# To Prune - removes words in the model with less than X frequency:
# perl naivebayes.pl prune 10
#
# To show stats:
# perl naivebayes.pl stats
#
# To export the model to a CSV file
# perl naivebayes.pl export <file>
#
# Hash with two levels of keys: $words{category}{word} gives count of
# 'word' in 'category'. Tied to a DB_File to keep it persistent.
#
my %words;
tie %words, 'DB_File', 'words.db';
#Utils
sub trim {
my $string = shift;
for ($string) {
s/^\s+//;
s/\s+$//;
s/=/_/g;
s/-/_/g;
}
return $string;
}
# Read a file and return a hash of the word counts in that file
sub parse_file
{
my ( $file ) = @_;
my %word_counts;
open FILE, "<$file";
while ( my $line = <FILE> ) {
# print "parsing $line\n";
# split on CSV
my @values = split(/[,;:]/, $line); #replace with this for whitespace /[ \t\n\r,;:]/
#Prior pattern while ( $line =~ s/([[:alpha:#]]{3,44})[ \t\n\r,;:]// ) {
# Grab all the words with between 3 and 44 letters
foreach my $val (@values) {
my $wrd = lc(trim($val));
if ((length( $wrd) > 2) && (length( $wrd) < 45) && ($wrd =~ m/[A-Za-z0-9_#]/)) {
$word_counts{lc($wrd)}++;
}
# else { print "Rejected: $wrd \n"; }
}
}
close FILE;
return %word_counts;
}
# Add words from a hash to the word counts for a category
sub add_words
{
my ( $category, %words_in_file ) = @_;
foreach my $word (keys %words_in_file) {
$words{"$category=$word"} += $words_in_file{$word};
}
}
# Get the classification of a file from word counts
sub classify
{
my ( %words_in_file ) = @_;
# Calculate the total number of words in each category and
# the total number of words overall
my %count;
my $total = 0;
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/; #category=word
$count{$1} += $words{$entry};
$total += $words{$entry};
}
# Run through words and calculate the probability for each category
my %score;
foreach my $word (sort (keys(%words_in_file))) {
foreach my $category (keys %count) {
if ( defined( $words{"$category=$word"} ) ) {
#print "[$word] $category: ".$score{$category}." += log(". $words{"$category=$word"}." / ".$count{$category}." )\n";
$score{$category} += log( $words{"$category=$word"} /
$count{$category} );
} else {
#print "[$word] $category: ".$score{$category}." += log( 0.01 / ".$count{$category}." )\n";
$score{$category} += log( 0.01 /
$count{$category} );
}
}
}
# Add in the probability that the text is of a specific category
foreach my $category (keys %count) {
#print "(pr) $category: ".$score{$category}." += log(". $count{$category}." / ".$total." )\n";
$score{$category} += log( $count{$category} / $total );
}
#print the log likelyhood of the categories in sorted order
my @score_array;
my @class_array;
foreach my $category (sort { $score{$b} <=> $score{$a} } keys %count) {
print "$category $score{$category}\n";
$score_array[@score_array] = $score{$category};
$class_array[@class_array] = $category;
}
my $sz = scalar @score_array;
if($sz > 1)
{
for (my $count=1; $count<$sz; $count++)
{
print "Test: of ". $class_array[$count-1] . " vs " . $class_array[$count] . " = " . (2 * ($score_array[$count-1] - $score_array[$count])) . "\n";
print "Confidence = " . (1- abs((2 * ($score_array[$count-1] - $score_array[$count]))/$score_array[$count-1])) . " - ". abs((2 * ($score_array[$count-1] - $score_array[$count]))/$score_array[$count-1])."\n";
}
}
}
# Get the classification of a file from word counts
sub prune_model
{
my ($thresh) = @_;
# Calculate the total number of words in each category and
# the total number of words overall
my %word_freq;
my $total = 0;
my $category;
my $word;
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/;
$category = $1;
$word = $2;
$word_freq{$word} += $words{$entry};
$total += $words{$entry};
}
# Run through words and calculate the probability for each category
# Add to new pruned model if greater than the threshold
unlink 'words_pruned.db';
my %words_pruned;
tie %words_pruned, 'DB_File', 'words_pruned.db';
print "total words:$total\n";
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/; #category=word
$category = $1;
$word = $2;
if( $word_freq{$word} >= $thresh ) { #freq based threshold
$words_pruned{$entry} = $words{$entry};
}
else
{
#print "Skipped: [$entry],$words{$entry} ".$word_freq{$word}." ; $thresh\n";
}
}
untie %words_pruned;
}
# Get the classification of a file from word counts
sub model_stats
{
# Calculate the total number of words in each category, total categories
# the total number of words overall
my %categories;
my %category_uniques;
my $unique_words;
my $total = 0;
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/; #category=word
$categories{$1} += $words{$entry};
$category_uniques{$1}++;
$total += $words{$entry};
$unique_words++;
}
print "Total words: $total\n";
print "Unique words: $unique_words\n";
print "Categories:\n";
foreach my $entry (keys %categories) {
print " - $entry: " . $categories{$entry} . " words, ". $category_uniques{$entry} . " unique words\n";
}
}
# Export a CSV of the model
sub model_export
{
my ($export_file) = @_;
print "Opening: $export_file\n";
open(EXPCSV, ">$export_file");
# Calculate the total number of words in each category, total categories
# the total number of words overall
my %categories;
my %category_uniques;
my $unique_words;
my $total = 0;
my $category;
my $word;
my $count;
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/; #category=word
$category = $1;
$word = $2;
$categories{$1} += $words{$entry};
$category_uniques{$1}++;
$total += $words{$entry};
$unique_words++;
}
print EXPCSV "#Total words: $total\n";
print EXPCSV "#Unique words: $unique_words\n";
print EXPCSV "#Categories:\n";
foreach my $entry (keys %categories) {
print EXPCSV " #- $entry: " . $categories{$entry} . " words, ". $category_uniques{$entry} . " unique words\n";
}
print EXPCSV "#WORD\tCATEGORY\tCOUNT\n";
foreach my $entry (keys %words) {
$entry =~ /^(.+)=(.+)$/; #category=word
$category = $1;
$word = $2;
$count = $words{$entry};
print EXPCSV "$word\t$category\t$count\n";
}
close(EXPCSV);
}
# Prune and swap the model and emit stats
sub prune_and_swap
{
my ($thresh) = @_;
print "OLD MODEL:\n";
model_stats();
prune_model( $thresh );
untie %words;
unlink('words_old.db');
rename('words.db','words_old.db');
rename('words_pruned.db','words.db');
tie %words, 'DB_File', 'words.db';
print "NEW MODEL:\n";
model_stats();
}
# Supported commands are 'add' to add words to a category and
# 'classify' to get the classification of a file
if ( ( $ARGV[0] eq 'add' ) && ( $#ARGV == 2 ) ) {
add_words( $ARGV[1], parse_file( $ARGV[2] ) );
} elsif ( ( $ARGV[0] eq 'classify' ) && ( $#ARGV == 1 ) ) {
classify( parse_file( $ARGV[1] ) );
} elsif ( ( $ARGV[0] eq 'prune' ) && ( $#ARGV == 1 ) ) {
prune_and_swap( $ARGV[1] );
} elsif ( ( $ARGV[0] eq 'export' ) && ( $#ARGV == 1 ) ) {
model_export( $ARGV[1] );
} elsif ( ( $ARGV[0] eq 'stats' ) ) {
model_stats( );
} else {
print <<EOUSAGE;
Usage: add <category> <file> - Adds words from <file> to category <category>
classify <file> - Outputs classification of <file>
prune <percentage> - Prunes elements of model lower than frequency <X>
stats - Prints stats of the model
export <file> - Exports model to <file>
EOUSAGE
}
untie %words;