forked from infochimps-labs/wukong
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_count.rb
executable file
·95 lines (87 loc) · 2.33 KB
/
word_count.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env ruby
require 'rubygems'
require 'wukong'
module WordCount
class Mapper < Wukong::Streamer::LineStreamer
#
# Split a string into its constituent words.
#
# This is pretty simpleminded:
# * downcase the word
# * Split at any non-alphanumeric boundary, including '_'
# * However, preserve the special cases of 's or 't at the end of a
# word.
#
# tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
# # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
#
def tokenize str
return [] unless str
str = str.downcase;
# kill off all punctuation except [stuff]'s or [stuff]'t
# this includes hyphens (words are split)
str = str.
gsub(/[^a-zA-Z0-9\']+/, ' ').
gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
# Busticate at whitespace
words = str.strip.split(/\s+/)
words.reject!{|w| w.blank? }
words
end
#
# Emit each word in each line.
#
def process line
tokenize(line).each{|word| yield [word, 1] }
end
end
#
# Accumulate the sum record-by-record:
#
class Reducer0 < Wukong::Streamer::Base
attr_accessor :key_count
def process word, count
@last_word ||= word
if (@last_word == word)
self.key_count += 1
else
yield [ @last_word, key_count ]
@last_word = word
end
end
def stream
emit @last_word, key_count
end
end
#
# You can stack up all the values in a list then sum them at once:
#
require 'active_support/core_ext/enumerable'
class Reducer1 < Wukong::Streamer::ListReducer
def finalize
yield [ key, values.map(&:last).map(&:to_i).sum ]
end
end
#
# A bit kinder to your memory manager: accumulate the sum record-by-record:
#
class Reducer2 < Wukong::Streamer::AccumulatingReducer
attr_accessor :key_count
def start!(*args) self.key_count = 0 end
def accumulate(*args) self.key_count += 1 end
def finalize
yield [ key, key_count ]
end
end
#
# ... easiest of all, though: this is common enough that it's already included
#
require 'wukong/streamer/count_keys'
class Reducer3 < Wukong::Streamer::CountKeys
end
end
# Execute the script
Wukong::Script.new(
WordCount::Mapper,
WordCount::Reducer1
).run