forked from infochimps-labs/wukong
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsample_records.rb
executable file
·43 lines (39 loc) · 1.15 KB
/
sample_records.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env ruby
$: << File.dirname(__FILE__)+'/../lib'
require 'rubygems'
require 'wukong'
#
# Probabilistically emit some fraction of record/lines
#
# Set the sampling fraction at the command line using the
# --sampling_fraction=
# option: for example, to take a random 1/1000th of the lines in huge_files,
# ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
#
class Mapper < Wukong::Streamer::LineStreamer
include Wukong::Streamer::Filter
#
# floating-point number between 0 and 1 giving the fraction of lines to emit:
# at sampling_fraction=1 all records are emitted, at 0 none are.
#
# Takes its value from a mandatory command-line option
#
def sampling_fraction
@sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
end
#
# randomly decide to emit +sampling_fraction+ fraction of lines
#
def emit? line
rand < self.sampling_fraction
end
end
#
# Executes the script
#
Wukong::Script.new( Mapper,
nil,
:reduce_tasks => 0,
:reuse_jvms => true
).run