Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialization support #10

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions lib/bloombroom/bits/bit_field.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# inspired by Peter Cooper's http://snippets.dzone.com/posts/show/4234
#
#
# create a bit field 1000 bits wide
# bf = BitField.new(1000)
#
Expand All @@ -15,14 +15,21 @@ module Bloombroom
class BitField
attr_reader :size
include Enumerable

ELEMENT_WIDTH = 32

def initialize(size)
ELEMENT_PACK = 'L'

# @param size [Fixnum] filter size in bits
# @param bytes [String] the raw contents obtanined using {#to_bytes}
def self.from_bytes size, bytes
new size, bytes.unpack("#{ELEMENT_PACK}*")
end

def initialize(size, field=nil)
@size = size
@field = Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
@field = field || Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
end

# set a bit
# @param position [Fixnum] bit position
# @param value [Fixnum] bit value 0/1
Expand All @@ -33,7 +40,7 @@ def []=(position, value)
@field[position / ELEMENT_WIDTH] |= 1 << (position % ELEMENT_WIDTH)
end
end

# read a bit
# @param position [Fixnum] bit position
# @return [Fixnum] bit value 0/1
Expand Down Expand Up @@ -62,7 +69,7 @@ def unset(position)
def include?(position)
@field[position / ELEMENT_WIDTH] & 1 << (position % ELEMENT_WIDTH) > 0
end

# check if bit is not set
# @param position [Fixnum] bit position
# @return [Boolean] true if bit is not set
Expand All @@ -75,16 +82,21 @@ def zero?(position)
def each(&block)
@size.times { |position| yield self[position] }
end

# returns the field as a string like "0101010100111100," etc.
def to_s
inject("") { |a, b| a + b.to_s }
end


# return the field as a string containing the raw binary representation of it's content
def to_bytes
@field.pack "#{ELEMENT_PACK}*"
end

# returns the total number of bits that are set
# (the technique used here is about 6 times faster than using each or inject direct on the bitfield)
def total_set
@field.inject(0) { |a, byte| a += byte & 1 and byte >>= 1 until byte == 0; a }
end
end
end
end
26 changes: 17 additions & 9 deletions lib/bloombroom/filter/bloom_filter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
module Bloombroom

# BloomFilter false positive probability rule of thumb: see http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/
# a Bloom filter with a 1% error rate and an optimal value for k only needs 9.6 bits per key, and each time we add 4.8 bits
# per element we decrease the error rate by ten times.
# a Bloom filter with a 1% error rate and an optimal value for k only needs 9.6 bits per key, and each time we add 4.8 bits
# per element we decrease the error rate by ten times.
#
# 10000 elements, 1% error rate: m = 10000 * 10 bits -> 12k of memory, k = 0.7 * (10000 * 10 bits / 10000) = 7 hash functions
# 10000 elements, 0.1% error rate: m = 10000 * 15 bits -> 18k of memory, k = 0.7 * (10000 * 15 bits / 10000) = 11 hash functions
Expand All @@ -18,28 +18,36 @@ class BloomFilter

# @param m [Fixnum] filter size in bits
# @param k [Fixnum] number of hashing functions
def initialize(m, k)
@bits = BitField.new(m)
# @param bytes [String] raw bits as a string obtained using {BitField.to_bytes}
# @param size [Integer] the size of the filter contained in bytes
def self.from_bytes m, k, bytes, size=0
new m, k, BitField.from_bytes(m, bytes), size
end

# @param m [Fixnum] filter size in bits
# @param k [Fixnum] number of hashing functions
def initialize(m, k, bits=nil, size=0)
@bits = bits || BitField.new(m)
@m = m
@k = k
@size = 0
@size = size
end

# @param key [String] the key to add in the filter
# @return [Fixnum] the total number of keys in the filter
def add(key)
BloomHelper.multi_hash(key, @k).each{|position| @bits.set(position % @m)}
@size += 1
end
alias_method :<<, :add

# @param key [String] test for the inclusion if key in the filter
# @return [Boolean] true if given key is present in the filter. false positive are possible and dependant on the m and k filter parameters.
def include?(key)
BloomHelper.multi_hash(key, @k).each{|position| return false unless @bits.include?(position % @m)}
true
end
alias_method :[], :include?

end
end
end
25 changes: 23 additions & 2 deletions spec/bloombroom/bits/bit_field_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
(0..99).each{|i| bf.include?(i).should be_false}
(0..31).each{|i| bf.unset(i)}
(0..99).each{|i| bf.include?(i).should be_false}

(0..31).each{|i| bf.set(i)}
(0..31).each{|i| bf.include?(i).should be_true}
(32..99).each{|i| bf.include?(i).should be_false}
Expand Down Expand Up @@ -98,11 +98,32 @@
bf.to_s.should == "0100010000"
end

it 'should produce raw bytes using #to_bytes' do
bf = Bloombroom::BitField.new(10)
bf[1] = 1
bf[5] = 1
bf.to_bytes.should == "\"\x00\x00\x00"
end

it 'should be restored using .from_bytes' do
bf = Bloombroom::BitField.from_bytes 10, "\"\x00\x00\x00"
bf[0].should == 0
bf[1].should == 1
bf[2].should == 0
bf[3].should == 0
bf[4].should == 0
bf[5].should == 1
bf[6].should == 0
bf[7].should == 0
bf[8].should == 0
bf[9].should == 0
end

it "should report total_set" do
bf = Bloombroom::BitField.new(10)
bf[1] = 1
bf[5] = 1
bf.total_set.should == 2
end

end
end
12 changes: 12 additions & 0 deletions spec/bloombroom/filter/bloom_filter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,16 @@
bf.k.should == 10
end

it "should be (de)serializable using #bits.to_bytes and .from_bytes" do
bf = Bloombroom::BloomFilter.new 1000, 5
bf.add("abc1")
bf.add("abc2")
new_bf = Bloombroom::BloomFilter.from_bytes 1000, 5, bf.bits.to_bytes, bf.size
new_bf.m.should == bf.m
new_bf.k.should == bf.k
new_bf.size.should == bf.size
new_bf.include?('abc1').should be true
new_bf.include?('abc2').should be true
new_bf.include?('abc3').should be false
end
end