From c3295005f1cc0c55545d9351c18c485bbe03f75e Mon Sep 17 00:00:00 2001 From: cesar colle Date: Wed, 29 Aug 2018 22:50:39 +0200 Subject: [PATCH 1/5] Add the test faillure for bloomfilter --- .../twitter/algebird/BloomFilterTest.scala | 159 ++++++++++-------- 1 file changed, 87 insertions(+), 72 deletions(-) diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala index 618e12b5a..b704509bc 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala @@ -2,16 +2,16 @@ package com.twitter.algebird import java.io.{ByteArrayOutputStream, ObjectOutputStream} -import org.scalacheck.{Arbitrary, Gen, Properties} -import org.scalatest.{Matchers, WordSpec} import org.scalacheck.Prop._ +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.{Matchers, WordSpec} object BloomFilterTestUtils { def toSparse[A](bf: BF[A]): BFSparse[A] = bf match { case BFZero(hashes, width) => BFSparse(hashes, RichCBitSet(), width) case BFItem(item, hashes, width) => BFSparse(hashes, RichCBitSet.fromArray(hashes(item)), width) - case bfs @ BFSparse(_, _, _) => bfs + case bfs@BFSparse(_, _, _) => bfs case BFInstance(hashes, bitset, width) => BFSparse(hashes, RichCBitSet.fromBitSet(bitset), width) } @@ -22,15 +22,15 @@ object BloomFilterTestUtils { val bs = LongBitSet.empty(width) bs += hashes(item) BFInstance(hashes, bs.toBitSetNoCopy, width) - case bfs @ BFSparse(hashes, bitset, width) => bfs.dense - case bfi @ BFInstance(hashes, bitset, width) => bfi + case bfs@BFSparse(hashes, bitset, width) => bfs.dense + case bfi@BFInstance(hashes, bitset, width) => bfi } } class BloomFilterLaws extends CheckProperties { - import com.twitter.algebird.BaseProperties._ import BloomFilterTestUtils._ + import com.twitter.algebird.BaseProperties._ val NUM_HASHES = 6 val WIDTH = 32 @@ -117,7 +117,7 @@ class BloomFilterLaws extends CheckProperties { val next1 = a + b Equiv[BF[String]].equiv(next, next1) && - (check == a.contains(b)) + (check == a.contains(b)) } } @@ -155,8 +155,8 @@ class BFHashIndices extends CheckProperties { } /** - * This is the version of the BFHash as of before the "negative values fix" - */ + * This is the version of the BFHash as of before the "negative values fix" + */ case class NegativeBFHash(numHashes: Int, width: Int) { val size = numHashes @@ -203,7 +203,7 @@ class BFHashIndices extends CheckProperties { } } -class BloomFilterFalsePositives[T: Gen: Hash128](falsePositiveRate: Double) extends ApproximateProperty { +class BloomFilterFalsePositives[T: Gen : Hash128](falsePositiveRate: Double) extends ApproximateProperty { type Exact = Set[T] type Approx = BF[T] @@ -237,7 +237,7 @@ class BloomFilterFalsePositives[T: Gen: Hash128](falsePositiveRate: Double) exte def approximateResult(bf: BF[T], t: T) = bf.contains(t) } -class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty { +class BloomFilterCardinality[T: Gen : Hash128] extends ApproximateProperty { type Exact = Set[T] type Approx = BF[T] @@ -264,10 +264,12 @@ class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty { def inputGenerator(set: Set[T]) = Gen.const(()) def exactResult(s: Set[T], u: Unit) = s.size + def approximateResult(bf: BF[T], u: Unit) = bf.size } class BloomFilterProperties extends ApproximateProperties("BloomFilter") { + import ApproximateProperty.toProp for (falsePositiveRate <- List(0.1, 0.01, 0.001)) { @@ -304,54 +306,53 @@ class BloomFilterTest extends WordSpec with Matchers { } "identify all true positives" in { - (0 to 100).foreach { _ => - { - val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = bfMonoid.create(entries: _*) - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) - } + (0 to 100).foreach { _ => { + val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = bfMonoid.create(entries: _*) + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) } } + } } "have small false positive rate" in { val iter = 10000 - Seq(0.1, 0.01, 0.001).foreach { fpProb => - { - val fps = (0 until iter).par.map { _ => - { - val numEntries = RAND.nextInt(10) + 1 + Seq(0.1, 0.01, 0.001).foreach { fpProb => { + val fps = (0 until iter).par.map { _ => { + val numEntries = RAND.nextInt(10) + 1 - val bfMonoid = BloomFilter[String](numEntries, fpProb) + val bfMonoid = BloomFilter[String](numEntries, fpProb) - val entries = RAND - .shuffle((0 until 1000).toList) - .take(numEntries + 1) - .map(_.toString) - val bf = bfMonoid.create(entries.drop(1): _*) + val entries = RAND + .shuffle((0 until 1000).toList) + .take(numEntries + 1) + .map(_.toString) + val bf = bfMonoid.create(entries.drop(1): _*) - if (bf.contains(entries(0)).isTrue) 1.0 else 0.0 - } - } + if (bf.contains(entries(0)).isTrue) 1.0 else 0.0 + } + } - val observedFpProb = fps.sum / fps.size + val observedFpProb = fps.sum / fps.size - // the 2.5 is a fudge factor to make the probability of it low - // in tests - assert(observedFpProb <= 2.5 * fpProb) - } + // the 2.5 is a fudge factor to make the probability of it low + // in tests + assert(observedFpProb <= 2.5 * fpProb) + } } } "approximate cardinality" in { val bfMonoid = BloomFilterMonoid[String](10, 100000) Seq(10, 100, 1000, 10000).foreach { exactCardinality => - val items = (1 until exactCardinality).map { _.toString } + val items = (1 until exactCardinality).map { + _.toString + } val bf = bfMonoid.create(items: _*) val size = bf.size @@ -362,18 +363,17 @@ class BloomFilterTest extends WordSpec with Matchers { } "work as an Aggregator" in { - (0 to 10).foreach { _ => - { - val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = aggregator(entries) - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) - } + (0 to 10).foreach { _ => { + val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = aggregator(entries) + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) } } + } } "not serialize @transient dense BFInstance" in { @@ -397,8 +397,8 @@ class BloomFilterTest extends WordSpec with Matchers { } /** - * this test failed before the fix for https://github.com/twitter/algebird/issues/229 - */ + * this test failed before the fix for https://github.com/twitter/algebird/issues/229 + */ "not have negative hash values" in { val NUM_HASHES = 2 val WIDTH = 4752800 @@ -408,33 +408,48 @@ class BloomFilterTest extends WordSpec with Matchers { assert(index >= 0) } + "return his size event if it's saturated" in { + val bfMonoid = BloomFilterMonoid[String](5, 13) + val strings = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10).map(_.toString) + val bf = bfMonoid.create(strings: _*) + + assert(bf.size.min > 0) + } + + "return a max approximate size if it's saturated" in { + val bfMonoid = BloomFilterMonoid[String](5, 13) + val strings = List(8, 9, 8, 10, 1, 8, 11, 7, 7, 1).map(_.toString) + val bf = bfMonoid.create(strings: _*) + // even it's seems big. This assert fail.. + assert(bf.size.max < strings.length * 100) + } + } "BloomFilter method `checkAndAdd`" should { "be identical to method `+`" in { - (0 to 100).foreach { _ => - { - val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = bfMonoid.create(entries: _*) - val bfWithCheckAndAdd = entries - .map { entry => - (entry, bfMonoid.create(entry)) - } - .foldLeft((bfMonoid.zero, bfMonoid.zero)) { - case ((left, leftAlt), (entry, right)) => - val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry) - left.contains(entry) shouldBe contained - (left + entry, newLeftAlt) - } - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) + (0 to 100).foreach { _ => { + val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = bfMonoid.create(entries: _*) + val bfWithCheckAndAdd = entries + .map { entry => + (entry, bfMonoid.create(entry)) } + .foldLeft((bfMonoid.zero, bfMonoid.zero)) { + case ((left, leftAlt), (entry, right)) => + val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry) + left.contains(entry) shouldBe contained + (left + entry, newLeftAlt) + } + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) } } + } } } From dd53848235251bb1b2f8ef4e8e279adf0be930ce Mon Sep 17 00:00:00 2001 From: cesar colle Date: Wed, 29 Aug 2018 23:05:29 +0200 Subject: [PATCH 2/5] add scalafmt's algebird --- .../twitter/algebird/BloomFilterTest.scala | 131 +++++++++--------- 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala index b704509bc..68e7550e0 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala @@ -11,7 +11,7 @@ object BloomFilterTestUtils { case BFZero(hashes, width) => BFSparse(hashes, RichCBitSet(), width) case BFItem(item, hashes, width) => BFSparse(hashes, RichCBitSet.fromArray(hashes(item)), width) - case bfs@BFSparse(_, _, _) => bfs + case bfs @ BFSparse(_, _, _) => bfs case BFInstance(hashes, bitset, width) => BFSparse(hashes, RichCBitSet.fromBitSet(bitset), width) } @@ -22,8 +22,8 @@ object BloomFilterTestUtils { val bs = LongBitSet.empty(width) bs += hashes(item) BFInstance(hashes, bs.toBitSetNoCopy, width) - case bfs@BFSparse(hashes, bitset, width) => bfs.dense - case bfi@BFInstance(hashes, bitset, width) => bfi + case bfs @ BFSparse(hashes, bitset, width) => bfs.dense + case bfi @ BFInstance(hashes, bitset, width) => bfi } } @@ -117,7 +117,7 @@ class BloomFilterLaws extends CheckProperties { val next1 = a + b Equiv[BF[String]].equiv(next, next1) && - (check == a.contains(b)) + (check == a.contains(b)) } } @@ -155,8 +155,8 @@ class BFHashIndices extends CheckProperties { } /** - * This is the version of the BFHash as of before the "negative values fix" - */ + * This is the version of the BFHash as of before the "negative values fix" + */ case class NegativeBFHash(numHashes: Int, width: Int) { val size = numHashes @@ -203,7 +203,7 @@ class BFHashIndices extends CheckProperties { } } -class BloomFilterFalsePositives[T: Gen : Hash128](falsePositiveRate: Double) extends ApproximateProperty { +class BloomFilterFalsePositives[T: Gen: Hash128](falsePositiveRate: Double) extends ApproximateProperty { type Exact = Set[T] type Approx = BF[T] @@ -237,7 +237,7 @@ class BloomFilterFalsePositives[T: Gen : Hash128](falsePositiveRate: Double) ext def approximateResult(bf: BF[T], t: T) = bf.contains(t) } -class BloomFilterCardinality[T: Gen : Hash128] extends ApproximateProperty { +class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty { type Exact = Set[T] type Approx = BF[T] @@ -306,44 +306,47 @@ class BloomFilterTest extends WordSpec with Matchers { } "identify all true positives" in { - (0 to 100).foreach { _ => { - val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = bfMonoid.create(entries: _*) - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) + (0 to 100).foreach { _ => + { + val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = bfMonoid.create(entries: _*) + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) + } } } - } } "have small false positive rate" in { val iter = 10000 - Seq(0.1, 0.01, 0.001).foreach { fpProb => { - val fps = (0 until iter).par.map { _ => { - val numEntries = RAND.nextInt(10) + 1 + Seq(0.1, 0.01, 0.001).foreach { fpProb => + { + val fps = (0 until iter).par.map { _ => + { + val numEntries = RAND.nextInt(10) + 1 - val bfMonoid = BloomFilter[String](numEntries, fpProb) + val bfMonoid = BloomFilter[String](numEntries, fpProb) - val entries = RAND - .shuffle((0 until 1000).toList) - .take(numEntries + 1) - .map(_.toString) - val bf = bfMonoid.create(entries.drop(1): _*) + val entries = RAND + .shuffle((0 until 1000).toList) + .take(numEntries + 1) + .map(_.toString) + val bf = bfMonoid.create(entries.drop(1): _*) - if (bf.contains(entries(0)).isTrue) 1.0 else 0.0 - } - } + if (bf.contains(entries(0)).isTrue) 1.0 else 0.0 + } + } - val observedFpProb = fps.sum / fps.size + val observedFpProb = fps.sum / fps.size - // the 2.5 is a fudge factor to make the probability of it low - // in tests - assert(observedFpProb <= 2.5 * fpProb) - } + // the 2.5 is a fudge factor to make the probability of it low + // in tests + assert(observedFpProb <= 2.5 * fpProb) + } } } @@ -363,17 +366,18 @@ class BloomFilterTest extends WordSpec with Matchers { } "work as an Aggregator" in { - (0 to 10).foreach { _ => { - val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = aggregator(entries) - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) + (0 to 10).foreach { _ => + { + val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = aggregator(entries) + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) + } } } - } } "not serialize @transient dense BFInstance" in { @@ -397,8 +401,8 @@ class BloomFilterTest extends WordSpec with Matchers { } /** - * this test failed before the fix for https://github.com/twitter/algebird/issues/229 - */ + * this test failed before the fix for https://github.com/twitter/algebird/issues/229 + */ "not have negative hash values" in { val NUM_HASHES = 2 val WIDTH = 4752800 @@ -429,27 +433,28 @@ class BloomFilterTest extends WordSpec with Matchers { "BloomFilter method `checkAndAdd`" should { "be identical to method `+`" in { - (0 to 100).foreach { _ => { - val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) - val numEntries = 5 - val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) - val bf = bfMonoid.create(entries: _*) - val bfWithCheckAndAdd = entries - .map { entry => - (entry, bfMonoid.create(entry)) + (0 to 100).foreach { _ => + { + val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32) + val numEntries = 5 + val entries = (0 until numEntries).map(_ => RAND.nextInt.toString) + val bf = bfMonoid.create(entries: _*) + val bfWithCheckAndAdd = entries + .map { entry => + (entry, bfMonoid.create(entry)) + } + .foldLeft((bfMonoid.zero, bfMonoid.zero)) { + case ((left, leftAlt), (entry, right)) => + val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry) + left.contains(entry) shouldBe contained + (left + entry, newLeftAlt) + } + + entries.foreach { i => + assert(bf.contains(i.toString).isTrue) } - .foldLeft((bfMonoid.zero, bfMonoid.zero)) { - case ((left, leftAlt), (entry, right)) => - val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry) - left.contains(entry) shouldBe contained - (left + entry, newLeftAlt) - } - - entries.foreach { i => - assert(bf.contains(i.toString).isTrue) } } - } } } From d81a4d04402f71f13443c190b7500ddc5274dd6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9sar=20Coll=C3=A9?= Date: Thu, 30 Aug 2018 08:48:42 +0200 Subject: [PATCH 3/5] Revert "Add Apple to 'Projects using Algebird' (#663)" This reverts commit 9180862044b343b68d03d6f21ac6615aebdbf65b. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index be86a132b..f413c2824 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,6 @@ See [Algebird's page on the Scaladex](https://index.scala-lang.org/twitter/algeb - [Scio](https://github.com/spotify/scio) - [Packetloop](https://www.packetloop.com) (see [this tweet](https://twitter.com/cloudjunky/status/355073917720858626)) - Ebay uses Algebird for machine learning: [ScalaDays talk](http://www.slideshare.net/VitalyGordon/scalable-and-flexible-machine-learning-with-scala-linkedin) -- [Apple (FEAR Team)](https://news.ycombinator.com/item?id=16969118) Other projects built with Algebird, as compiled by the Scaladex: [![Scaladex Dependents](https://index.scala-lang.org/count.svg?q=dependencies:twitter/algebird*&subject=scaladex:&color=blue&style=flat-square)](https://index.scala-lang.org/search?q=dependencies:twitter/algebird-core) From 8043f7a597db352445c6b8fd06a05c356b8821fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9sar=20Coll=C3=A9?= Date: Thu, 30 Aug 2018 08:57:15 +0200 Subject: [PATCH 4/5] Add faillur test --- .../com/twitter/algebird/BloomFilterTest.scala | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala index 68e7550e0..3c9173886 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala @@ -2,9 +2,9 @@ package com.twitter.algebird import java.io.{ByteArrayOutputStream, ObjectOutputStream} -import org.scalacheck.Prop._ -import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.{Arbitrary, Gen, Properties} import org.scalatest.{Matchers, WordSpec} +import org.scalacheck.Prop._ object BloomFilterTestUtils { def toSparse[A](bf: BF[A]): BFSparse[A] = bf match { @@ -29,8 +29,8 @@ object BloomFilterTestUtils { class BloomFilterLaws extends CheckProperties { - import BloomFilterTestUtils._ import com.twitter.algebird.BaseProperties._ + import BloomFilterTestUtils._ val NUM_HASHES = 6 val WIDTH = 32 @@ -155,7 +155,7 @@ class BFHashIndices extends CheckProperties { } /** - * This is the version of the BFHash as of before the "negative values fix" + * This is the version of the BFHash as of before the "negative values fix" */ case class NegativeBFHash(numHashes: Int, width: Int) { val size = numHashes @@ -264,12 +264,10 @@ class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty { def inputGenerator(set: Set[T]) = Gen.const(()) def exactResult(s: Set[T], u: Unit) = s.size - def approximateResult(bf: BF[T], u: Unit) = bf.size } class BloomFilterProperties extends ApproximateProperties("BloomFilter") { - import ApproximateProperty.toProp for (falsePositiveRate <- List(0.1, 0.01, 0.001)) { @@ -353,9 +351,7 @@ class BloomFilterTest extends WordSpec with Matchers { "approximate cardinality" in { val bfMonoid = BloomFilterMonoid[String](10, 100000) Seq(10, 100, 1000, 10000).foreach { exactCardinality => - val items = (1 until exactCardinality).map { - _.toString - } + val items = (1 until exactCardinality).map { _.toString } val bf = bfMonoid.create(items: _*) val size = bf.size @@ -412,6 +408,7 @@ class BloomFilterTest extends WordSpec with Matchers { assert(index >= 0) } + "return his size event if it's saturated" in { val bfMonoid = BloomFilterMonoid[String](5, 13) val strings = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10).map(_.toString) From ee3b2db1570386122a5748e4b2987cbfdcc1c9b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9sar=20Coll=C3=A9?= Date: Fri, 7 Sep 2018 18:43:25 +0200 Subject: [PATCH 5/5] Fix #632 : BloomFilter saturated exception From the paper linked to the size estimation of a bloomfilter, we have a inegality mixing t = number bits to 1 of the filter, m=width of filter, numHash=k nl <= S-1(t-1) nr >= S-1(t+1) S-1 is defined like : S-1(t) = ln(1 - t/m) / k.ln(1 - 1/m) problem occur when _t = m_ S-1(m) = ln(0) / k.ln(1 - 1/m) but *ln(-1/m)* is not correct. problem is : ln( 1 - 1/m) < 0 The scala maths library give : scala> val infinity = scala.math.log(0) / -42 infinity: Double = Infinity scala> infinity.round.toInt res24: Int = -1 just add special case for the S-1 function. --- .../com/twitter/algebird/BloomFilter.scala | 5 ++++- .../com/twitter/algebird/BloomFilterTest.scala | 18 +++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala index f02f6c948..6d81ebe88 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala @@ -157,13 +157,16 @@ object BloomFilter { * This is \hat{S}^{-1}(t) in the cardinality estimation paper used above. */ def sInverse(t: Int): Double = - scala.math.log1p(-t.toDouble / width) / (numHashes * scala.math.log1p(-1.0 / width)) + if (numBits == width) 0.0 + else + scala.math.log(1 - t.toDouble / width) / (numHashes * scala.math.log1p(-1.0 / width)) // Variable names correspond to those used in the paper. val t = numBits val n = sInverse(t).round.toInt // Take the min and max because the probability formula assumes // nl <= sInverse(t - 1) and sInverse(t + 1) <= nr + val nl = scala.math.min(sInverse(t - 1).floor, (1 - approximationWidth) * n).toInt val nr = diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala index 3c9173886..905799a79 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala @@ -354,7 +354,6 @@ class BloomFilterTest extends WordSpec with Matchers { val items = (1 until exactCardinality).map { _.toString } val bf = bfMonoid.create(items: _*) val size = bf.size - assert(size ~ exactCardinality) assert(size.min <= size.estimate) assert(size.max >= size.estimate) @@ -408,23 +407,16 @@ class BloomFilterTest extends WordSpec with Matchers { assert(index >= 0) } + } - "return his size event if it's saturated" in { - val bfMonoid = BloomFilterMonoid[String](5, 13) - val strings = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10).map(_.toString) - val bf = bfMonoid.create(strings: _*) - - assert(bf.size.min > 0) - } + "BloomFilter method `size`" should { - "return a max approximate size if it's saturated" in { + "return the appropriate size when it's saturated " in { val bfMonoid = BloomFilterMonoid[String](5, 13) - val strings = List(8, 9, 8, 10, 1, 8, 11, 7, 7, 1).map(_.toString) + val strings = List(8, 9, 8, 10, 1, 8, 11, 12, 13, 14, 15, 67, 18981, 1122, 86787).map(_.toString) val bf = bfMonoid.create(strings: _*) - // even it's seems big. This assert fail.. - assert(bf.size.max < strings.length * 100) + assert(bf.size.isZero) } - } "BloomFilter method `checkAndAdd`" should {