Skip to content

Commit

Permalink
fixed zero hash intersection
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe Green committed Jul 10, 2014
1 parent 691ee1a commit cdbd4f4
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 4 deletions.
64 changes: 64 additions & 0 deletions examples/intersection-improved-empty.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

include __DIR__ . '/../vendor/autoload.php';
include __DIR__ . '/randomGenerator.php';

// adjusted so the union is almost the size of the hash k
$set1 = randomSet(110000);

$set2 = randomSet(0);

$set3 = randomSet(120000);

echo "Number of words in set 1: " . count($set1) . "\n";

echo "Number of words in set 2: " . count($set2) . "\n";

echo "Number of words in set 3: " . count($set3) . "\n";


echo "------\n";

echo "Cardinailiy of set 1: " . cardinality($set1) . "\n";

echo "Cardinailiy of set 2: " . cardinality($set2) . "\n";

echo "Cardinailiy of set 3: " . cardinality($set3) . "\n";

$intersection = array_intersect($set1, $set2, $set3);

$union = array_merge($set1, $set2, $set3);

$intersectionCount = cardinality($intersection);

echo "Cardinailiy of union: " . cardinality($union) . "\n";

echo "Cardinailiy of intersection: " . $intersectionCount . "\n";

echo "------\nLogLog\n";

$log_logs = array();

foreach(array($set1, $set2, $set3) as $i => $set)
{
$log_log = new HyperLogLog\MinHash();

foreach ($set as $word) {
$log_log->add($word);
}

$log_logs[] = $log_log;

echo "Added set " . ($i + 1) . "\n";
}

$count = \HyperLogLog\Utils\MinHashIntersector::count($log_logs);

echo "intersection complete: count: $count\n";

if($count)
{
echo "Error: 100% - count should be zero\n";
}

echo "Error: 0% - count is zero\n";
8 changes: 4 additions & 4 deletions src/HyperLogLog/Utils/MinHashIntersector.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public static function count(array $minHashes, $strict = true)
}
}

$intersection = array_intersect($intersection, $totalHll->getMinHash()->toArray());
$intersectionCount = count(array_intersect($intersection, $totalHll->getMinHash()->toArray()));

$hllUnionCount = $totalHll->count();

Expand All @@ -37,12 +37,12 @@ public static function count(array $minHashes, $strict = true)
* the min hash data structures will be accurate until the size of the union is
* greater than the max size of the min hash data structure
*/
if($hllUnionCount < $minHashK)
if($intersectionCount === 0 || $hllUnionCount < $minHashK)
{
return count($intersection);
return $intersectionCount;
}

return floor((count($intersection) / $minHashK) * $hllUnionCount);
return floor($intersectionCount / $minHashK) * $hllUnionCount;
}

private static function getMinHashKForSet(array $minHashes, $strict)
Expand Down

0 comments on commit cdbd4f4

Please sign in to comment.