Skip to content

Commit

Permalink
fixed for empty intersections
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe Green committed Jul 10, 2014
1 parent a759e86 commit 691ee1a
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 22 deletions.
File renamed without changes.
59 changes: 59 additions & 0 deletions examples/intersection-improved-medium.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?php

include __DIR__ . '/../vendor/autoload.php';
include __DIR__ . '/randomGenerator.php';

// adjusted so the union is almost the size of the hash k
$set1 = randomSet(110000);

$set2 = randomSet(100000);

$set3 = randomSet(120000);

echo "Number of words in set 1: " . count($set1) . "\n";

echo "Number of words in set 2: " . count($set2) . "\n";

echo "Number of words in set 3: " . count($set3) . "\n";


echo "------\n";

echo "Cardinailiy of set 1: " . cardinality($set1) . "\n";

echo "Cardinailiy of set 2: " . cardinality($set2) . "\n";

echo "Cardinailiy of set 3: " . cardinality($set3) . "\n";

$intersection = array_intersect($set1, $set2, $set3);

$union = array_merge($set1, $set2, $set3);

$intersectionCount = cardinality($intersection);

echo "Cardinailiy of union: " . cardinality($union) . "\n";

echo "Cardinailiy of intersection: " . $intersectionCount . "\n";

echo "------\nLogLog\n";

$log_logs = array();

foreach(array($set1, $set2, $set3) as $i => $set)
{
$log_log = new HyperLogLog\MinHash();

foreach ($set as $word) {
$log_log->add($word);
}

$log_logs[] = $log_log;

echo "Added set " . ($i + 1) . "\n";
}

$count = \HyperLogLog\Utils\MinHashIntersector::count($log_logs);

echo "intersection complete\n";

echo $count . "\n" . 'error: ' . number_format(($count - $intersectionCount) / ($intersectionCount / 100.0), 3) . '%' . PHP_EOL;
43 changes: 21 additions & 22 deletions src/HyperLogLog/Utils/MinHashIntersector.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,44 @@

class MinHashIntersector
{

public static function count(array $minHashes, $strict = true)
{
list($minHashIntersection, $minHashK, $hllUnion) = self::jaccard($minHashes, $strict);

/**
* For low numbers there is no need to estimate
* If we assume an even spread with no has collisions then the intersection of
* the min hash data structures will be accurate until the size of the union is
* greater than the max size of the min hash data structure
*/
if($hllUnion->count() < $minHashK)
{
return $minHashIntersection;
}

return floor(($minHashIntersection / $minHashK) * $hllUnion->count());
}

public static function jaccard(array $minHashes, $strict = true)
{
$minHashK = self::getMinHashKForSet($minHashes, $strict);

$totalHll = new HyperLogLogMinHash(Basic::DEFAULT_HLL, new MinHash($minHashK));

$intersection = array();
$intersection = null;

foreach($minHashes as $hll)
{
$totalHll->union($hll);

$hashK = $hll->getMinHash()->toArray();

$intersection = $intersection ? array_intersect($intersection, $hashK) : $hashK;
$intersection = isset($intersection) ? array_intersect($intersection, $hashK) : $hashK;

if(count($intersection) === 0)
{
return 0;
}
}

$intersection = array_intersect($intersection, $totalHll->getMinHash()->toArray());

return array(count($intersection), $minHashK, $totalHll);
$hllUnionCount = $totalHll->count();

/**
* For low numbers there is no need to estimate
* If we assume an even spread with no has collisions then the intersection of
* the min hash data structures will be accurate until the size of the union is
* greater than the max size of the min hash data structure
*/
if($hllUnionCount < $minHashK)
{
return count($intersection);
}

return floor((count($intersection) / $minHashK) * $hllUnionCount);
}

private static function getMinHashKForSet(array $minHashes, $strict)
Expand Down

0 comments on commit 691ee1a

Please sign in to comment.