Skip to content

Commit

Permalink
add analysis of exponential distribution and some analysis of error a…
Browse files Browse the repository at this point in the history
…gainst centroid sizes
  • Loading branch information
joe-sfx committed Oct 6, 2020
1 parent 31c79be commit 01bdccc
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 33 deletions.
6 changes: 3 additions & 3 deletions core/src/main/java/com/tdunning/math/stats/ScaleFunction.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ public double normalizer(double compression, double n) {
},

/**
* Generates cluster sizes proportional to sqrt(1-q) for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing
* Generates cluster sizes proportional to sqrt(1-q) for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing
* the graph of the K_1 function to its tangent line at q=1/2. Changing the split point is possible.
*/
K_1_GLUED {
Expand Down Expand Up @@ -327,7 +327,7 @@ private double Z(double compression, double n) {
},

/**
* Generates cluster sizes proportional to 1-q for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing
* Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing
* the graph of the K_2 function to its tangent line at q=1/2. Changing the split point is possible.
*/
K_2_GLUED {
Expand Down Expand Up @@ -494,7 +494,7 @@ private double Z(double compression, double n) {
},

/**
* Generates cluster sizes proportional to 1-q for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing
* Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing
* the graph of the K_3 function to its tangent line at q=1/2.
*/
K_3_GLUED {
Expand Down
72 changes: 60 additions & 12 deletions core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,26 @@

package com.tdunning.math.stats;

import com.carrotsearch.randomizedtesting.annotations.Seed;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.commons.math3.util.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
import org.apache.mahout.math.jet.random.Exponential;
import org.apache.mahout.math.jet.random.Uniform;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.FileWriter;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import com.carrotsearch.randomizedtesting.annotations.Seed;

//to freeze the tests with a particular seed, put the seed on the next line
//@Seed("84527677CF03B566:A6FF596BDDB2D59D")
Expand Down Expand Up @@ -59,10 +66,25 @@ protected TDigest fromBytes(ByteBuffer bytes) {
return MergingDigest.fromBytes(bytes);
}


@Test
public void writeUniformAsymmetricScaleFunctionResults() {
try {
writeAsymmetricScaleFunctionResults(Distribution.UNIFORM);
} catch (Exception e) {
e.printStackTrace();
}
}

@Test
public void writeAsymmetricScaleFunctionResults() {
public void writeExponentialAsymmetricScaleFunctionResults() {
try {
writeAsymmetricScaleFunctionResults(Distribution.EXPONENTIAL);
} catch (Exception e) {
e.printStackTrace();
}
}

private void writeAsymmetricScaleFunctionResults(Distribution distribution) throws Exception {

List<ScaleFunction> scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1,
ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED,
Expand All @@ -79,12 +101,12 @@ public void writeAsymmetricScaleFunctionResults() {
digestParams.put(fcn.toString() + "_USUAL", new Pair<>(fcn, false));
}
}
writeSeveralDigestUniformResults(digestParams, numTrials, "../docs/asymmetric/data/merging/");

writeSeveralDigestUniformResults(digestParams, numTrials, distribution,
"../docs/asymmetric/data/merging/" + distribution.name() + "/");
}

public void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boolean>> digestParams, int numTrials,
String writeLocation) {
private void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boolean>> digestParams,
int numTrials, Distribution distribution, String writeLocation) throws Exception {

int trialSize = 1_000_000;
double compression = 100;
Expand All @@ -93,8 +115,12 @@ public void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boo

Map<String, List<Integer>> centroidCounts= new HashMap<>();

Map<String, List<List<Integer>>> centroidSequences= new HashMap<>();


for (Map.Entry<String, Pair<ScaleFunction, Boolean>> entry : digestParams.entrySet()) {
centroidCounts.put(entry.getKey(), new ArrayList<Integer>());
centroidSequences.put(entry.getKey(), new ArrayList<List<Integer>>());
try {
Map<Double, List<String>> records = new HashMap<>();
for (double q : quants) {
Expand All @@ -105,7 +131,12 @@ public void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boo
digest.setScaleFunction(entry.getValue().getFirst());
digest.setUseAlternatingSort(entry.getValue().getSecond());
Random rand = new Random();
AbstractContinousDistribution gen = new Uniform(50, 51, rand);
AbstractContinousDistribution gen;
if (distribution.equals(Distribution.UNIFORM)) {
gen = new Uniform(50, 51, rand);
} else if (distribution.equals(Distribution.EXPONENTIAL)) {
gen = new Exponential(5, rand);
} else throw new Exception("distribution not specified");
double[] data = new double[trialSize];
for (int i = 0; i < trialSize; i++) {
data[i] = gen.nextDouble();
Expand All @@ -121,6 +152,12 @@ public void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boo
String.valueOf(Math.abs(q1 - q2) / Math.min(q, 1 - q)) + "\n");
}
centroidCounts.get(entry.getKey()).add(digest.centroids().size());

List<Integer> seq = new ArrayList<>();
for (Centroid c : digest.centroids()) {
seq.add(c.count());
}
centroidSequences.get(entry.getKey()).add(seq);
}
for (double q : quants) {
FileWriter csvWriter = new FileWriter(writeLocation + entry.getKey() + "_" + String.valueOf(q) + ".csv");
Expand All @@ -140,6 +177,17 @@ public void writeSeveralDigestUniformResults(Map<String, Pair<ScaleFunction, Boo
csvWriter.flush();
csvWriter.close();


FileWriter csvWriter2 = new FileWriter(writeLocation + entry.getKey() + "_centroid_sizes.csv");
for (List<Integer> ct : centroidSequences.get(entry.getKey())) {
for (Integer c : ct) {
csvWriter2.append(c.toString()).append(",");
}
csvWriter2.append("\n");
}
csvWriter2.flush();
csvWriter2.close();

} catch (IOException e) {
System.out.println(e.toString());
return;
Expand Down
54 changes: 50 additions & 4 deletions core/src/test/java/com/tdunning/math/stats/TDigestTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
package com.tdunning.math.stats;

import com.google.common.collect.Lists;

import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
import org.apache.mahout.math.jet.random.Exponential;
import org.apache.mahout.math.jet.random.Gamma;
import org.apache.mahout.math.jet.random.Normal;
import org.apache.mahout.math.jet.random.Uniform;
Expand All @@ -31,7 +33,6 @@
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;


/**
* Base test case for TDigests, just extend this class and implement the abstract methods.
*/
Expand All @@ -44,6 +45,8 @@ public abstract class TDigestTest extends AbstractTest {

private static String digestName;

protected enum Distribution {UNIFORM, EXPONENTIAL};

@BeforeClass
public static void freezeSeed() {
RandomUtils.useTestSeed();
Expand Down Expand Up @@ -129,6 +132,23 @@ public void offsetUniform() {

@Test
public void writeUniformResultsWithCompression() {
try {
writeResultsWithCompression(Distribution.UNIFORM);
} catch (Exception e) {
e.printStackTrace();
}
}

@Test
public void writeExponentialResultsWithCompression() {
try {
writeResultsWithCompression(Distribution.EXPONENTIAL);
} catch (Exception e) {
e.printStackTrace();
}
}

private void writeResultsWithCompression(Distribution distribution) throws Exception {

List<ScaleFunction> scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1,
ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED,
Expand All @@ -138,8 +158,12 @@ public void writeUniformResultsWithCompression() {

Map<ScaleFunction, List<Integer>> centroidCounts= new HashMap<>();

Map<ScaleFunction, List<List<Integer>>> centroidSequences= new HashMap<>();

for (ScaleFunction scaleFcn : scaleFcns) {
centroidCounts.put(scaleFcn, new ArrayList<Integer>());
centroidSequences.put(scaleFcn, new ArrayList<List<Integer>>());

try {
Map<Double, List<String>> records = new HashMap<>();
double[] quants = new double[]{0.00001, 0.0001, 0.001, 0.01, 0.1,
Expand All @@ -152,7 +176,12 @@ public void writeUniformResultsWithCompression() {
TDigest digest = factory(compression).create();
digest.setScaleFunction(scaleFcn);
Random rand = new Random();
AbstractContinousDistribution gen = new Uniform(50, 51, rand);
AbstractContinousDistribution gen;
if (distribution.equals(Distribution.UNIFORM)) {
gen = new Uniform(50, 51, rand);
} else if (distribution.equals(Distribution.EXPONENTIAL)) {
gen = new Exponential(5, rand);
} else throw new Exception("distribution not specified");
double[] data = new double[trialSize];
for (int i = 0; i < trialSize; i++) {
data[i] = gen.nextDouble();
Expand All @@ -168,6 +197,12 @@ public void writeUniformResultsWithCompression() {
String.valueOf(Math.abs(q1 - q2) / Math.min(q, 1 - q)) + "\n");
}
centroidCounts.get(scaleFcn).add(digest.centroids().size());

List<Integer> seq = new ArrayList<>();
for (Centroid c : digest.centroids()) {
seq.add(c.count());
}
centroidSequences.get(scaleFcn).add(seq);
}
}

Expand All @@ -180,7 +215,7 @@ public void writeUniformResultsWithCompression() {
}

for (double q : quants) {
FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + fcnName + "_" + String.valueOf(q) + ".csv");
FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_" + String.valueOf(q) + ".csv");
csvWriter.append("error_q,norm_error_q\n");
for (String obs : records.get(q)) {
csvWriter.append(obs);
Expand All @@ -189,14 +224,25 @@ public void writeUniformResultsWithCompression() {
csvWriter.close();
}

FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + fcnName + "_centroid_counts.csv");
FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_counts.csv");
csvWriter.append("centroid_count\n");
for (Integer ct : centroidCounts.get(scaleFcn)) {
csvWriter.append(ct.toString()).append("\n");
}
csvWriter.flush();
csvWriter.close();


FileWriter csvWriter2 = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_sizes.csv");
for (List<Integer> ct : centroidSequences.get(scaleFcn)) {
for (Integer c : ct) {
csvWriter2.append(c.toString()).append(",");
}
csvWriter2.append("\n");
}
csvWriter2.flush();
csvWriter2.close();

} catch (IOException e) {
return;
}
Expand Down
Loading

0 comments on commit 01bdccc

Please sign in to comment.