diff --git a/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java b/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java index 8ffc97c3..4661b41f 100644 --- a/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java +++ b/core/src/main/java/com/tdunning/math/stats/ScaleFunction.java @@ -123,7 +123,7 @@ public double normalizer(double compression, double n) { }, /** - * Generates cluster sizes proportional to sqrt(1-q) for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing + * Generates cluster sizes proportional to sqrt(1-q) for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing * the graph of the K_1 function to its tangent line at q=1/2. Changing the split point is possible. */ K_1_GLUED { @@ -327,7 +327,7 @@ private double Z(double compression, double n) { }, /** - * Generates cluster sizes proportional to 1-q for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing + * Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing * the graph of the K_2 function to its tangent line at q=1/2. Changing the split point is possible. */ K_2_GLUED { @@ -494,7 +494,7 @@ private double Z(double compression, double n) { }, /** - * Generates cluster sizes proportional to 1-q for q >= 1/2, and uniform cluster sizes for q < 1/2 by gluing + * Generates cluster sizes proportional to 1-q for q geq 1/2, and uniform cluster sizes for q lt 1/2 by gluing * the graph of the K_3 function to its tangent line at q=1/2. */ K_3_GLUED { diff --git a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java index 21fe6090..1bf10651 100644 --- a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java +++ b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java @@ -17,19 +17,26 @@ package com.tdunning.math.stats; -import com.carrotsearch.randomizedtesting.annotations.Seed; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + import org.apache.commons.math3.util.Pair; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; +import org.apache.mahout.math.jet.random.Exponential; import org.apache.mahout.math.jet.random.Uniform; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.*; +import com.carrotsearch.randomizedtesting.annotations.Seed; //to freeze the tests with a particular seed, put the seed on the next line //@Seed("84527677CF03B566:A6FF596BDDB2D59D") @@ -59,10 +66,25 @@ protected TDigest fromBytes(ByteBuffer bytes) { return MergingDigest.fromBytes(bytes); } - + @Test + public void writeUniformAsymmetricScaleFunctionResults() { + try { + writeAsymmetricScaleFunctionResults(Distribution.UNIFORM); + } catch (Exception e) { + e.printStackTrace(); + } + } @Test - public void writeAsymmetricScaleFunctionResults() { + public void writeExponentialAsymmetricScaleFunctionResults() { + try { + writeAsymmetricScaleFunctionResults(Distribution.EXPONENTIAL); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void writeAsymmetricScaleFunctionResults(Distribution distribution) throws Exception { List scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1, ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED, @@ -79,12 +101,12 @@ public void writeAsymmetricScaleFunctionResults() { digestParams.put(fcn.toString() + "_USUAL", new Pair<>(fcn, false)); } } - writeSeveralDigestUniformResults(digestParams, numTrials, "../docs/asymmetric/data/merging/"); - + writeSeveralDigestUniformResults(digestParams, numTrials, distribution, + "../docs/asymmetric/data/merging/" + distribution.name() + "/"); } - public void writeSeveralDigestUniformResults(Map> digestParams, int numTrials, - String writeLocation) { + private void writeSeveralDigestUniformResults(Map> digestParams, + int numTrials, Distribution distribution, String writeLocation) throws Exception { int trialSize = 1_000_000; double compression = 100; @@ -93,8 +115,12 @@ public void writeSeveralDigestUniformResults(Map> centroidCounts= new HashMap<>(); + Map>> centroidSequences= new HashMap<>(); + + for (Map.Entry> entry : digestParams.entrySet()) { centroidCounts.put(entry.getKey(), new ArrayList()); + centroidSequences.put(entry.getKey(), new ArrayList>()); try { Map> records = new HashMap<>(); for (double q : quants) { @@ -105,7 +131,12 @@ public void writeSeveralDigestUniformResults(Map seq = new ArrayList<>(); + for (Centroid c : digest.centroids()) { + seq.add(c.count()); + } + centroidSequences.get(entry.getKey()).add(seq); } for (double q : quants) { FileWriter csvWriter = new FileWriter(writeLocation + entry.getKey() + "_" + String.valueOf(q) + ".csv"); @@ -140,6 +177,17 @@ public void writeSeveralDigestUniformResults(Map ct : centroidSequences.get(entry.getKey())) { + for (Integer c : ct) { + csvWriter2.append(c.toString()).append(","); + } + csvWriter2.append("\n"); + } + csvWriter2.flush(); + csvWriter2.close(); + } catch (IOException e) { System.out.println(e.toString()); return; diff --git a/core/src/test/java/com/tdunning/math/stats/TDigestTest.java b/core/src/test/java/com/tdunning/math/stats/TDigestTest.java index 51066c3b..a16978c5 100644 --- a/core/src/test/java/com/tdunning/math/stats/TDigestTest.java +++ b/core/src/test/java/com/tdunning/math/stats/TDigestTest.java @@ -18,8 +18,10 @@ package com.tdunning.math.stats; import com.google.common.collect.Lists; + import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; +import org.apache.mahout.math.jet.random.Exponential; import org.apache.mahout.math.jet.random.Gamma; import org.apache.mahout.math.jet.random.Normal; import org.apache.mahout.math.jet.random.Uniform; @@ -31,7 +33,6 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; - /** * Base test case for TDigests, just extend this class and implement the abstract methods. */ @@ -44,6 +45,8 @@ public abstract class TDigestTest extends AbstractTest { private static String digestName; + protected enum Distribution {UNIFORM, EXPONENTIAL}; + @BeforeClass public static void freezeSeed() { RandomUtils.useTestSeed(); @@ -129,6 +132,23 @@ public void offsetUniform() { @Test public void writeUniformResultsWithCompression() { + try { + writeResultsWithCompression(Distribution.UNIFORM); + } catch (Exception e) { + e.printStackTrace(); + } + } + + @Test + public void writeExponentialResultsWithCompression() { + try { + writeResultsWithCompression(Distribution.EXPONENTIAL); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void writeResultsWithCompression(Distribution distribution) throws Exception { List scaleFcns = Arrays.asList(ScaleFunction.K_0, ScaleFunction.K_1, ScaleFunction.K_2, ScaleFunction.K_3, ScaleFunction.K_1_GLUED, @@ -138,8 +158,12 @@ public void writeUniformResultsWithCompression() { Map> centroidCounts= new HashMap<>(); + Map>> centroidSequences= new HashMap<>(); + for (ScaleFunction scaleFcn : scaleFcns) { centroidCounts.put(scaleFcn, new ArrayList()); + centroidSequences.put(scaleFcn, new ArrayList>()); + try { Map> records = new HashMap<>(); double[] quants = new double[]{0.00001, 0.0001, 0.001, 0.01, 0.1, @@ -152,7 +176,12 @@ public void writeUniformResultsWithCompression() { TDigest digest = factory(compression).create(); digest.setScaleFunction(scaleFcn); Random rand = new Random(); - AbstractContinousDistribution gen = new Uniform(50, 51, rand); + AbstractContinousDistribution gen; + if (distribution.equals(Distribution.UNIFORM)) { + gen = new Uniform(50, 51, rand); + } else if (distribution.equals(Distribution.EXPONENTIAL)) { + gen = new Exponential(5, rand); + } else throw new Exception("distribution not specified"); double[] data = new double[trialSize]; for (int i = 0; i < trialSize; i++) { data[i] = gen.nextDouble(); @@ -168,6 +197,12 @@ public void writeUniformResultsWithCompression() { String.valueOf(Math.abs(q1 - q2) / Math.min(q, 1 - q)) + "\n"); } centroidCounts.get(scaleFcn).add(digest.centroids().size()); + + List seq = new ArrayList<>(); + for (Centroid c : digest.centroids()) { + seq.add(c.count()); + } + centroidSequences.get(scaleFcn).add(seq); } } @@ -180,7 +215,7 @@ public void writeUniformResultsWithCompression() { } for (double q : quants) { - FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + fcnName + "_" + String.valueOf(q) + ".csv"); + FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_" + String.valueOf(q) + ".csv"); csvWriter.append("error_q,norm_error_q\n"); for (String obs : records.get(q)) { csvWriter.append(obs); @@ -189,7 +224,7 @@ public void writeUniformResultsWithCompression() { csvWriter.close(); } - FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + fcnName + "_centroid_counts.csv"); + FileWriter csvWriter = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_counts.csv"); csvWriter.append("centroid_count\n"); for (Integer ct : centroidCounts.get(scaleFcn)) { csvWriter.append(ct.toString()).append("\n"); @@ -197,6 +232,17 @@ public void writeUniformResultsWithCompression() { csvWriter.flush(); csvWriter.close(); + + FileWriter csvWriter2 = new FileWriter("../docs/asymmetric/data/tree/" + distribution.name() + "/" + fcnName + "_centroid_sizes.csv"); + for (List ct : centroidSequences.get(scaleFcn)) { + for (Integer c : ct) { + csvWriter2.append(c.toString()).append(","); + } + csvWriter2.append("\n"); + } + csvWriter2.flush(); + csvWriter2.close(); + } catch (IOException e) { return; } diff --git a/docs/asymmetric/generate_plots.py b/docs/asymmetric/generate_plots.py index 7fab2657..39b59e0b 100644 --- a/docs/asymmetric/generate_plots.py +++ b/docs/asymmetric/generate_plots.py @@ -2,12 +2,11 @@ import pandas as pd import matplotlib.pyplot as plt -in_tree = "data/tree/" -out_tree = "plots/tree/" - -in_merging = "data/merging/" -out_merging = "plots/merging/" +in_prefix = "data" +out_prefix = "plots" +implementations = ["tree", "merging"] +distributions = ["UNIFORM", "EXPONENTIAL"] scale_function_prefixes = ["K_{0}_{1}".format(x, y) for x in ["1", "2", "3"] for y in ["USUAL", "GLUED"]] + ["K_0_USUAL"] + ["K_QUADRATIC"] @@ -19,6 +18,7 @@ def clean_string(s): cc_suffix = "_centroid_counts.csv" +cs_suffix = "_centroid_sizes.csv" axis_labels = {'.99': 2, '0.99': 2, '1.0E-5': -5, '0.00001': -5, @@ -33,24 +33,26 @@ def clean_string(s): '.5': 0, '0.5': 0} -def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="t_digest_figs", - location=in_tree, implementation=""): +def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename="", + location="", implementation=""): data = {} for prefix in prefixes: data[prefix] = {} - filenames = filter(lambda x: x.startswith(prefix) and not x.endswith(cc_suffix), - os.listdir(location)) + filenames = filter( + lambda x: x.startswith(prefix) and not x.endswith(cc_suffix) and not x.endswith( + cs_suffix), + os.listdir(location)) for filename in filenames: value = filename.replace(prefix + "_", "").replace(".csv", "") with open(location + filename, 'r') as f: data[prefix][value] = pd.read_csv(f) - centroid_data = {} + centroid_count_data = {} centroid_counts = map(lambda x: x + cc_suffix, prefixes) for cc_name in centroid_counts: with open(location + cc_name, 'r') as f: - centroid_data[cc_name.replace(cc_suffix, "")] = pd.read_csv(f) + centroid_count_data[cc_name.replace(cc_suffix, "")] = pd.read_csv(f) fig, ax = plt.subplots(len(prefixes), 3, squeeze=False) fig.set_figheight(4 * len(prefixes)) @@ -67,13 +69,15 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename=" ax[prefixes.index(prefix), 0].set_title(clean_string(prefix) + implementation + " error") ax[prefixes.index(prefix), 0].boxplot(error_q_list, positions=pos, whis=[5, 95], showfliers=False) + ax[prefixes.index(prefix), 0].set_yscale('log') ax[prefixes.index(prefix), 1].set_title( clean_string(prefix) + implementation + " norm_error") ax[prefixes.index(prefix), 1].boxplot(norm_error_q_list, positions=pos, whis=[5, 95], showfliers=False) + ax[prefixes.index(prefix), 1].set_yscale('log') ax[prefixes.index(prefix), 2].set_title( clean_string(prefix) + implementation + " " + cc_suffix.replace(".csv", "").lstrip("_")) - ax[prefixes.index(prefix), 2].hist(centroid_data[prefix]["centroid_count"], range=[5, 95], + ax[prefixes.index(prefix), 2].hist(centroid_count_data[prefix]["centroid_count"], range=[5, 95], bins=30) fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9, @@ -85,8 +89,54 @@ def generate_figures(prefixes=scale_function_prefixes, save=False, outfilename=" plt.show() -params = [(out_tree, in_tree, " (tree)"), (out_merging, in_merging, " (merging)")] +def generate_size_figures(prefix="K_0_USUAL", save=False, outfilename="", value='0.01', + location="", centroid_index=0): + data = {} + centroid_sizes_data = {} + + for impl in implementations: + data[impl] = {} + centroid_sizes_data[impl] = {} + for dist in distributions: + data[impl][dist]= {} + centroid_sizes_data[impl][dist] = {} + filename = "{0}_{1}.csv".format(prefix, value) + with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + filename, 'r') as f: + data[impl][dist][value] = pd.read_csv(f) + with open("{0}/{1}/{2}".format(location, impl, dist) + "/" + prefix + cs_suffix, 'r') as f: + _d = f.readlines() + centroid_sizes_data[impl][dist][prefix] = [[int(x) for x in y.rstrip(',\n').split(',')] for y in _d] + + fig, ax = plt.subplots(len(implementations), len(distributions), squeeze=False) + fig.set_figheight(15) + fig.set_figwidth(15) + for impl in implementations: + for dist in distributions: + error_q_list, norm_error_q_list = [], [] + pos = [] + for v in data[impl][dist]: + pos.append(axis_labels[v]) + df = data[impl][dist][v] + error_q_list.append(df['error_q']) + norm_error_q_list.append(df['norm_error_q']) + title = "{0}, {1}, {2}, q={3}, index {4}".format(clean_string(prefix), impl, dist.lower(), value, str(centroid_index)) + ax[implementations.index(impl), distributions.index(dist)].set_title(title) + _a, b = centroid_sizes_data[impl][dist][prefix], df['norm_error_q'] + a = [i[centroid_index] for i in _a] + ax[implementations.index(impl), distributions.index(dist)].scatter(a, b) + + fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9, + hspace=0.4, wspace=0.3) + + if save is True: + plt.savefig(outfilename) + elif save is False: + plt.show() + + +params = [ ("{0}/{1}/{2}/".format(out_prefix, impl, dist), "{0}/{1}/{2}/".format(in_prefix, impl, dist), + " ({0}, {1})".format(impl, dist.lower())) for impl in implementations for dist in distributions] def main(): for a, b, c in params: @@ -98,7 +148,13 @@ def main(): outfilename="{}t_digest_figs_K_2".format(a), location=b, implementation=c) generate_figures(prefixes=["K_3_{}".format(y) for y in ["USUAL", "GLUED"]], save=True, outfilename="{}t_digest_figs_K_3".format(a), location=b, implementation=c) - + for centroid_index, v in [(-1, '0.99'), (-1, '0.999'), (0, '0.01')]: + fcn = 'K_0_USUAL' + outfile = "{0}/size/{1}_{2}_{3}.png".format(out_prefix, fcn, v, str(centroid_index)) + generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index, + outfilename=outfile, save=True) + generate_size_figures(location=in_prefix + '/', prefix=fcn, value=v, centroid_index=centroid_index, + outfilename=outfile, save=True) if __name__ == "__main__": main()