// simplified file operations and no error handling for clarity
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.Arrays;
import java.util.Random;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.tuple.ArrayOfDoublesSetOperationBuilder;
import org.apache.datasketches.tuple.ArrayOfDoublesSketch;
import org.apache.datasketches.tuple.ArrayOfDoublesSketchIterator;
import org.apache.datasketches.tuple.ArrayOfDoublesSketches;
import org.apache.datasketches.tuple.ArrayOfDoublesUnion;
import org.apache.datasketches.tuple.ArrayOfDoublesUpdatableSketch;
import org.apache.datasketches.tuple.ArrayOfDoublesUpdatableSketchBuilder;
import org.apache.datasketches.quantiles.DoublesSketch;
import org.apache.datasketches.quantiles.UpdateDoublesSketch;
// this section generates two sketches with some overlap in unique keys
// and random double values from a normal distribution
// and serializes them into files in compact (not updatable) form
{
Random rand = new Random();
ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder().build();
for (int key = 0; key < 100000; key++) sketch1.update(key, new double[] {rand.nextGaussian()});
FileOutputStream out1 = new FileOutputStream("TupleSketch1.bin");
out1.write(sketch1.compact().toByteArray());
out1.close();
ArrayOfDoublesUpdatableSketch sketch2 = new ArrayOfDoublesUpdatableSketchBuilder().build();
for (int key = 50000; key < 150000; key++) sketch2.update(key, new double[] {rand.nextGaussian()});
FileOutputStream out2 = new FileOutputStream("TupleSketch2.bin");
out2.write(sketch2.compact().toByteArray());
out2.close();
}
// this section deserializes the sketches, produces union and prints some results
{
FileInputStream in1 = new FileInputStream("TupleSketch1.bin");
byte[] bytes1 = new byte[in1.available()];
in1.read(bytes1);
in1.close();
ArrayOfDoublesSketch sketch1 = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(bytes1));
FileInputStream in2 = new FileInputStream("TupleSketch2.bin");
byte[] bytes2 = new byte[in2.available()];
in2.read(bytes2);
in2.close();
ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketches.wrapSketch(Memory.wrap(bytes2));
ArrayOfDoublesUnion union = new ArrayOfDoublesSetOperationBuilder().buildUnion();
union.union(sketch1);
union.union(sketch2);
ArrayOfDoublesSketch unionResult = union.getResult();
System.out.println("Union unique count estimate: " + unionResult.getEstimate());
System.out.println("Union unique count lower bound (95% confidence): " + unionResult.getLowerBound(2));
System.out.println("Union unique count upper bound (95% confidence): " + unionResult.getUpperBound(2));
// Let's use Quantiles sketch to analyze the distribution of values
UpdateDoublesSketch quantilesSketch = DoublesSketch.builder().build();
ArrayOfDoublesSketchIterator it = unionResult.iterator();
while (it.next()) {
quantilesSketch.update(it.getValues()[0]);
}
System.out.println("Probability Histogram of values: estimated probability mass in 6 bins:\n"
+ "(-inf, -2), [-2, -1), [-1, 0), [0, 1), [1, 2), [2, +inf)");
System.out.println(Arrays.toString(quantilesSketch.getPMF(new double[] {-2, -1, 0, 1, 2})));
}
Output:
Union unique count estimate: 149586.73149344584
Union unique count lower bound (95% confidence): 145028.6046846571
Union unique count upper bound (95% confidence): 154287.5017892762
Probability Histogram of values: estimated probability mass in 6 bins:
(-inf, -2), [-2, -1), [-1, 0), [0, 1), [1, 2), [2, +inf)
[0.0390625, 0.1484375, 0.3125, 0.3046875, 0.1484375, 0.046875]