reduce memory usage for computation of cumulative distribution

Before: To compute the cumulative distribution we added every duration into a LongList. This requires O(n) memory, where n is the number of values. Now: We store the durations + the number of occurrences in a LongLongHashMap. This has the potential to reduce the memory requirements if durations occur multiple times. There are a lot of durations with 0, 1, 2 milliseconds. In the worst case every duration is different. In that case the memory usage doubled with this solution. Future: We are currently storing durations with milli seconds precision. We don't have to do that. We cannot draw 100 million different values on the y-axis in an images with only 1000px.
2019-09-07 18:31:18 +02:00
parent 0e9e2cd53a
commit 162ef1626c
3 changed files with 72 additions and 18 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -32,7 +32,7 @@ ext {
    lib_log4j2_core = "org.apache.logging.log4j:log4j-core:${version_log4j2}"
    lib_log4j2_slf4j_impl = "org.apache.logging.log4j:log4j-slf4j-impl:${version_log4j2}"
-    lib_primitive_collections='org.lucares:primitiveCollections:0.1.20190819195450'
+    lib_primitive_collections='org.lucares:primitiveCollections:0.1.20190907180014'
    lib_spring_boot_log4j2="org.springframework.boot:spring-boot-starter-log4j2:${version_spring}"
    lib_spring_boot_mustache="org.springframework.boot:spring-boot-starter-mustache:${version_spring}"
--- a/pdb-plotting/src/main/java/org/lucares/pdb/plot/api/CumulativeDistributionCustomAggregator.java
+++ b/pdb-plotting/src/main/java/org/lucares/pdb/plot/api/CumulativeDistributionCustomAggregator.java
@@ -8,14 +8,68 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.util.LinkedHashMap;
-import org.lucares.collections.LongList;
+import org.lucares.collections.LongLongConsumer;
 import org.lucares.collections.LongLongHashMap;
 public class CumulativeDistributionCustomAggregator implements CustomAggregator {
 	private final static int POINTS = 500;
-	private final LongList values = new LongList();
+	private static final class ToPercentiles implements LongLongConsumer {
 		private long cumulativeCount = 0;
 		private long maxValue = 0;
 		private final LinkedHashMap<Double, Long> percentiles = new LinkedHashMap<>(POINTS);
 		private final double stepSize;
 		private double lastPercentile;
 		private double nextPercentile;
 		private final long totalValues;
 		public ToPercentiles(final long totalValues) {
 			this.totalValues = totalValues;
 			stepSize = 100.0 / POINTS;
 			nextPercentile = stepSize;
 		}
 		@Override
 		public void accept(final long duration, final long count) {
 			maxValue = duration;
 			cumulativeCount += count;
 			final double newPercentile = cumulativeCount * 100.0 / totalValues;
 			if (newPercentile >= nextPercentile) {
 				double currentPercentile = lastPercentile + stepSize;
 				while (currentPercentile <= newPercentile) {
 					percentiles.put(currentPercentile, duration);
 					currentPercentile += stepSize;
 				}
 				nextPercentile = currentPercentile;
 				lastPercentile = currentPercentile - stepSize;
 			}
 		}
 		public long getMaxValue() {
 			return maxValue;
 		}
 		public LinkedHashMap<Double, Long> getPercentiles() {
 			return percentiles;
 		}
 	}
 	// the rather large initial capacity should prevent too many grow&re-hash phases
 	private final LongLongHashMap map = new LongLongHashMap(5_000, 0.75);
 	private long totalValues = 0;
 	private final Path tmpDir;
@@ -25,7 +79,8 @@ public class CumulativeDistributionCustomAggregator implements CustomAggregator
 	@Override
 	public void addValue(final long epochMilli, final long value) {
-		values.add((int) value);
+		map.compute(value, 0, l -> l + 1);
 		totalValues++;
 	}
 	@Override
@@ -33,26 +88,25 @@ public class CumulativeDistributionCustomAggregator implements CustomAggregator
 		final char separator = ',';
 		final char newline = '\n';
-		values.parallelSort();
+		final ToPercentiles toPercentiles = new ToPercentiles(totalValues);
 		map.forEachOrdered(toPercentiles);
 		final LongList percentiles = new LongList(POINTS);
 		final File dataFile = File.createTempFile("data", ".dat", tmpDir.toFile());
 		try (final Writer output = new BufferedWriter(
 				new OutputStreamWriter(new FileOutputStream(dataFile), StandardCharsets.US_ASCII));) {
 			final StringBuilder data = new StringBuilder();
-			if (values.size() > 0) {
+			if (map.size() > 0) {
 				// compute the percentiles
-				for (int i = 0; i < POINTS; i++) {
+				toPercentiles.getPercentiles().forEach((percentile, value) -> {
 					data.append(i * (100 / (double) POINTS));
 					data.append(separator);
 					final long percentile = values.get((int) Math.floor(values.size() / ((double) POINTS) * i));
 					data.append(percentile);
 					data.append(newline);
-					percentiles.add(percentile);
+					data.append(percentile);
-				}
+					data.append(separator);
-				final long maxValue = values.get(values.size() - 1);
+					data.append(value);
 					data.append(newline);
 				});
 				final long maxValue = toPercentiles.getMaxValue();
 				data.append(100);
 				data.append(separator);
 				data.append(maxValue);
@@ -62,7 +116,7 @@ public class CumulativeDistributionCustomAggregator implements CustomAggregator
 		}
-		final String title = String.format("percentiles");
+		final String title = String.format("cumulative distribution");
 		return new AggregatedData(title, dataFile);
 	}
--- a/pdb-plotting/src/main/java/org/lucares/pdb/plot/api/PlotSettings.java
+++ b/pdb-plotting/src/main/java/org/lucares/pdb/plot/api/PlotSettings.java
@@ -119,7 +119,7 @@ public class PlotSettings {
 	public DateTimeRange dateRange() {
 		final String[] startEnd = dateRangeAsString.split(Pattern.quote(" - "));
-		Preconditions.checkEqual(startEnd, 2, "invalid date range: ''{0}''", dateRangeAsString);
+		Preconditions.checkEqual(startEnd.length, 2, "invalid date range: ''{0}''", dateRangeAsString);
 		final OffsetDateTime startDate = LocalDateTime.parse(startEnd[0], DATE_FORMAT).atOffset(ZoneOffset.UTC);
 		final OffsetDateTime endDate = LocalDateTime.parse(startEnd[1], DATE_FORMAT).atOffset(ZoneOffset.UTC);