read csv using input stream instead of reader

We are now reading the CSV input without transforming
the data into strings. This reduces the amount of bytes
that have to be converted and copied.
We also made Tag smaller. It no longer stores pointers
to strings, instead it stored integers obtained by
compressing the strings (see StringCompressor). This
reduces memory usage and it speeds up hashcode and
equals, which speeds up access to the writer cache.

Performance gain is almost 100%:
- 330k entries/s -> 670k entries/s, top speed measured over a second
- 62s -> 32s, to ingest 16 million entries
This commit is contained in:
2019-01-01 08:31:28 +01:00
parent 0487c30582
commit 4cde10a9f2
12 changed files with 548 additions and 139 deletions

View File

@@ -1,5 +1,6 @@
package org.lucares.pdbui.date;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
@@ -8,7 +9,7 @@ import java.util.concurrent.ConcurrentHashMap;
/**
* A specialized date parser that can only handle ISO-8601 like dates
* (2011-12-03T10:15:30.123Z or 2011-12-03T10:15:30+01:00) but does this roughly
* 10 times faster than {@link DateTimeFormatter} and 5 times faster than the
* 40 times faster than {@link DateTimeFormatter} and 20 times faster than the
* FastDateParser of commons-lang3.
*/
public class FastISODateParser {
@@ -49,12 +50,6 @@ public class FastISODateParser {
public long parseAsEpochMilli(final String date) {
try {
// final long year = Integer.parseInt(date, 0, 4, 10);
// final long month = Integer.parseInt(date, 5, 7, 10);
// final long dayOfMonth = Integer.parseInt(date, 8, 10, 10);
// final long hour = Integer.parseInt(date, 11, 13, 10);
// final long minute = Integer.parseInt(date, 14, 16, 10);
// final long second = Integer.parseInt(date, 17, 19, 10);
final long year = parseLong(date, 0, 4);
final long month = parseLong(date, 5, 7);
final long dayOfMonth = parseLong(date, 8, 10);
@@ -62,13 +57,6 @@ public class FastISODateParser {
final long minute = parseLong(date, 14, 16);
final long second = parseLong(date, 17, 19);
// final long year = 2018;
// final long month = 10;
// final long dayOfMonth = 12;
// final long hour = 0;
// final long minute = 0;
// final long second = 0;
final int[] nanosAndCharsRead = parseMilliseconds(date, 19);
final long nanos = nanosAndCharsRead[0];
final int offsetTimezone = 19 + nanosAndCharsRead[1];
@@ -170,4 +158,127 @@ public class FastISODateParser {
return hours * 3_600_000 + minutes * 60_000;
}
public long parseAsEpochMilli(final byte[] date) {
return parseAsEpochMilli(date, 0);
}
public long parseAsEpochMilli(final byte[] date, final int beginIndex) {
try {
final int yearBegin = beginIndex + 0;
final int yearEnd = yearBegin + 4;
final int monthBegin = yearEnd + 1;
final int dayBegin = monthBegin + 3;
final int hourBegin = dayBegin + 3;
final int minuteBegin = hourBegin + 3;
final int secondBegin = minuteBegin + 3;
final int secondEnd = secondBegin + 2;
final long year = parseLong(date, yearBegin, yearEnd);
final long month = parse2ByteLong(date, monthBegin);
final long dayOfMonth = parse2ByteLong(date, dayBegin);
final long hour = parse2ByteLong(date, hourBegin);
final long minute = parse2ByteLong(date, minuteBegin);
final long second = parse2ByteLong(date, secondBegin);
final int[] nanosAndCharsRead = parseMilliseconds(date, secondEnd);
final long nanos = nanosAndCharsRead[0];
final int offsetTimezone = beginIndex + 19 + nanosAndCharsRead[1];
final long zoneOffsetMillis = date[offsetTimezone] == 'Z' ? 0 : parseZoneToMillis(date, offsetTimezone);
final int epochMilliMonthOffsetKey = (int) (year * 12 + month - 1);
final long epochMilliMonthOffset;
if (cached_epochMilliMonthOffsetKey == epochMilliMonthOffsetKey) {
epochMilliMonthOffset = cached_epochMilliMonthOffset;
} else {
epochMilliMonthOffset = EPOCH_MILLI_MONTH_OFFSETS.computeIfAbsent(epochMilliMonthOffsetKey,
FastISODateParser::computeEpochMilliMonthOffset);
cached_epochMilliMonthOffsetKey = epochMilliMonthOffsetKey;
cached_epochMilliMonthOffset = epochMilliMonthOffset;
}
final long epochMilli = epochMilliMonthOffset //
+ (dayOfMonth - 1) * 86_400_000 //
+ hour * 3_600_000 //
+ minute * 60_000 //
+ second * 1_000 //
+ nanos / 1_000_000//
- zoneOffsetMillis;
return epochMilli;
} catch (final RuntimeException e) {
throw new IllegalArgumentException("'"
+ new String(date, beginIndex, date.length - beginIndex, StandardCharsets.UTF_8)
+ "' is not an ISO-8601 that can be parsed with " + FastISODateParser.class.getCanonicalName(), e);
}
}
private long parseLong(final byte[] bytes, final int start, final int end) {
long result = 0;
for (int i = start; i < end; i++) {
final int c = bytes[i];
if (c < '0' || c > '9') // (byte)48 = '0' and (byte)57 = '9'
{
throw new NumberFormatException(c + " is not a number at offset " + i);
}
result = result * 10 + (c - '0');
}
return result;
}
private long parse2ByteLong(final byte[] bytes, final int start) {
final int c0 = bytes[start];
if (c0 < 48 || c0 > 57) // (byte)48 = '0' and (byte)57 = '9'
{
throw new NumberFormatException(c0 + " is not a number at offset " + start);
// throw new NumberFormatException();
}
long result = c0 - 48;
final int c1 = bytes[start + 1];
if (c1 < 48 || c1 > 57) {
throw new NumberFormatException(c1 + " is not a number at offset " + (start + 1));
// throw new NumberFormatException();
}
result = result * 10 + (c1 - 48);
return result;
}
private int[] parseMilliseconds(final byte[] date, final int start) {
int result = 0;
int i = start;
while (i < date.length) {
final byte c = date[i];
i++;
if (c == '.') {
continue;
}
if (c < '0' || c > '9') {
break;
}
result = result * 10 + (c - '0');
}
final int readChars = i - start - 1;
while (i <= start + 10) {
result *= 10;
i++;
}
return new int[] { result, readChars };
}
private long parseZoneToMillis(final byte[] zoneBytes, final int beginIndex) {
final String zoneString = new String(zoneBytes, beginIndex, zoneBytes.length - beginIndex);
final int hours = Integer.parseInt(zoneString, 0, 3, 10);
int minutes = Integer.parseInt(zoneString, 4, 6, 10);
// if hours is negative,then minutes must be too
minutes = (hours < 0 ? -1 : 1) * minutes;
return hours * 3_600_000 + minutes * 60_000;
}
}