read csv using input stream instead of reader

We are now reading the CSV input without transforming
the data into strings. This reduces the amount of bytes
that have to be converted and copied.
We also made Tag smaller. It no longer stores pointers
to strings, instead it stored integers obtained by
compressing the strings (see StringCompressor). This
reduces memory usage and it speeds up hashcode and
equals, which speeds up access to the writer cache.

Performance gain is almost 100%:
- 330k entries/s -> 670k entries/s, top speed measured over a second
- 62s -> 32s, to ingest 16 million entries
This commit is contained in:
2019-01-01 08:31:28 +01:00
parent 0487c30582
commit 4cde10a9f2
12 changed files with 548 additions and 139 deletions

View File

@@ -23,8 +23,18 @@ public class StringCompressor {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
}
public int put(final byte[] bytes, final int start, final int endExclusive) {
return usip.computeIfAbsent(bytes, start, endExclusive);
}
public String get(final int integer) {
return usip.getKey(integer);
}
public int getIfPresent(final String string) {
final Integer integer = usip.get(string);
return integer != null ? integer : -1;
}
}

View File

@@ -1,34 +1,47 @@
package org.lucares.pdb.api;
public class Tag {
private final String key;
private final int key;
private final String value;
private final int value;
public Tag(final String key, final String value) {
public Tag(final int key, final int value) {
this.key = key;
this.value = value;
}
public String getKey() {
public Tag(final String key, final String value) {
this.key = Tags.STRING_COMPRESSOR.put(key);
this.value = Tags.STRING_COMPRESSOR.put(value);
}
public int getKey() {
return key;
}
public String getValue() {
public String getKeyAsString() {
return Tags.STRING_COMPRESSOR.get(key);
}
public int getValue() {
return value;
}
public String getValueAsString() {
return Tags.STRING_COMPRESSOR.get(value);
}
@Override
public String toString() {
return key + "=" + value;
return Tags.STRING_COMPRESSOR.get(key) + "=" + Tags.STRING_COMPRESSOR.get(value);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((key == null) ? 0 : key.hashCode());
result = prime * result + ((value == null) ? 0 : value.hashCode());
result = prime * result + key;
result = prime * result + value;
return result;
}
@@ -41,15 +54,9 @@ public class Tag {
if (getClass() != obj.getClass())
return false;
final Tag other = (Tag) obj;
if (key == null) {
if (other.key != null)
return false;
} else if (!key.equals(other.key))
if (key != other.key)
return false;
if (value == null) {
if (other.value != null)
return false;
} else if (!value.equals(other.value))
if (value != other.value)
return false;
return true;
}

View File

@@ -0,0 +1,8 @@
package org.lucares.pdb.api;
import java.util.Comparator;
public class TagByKeyComparator {
public static final Comparator<Tag> INSTANCE = Comparator.comparing(Tag::getKey);
}

View File

@@ -1,12 +1,9 @@
package org.lucares.pdb.api;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Function;
@@ -20,18 +17,18 @@ public class Tags {
private static final byte[] EMPTY_BYTES = new byte[0];
public static final Tags EMPTY = new Tags();
private final SortedSet<Tag> tags;
private final List<Tag> tags;
public Tags() {
tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
tags = new ArrayList<>();
}
public Tags(final Collection<Tag> tags) {
this.tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
this.tags.addAll(tags);
public Tags(final List<Tag> tags) {
Collections.sort(tags, TagByKeyAndValueComparator.INSTANCE);
this.tags = tags;
}
public static Tags create(final Collection<Tag> tags) {
public static Tags create(final List<Tag> tags) {
return new Tags(tags);
}
@@ -40,6 +37,23 @@ public class Tags {
return EMPTY;
}
public static Tags create(final int key, final int value) {
return TagsBuilder.create().add(key, value).build();
}
public static Tags create(final int key1, final int value1, final int key2, final int value2) {
final Tags result = TagsBuilder.create().add(key1, value1).add(key2, value2).build();
return result;
}
public static Tags create(final int key1, final int value1, final int key2, final int value2, final int key3,
final int value3) {
final Tags result = TagsBuilder.create().add(key1, value1).add(key2, value2).add(key3, value3).build();
return result;
}
public static Tags create(final String key, final String value) {
return TagsBuilder.create().add(key, value).build();
@@ -58,7 +72,7 @@ public class Tags {
}
public static Tags fromBytes(final byte[] bytes) {
final SortedSet<Tag> result = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
final List<Tag> result = new ArrayList<>();
final LongList keyValuesAsLongs = VariableByteEncoder.decode(bytes);
@@ -67,8 +81,8 @@ public class Tags {
final long keyAsLong = keyValuesAsLongs.get(i);
final long valueAsLong = keyValuesAsLongs.get(i + 1);
final String key = STRING_COMPRESSOR.get((int) keyAsLong);
final String value = STRING_COMPRESSOR.get((int) valueAsLong);
final int key = (int) keyAsLong;
final int value = (int) valueAsLong;
result.add(new Tag(key, value));
}
@@ -81,8 +95,8 @@ public class Tags {
if (tags.size() > 0) {
final LongList keyValuesAsLongs = new LongList(tags.size() * 2);
for (final Tag tag : tags) {
final long keyAsLong = STRING_COMPRESSOR.put(tag.getKey());
final long valueAsLong = STRING_COMPRESSOR.put(tag.getValue());
final long keyAsLong = tag.getKey();
final long valueAsLong = tag.getValue();
keyValuesAsLongs.add(keyAsLong);
keyValuesAsLongs.add(valueAsLong);
@@ -96,40 +110,50 @@ public class Tags {
}
public String getValue(final String key) {
final Tag needle = new Tag(STRING_COMPRESSOR.put(key), 0);
final Set<Tag> tags = toTags();
for (final Tag tag : tags) {
if (Objects.equals(tag.getKey(), key)) {
return tag.getValue();
}
final int index = Collections.binarySearch(tags, needle, TagByKeyComparator.INSTANCE);
if (index >= 0) {
final Tag tag = tags.get(index);
return STRING_COMPRESSOR.get(tag.getValue());
}
return null;
}
public SortedSet<Tag> toTags() {
return Collections.unmodifiableSortedSet(tags);
public int getValueAsInt(final String key) {
final Tag needle = new Tag(STRING_COMPRESSOR.put(key), 0);
final int index = Collections.binarySearch(tags, needle, TagByKeyComparator.INSTANCE);
if (index >= 0) {
final Tag tag = tags.get(index);
return tag.getValue();
}
return -1;
}
public Set<String> getKeys() {
final TreeSet<String> result = new TreeSet<>();
final Set<Tag> tags = toTags();
for (final Tag tag : tags) {
result.add(tag.getKey());
result.add(STRING_COMPRESSOR.get(tag.getKey()));
}
return result;
}
public List<Tag> toTags() {
return Collections.unmodifiableList(tags);
}
public void forEach(final BiConsumer<String, String> keyValueConsumer) {
final Set<Tag> tags = toTags();
for (final Tag tag : tags) {
keyValueConsumer.accept(tag.getKey(), tag.getValue());
final String key = STRING_COMPRESSOR.get(tag.getKey());
final String value = STRING_COMPRESSOR.get(tag.getValue());
keyValueConsumer.accept(key, value);
}
}
public Tags mapTags(final Function<Tag, Tag> tagMapFuntion) {
final Set<Tag> tags = toTags();
final Collection<Tag> mappedTags = new ArrayList<>(tags.size());
final List<Tag> mappedTags = new ArrayList<>(tags.size());
for (final Tag tag : tags) {
mappedTags.add(tagMapFuntion.apply(tag));
}
@@ -138,7 +162,7 @@ public class Tags {
@Override
public String toString() {
return "Tags [tags=" + toTags() + "]";
return "Tags [tags=" + tags + "]";
}
@Override
@@ -171,10 +195,11 @@ public class Tags {
final TagsBuilder result = TagsBuilder.create();
for (final String field : groupByFields) {
final String value = getValue(field);
final int value = getValueAsInt(field);
if (value != null) {
result.add(field, value);
if (value >= 0) {
final int fieldAsInt = STRING_COMPRESSOR.getIfPresent(field);
result.add(fieldAsInt, value);
}
}
@@ -197,9 +222,9 @@ public class Tags {
result.append(", ");
}
result.append(tag.getKey());
result.append(STRING_COMPRESSOR.get(tag.getKey()));
result.append("=");
result.append(tag.getValue());
result.append(STRING_COMPRESSOR.get(tag.getValue()));
}
return result.toString();

View File

@@ -11,11 +11,17 @@ public class TagsBuilder {
return new TagsBuilder();
}
public TagsBuilder add(final String key, final String value) {
public TagsBuilder add(final int key, final int value) {
tags.add(new Tag(key, value));
return this;
}
public TagsBuilder add(final String key, final String value) {
final int keyAsInt = Tags.STRING_COMPRESSOR.put(key);
final int valueAsInt = Tags.STRING_COMPRESSOR.put(value);
return add(keyAsInt, valueAsInt);
}
public Tags build() {
return Tags.create(tags);
}

View File

@@ -11,6 +11,7 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -34,11 +35,59 @@ public class UniqueStringIntegerPairs {
private static final boolean APPEND = true;
private static final class ByteArray implements Comparable<ByteArray> {
private final byte[] array;
private final int start;
private final int endExclusive;
public ByteArray(final byte[] array, final int start, final int endExclusive) {
super();
this.array = array;
this.start = start;
this.endExclusive = endExclusive;
}
public ByteArray(final byte[] bytes) {
this.array = bytes;
this.start = 0;
this.endExclusive = bytes.length;
}
// custom hashcode!
@Override
public int hashCode() {
int result = 1;
final byte[] a = array;
final int end = endExclusive;
for (int i = start; i < end; i++) {
result = 31 * result + a[i];
}
return result;
}
// custom equals!
@Override
public boolean equals(final Object obj) {
final ByteArray other = (ByteArray) obj;
if (!Arrays.equals(array, start, endExclusive, other.array, other.start, other.endExclusive))
return false;
return true;
}
@Override
public int compareTo(final ByteArray o) {
return Arrays.compare(array, start, endExclusive, o.array, o.start, o.endExclusive);
}
}
/**
* Maps a string to an integer. E.g. "myLongValue" -> 123
*/
private final Map<String, Integer> stringToInt = new HashMap<>();
private final Map<ByteArray, Integer> bytesToInt = new HashMap<>();
/**
* Maps an integer to a string. E.g. 123 -> "myLongValue"
*/
@@ -74,9 +123,10 @@ public class UniqueStringIntegerPairs {
if (tokens.length == 2) {
final String string = tokens[0];
final int value = Integer.parseInt(tokens[1]);
intToStringPut(value, string);
stringToInt.put(string, value);
final int integer = Integer.parseInt(tokens[1]);
intToStringPut(integer, string);
stringToInt.put(string, integer);
bytesToInt.put(new ByteArray(string.getBytes(StandardCharsets.UTF_8)), integer);
}
}
}
@@ -95,29 +145,30 @@ public class UniqueStringIntegerPairs {
intToString.set(value, string);
}
void put(final String first, final int second) {
void put(final String string, final int integer) {
if (stringToInt.containsKey(first) || (intToString.size() > second && intToString.get(second) != null)) {
throw new IllegalArgumentException("Unique key constraint violation for (" + first + ", " + second + ")");
if (stringToInt.containsKey(string) || (intToString.size() > integer && intToString.get(integer) != null)) {
throw new IllegalArgumentException("Unique key constraint violation for (" + string + ", " + integer + ")");
}
if (file != null) {
try (final Writer writer = new OutputStreamWriter(new FileOutputStream(file.toFile(), APPEND),
StandardCharsets.UTF_8)) {
writer.write(first + SEPARATOR + second + "\n");
writer.write(string + SEPARATOR + integer + "\n");
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
intToStringPut(second, first);
stringToInt.put(first, second);
intToStringPut(integer, string);
stringToInt.put(string, integer);
bytesToInt.put(new ByteArray(string.getBytes(StandardCharsets.UTF_8)), integer);
}
public Integer get(final String first) {
public Integer get(final String string) {
return stringToInt.get(first);
return stringToInt.get(string);
}
public String getKey(final int second) {
@@ -128,16 +179,34 @@ public class UniqueStringIntegerPairs {
return intToString.size() == 0 ? -1 : intToString.size() - 1;
}
public Integer computeIfAbsent(final String first, final Function<String, Integer> mappingFunction) {
if (!stringToInt.containsKey(first)) {
public Integer computeIfAbsent(final String string, final Function<String, Integer> mappingFunction) {
if (!stringToInt.containsKey(string)) {
synchronized (stringToInt) {
if (!stringToInt.containsKey(first)) {
final Integer second = mappingFunction.apply(first);
put(first, second);
if (!stringToInt.containsKey(string)) {
final Integer second = mappingFunction.apply(string);
put(string, second);
}
}
}
return stringToInt.get(first);
return stringToInt.get(string);
}
public Integer computeIfAbsent(final byte[] bytes, final int start, final int endExclusive) {
final ByteArray byteArray = new ByteArray(bytes, start, endExclusive);
Integer result = bytesToInt.get(byteArray);
if (result == null) {
synchronized (stringToInt) {
if (!bytesToInt.containsKey(byteArray)) {
final String string = new String(bytes, start, endExclusive - start, StandardCharsets.UTF_8);
final Integer integer = intToString.size();
put(string, integer);
}
result = bytesToInt.get(byteArray);
}
}
return result;
}
}