read csv using input stream instead of reader

We are now reading the CSV input without transforming
the data into strings. This reduces the amount of bytes
that have to be converted and copied.
We also made Tag smaller. It no longer stores pointers
to strings, instead it stored integers obtained by
compressing the strings (see StringCompressor). This
reduces memory usage and it speeds up hashcode and
equals, which speeds up access to the writer cache.

Performance gain is almost 100%:
- 330k entries/s -> 670k entries/s, top speed measured over a second
- 62s -> 32s, to ingest 16 million entries
This commit is contained in:
2019-01-01 08:31:28 +01:00
parent 0487c30582
commit 4cde10a9f2
12 changed files with 548 additions and 139 deletions

View File

@@ -97,13 +97,13 @@ public class DataStore implements AutoCloseable {
final LongList keyAndValueCompressed = new LongList(2);
final String key = tag.getKey();
final String key = tag.getKeyAsString();
final byte[] result;
if (!key.isEmpty()) {
final Integer keyAsLong = Tags.STRING_COMPRESSOR.put(key);
keyAndValueCompressed.add(keyAsLong);
final String value = tag.getValue();
final String value = tag.getValueAsString();
if (!value.isEmpty()) {
final Integer valueAsLong = Tags.STRING_COMPRESSOR.put(value);
keyAndValueCompressed.add(valueAsLong);
@@ -142,7 +142,7 @@ public class DataStore implements AutoCloseable {
return result;
}
};
public static final Tag TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, "");
public static Tag TAG_ALL_DOCS = null;
private final PersistentMap<Long, Doc> docIdToDoc;
@@ -163,6 +163,8 @@ public class DataStore implements AutoCloseable {
storageBasePath = storageDirectory(dataDirectory);
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
// must be initialized after the string compressor has been created
diskStorageFilePath = storageBasePath.resolve("data.bs");
diskStorage = new DiskStorage(diskStorageFilePath);
@@ -243,7 +245,7 @@ public class DataStore implements AutoCloseable {
final Tag keyPrefix = new Tag("", ""); // will find everything
tagToDocsId.visitValues(keyPrefix, (tags, __) -> keys.add(tags.getKey()));
tagToDocsId.visitValues(keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));
keys.remove(ALL_DOCS_KEY);
final List<String> result = new ArrayList<>(keys);
@@ -259,7 +261,7 @@ public class DataStore implements AutoCloseable {
try {
final SortedSet<String> result = new TreeSet<>();
if (query.isEmpty()) {
tagToDocsId.visitValues(new Tag(key, ""), (tag, value) -> result.add(tag.getValue()));
tagToDocsId.visitValues(new Tag(key, ""), (tag, value) -> result.add(tag.getValueAsString()));
} else {
final List<Doc> docs = search(query);
for (final Doc doc : docs) {

View File

@@ -128,7 +128,7 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
keyToValueToDocId.visitValues(new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
try {
if (valuePattern.matcher(tags.getValue()).matches()) {
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
try (final BSFile bsFile = BSFile.existingFile(blockOffsetToDocIds, diskStorage)) {
bsFile.streamOfLongLists().forEach(result::add);
}