read csv using input stream instead of reader

We are now reading the CSV input without transforming the data into strings. This reduces the amount of bytes that have to be converted and copied. We also made Tag smaller. It no longer stores pointers to strings, instead it stored integers obtained by compressing the strings (see StringCompressor). This reduces memory usage and it speeds up hashcode and equals, which speeds up access to the writer cache. Performance gain is almost 100%: - 330k entries/s -> 670k entries/s, top speed measured over a second - 62s -> 32s, to ingest 16 million entries
2019-01-01 08:31:28 +01:00
parent 0487c30582
commit 4cde10a9f2
12 changed files with 548 additions and 139 deletions
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
@@ -97,13 +97,13 @@ public class DataStore implements AutoCloseable {

 			final LongList keyAndValueCompressed = new LongList(2);

-			final String key = tag.getKey();
+			final String key = tag.getKeyAsString();
 			final byte[] result;
 			if (!key.isEmpty()) {
 				final Integer keyAsLong = Tags.STRING_COMPRESSOR.put(key);
 				keyAndValueCompressed.add(keyAsLong);

-				final String value = tag.getValue();
+				final String value = tag.getValueAsString();
 				if (!value.isEmpty()) {
 					final Integer valueAsLong = Tags.STRING_COMPRESSOR.put(value);
 					keyAndValueCompressed.add(valueAsLong);
@@ -142,7 +142,7 @@ public class DataStore implements AutoCloseable {
 			return result;
 		}
 	};
-	public static final Tag TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, "");
+	public static Tag TAG_ALL_DOCS = null;

 	private final PersistentMap<Long, Doc> docIdToDoc;

@@ -163,6 +163,8 @@ public class DataStore implements AutoCloseable {
 		storageBasePath = storageDirectory(dataDirectory);

 		Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
+		TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
+													// must be initialized after the string compressor has been created

 		diskStorageFilePath = storageBasePath.resolve("data.bs");
 		diskStorage = new DiskStorage(diskStorageFilePath);
@@ -243,7 +245,7 @@ public class DataStore implements AutoCloseable {

 			final Tag keyPrefix = new Tag("", ""); // will find everything

-			tagToDocsId.visitValues(keyPrefix, (tags, __) -> keys.add(tags.getKey()));
+			tagToDocsId.visitValues(keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));

 			keys.remove(ALL_DOCS_KEY);
 			final List<String> result = new ArrayList<>(keys);
@@ -259,7 +261,7 @@ public class DataStore implements AutoCloseable {
 		try {
 			final SortedSet<String> result = new TreeSet<>();
 			if (query.isEmpty()) {
-				tagToDocsId.visitValues(new Tag(key, ""), (tag, value) -> result.add(tag.getValue()));
+				tagToDocsId.visitValues(new Tag(key, ""), (tag, value) -> result.add(tag.getValueAsString()));
 			} else {
 				final List<Doc> docs = search(query);
 				for (final Doc doc : docs) {
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java
@@ -128,7 +128,7 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {

 			keyToValueToDocId.visitValues(new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
 				try {
-					if (valuePattern.matcher(tags.getValue()).matches()) {
+					if (valuePattern.matcher(tags.getValueAsString()).matches()) {
 						try (final BSFile bsFile = BSFile.existingFile(blockOffsetToDocIds, diskStorage)) {
 							bsFile.streamOfLongLists().forEach(result::add);
 						}