From 135ab42cd87d8bfc2d2ade4382eeb28c5eab281c Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sat, 17 Nov 2018 20:03:46 +0100 Subject: [PATCH] tags are now stored as variable length byte sequences of longs Replaced Tags.filenameBytes with a SortedSet. Tags are now stored as longs (variable length encoded) in the PersistenMap. Tags.filenameBytes was introduced to reduce memory consumption, when all tags were hold in memory. Tags are now stored in a PersistentMap and only read when needed. Moved the VariableByteEncoder into its own project, because it was needed by pdb-api. --- block-storage/build.gradle | 1 + .../org/lucares/pdb/blockstorage/BSFile.java | 2 +- .../pdb/blockstorage/BSFileDiskBlock.java | 2 +- .../java/org/lucares/pdb/map/NodeEntry.java | 10 +- byte-utils/.gitignore | 7 + byte-utils/build.gradle | 7 + .../byteencoder}/VariableByteEncoder.java | 2 +- .../byteencoder}/VariableByteEncoderTest.java | 3 +- .../pdb/datastore/internal/DataStore.java | 12 +- pdb-api/build.gradle | 1 + .../org/lucares/pdb/api/StringCompressor.java | 2 +- .../pdb/api/TagByKeyAndValueComparator.java | 8 + .../lucares/pdb/api/TagByKeyComparator.java | 15 -- .../main/java/org/lucares/pdb/api/Tags.java | 153 ++++++------------ 14 files changed, 97 insertions(+), 128 deletions(-) create mode 100644 byte-utils/.gitignore create mode 100644 byte-utils/build.gradle rename {block-storage/src/main/java/org/lucares/pdb/blockstorage/intsequence => byte-utils/src/main/java/org/lucares/utils/byteencoder}/VariableByteEncoder.java (99%) rename {block-storage/src/test/java/org/lucares/pdb/blockstorage/intsequence => byte-utils/src/test/java/org/lucares/utils/byteencoder}/VariableByteEncoderTest.java (97%) create mode 100644 pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyAndValueComparator.java delete mode 100644 pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyComparator.java diff --git a/block-storage/build.gradle b/block-storage/build.gradle index 4a8022d..1bf9b18 100644 --- a/block-storage/build.gradle +++ b/block-storage/build.gradle @@ -1,6 +1,7 @@ apply plugin: 'antlr' dependencies { + compile project(':byte-utils') compile project(':file-utils') compile project(':pdb-utils') diff --git a/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFile.java b/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFile.java index 8994423..b090f16 100644 --- a/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFile.java +++ b/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFile.java @@ -11,9 +11,9 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; import org.lucares.collections.LongList; -import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder; import org.lucares.pdb.diskstorage.DiskBlock; import org.lucares.pdb.diskstorage.DiskStorage; +import org.lucares.utils.byteencoder.VariableByteEncoder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFileDiskBlock.java b/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFileDiskBlock.java index c33830c..1563d3f 100644 --- a/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFileDiskBlock.java +++ b/block-storage/src/main/java/org/lucares/pdb/blockstorage/BSFileDiskBlock.java @@ -3,8 +3,8 @@ package org.lucares.pdb.blockstorage; import java.nio.MappedByteBuffer; import org.lucares.collections.LongList; -import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder; import org.lucares.pdb.diskstorage.DiskBlock; +import org.lucares.utils.byteencoder.VariableByteEncoder; public class BSFileDiskBlock { diff --git a/block-storage/src/main/java/org/lucares/pdb/map/NodeEntry.java b/block-storage/src/main/java/org/lucares/pdb/map/NodeEntry.java index 81becb5..cf1fbe9 100644 --- a/block-storage/src/main/java/org/lucares/pdb/map/NodeEntry.java +++ b/block-storage/src/main/java/org/lucares/pdb/map/NodeEntry.java @@ -7,7 +7,7 @@ import java.util.List; import java.util.function.Function; import java.util.function.Predicate; -import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder; +import org.lucares.utils.byteencoder.VariableByteEncoder; class NodeEntry { enum ValueType { @@ -87,7 +87,13 @@ class NodeEntry { public String toString(final Function keyDecoder, final Function valueDecoder) { final String valueAsString = isInnerNode() ? String.valueOf(VariableByteEncoder.decodeFirstValue(value)) : valueDecoder.apply(value); - final String keyAsString = keyDecoder.apply(key); + + final String keyAsString; + if (Arrays.equals(key, PersistentMap.MAX_KEY)) { + keyAsString = "<<>>"; + } else { + keyAsString = keyDecoder.apply(key); + } return "NodeEntry [type=" + type + ", key=" + keyAsString + ", value=" + valueAsString + "]"; } diff --git a/byte-utils/.gitignore b/byte-utils/.gitignore new file mode 100644 index 0000000..691dc42 --- /dev/null +++ b/byte-utils/.gitignore @@ -0,0 +1,7 @@ +/.settings/ +/.classpath +/.project +/bin/ +/build/ +/target/ +/test-output/ \ No newline at end of file diff --git a/byte-utils/build.gradle b/byte-utils/build.gradle new file mode 100644 index 0000000..a0821b1 --- /dev/null +++ b/byte-utils/build.gradle @@ -0,0 +1,7 @@ +dependencies { + + + compile 'org.apache.logging.log4j:log4j-core:2.10.0' + compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0' + compile 'org.lucares:primitiveCollections:0.1.20180908084945' +} diff --git a/block-storage/src/main/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoder.java b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java similarity index 99% rename from block-storage/src/main/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoder.java rename to byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java index 17591f6..b11cf88 100644 --- a/block-storage/src/main/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoder.java +++ b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java @@ -1,4 +1,4 @@ -package org.lucares.pdb.blockstorage.intsequence; +package org.lucares.utils.byteencoder; import java.util.Arrays; diff --git a/block-storage/src/test/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoderTest.java b/byte-utils/src/test/java/org/lucares/utils/byteencoder/VariableByteEncoderTest.java similarity index 97% rename from block-storage/src/test/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoderTest.java rename to byte-utils/src/test/java/org/lucares/utils/byteencoder/VariableByteEncoderTest.java index a95543b..b975cfc 100644 --- a/block-storage/src/test/java/org/lucares/pdb/blockstorage/intsequence/VariableByteEncoderTest.java +++ b/byte-utils/src/test/java/org/lucares/utils/byteencoder/VariableByteEncoderTest.java @@ -1,4 +1,4 @@ -package org.lucares.pdb.blockstorage.intsequence; +package org.lucares.utils.byteencoder; import static org.testng.Assert.assertEquals; @@ -6,6 +6,7 @@ import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.atomic.AtomicInteger; import org.lucares.collections.LongList; +import org.lucares.utils.byteencoder.VariableByteEncoder; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java index f89dc3e..1fe5c9a 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java @@ -19,7 +19,6 @@ import org.lucares.pdb.api.StringCompressor; import org.lucares.pdb.api.Tag; import org.lucares.pdb.api.Tags; import org.lucares.pdb.blockstorage.BSFile; -import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder; import org.lucares.pdb.datastore.Doc; import org.lucares.pdb.datastore.Proposal; import org.lucares.pdb.datastore.lang.Expression; @@ -29,6 +28,7 @@ import org.lucares.pdb.diskstorage.DiskStorage; import org.lucares.pdb.map.PersistentMap; import org.lucares.pdb.map.PersistentMap.EncoderDecoder; import org.lucares.utils.Preconditions; +import org.lucares.utils.byteencoder.VariableByteEncoder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,13 +52,13 @@ public class DataStore implements AutoCloseable { private static final EncoderDecoder ENCODER_TAGS = new EncoderDecoder<>() { @Override - public byte[] encode(final Tags object) { - return object.getFilenameBytes(); + public byte[] encode(final Tags tags) { + return tags.toBytes(); } @Override public Tags decode(final byte[] bytes) { - return new Tags(bytes); + return Tags.fromBytes(bytes); } }; @@ -68,7 +68,7 @@ public class DataStore implements AutoCloseable { public byte[] encode(final Doc doc) { final byte[] rootBlockNumber = VariableByteEncoder.encode(doc.getRootBlockNumber()); - final byte[] tags = doc.getTags().getFilenameBytes(); + final byte[] tags = doc.getTags().toBytes(); final byte[] result = new byte[rootBlockNumber.length + tags.length]; @@ -83,7 +83,7 @@ public class DataStore implements AutoCloseable { final long rootBlockNumber = VariableByteEncoder.decodeFirstValue(bytes); final int bytesRootBlockNumber = VariableByteEncoder.neededBytes(rootBlockNumber); - final Tags tags = new Tags(Arrays.copyOfRange(bytes, bytesRootBlockNumber, bytes.length)); + final Tags tags = Tags.fromBytes(Arrays.copyOfRange(bytes, bytesRootBlockNumber, bytes.length)); return new Doc(tags, rootBlockNumber); } }; diff --git a/pdb-api/build.gradle b/pdb-api/build.gradle index 16236f2..6e0851f 100644 --- a/pdb-api/build.gradle +++ b/pdb-api/build.gradle @@ -1,5 +1,6 @@ dependencies { + compile project(':byte-utils') compile project(':pdb-utils') compile project(':file-utils') compile 'org.lucares:primitiveCollections:0.1.20180908084945' diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/StringCompressor.java b/pdb-api/src/main/java/org/lucares/pdb/api/StringCompressor.java index c7f9b1e..76b44df 100644 --- a/pdb-api/src/main/java/org/lucares/pdb/api/StringCompressor.java +++ b/pdb-api/src/main/java/org/lucares/pdb/api/StringCompressor.java @@ -18,7 +18,7 @@ public class StringCompressor { return new StringCompressor(mapsi); } - public Integer put(final String string) { + public int put(final String string) { return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1); } diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyAndValueComparator.java b/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyAndValueComparator.java new file mode 100644 index 0000000..b86cf83 --- /dev/null +++ b/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyAndValueComparator.java @@ -0,0 +1,8 @@ +package org.lucares.pdb.api; + +import java.util.Comparator; + +public class TagByKeyAndValueComparator { + + public static final Comparator INSTANCE = Comparator.comparing(Tag::getKey).thenComparing(Tag::getValue); +} diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyComparator.java b/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyComparator.java deleted file mode 100644 index 6bf2e9b..0000000 --- a/pdb-api/src/main/java/org/lucares/pdb/api/TagByKeyComparator.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.lucares.pdb.api; - -import java.io.Serializable; -import java.util.Comparator; - -public class TagByKeyComparator implements Comparator, Serializable { - - private static final long serialVersionUID = -6683582291996307323L; - public static final TagByKeyComparator INSTANCE = new TagByKeyComparator(); - - @Override - public int compare(final Tag a, final Tag b) { - return a.getKey().compareToIgnoreCase(b.getKey()); - } -} diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java b/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java index 1971da2..3bcbf50 100644 --- a/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java +++ b/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java @@ -1,9 +1,8 @@ package org.lucares.pdb.api; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Set; @@ -11,57 +10,30 @@ import java.util.SortedSet; import java.util.TreeSet; import java.util.function.BiConsumer; import java.util.function.Function; -import java.util.regex.Matcher; -import java.util.regex.Pattern; + +import org.lucares.collections.LongList; +import org.lucares.utils.byteencoder.VariableByteEncoder; public class Tags { public static StringCompressor STRING_COMPRESSOR = null; - public static final byte[] EMPTY_BYTES = new byte[0]; + private static final byte[] EMPTY_BYTES = new byte[0]; public static final Tags EMPTY = new Tags(); - public static final String KEY_VALUE_SEPARATOR = "-"; - public static final String KEY_VALUE_PAIR_SEPARATOR = "_"; - public static final String KEY_VALUE_END_SEPARATOR = "$"; - - private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+"; - - private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR) - + REGEX_KEY_VALUE + ")*";; - - private static final String REGEX_STORAGE_FILE = String.format("(%1$s)", REGEX_KEY_VALUE_PAIRS); - - private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE); - - private final byte[] filenameBytes; + private final SortedSet tags; public Tags() { - filenameBytes = EMPTY_BYTES; + tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE); } - public Tags(final byte[] filenameBytes) { - this(new String(filenameBytes, StandardCharsets.UTF_8)); - } - - public Tags(final String serializedTags) { - // serialized tags look like this: 0-1_2-1M_H-28_4-5$1.pdb - // there can be several files for the same set of tags, in which case the number - // after the $ is incremented - // We only take the part until the $. - final int end = serializedTags.indexOf(KEY_VALUE_END_SEPARATOR); - final String normalizedFilename; - if (end >= 0) { - normalizedFilename = serializedTags.substring(0, end); - } else { - normalizedFilename = serializedTags; - } - this.filenameBytes = normalizedFilename.getBytes(StandardCharsets.UTF_8); + public Tags(final Collection tags) { + this.tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE); + this.tags.addAll(tags); } public static Tags create(final Collection tags) { - final String newFilename = toFilename(tags); - return new Tags(newFilename); + return new Tags(tags); } public static Tags create() { @@ -85,12 +57,42 @@ public class Tags { return result; } - public String serialize() { - return new String(this.filenameBytes, StandardCharsets.UTF_8); + public static Tags fromBytes(final byte[] bytes) { + final SortedSet result = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE); + + final LongList keyValuesAsLongs = VariableByteEncoder.decode(bytes); + + for (int i = 0; i < keyValuesAsLongs.size(); i += 2) { + + final long keyAsLong = keyValuesAsLongs.get(i); + final long valueAsLong = keyValuesAsLongs.get(i + 1); + + final String key = STRING_COMPRESSOR.get((int) keyAsLong); + final String value = STRING_COMPRESSOR.get((int) valueAsLong); + result.add(new Tag(key, value)); + } + + return new Tags(result); } - public byte[] getFilenameBytes() { - return filenameBytes; + public byte[] toBytes() { + final byte[] result; + + if (tags.size() > 0) { + final LongList keyValuesAsLongs = new LongList(tags.size() * 2); + for (final Tag tag : tags) { + final long keyAsLong = STRING_COMPRESSOR.put(tag.getKey()); + final long valueAsLong = STRING_COMPRESSOR.put(tag.getValue()); + + keyValuesAsLongs.add(keyAsLong); + keyValuesAsLongs.add(valueAsLong); + } + + result = VariableByteEncoder.encode(keyValuesAsLongs); + } else { + result = EMPTY_BYTES; + } + return result; } public String getValue(final String key) { @@ -105,54 +107,7 @@ public class Tags { } public SortedSet toTags() { - final SortedSet result = new TreeSet<>(TagByKeyComparator.INSTANCE); - final String filename = new String(this.filenameBytes, StandardCharsets.UTF_8); - final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename); - - if (matcher.find()) { - final String serializedTags = matcher.group(1); - - final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)); - - for (int i = 0; i < serializedKeyValuePairs.length; i++) { - final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR)); - - if (keyValuePair.length == 2) { - - final String key = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[0])); - final String value = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[1])); - - result.add(new Tag(key, value)); - } - } - } - return result; - } - - private static String toFilename(final Collection tags) { - final StringBuilder path = new StringBuilder(); - - final Tag[] tagsAsArray = tags.toArray(new Tag[tags.size()]); - Arrays.sort(tagsAsArray, TagByKeyComparator.INSTANCE); - - for (final Tag tag : tagsAsArray) { - final String key = tag.getKey(); - final String value = tag.getValue(); - - final int compressedKey = STRING_COMPRESSOR.put(key); - final int compressedValue = STRING_COMPRESSOR.put(value); - - if (path.length() > 0) { - path.append(Tags.KEY_VALUE_PAIR_SEPARATOR); - } - - path.append(RadixConverter.toString(compressedKey)); - path.append(Tags.KEY_VALUE_SEPARATOR); - path.append(RadixConverter.toString(compressedValue)); - } - path.append(Tags.KEY_VALUE_END_SEPARATOR); - - return path.toString(); + return Collections.unmodifiableSortedSet(tags); } public Set getKeys() { @@ -183,14 +138,14 @@ public class Tags { @Override public String toString() { - return "Tags [filename=" + serialize() + ", tags=" + toTags() + "]"; + return "Tags [tags=" + toTags() + "]"; } @Override public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + Arrays.hashCode(filenameBytes); + result = prime * result + ((tags == null) ? 0 : tags.hashCode()); return result; } @@ -203,7 +158,10 @@ public class Tags { if (getClass() != obj.getClass()) return false; final Tags other = (Tags) obj; - if (!Arrays.equals(filenameBytes, other.filenameBytes)) + if (tags == null) { + if (other.tags != null) + return false; + } else if (!tags.equals(other.tags)) return false; return true; } @@ -224,11 +182,7 @@ public class Tags { } public boolean isEmpty() { - return filenameBytes == null || filenameBytes.length == 0; - } - - public static Tags create(final String filename) { - return new Tags(filename); + return tags.isEmpty(); } /** @@ -237,7 +191,6 @@ public class Tags { public String asString() { final StringBuilder result = new StringBuilder(); - final SortedSet tags = toTags(); for (final Tag tag : tags) { if (result.length() > 0) {