tags are now stored as variable length byte sequences of longs

Replaced Tags.filenameBytes with a SortedSet<Tag>. Tags are now
stored as longs (variable length encoded) in the PersistenMap.
Tags.filenameBytes was introduced to reduce memory consumption, when
all tags were hold in memory. Tags are now stored in a PersistentMap
and only read when needed.

Moved the VariableByteEncoder into its own project, because it was
needed by pdb-api.
This commit is contained in:
2018-11-17 20:03:46 +01:00
parent b2107acf4e
commit 135ab42cd8
14 changed files with 97 additions and 128 deletions

View File

@@ -1,6 +1,7 @@
apply plugin: 'antlr' apply plugin: 'antlr'
dependencies { dependencies {
compile project(':byte-utils')
compile project(':file-utils') compile project(':file-utils')
compile project(':pdb-utils') compile project(':pdb-utils')

View File

@@ -11,9 +11,9 @@ import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.lucares.collections.LongList; import org.lucares.collections.LongList;
import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder;
import org.lucares.pdb.diskstorage.DiskBlock; import org.lucares.pdb.diskstorage.DiskBlock;
import org.lucares.pdb.diskstorage.DiskStorage; import org.lucares.pdb.diskstorage.DiskStorage;
import org.lucares.utils.byteencoder.VariableByteEncoder;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@@ -3,8 +3,8 @@ package org.lucares.pdb.blockstorage;
import java.nio.MappedByteBuffer; import java.nio.MappedByteBuffer;
import org.lucares.collections.LongList; import org.lucares.collections.LongList;
import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder;
import org.lucares.pdb.diskstorage.DiskBlock; import org.lucares.pdb.diskstorage.DiskBlock;
import org.lucares.utils.byteencoder.VariableByteEncoder;
public class BSFileDiskBlock { public class BSFileDiskBlock {

View File

@@ -7,7 +7,7 @@ import java.util.List;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder; import org.lucares.utils.byteencoder.VariableByteEncoder;
class NodeEntry { class NodeEntry {
enum ValueType { enum ValueType {
@@ -87,7 +87,13 @@ class NodeEntry {
public String toString(final Function<byte[], String> keyDecoder, final Function<byte[], String> valueDecoder) { public String toString(final Function<byte[], String> keyDecoder, final Function<byte[], String> valueDecoder) {
final String valueAsString = isInnerNode() ? String.valueOf(VariableByteEncoder.decodeFirstValue(value)) final String valueAsString = isInnerNode() ? String.valueOf(VariableByteEncoder.decodeFirstValue(value))
: valueDecoder.apply(value); : valueDecoder.apply(value);
final String keyAsString = keyDecoder.apply(key);
final String keyAsString;
if (Arrays.equals(key, PersistentMap.MAX_KEY)) {
keyAsString = "<<<MAX_KEY>>>";
} else {
keyAsString = keyDecoder.apply(key);
}
return "NodeEntry [type=" + type + ", key=" + keyAsString + ", value=" + valueAsString + "]"; return "NodeEntry [type=" + type + ", key=" + keyAsString + ", value=" + valueAsString + "]";
} }

7
byte-utils/.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
/.settings/
/.classpath
/.project
/bin/
/build/
/target/
/test-output/

7
byte-utils/build.gradle Normal file
View File

@@ -0,0 +1,7 @@
dependencies {
compile 'org.apache.logging.log4j:log4j-core:2.10.0'
compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0'
compile 'org.lucares:primitiveCollections:0.1.20180908084945'
}

View File

@@ -1,4 +1,4 @@
package org.lucares.pdb.blockstorage.intsequence; package org.lucares.utils.byteencoder;
import java.util.Arrays; import java.util.Arrays;

View File

@@ -1,4 +1,4 @@
package org.lucares.pdb.blockstorage.intsequence; package org.lucares.utils.byteencoder;
import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertEquals;
@@ -6,6 +6,7 @@ import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.lucares.collections.LongList; import org.lucares.collections.LongList;
import org.lucares.utils.byteencoder.VariableByteEncoder;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;

View File

@@ -19,7 +19,6 @@ import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tag; import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags; import org.lucares.pdb.api.Tags;
import org.lucares.pdb.blockstorage.BSFile; import org.lucares.pdb.blockstorage.BSFile;
import org.lucares.pdb.blockstorage.intsequence.VariableByteEncoder;
import org.lucares.pdb.datastore.Doc; import org.lucares.pdb.datastore.Doc;
import org.lucares.pdb.datastore.Proposal; import org.lucares.pdb.datastore.Proposal;
import org.lucares.pdb.datastore.lang.Expression; import org.lucares.pdb.datastore.lang.Expression;
@@ -29,6 +28,7 @@ import org.lucares.pdb.diskstorage.DiskStorage;
import org.lucares.pdb.map.PersistentMap; import org.lucares.pdb.map.PersistentMap;
import org.lucares.pdb.map.PersistentMap.EncoderDecoder; import org.lucares.pdb.map.PersistentMap.EncoderDecoder;
import org.lucares.utils.Preconditions; import org.lucares.utils.Preconditions;
import org.lucares.utils.byteencoder.VariableByteEncoder;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -52,13 +52,13 @@ public class DataStore implements AutoCloseable {
private static final EncoderDecoder<Tags> ENCODER_TAGS = new EncoderDecoder<>() { private static final EncoderDecoder<Tags> ENCODER_TAGS = new EncoderDecoder<>() {
@Override @Override
public byte[] encode(final Tags object) { public byte[] encode(final Tags tags) {
return object.getFilenameBytes(); return tags.toBytes();
} }
@Override @Override
public Tags decode(final byte[] bytes) { public Tags decode(final byte[] bytes) {
return new Tags(bytes); return Tags.fromBytes(bytes);
} }
}; };
@@ -68,7 +68,7 @@ public class DataStore implements AutoCloseable {
public byte[] encode(final Doc doc) { public byte[] encode(final Doc doc) {
final byte[] rootBlockNumber = VariableByteEncoder.encode(doc.getRootBlockNumber()); final byte[] rootBlockNumber = VariableByteEncoder.encode(doc.getRootBlockNumber());
final byte[] tags = doc.getTags().getFilenameBytes(); final byte[] tags = doc.getTags().toBytes();
final byte[] result = new byte[rootBlockNumber.length + tags.length]; final byte[] result = new byte[rootBlockNumber.length + tags.length];
@@ -83,7 +83,7 @@ public class DataStore implements AutoCloseable {
final long rootBlockNumber = VariableByteEncoder.decodeFirstValue(bytes); final long rootBlockNumber = VariableByteEncoder.decodeFirstValue(bytes);
final int bytesRootBlockNumber = VariableByteEncoder.neededBytes(rootBlockNumber); final int bytesRootBlockNumber = VariableByteEncoder.neededBytes(rootBlockNumber);
final Tags tags = new Tags(Arrays.copyOfRange(bytes, bytesRootBlockNumber, bytes.length)); final Tags tags = Tags.fromBytes(Arrays.copyOfRange(bytes, bytesRootBlockNumber, bytes.length));
return new Doc(tags, rootBlockNumber); return new Doc(tags, rootBlockNumber);
} }
}; };

View File

@@ -1,5 +1,6 @@
dependencies { dependencies {
compile project(':byte-utils')
compile project(':pdb-utils') compile project(':pdb-utils')
compile project(':file-utils') compile project(':file-utils')
compile 'org.lucares:primitiveCollections:0.1.20180908084945' compile 'org.lucares:primitiveCollections:0.1.20180908084945'

View File

@@ -18,7 +18,7 @@ public class StringCompressor {
return new StringCompressor(mapsi); return new StringCompressor(mapsi);
} }
public Integer put(final String string) { public int put(final String string) {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1); return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
} }

View File

@@ -0,0 +1,8 @@
package org.lucares.pdb.api;
import java.util.Comparator;
public class TagByKeyAndValueComparator {
public static final Comparator<Tag> INSTANCE = Comparator.comparing(Tag::getKey).thenComparing(Tag::getValue);
}

View File

@@ -1,15 +0,0 @@
package org.lucares.pdb.api;
import java.io.Serializable;
import java.util.Comparator;
public class TagByKeyComparator implements Comparator<Tag>, Serializable {
private static final long serialVersionUID = -6683582291996307323L;
public static final TagByKeyComparator INSTANCE = new TagByKeyComparator();
@Override
public int compare(final Tag a, final Tag b) {
return a.getKey().compareToIgnoreCase(b.getKey());
}
}

View File

@@ -1,9 +1,8 @@
package org.lucares.pdb.api; package org.lucares.pdb.api;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Set; import java.util.Set;
@@ -11,57 +10,30 @@ import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;
import java.util.function.Function; import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.lucares.collections.LongList;
import org.lucares.utils.byteencoder.VariableByteEncoder;
public class Tags { public class Tags {
public static StringCompressor STRING_COMPRESSOR = null; public static StringCompressor STRING_COMPRESSOR = null;
public static final byte[] EMPTY_BYTES = new byte[0]; private static final byte[] EMPTY_BYTES = new byte[0];
public static final Tags EMPTY = new Tags(); public static final Tags EMPTY = new Tags();
public static final String KEY_VALUE_SEPARATOR = "-"; private final SortedSet<Tag> tags;
public static final String KEY_VALUE_PAIR_SEPARATOR = "_";
public static final String KEY_VALUE_END_SEPARATOR = "$";
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
+ REGEX_KEY_VALUE + ")*";;
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)", REGEX_KEY_VALUE_PAIRS);
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
private final byte[] filenameBytes;
public Tags() { public Tags() {
filenameBytes = EMPTY_BYTES; tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
} }
public Tags(final byte[] filenameBytes) { public Tags(final Collection<Tag> tags) {
this(new String(filenameBytes, StandardCharsets.UTF_8)); this.tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
} this.tags.addAll(tags);
public Tags(final String serializedTags) {
// serialized tags look like this: 0-1_2-1M_H-28_4-5$1.pdb
// there can be several files for the same set of tags, in which case the number
// after the $ is incremented
// We only take the part until the $.
final int end = serializedTags.indexOf(KEY_VALUE_END_SEPARATOR);
final String normalizedFilename;
if (end >= 0) {
normalizedFilename = serializedTags.substring(0, end);
} else {
normalizedFilename = serializedTags;
}
this.filenameBytes = normalizedFilename.getBytes(StandardCharsets.UTF_8);
} }
public static Tags create(final Collection<Tag> tags) { public static Tags create(final Collection<Tag> tags) {
final String newFilename = toFilename(tags);
return new Tags(newFilename); return new Tags(tags);
} }
public static Tags create() { public static Tags create() {
@@ -85,12 +57,42 @@ public class Tags {
return result; return result;
} }
public String serialize() { public static Tags fromBytes(final byte[] bytes) {
return new String(this.filenameBytes, StandardCharsets.UTF_8); final SortedSet<Tag> result = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
final LongList keyValuesAsLongs = VariableByteEncoder.decode(bytes);
for (int i = 0; i < keyValuesAsLongs.size(); i += 2) {
final long keyAsLong = keyValuesAsLongs.get(i);
final long valueAsLong = keyValuesAsLongs.get(i + 1);
final String key = STRING_COMPRESSOR.get((int) keyAsLong);
final String value = STRING_COMPRESSOR.get((int) valueAsLong);
result.add(new Tag(key, value));
} }
public byte[] getFilenameBytes() { return new Tags(result);
return filenameBytes; }
public byte[] toBytes() {
final byte[] result;
if (tags.size() > 0) {
final LongList keyValuesAsLongs = new LongList(tags.size() * 2);
for (final Tag tag : tags) {
final long keyAsLong = STRING_COMPRESSOR.put(tag.getKey());
final long valueAsLong = STRING_COMPRESSOR.put(tag.getValue());
keyValuesAsLongs.add(keyAsLong);
keyValuesAsLongs.add(valueAsLong);
}
result = VariableByteEncoder.encode(keyValuesAsLongs);
} else {
result = EMPTY_BYTES;
}
return result;
} }
public String getValue(final String key) { public String getValue(final String key) {
@@ -105,54 +107,7 @@ public class Tags {
} }
public SortedSet<Tag> toTags() { public SortedSet<Tag> toTags() {
final SortedSet<Tag> result = new TreeSet<>(TagByKeyComparator.INSTANCE); return Collections.unmodifiableSortedSet(tags);
final String filename = new String(this.filenameBytes, StandardCharsets.UTF_8);
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
if (matcher.find()) {
final String serializedTags = matcher.group(1);
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
if (keyValuePair.length == 2) {
final String key = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[0]));
final String value = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[1]));
result.add(new Tag(key, value));
}
}
}
return result;
}
private static String toFilename(final Collection<Tag> tags) {
final StringBuilder path = new StringBuilder();
final Tag[] tagsAsArray = tags.toArray(new Tag[tags.size()]);
Arrays.sort(tagsAsArray, TagByKeyComparator.INSTANCE);
for (final Tag tag : tagsAsArray) {
final String key = tag.getKey();
final String value = tag.getValue();
final int compressedKey = STRING_COMPRESSOR.put(key);
final int compressedValue = STRING_COMPRESSOR.put(value);
if (path.length() > 0) {
path.append(Tags.KEY_VALUE_PAIR_SEPARATOR);
}
path.append(RadixConverter.toString(compressedKey));
path.append(Tags.KEY_VALUE_SEPARATOR);
path.append(RadixConverter.toString(compressedValue));
}
path.append(Tags.KEY_VALUE_END_SEPARATOR);
return path.toString();
} }
public Set<String> getKeys() { public Set<String> getKeys() {
@@ -183,14 +138,14 @@ public class Tags {
@Override @Override
public String toString() { public String toString() {
return "Tags [filename=" + serialize() + ", tags=" + toTags() + "]"; return "Tags [tags=" + toTags() + "]";
} }
@Override @Override
public int hashCode() { public int hashCode() {
final int prime = 31; final int prime = 31;
int result = 1; int result = 1;
result = prime * result + Arrays.hashCode(filenameBytes); result = prime * result + ((tags == null) ? 0 : tags.hashCode());
return result; return result;
} }
@@ -203,7 +158,10 @@ public class Tags {
if (getClass() != obj.getClass()) if (getClass() != obj.getClass())
return false; return false;
final Tags other = (Tags) obj; final Tags other = (Tags) obj;
if (!Arrays.equals(filenameBytes, other.filenameBytes)) if (tags == null) {
if (other.tags != null)
return false;
} else if (!tags.equals(other.tags))
return false; return false;
return true; return true;
} }
@@ -224,11 +182,7 @@ public class Tags {
} }
public boolean isEmpty() { public boolean isEmpty() {
return filenameBytes == null || filenameBytes.length == 0; return tags.isEmpty();
}
public static Tags create(final String filename) {
return new Tags(filename);
} }
/** /**
@@ -237,7 +191,6 @@ public class Tags {
public String asString() { public String asString() {
final StringBuilder result = new StringBuilder(); final StringBuilder result = new StringBuilder();
final SortedSet<Tag> tags = toTags();
for (final Tag tag : tags) { for (final Tag tag : tags) {
if (result.length() > 0) { if (result.length() > 0) {