tags are now stored as variable length byte sequences of longs

Replaced Tags.filenameBytes with a SortedSet<Tag>. Tags are now
stored as longs (variable length encoded) in the PersistenMap.
Tags.filenameBytes was introduced to reduce memory consumption, when
all tags were hold in memory. Tags are now stored in a PersistentMap
and only read when needed.

Moved the VariableByteEncoder into its own project, because it was
needed by pdb-api.
This commit is contained in:
2018-11-17 20:03:46 +01:00
parent b2107acf4e
commit 135ab42cd8
14 changed files with 97 additions and 128 deletions

View File

@@ -18,7 +18,7 @@ public class StringCompressor {
return new StringCompressor(mapsi);
}
public Integer put(final String string) {
public int put(final String string) {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
}

View File

@@ -0,0 +1,8 @@
package org.lucares.pdb.api;
import java.util.Comparator;
public class TagByKeyAndValueComparator {
public static final Comparator<Tag> INSTANCE = Comparator.comparing(Tag::getKey).thenComparing(Tag::getValue);
}

View File

@@ -1,15 +0,0 @@
package org.lucares.pdb.api;
import java.io.Serializable;
import java.util.Comparator;
public class TagByKeyComparator implements Comparator<Tag>, Serializable {
private static final long serialVersionUID = -6683582291996307323L;
public static final TagByKeyComparator INSTANCE = new TagByKeyComparator();
@Override
public int compare(final Tag a, final Tag b) {
return a.getKey().compareToIgnoreCase(b.getKey());
}
}

View File

@@ -1,9 +1,8 @@
package org.lucares.pdb.api;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@@ -11,57 +10,30 @@ import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.lucares.collections.LongList;
import org.lucares.utils.byteencoder.VariableByteEncoder;
public class Tags {
public static StringCompressor STRING_COMPRESSOR = null;
public static final byte[] EMPTY_BYTES = new byte[0];
private static final byte[] EMPTY_BYTES = new byte[0];
public static final Tags EMPTY = new Tags();
public static final String KEY_VALUE_SEPARATOR = "-";
public static final String KEY_VALUE_PAIR_SEPARATOR = "_";
public static final String KEY_VALUE_END_SEPARATOR = "$";
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
+ REGEX_KEY_VALUE + ")*";;
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)", REGEX_KEY_VALUE_PAIRS);
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
private final byte[] filenameBytes;
private final SortedSet<Tag> tags;
public Tags() {
filenameBytes = EMPTY_BYTES;
tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
}
public Tags(final byte[] filenameBytes) {
this(new String(filenameBytes, StandardCharsets.UTF_8));
}
public Tags(final String serializedTags) {
// serialized tags look like this: 0-1_2-1M_H-28_4-5$1.pdb
// there can be several files for the same set of tags, in which case the number
// after the $ is incremented
// We only take the part until the $.
final int end = serializedTags.indexOf(KEY_VALUE_END_SEPARATOR);
final String normalizedFilename;
if (end >= 0) {
normalizedFilename = serializedTags.substring(0, end);
} else {
normalizedFilename = serializedTags;
}
this.filenameBytes = normalizedFilename.getBytes(StandardCharsets.UTF_8);
public Tags(final Collection<Tag> tags) {
this.tags = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
this.tags.addAll(tags);
}
public static Tags create(final Collection<Tag> tags) {
final String newFilename = toFilename(tags);
return new Tags(newFilename);
return new Tags(tags);
}
public static Tags create() {
@@ -85,12 +57,42 @@ public class Tags {
return result;
}
public String serialize() {
return new String(this.filenameBytes, StandardCharsets.UTF_8);
public static Tags fromBytes(final byte[] bytes) {
final SortedSet<Tag> result = new TreeSet<>(TagByKeyAndValueComparator.INSTANCE);
final LongList keyValuesAsLongs = VariableByteEncoder.decode(bytes);
for (int i = 0; i < keyValuesAsLongs.size(); i += 2) {
final long keyAsLong = keyValuesAsLongs.get(i);
final long valueAsLong = keyValuesAsLongs.get(i + 1);
final String key = STRING_COMPRESSOR.get((int) keyAsLong);
final String value = STRING_COMPRESSOR.get((int) valueAsLong);
result.add(new Tag(key, value));
}
return new Tags(result);
}
public byte[] getFilenameBytes() {
return filenameBytes;
public byte[] toBytes() {
final byte[] result;
if (tags.size() > 0) {
final LongList keyValuesAsLongs = new LongList(tags.size() * 2);
for (final Tag tag : tags) {
final long keyAsLong = STRING_COMPRESSOR.put(tag.getKey());
final long valueAsLong = STRING_COMPRESSOR.put(tag.getValue());
keyValuesAsLongs.add(keyAsLong);
keyValuesAsLongs.add(valueAsLong);
}
result = VariableByteEncoder.encode(keyValuesAsLongs);
} else {
result = EMPTY_BYTES;
}
return result;
}
public String getValue(final String key) {
@@ -105,54 +107,7 @@ public class Tags {
}
public SortedSet<Tag> toTags() {
final SortedSet<Tag> result = new TreeSet<>(TagByKeyComparator.INSTANCE);
final String filename = new String(this.filenameBytes, StandardCharsets.UTF_8);
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
if (matcher.find()) {
final String serializedTags = matcher.group(1);
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
if (keyValuePair.length == 2) {
final String key = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[0]));
final String value = STRING_COMPRESSOR.get(RadixConverter.fromString(keyValuePair[1]));
result.add(new Tag(key, value));
}
}
}
return result;
}
private static String toFilename(final Collection<Tag> tags) {
final StringBuilder path = new StringBuilder();
final Tag[] tagsAsArray = tags.toArray(new Tag[tags.size()]);
Arrays.sort(tagsAsArray, TagByKeyComparator.INSTANCE);
for (final Tag tag : tagsAsArray) {
final String key = tag.getKey();
final String value = tag.getValue();
final int compressedKey = STRING_COMPRESSOR.put(key);
final int compressedValue = STRING_COMPRESSOR.put(value);
if (path.length() > 0) {
path.append(Tags.KEY_VALUE_PAIR_SEPARATOR);
}
path.append(RadixConverter.toString(compressedKey));
path.append(Tags.KEY_VALUE_SEPARATOR);
path.append(RadixConverter.toString(compressedValue));
}
path.append(Tags.KEY_VALUE_END_SEPARATOR);
return path.toString();
return Collections.unmodifiableSortedSet(tags);
}
public Set<String> getKeys() {
@@ -183,14 +138,14 @@ public class Tags {
@Override
public String toString() {
return "Tags [filename=" + serialize() + ", tags=" + toTags() + "]";
return "Tags [tags=" + toTags() + "]";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(filenameBytes);
result = prime * result + ((tags == null) ? 0 : tags.hashCode());
return result;
}
@@ -203,7 +158,10 @@ public class Tags {
if (getClass() != obj.getClass())
return false;
final Tags other = (Tags) obj;
if (!Arrays.equals(filenameBytes, other.filenameBytes))
if (tags == null) {
if (other.tags != null)
return false;
} else if (!tags.equals(other.tags))
return false;
return true;
}
@@ -224,11 +182,7 @@ public class Tags {
}
public boolean isEmpty() {
return filenameBytes == null || filenameBytes.length == 0;
}
public static Tags create(final String filename) {
return new Tags(filename);
return tags.isEmpty();
}
/**
@@ -237,7 +191,6 @@ public class Tags {
public String asString() {
final StringBuilder result = new StringBuilder();
final SortedSet<Tag> tags = toTags();
for (final Tag tag : tags) {
if (result.length() > 0) {