reduce memory usage
Reduce memory usage by storing the filename as string instead of individual tags.
This commit is contained in:
@@ -2,7 +2,6 @@ package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@@ -12,11 +11,10 @@ import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.lucares.collections.IntList;
|
||||
import org.lucares.pdb.api.StringCompressor;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.datastore.Doc;
|
||||
import org.lucares.pdb.datastore.lang.Expression;
|
||||
@@ -27,37 +25,24 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class DataStore {
|
||||
private static final Logger EXECUTE_QUERY_LOGGER = LoggerFactory.getLogger("org.lucares.metrics.dataStore.executeQuery");
|
||||
private static final Logger EXECUTE_QUERY_LOGGER = LoggerFactory
|
||||
.getLogger("org.lucares.metrics.dataStore.executeQuery");
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(DataStore.class);
|
||||
|
||||
private static final String SUBDIR_STORAGE = "storage";
|
||||
private static final String PDB_EXTENSION = ".pdb";
|
||||
private static final String KEY_VALUE_SEPARATOR = "-";
|
||||
private static final String KEY_VALUE_PAIR_SEPARATOR = "_";
|
||||
private static final String KEY_VALUE_END_SEPARATOR = "$";
|
||||
|
||||
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
|
||||
|
||||
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
|
||||
+ REGEX_KEY_VALUE + ")*";;
|
||||
|
||||
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)%2$s[0-9]*%3$s", REGEX_KEY_VALUE_PAIRS,
|
||||
Pattern.quote(KEY_VALUE_END_SEPARATOR), PDB_EXTENSION);
|
||||
|
||||
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
|
||||
|
||||
// to be guarded by itself
|
||||
private final List<Doc> docIdToDoc = new ArrayList<>();
|
||||
|
||||
|
||||
private final ConcurrentHashMap<Tags, List<Doc>> tagsToDocs = new ConcurrentHashMap<>();
|
||||
|
||||
private final ConcurrentHashMap<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
|
||||
|
||||
private final StringCompressor stringCompressor;
|
||||
private final FolderStorage folderStorage;
|
||||
|
||||
public DataStore(final Path dataDirectory) throws IOException {
|
||||
stringCompressor = StringCompressor.create(keyCompressionFile(dataDirectory));
|
||||
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(dataDirectory));
|
||||
|
||||
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
|
||||
init(folderStorage);
|
||||
@@ -75,8 +60,8 @@ public class DataStore {
|
||||
});
|
||||
trimIntLists();
|
||||
sortIntLists();
|
||||
synchronized (docIdToDoc) {
|
||||
((ArrayList<Doc>)docIdToDoc).trimToSize();
|
||||
synchronized (docIdToDoc) {
|
||||
((ArrayList<Doc>) docIdToDoc).trimToSize();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,24 +69,25 @@ public class DataStore {
|
||||
|
||||
final int docId;
|
||||
final Doc newDoc = new Doc(tags, path);
|
||||
synchronized (docIdToDoc) {
|
||||
synchronized (docIdToDoc) {
|
||||
docId = docIdToDoc.size();
|
||||
docIdToDoc.add(newDoc);
|
||||
}
|
||||
|
||||
|
||||
tagsToDocs.compute(tags, (t, listOfDocs) -> {
|
||||
final List<Doc> result = listOfDocs != null ? listOfDocs : new ArrayList<>(2);
|
||||
result.add(newDoc);
|
||||
return result;
|
||||
});
|
||||
|
||||
|
||||
for (final String key : tags.getKeys()) {
|
||||
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new ConcurrentHashMap<>());
|
||||
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key,
|
||||
k -> new ConcurrentHashMap<>());
|
||||
|
||||
final String value = tags.getValue(key);
|
||||
|
||||
final IntList docIds = valueToDocIds.computeIfAbsent(value, v -> new IntList());
|
||||
synchronized (docIds) {
|
||||
synchronized (docIds) {
|
||||
docIds.add(docId);
|
||||
}
|
||||
}
|
||||
@@ -112,37 +98,30 @@ public class DataStore {
|
||||
int totalBeforeTrim = 0;
|
||||
int totalAfterTrim = 0;
|
||||
int totalValues = 0;
|
||||
for (Map<String, IntList> valueToDocIds : keyToValueToDocId.values()) {
|
||||
|
||||
for (IntList intList : valueToDocIds.values()) {
|
||||
for (final Map<String, IntList> valueToDocIds : keyToValueToDocId.values()) {
|
||||
|
||||
for (final IntList intList : valueToDocIds.values()) {
|
||||
totalBeforeTrim += intList.getCapacity();
|
||||
intList.trim();
|
||||
totalAfterTrim += intList.getCapacity();
|
||||
totalValues += intList.size();
|
||||
}
|
||||
}
|
||||
|
||||
LOGGER.info(
|
||||
"trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, took: {} ms",
|
||||
totalValues,
|
||||
(totalBeforeTrim * 4) / 1024,
|
||||
(totalAfterTrim * 4) / 1024,
|
||||
((totalBeforeTrim - totalAfterTrim) * 4) / 1024,
|
||||
(totalValues * 4) / 1024,
|
||||
|
||||
LOGGER.info("trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, took: {} ms",
|
||||
totalValues, (totalBeforeTrim * 4) / 1024, (totalAfterTrim * 4) / 1024,
|
||||
((totalBeforeTrim - totalAfterTrim) * 4) / 1024, (totalValues * 4) / 1024,
|
||||
(System.nanoTime() - start) / 1_000_000.0);
|
||||
}
|
||||
|
||||
|
||||
private void sortIntLists() {
|
||||
final long start = System.nanoTime();
|
||||
|
||||
|
||||
final Collection<Map<String, IntList>> valueToDocIds = keyToValueToDocId.values();
|
||||
|
||||
|
||||
valueToDocIds.stream().flatMap(map -> map.values().stream()).forEach(intList -> intList.sort());
|
||||
|
||||
|
||||
LOGGER.info(
|
||||
"sorting IntLists, took: {} ms",
|
||||
(System.nanoTime() - start) / 1_000_000.0);
|
||||
|
||||
LOGGER.info("sorting IntLists, took: {} ms", (System.nanoTime() - start) / 1_000_000.0);
|
||||
}
|
||||
|
||||
private Path keyCompressionFile(final Path dataDirectory) throws IOException {
|
||||
@@ -155,60 +134,16 @@ public class DataStore {
|
||||
|
||||
public Path createNewFile(final Tags tags) throws IOException {
|
||||
|
||||
final Path filename = toFilename(tags);
|
||||
final Path result = folderStorage.insert(filename.toString(), PDB_EXTENSION);
|
||||
final String filename = tags.getFilename();
|
||||
final Path result = folderStorage.insert(filename, PDB_EXTENSION);
|
||||
|
||||
cacheTagToFileMapping(tags, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private Path toFilename(final Tags tags) {
|
||||
final StringBuilder path = new StringBuilder();
|
||||
|
||||
final SortedSet<String> sortedKeys = new TreeSet<>(tags.getKeys());
|
||||
|
||||
for (final String key : sortedKeys) {
|
||||
final String value = tags.getValue(key);
|
||||
|
||||
final int compressedKey = stringCompressor.put(key);
|
||||
final int compressedValue = stringCompressor.put(value);
|
||||
|
||||
if (path.length() > 0) {
|
||||
path.append(KEY_VALUE_PAIR_SEPARATOR);
|
||||
}
|
||||
|
||||
path.append(RadixConverter.toString(compressedKey));
|
||||
path.append(KEY_VALUE_SEPARATOR);
|
||||
path.append(RadixConverter.toString(compressedValue));
|
||||
}
|
||||
path.append(KEY_VALUE_END_SEPARATOR);
|
||||
|
||||
return Paths.get(path.toString());
|
||||
}
|
||||
|
||||
private Tags toTags(final String filename) {
|
||||
Tags tags = Tags.create();
|
||||
|
||||
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
|
||||
|
||||
if (matcher.find()) {
|
||||
final String serializedTags = matcher.group(1);
|
||||
|
||||
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
|
||||
|
||||
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
|
||||
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
|
||||
|
||||
if (keyValuePair.length == 2) {
|
||||
|
||||
final String key = stringCompressor.get(RadixConverter.fromString(keyValuePair[0]));
|
||||
final String value = stringCompressor.get(RadixConverter.fromString(keyValuePair[1]));
|
||||
|
||||
tags = tags.copyAdd(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
final Tags tags = Tags.create(filename);
|
||||
|
||||
return tags;
|
||||
}
|
||||
@@ -263,13 +198,11 @@ public class DataStore {
|
||||
final long start = System.nanoTime();
|
||||
synchronized (docIdToDoc) {
|
||||
final Expression expression = QueryLanguageParser.parse(query);
|
||||
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(
|
||||
keyToValueToDocId, new AllDocIds(docIdToDoc));
|
||||
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(keyToValueToDocId,
|
||||
new AllDocIds(docIdToDoc));
|
||||
final IntList docIdsList = expression.visit(visitor);
|
||||
EXECUTE_QUERY_LOGGER.debug(
|
||||
"executeQuery({}) took {}ms returned {} results ", query,
|
||||
(System.nanoTime() - start) / 1_000_000.0,
|
||||
docIdsList.size());
|
||||
EXECUTE_QUERY_LOGGER.debug("executeQuery({}) took {}ms returned {} results ", query,
|
||||
(System.nanoTime() - start) / 1_000_000.0, docIdsList.size());
|
||||
return docIdsList;
|
||||
}
|
||||
}
|
||||
@@ -277,20 +210,20 @@ public class DataStore {
|
||||
private List<Doc> mapDocIdsToDocs(final IntList docIdsList) {
|
||||
final List<Doc> result = new ArrayList<>(docIdsList.size());
|
||||
|
||||
synchronized (docIdToDoc) {
|
||||
final int[] intDocIds = docIdsList.toArray();
|
||||
synchronized (docIdToDoc) {
|
||||
final int[] intDocIds = docIdsList.toArray();
|
||||
for (int i = 0; i < intDocIds.length; i++) {
|
||||
final int docId = intDocIds[i];
|
||||
|
||||
|
||||
final Doc doc = docIdToDoc.get(docId);
|
||||
|
||||
|
||||
result.add(doc);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<Doc> getByTags(Tags tags) {
|
||||
public List<Doc> getByTags(final Tags tags) {
|
||||
final List<Doc> result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0));
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
public class RadixConverter {
|
||||
|
||||
private static final String ALPHABET = "0123456789ABCDEFGHIJKLMNOPRSTUVWXYZacbdefghijklmnopqrstuvwxyz";
|
||||
|
||||
public static String toString(final int value) {
|
||||
|
||||
if (value < 0) {
|
||||
throw new IllegalArgumentException("value must not be negative");
|
||||
}
|
||||
|
||||
final StringBuilder result = new StringBuilder();
|
||||
int v = value;
|
||||
|
||||
if (v == 0) {
|
||||
result.append(ALPHABET.charAt(0));
|
||||
} else {
|
||||
while (v > 0) {
|
||||
final int remainder = v % ALPHABET.length();
|
||||
v = v / ALPHABET.length();
|
||||
|
||||
result.insert(0, ALPHABET.charAt(remainder));
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static int fromString(final String string) {
|
||||
|
||||
int result = 0;
|
||||
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
final int value = ALPHABET.indexOf(string.charAt(i));
|
||||
result = result * ALPHABET.length() + value;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,10 +0,0 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
public class RuntimeIOException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public RuntimeIOException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.lucares.pdb.datastore.internal.map.UniqueStringIntegerPairs;
|
||||
|
||||
/**
|
||||
* Persistently maps Strings to integers.
|
||||
*/
|
||||
public class StringCompressor {
|
||||
|
||||
private final UniqueStringIntegerPairs usip;
|
||||
|
||||
public StringCompressor(final UniqueStringIntegerPairs usip) throws RuntimeIOException {
|
||||
this.usip = usip;
|
||||
}
|
||||
|
||||
public static StringCompressor create(final Path path) {
|
||||
final UniqueStringIntegerPairs mapsi = new UniqueStringIntegerPairs(path);
|
||||
return new StringCompressor(mapsi);
|
||||
}
|
||||
|
||||
public Integer put(final String string) {
|
||||
|
||||
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
|
||||
}
|
||||
|
||||
public String get(final int integer) {
|
||||
|
||||
return usip.getKey(integer);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
public class TagsUtils {
|
||||
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
package org.lucares.pdb.datastore.internal.map;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.lucares.pdb.datastore.internal.RuntimeIOException;
|
||||
|
||||
/**
|
||||
* A very simple {@link Set}-like or {@link Map}-like datastructure that stores
|
||||
* unique¹ pairs of Strings and integers persistently.
|
||||
* <p>
|
||||
* (1) Unique means, that neither the string, nor the integer may occur twice.
|
||||
* For Example, imagine the pair ("a", 1) already exists, then neither ("a", 2)
|
||||
* nor ("b", 1) may be added.
|
||||
* <p>
|
||||
* You can only add pairs. No deletion. It keeps an in memory view for fast
|
||||
* retrievals.
|
||||
*/
|
||||
public class UniqueStringIntegerPairs {
|
||||
private static final String SEPARATOR = "\t";
|
||||
|
||||
private static final boolean APPEND = true;
|
||||
|
||||
/**
|
||||
* Maps a string to an integer. E.g. "myLongValue" -> 123
|
||||
*/
|
||||
private final Map<String, Integer> stringToInt = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Maps an integer to a string. E.g. 123 -> "myLongValue"
|
||||
*/
|
||||
private final SortedMap<Integer, String> intToString = new TreeMap<>();
|
||||
|
||||
private final Path file;
|
||||
|
||||
public UniqueStringIntegerPairs(final Path file) {
|
||||
super();
|
||||
this.file = file;
|
||||
init(file);
|
||||
}
|
||||
|
||||
private void init(final Path file) throws RuntimeIOException {
|
||||
|
||||
try {
|
||||
Files.createDirectories(file.getParent());
|
||||
if (!Files.exists(file)) {
|
||||
Files.createFile(file);
|
||||
}
|
||||
|
||||
try (final BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(new FileInputStream(file.toFile()), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
|
||||
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
|
||||
|
||||
if (tokens.length == 2) {
|
||||
final String string = tokens[0];
|
||||
final int value = Integer.parseInt(tokens[1]);
|
||||
intToString.put(value, string);
|
||||
stringToInt.put(string, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void put(final String first, final int second) {
|
||||
|
||||
if (stringToInt.containsKey(first) || intToString.containsKey(second)) {
|
||||
throw new IllegalArgumentException("Unique key constraint violation for (" + first + ", " + second + ")");
|
||||
}
|
||||
|
||||
try (final Writer writer = new OutputStreamWriter(new FileOutputStream(file.toFile(), APPEND),
|
||||
StandardCharsets.UTF_8)) {
|
||||
|
||||
writer.write(first + SEPARATOR + second + "\n");
|
||||
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
|
||||
intToString.put(second, first);
|
||||
stringToInt.put(first, second);
|
||||
}
|
||||
|
||||
public Integer get(final String first) {
|
||||
|
||||
return stringToInt.get(first);
|
||||
}
|
||||
|
||||
public String getKey(final Integer second) {
|
||||
return intToString.get(second);
|
||||
}
|
||||
|
||||
public Integer getHighestInteger() {
|
||||
return intToString.size() == 0 ? -1 : intToString.lastKey();
|
||||
}
|
||||
|
||||
public Integer computeIfAbsent(final String first, final Function<String, Integer> mappingFunction) {
|
||||
|
||||
if (!stringToInt.containsKey(first)) {
|
||||
final Integer second = mappingFunction.apply(first);
|
||||
put(first, second);
|
||||
}
|
||||
|
||||
return stringToInt.get(first);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user