reduce memory usage

Reduce memory usage by storing the filename as string instead of
individual tags.
This commit is contained in:
2018-03-19 19:21:57 +01:00
parent 181fce805d
commit 5343c0d427
20 changed files with 315 additions and 454 deletions

View File

@@ -2,7 +2,6 @@ package org.lucares.pdb.datastore.internal;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -12,11 +11,10 @@ import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.lucares.collections.IntList;
import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.Doc;
import org.lucares.pdb.datastore.lang.Expression;
@@ -27,37 +25,24 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DataStore {
private static final Logger EXECUTE_QUERY_LOGGER = LoggerFactory.getLogger("org.lucares.metrics.dataStore.executeQuery");
private static final Logger EXECUTE_QUERY_LOGGER = LoggerFactory
.getLogger("org.lucares.metrics.dataStore.executeQuery");
private static final Logger LOGGER = LoggerFactory.getLogger(DataStore.class);
private static final String SUBDIR_STORAGE = "storage";
private static final String PDB_EXTENSION = ".pdb";
private static final String KEY_VALUE_SEPARATOR = "-";
private static final String KEY_VALUE_PAIR_SEPARATOR = "_";
private static final String KEY_VALUE_END_SEPARATOR = "$";
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
+ REGEX_KEY_VALUE + ")*";;
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)%2$s[0-9]*%3$s", REGEX_KEY_VALUE_PAIRS,
Pattern.quote(KEY_VALUE_END_SEPARATOR), PDB_EXTENSION);
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
// to be guarded by itself
private final List<Doc> docIdToDoc = new ArrayList<>();
private final ConcurrentHashMap<Tags, List<Doc>> tagsToDocs = new ConcurrentHashMap<>();
private final ConcurrentHashMap<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
private final StringCompressor stringCompressor;
private final FolderStorage folderStorage;
public DataStore(final Path dataDirectory) throws IOException {
stringCompressor = StringCompressor.create(keyCompressionFile(dataDirectory));
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(dataDirectory));
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
init(folderStorage);
@@ -75,8 +60,8 @@ public class DataStore {
});
trimIntLists();
sortIntLists();
synchronized (docIdToDoc) {
((ArrayList<Doc>)docIdToDoc).trimToSize();
synchronized (docIdToDoc) {
((ArrayList<Doc>) docIdToDoc).trimToSize();
}
}
@@ -84,24 +69,25 @@ public class DataStore {
final int docId;
final Doc newDoc = new Doc(tags, path);
synchronized (docIdToDoc) {
synchronized (docIdToDoc) {
docId = docIdToDoc.size();
docIdToDoc.add(newDoc);
}
tagsToDocs.compute(tags, (t, listOfDocs) -> {
final List<Doc> result = listOfDocs != null ? listOfDocs : new ArrayList<>(2);
result.add(newDoc);
return result;
});
for (final String key : tags.getKeys()) {
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new ConcurrentHashMap<>());
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key,
k -> new ConcurrentHashMap<>());
final String value = tags.getValue(key);
final IntList docIds = valueToDocIds.computeIfAbsent(value, v -> new IntList());
synchronized (docIds) {
synchronized (docIds) {
docIds.add(docId);
}
}
@@ -112,37 +98,30 @@ public class DataStore {
int totalBeforeTrim = 0;
int totalAfterTrim = 0;
int totalValues = 0;
for (Map<String, IntList> valueToDocIds : keyToValueToDocId.values()) {
for (IntList intList : valueToDocIds.values()) {
for (final Map<String, IntList> valueToDocIds : keyToValueToDocId.values()) {
for (final IntList intList : valueToDocIds.values()) {
totalBeforeTrim += intList.getCapacity();
intList.trim();
totalAfterTrim += intList.getCapacity();
totalValues += intList.size();
}
}
LOGGER.info(
"trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, took: {} ms",
totalValues,
(totalBeforeTrim * 4) / 1024,
(totalAfterTrim * 4) / 1024,
((totalBeforeTrim - totalAfterTrim) * 4) / 1024,
(totalValues * 4) / 1024,
LOGGER.info("trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, took: {} ms",
totalValues, (totalBeforeTrim * 4) / 1024, (totalAfterTrim * 4) / 1024,
((totalBeforeTrim - totalAfterTrim) * 4) / 1024, (totalValues * 4) / 1024,
(System.nanoTime() - start) / 1_000_000.0);
}
private void sortIntLists() {
final long start = System.nanoTime();
final Collection<Map<String, IntList>> valueToDocIds = keyToValueToDocId.values();
valueToDocIds.stream().flatMap(map -> map.values().stream()).forEach(intList -> intList.sort());
LOGGER.info(
"sorting IntLists, took: {} ms",
(System.nanoTime() - start) / 1_000_000.0);
LOGGER.info("sorting IntLists, took: {} ms", (System.nanoTime() - start) / 1_000_000.0);
}
private Path keyCompressionFile(final Path dataDirectory) throws IOException {
@@ -155,60 +134,16 @@ public class DataStore {
public Path createNewFile(final Tags tags) throws IOException {
final Path filename = toFilename(tags);
final Path result = folderStorage.insert(filename.toString(), PDB_EXTENSION);
final String filename = tags.getFilename();
final Path result = folderStorage.insert(filename, PDB_EXTENSION);
cacheTagToFileMapping(tags, result);
return result;
}
private Path toFilename(final Tags tags) {
final StringBuilder path = new StringBuilder();
final SortedSet<String> sortedKeys = new TreeSet<>(tags.getKeys());
for (final String key : sortedKeys) {
final String value = tags.getValue(key);
final int compressedKey = stringCompressor.put(key);
final int compressedValue = stringCompressor.put(value);
if (path.length() > 0) {
path.append(KEY_VALUE_PAIR_SEPARATOR);
}
path.append(RadixConverter.toString(compressedKey));
path.append(KEY_VALUE_SEPARATOR);
path.append(RadixConverter.toString(compressedValue));
}
path.append(KEY_VALUE_END_SEPARATOR);
return Paths.get(path.toString());
}
private Tags toTags(final String filename) {
Tags tags = Tags.create();
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
if (matcher.find()) {
final String serializedTags = matcher.group(1);
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
if (keyValuePair.length == 2) {
final String key = stringCompressor.get(RadixConverter.fromString(keyValuePair[0]));
final String value = stringCompressor.get(RadixConverter.fromString(keyValuePair[1]));
tags = tags.copyAdd(key, value);
}
}
}
final Tags tags = Tags.create(filename);
return tags;
}
@@ -263,13 +198,11 @@ public class DataStore {
final long start = System.nanoTime();
synchronized (docIdToDoc) {
final Expression expression = QueryLanguageParser.parse(query);
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(
keyToValueToDocId, new AllDocIds(docIdToDoc));
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(keyToValueToDocId,
new AllDocIds(docIdToDoc));
final IntList docIdsList = expression.visit(visitor);
EXECUTE_QUERY_LOGGER.debug(
"executeQuery({}) took {}ms returned {} results ", query,
(System.nanoTime() - start) / 1_000_000.0,
docIdsList.size());
EXECUTE_QUERY_LOGGER.debug("executeQuery({}) took {}ms returned {} results ", query,
(System.nanoTime() - start) / 1_000_000.0, docIdsList.size());
return docIdsList;
}
}
@@ -277,20 +210,20 @@ public class DataStore {
private List<Doc> mapDocIdsToDocs(final IntList docIdsList) {
final List<Doc> result = new ArrayList<>(docIdsList.size());
synchronized (docIdToDoc) {
final int[] intDocIds = docIdsList.toArray();
synchronized (docIdToDoc) {
final int[] intDocIds = docIdsList.toArray();
for (int i = 0; i < intDocIds.length; i++) {
final int docId = intDocIds[i];
final Doc doc = docIdToDoc.get(docId);
result.add(doc);
}
}
return result;
}
public List<Doc> getByTags(Tags tags) {
public List<Doc> getByTags(final Tags tags) {
final List<Doc> result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0));
return result;
}

View File

@@ -1,42 +0,0 @@
package org.lucares.pdb.datastore.internal;
public class RadixConverter {
private static final String ALPHABET = "0123456789ABCDEFGHIJKLMNOPRSTUVWXYZacbdefghijklmnopqrstuvwxyz";
public static String toString(final int value) {
if (value < 0) {
throw new IllegalArgumentException("value must not be negative");
}
final StringBuilder result = new StringBuilder();
int v = value;
if (v == 0) {
result.append(ALPHABET.charAt(0));
} else {
while (v > 0) {
final int remainder = v % ALPHABET.length();
v = v / ALPHABET.length();
result.insert(0, ALPHABET.charAt(remainder));
}
}
return result.toString();
}
public static int fromString(final String string) {
int result = 0;
for (int i = 0; i < string.length(); i++) {
final int value = ALPHABET.indexOf(string.charAt(i));
result = result * ALPHABET.length() + value;
}
return result;
}
}

View File

@@ -1,10 +0,0 @@
package org.lucares.pdb.datastore.internal;
public class RuntimeIOException extends RuntimeException {
private static final long serialVersionUID = 1L;
public RuntimeIOException(final Throwable cause) {
super(cause);
}
}

View File

@@ -1,32 +0,0 @@
package org.lucares.pdb.datastore.internal;
import java.nio.file.Path;
import org.lucares.pdb.datastore.internal.map.UniqueStringIntegerPairs;
/**
* Persistently maps Strings to integers.
*/
public class StringCompressor {
private final UniqueStringIntegerPairs usip;
public StringCompressor(final UniqueStringIntegerPairs usip) throws RuntimeIOException {
this.usip = usip;
}
public static StringCompressor create(final Path path) {
final UniqueStringIntegerPairs mapsi = new UniqueStringIntegerPairs(path);
return new StringCompressor(mapsi);
}
public Integer put(final String string) {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
}
public String get(final int integer) {
return usip.getKey(integer);
}
}

View File

@@ -0,0 +1,5 @@
package org.lucares.pdb.datastore.internal;
public class TagsUtils {
}

View File

@@ -1,126 +0,0 @@
package org.lucares.pdb.datastore.internal.map;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.lucares.pdb.datastore.internal.RuntimeIOException;
/**
* A very simple {@link Set}-like or {@link Map}-like datastructure that stores
* unique&sup1; pairs of Strings and integers persistently.
* <p>
* (1) Unique means, that neither the string, nor the integer may occur twice.
* For Example, imagine the pair ("a", 1) already exists, then neither ("a", 2)
* nor ("b", 1) may be added.
* <p>
* You can only add pairs. No deletion. It keeps an in memory view for fast
* retrievals.
*/
public class UniqueStringIntegerPairs {
private static final String SEPARATOR = "\t";
private static final boolean APPEND = true;
/**
* Maps a string to an integer. E.g. "myLongValue" -> 123
*/
private final Map<String, Integer> stringToInt = new HashMap<>();
/**
* Maps an integer to a string. E.g. 123 -> "myLongValue"
*/
private final SortedMap<Integer, String> intToString = new TreeMap<>();
private final Path file;
public UniqueStringIntegerPairs(final Path file) {
super();
this.file = file;
init(file);
}
private void init(final Path file) throws RuntimeIOException {
try {
Files.createDirectories(file.getParent());
if (!Files.exists(file)) {
Files.createFile(file);
}
try (final BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(file.toFile()), StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
if (tokens.length == 2) {
final String string = tokens[0];
final int value = Integer.parseInt(tokens[1]);
intToString.put(value, string);
stringToInt.put(string, value);
}
}
}
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
public void put(final String first, final int second) {
if (stringToInt.containsKey(first) || intToString.containsKey(second)) {
throw new IllegalArgumentException("Unique key constraint violation for (" + first + ", " + second + ")");
}
try (final Writer writer = new OutputStreamWriter(new FileOutputStream(file.toFile(), APPEND),
StandardCharsets.UTF_8)) {
writer.write(first + SEPARATOR + second + "\n");
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
intToString.put(second, first);
stringToInt.put(first, second);
}
public Integer get(final String first) {
return stringToInt.get(first);
}
public String getKey(final Integer second) {
return intToString.get(second);
}
public Integer getHighestInteger() {
return intToString.size() == 0 ? -1 : intToString.lastKey();
}
public Integer computeIfAbsent(final String first, final Function<String, Integer> mappingFunction) {
if (!stringToInt.containsKey(first)) {
final Integer second = mappingFunction.apply(first);
put(first, second);
}
return stringToInt.get(first);
}
}