parallelize initialization of DataStore

When the files are already in the OS cache, then the initialization time
for 750k files went down from 35 seconds to 15 seconds.
This commit is contained in:
ahr
2017-12-23 08:58:42 +01:00
parent a6251074cf
commit e59caa0f02

View File

@@ -5,12 +5,12 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@@ -47,7 +47,7 @@ public class DataStore {
private final List<Doc> docIdToDoc = new ArrayList<>();
private final Map<String, Map<String, IntList>> keyToValueToDocId = new HashMap<>();
private final Map<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
private final StringCompressor stringCompressor;
private final FolderStorage folderStorage;
@@ -62,7 +62,7 @@ public class DataStore {
private void init(final FolderStorage folderStorage) throws IOException {
final Stream<Path> files = folderStorage.list();
files.forEach(path -> {
files.parallel().forEach(path -> {
final String filename = path.getFileName().toString();
final Tags tags = toTags(filename);
@@ -78,14 +78,16 @@ public class DataStore {
docIdToDoc.add(new Doc(tags, path));
for (final String key : tags.getKeys()) {
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new HashMap<>());
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new ConcurrentHashMap<>());
final String value = tags.getValue(key);
final IntList docIds = valueToDocIds.computeIfAbsent(value, v -> new IntList());
synchronized (docIds) {
docIds.add(docId);
}
}
}
private void trimIntLists() {
final long start = System.nanoTime();
@@ -103,9 +105,10 @@ public class DataStore {
}
LOGGER.info(
"trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, total size: {} kB, took: {} ms",
"trimming IntLists of index: values {}, {} kB before, {} kB after, difference {} kB, took: {} ms",
totalValues,
(totalBeforeTrim * 4) / 1024, (totalAfterTrim * 4) / 1024,
(totalBeforeTrim * 4) / 1024,
(totalAfterTrim * 4) / 1024,
((totalBeforeTrim - totalAfterTrim) * 4) / 1024,
(totalValues * 4) / 1024,
(System.nanoTime() - start) / 1_000_000.0);