add index for tags-to-documents
Now we can find writer much faster, because we don't have to execute a query for documents that match the tags. We can just look up the documents in the map. Speedup: 2-4ms -> 0.002-0.01ms
This commit is contained in:
@@ -39,4 +39,8 @@ public class PdbDB {
|
|||||||
return proposer.propose(query, caretIndex);
|
return proposer.propose(query, caretIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Doc> getByTags(Tags tags) {
|
||||||
|
|
||||||
|
return dataStore.getByTags(tags);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,7 +49,9 @@ public class DataStore {
|
|||||||
// to be guarded by itself
|
// to be guarded by itself
|
||||||
private final List<Doc> docIdToDoc = new ArrayList<>();
|
private final List<Doc> docIdToDoc = new ArrayList<>();
|
||||||
|
|
||||||
private final Map<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
|
private final ConcurrentHashMap<Tags, List<Doc>> tagsToDocs = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
private final ConcurrentHashMap<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private final StringCompressor stringCompressor;
|
private final StringCompressor stringCompressor;
|
||||||
private final FolderStorage folderStorage;
|
private final FolderStorage folderStorage;
|
||||||
@@ -81,11 +83,18 @@ public class DataStore {
|
|||||||
private void cacheTagToFileMapping(final Tags tags, final Path path) {
|
private void cacheTagToFileMapping(final Tags tags, final Path path) {
|
||||||
|
|
||||||
final int docId;
|
final int docId;
|
||||||
|
final Doc newDoc = new Doc(tags, path);
|
||||||
synchronized (docIdToDoc) {
|
synchronized (docIdToDoc) {
|
||||||
docId = docIdToDoc.size();
|
docId = docIdToDoc.size();
|
||||||
docIdToDoc.add(new Doc(tags, path));
|
docIdToDoc.add(newDoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tagsToDocs.compute(tags, (t, listOfDocs) -> {
|
||||||
|
final List<Doc> result = listOfDocs != null ? listOfDocs : new ArrayList<>(2);
|
||||||
|
result.add(newDoc);
|
||||||
|
return result;
|
||||||
|
});
|
||||||
|
|
||||||
for (final String key : tags.getKeys()) {
|
for (final String key : tags.getKeys()) {
|
||||||
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new ConcurrentHashMap<>());
|
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new ConcurrentHashMap<>());
|
||||||
|
|
||||||
@@ -280,4 +289,9 @@ public class DataStore {
|
|||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Doc> getByTags(Tags tags) {
|
||||||
|
final List<Doc> result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import java.util.Map.Entry;
|
|||||||
|
|
||||||
import org.lucares.pdb.api.Tags;
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.datastore.Doc;
|
import org.lucares.pdb.datastore.Doc;
|
||||||
import org.lucares.pdb.datastore.internal.DataStore;
|
|
||||||
import org.lucares.utils.CollectionUtils;
|
import org.lucares.utils.CollectionUtils;
|
||||||
import org.lucares.utils.file.FileUtils;
|
import org.lucares.utils.file.FileUtils;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
@@ -99,6 +98,28 @@ public class DataStoreTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testGetByTags() throws IOException
|
||||||
|
{
|
||||||
|
final Tags eagleTim1 = Tags.create("bird", "eagle", "name", "Tim");
|
||||||
|
final Tags eagleTim2 = Tags.create("bird", "eagle", "name", "Tim");
|
||||||
|
final Tags pigeonJennifer = Tags.create("bird", "pigeon", "name", "Jennifer");
|
||||||
|
final Tags flamingoJennifer = Tags.create("bird", "flamingo", "name", "Jennifer");
|
||||||
|
|
||||||
|
dataStore = new DataStore(dataDirectory);
|
||||||
|
|
||||||
|
dataStore.createNewFile(eagleTim1);
|
||||||
|
dataStore.createNewFile(eagleTim2);
|
||||||
|
dataStore.createNewFile(pigeonJennifer);
|
||||||
|
dataStore.createNewFile(flamingoJennifer);
|
||||||
|
|
||||||
|
// eagleTim1 and eagleTim2 have the same tags, so we find both docs
|
||||||
|
final List<Doc> docsEagleTim = dataStore.getByTags(eagleTim1);
|
||||||
|
Assert.assertEquals(docsEagleTim.size(), 2, "two docs for eagleTim1 and eagleTim2");
|
||||||
|
|
||||||
|
final List<Doc> docsFlamingoJennifer = dataStore.getByTags(flamingoJennifer);
|
||||||
|
Assert.assertEquals(docsFlamingoJennifer.size(), 1, "doc for docsFlamingoJennifer");
|
||||||
|
}
|
||||||
|
|
||||||
private void assertSearch(final String query, final Tags... tags) {
|
private void assertSearch(final String query, final Tags... tags) {
|
||||||
final List<Doc> actualDocs = dataStore.search(query);
|
final List<Doc> actualDocs = dataStore.search(query);
|
||||||
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
|
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
|
||||||
|
|||||||
@@ -38,6 +38,7 @@
|
|||||||
<logger name="org.lucares.metrics.proposals" level="DEBUG" />
|
<logger name="org.lucares.metrics.proposals" level="DEBUG" />
|
||||||
<logger name="org.lucares.metrics.plotter" level="DEBUG" />
|
<logger name="org.lucares.metrics.plotter" level="DEBUG" />
|
||||||
<logger name="org.lucares.metrics.gnuplot" level="DEBUG" />
|
<logger name="org.lucares.metrics.gnuplot" level="DEBUG" />
|
||||||
|
<logger name="org.lucares.metrics.ingestion.tagsToFile.newPdbWriter" level="DEBUG" />
|
||||||
<!--
|
<!--
|
||||||
<logger name="org.lucares.metrics.dataStore" level="DEBUG" />
|
<logger name="org.lucares.metrics.dataStore" level="DEBUG" />
|
||||||
<logger name="org.lucares.pdb.datastore.lang.QueryCompletionPdbLangParser" level="TRACE" />
|
<logger name="org.lucares.pdb.datastore.lang.QueryCompletionPdbLangParser" level="TRACE" />
|
||||||
|
|||||||
@@ -12,9 +12,7 @@ import java.util.Map;
|
|||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.lucares.pdb.api.Tags;
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.datastore.Doc;
|
import org.lucares.pdb.datastore.Doc;
|
||||||
@@ -26,7 +24,8 @@ import org.slf4j.LoggerFactory;
|
|||||||
public class TagsToFile implements AutoCloseable {
|
public class TagsToFile implements AutoCloseable {
|
||||||
|
|
||||||
private static final Logger LOGGER = LoggerFactory.getLogger(TagsToFile.class);
|
private static final Logger LOGGER = LoggerFactory.getLogger(TagsToFile.class);
|
||||||
private final static Logger METRICS_LOGGER = LoggerFactory.getLogger("org.lucares.metrics.ingestion.tagsToFile");
|
private final static Logger METRICS_LOGGER_FIND_WRITER = LoggerFactory.getLogger("org.lucares.metrics.ingestion.tagsToFile.findWriter");
|
||||||
|
private final static Logger METRICS_LOGGER_NEW_WRITER = LoggerFactory.getLogger("org.lucares.metrics.ingestion.tagsToFile.newPdbWriter");
|
||||||
|
|
||||||
private static class WriterCache {
|
private static class WriterCache {
|
||||||
final List<PdbWriter> writers = new ArrayList<>();
|
final List<PdbWriter> writers = new ArrayList<>();
|
||||||
@@ -53,25 +52,23 @@ public class TagsToFile implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private List<PdbFile> getFilesMatchingTagsExactly(final Tags tags) {
|
private List<PdbFile> getFilesMatchingTagsExactly(final Tags tags) {
|
||||||
final List<PdbFile> files = getFilesMatchingTags(tags);
|
final List<Doc> docs = db.getByTags(tags);
|
||||||
|
return toPdbFiles(docs);
|
||||||
return CollectionUtils.filter(files, f -> f.getTags().equals(tags));
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<PdbFile> getFilesMatchingTags(final Tags tags) {
|
|
||||||
final String query = Query.createQuery(tags);
|
|
||||||
|
|
||||||
return getFilesForQuery(query);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<PdbFile> getFilesForQuery(final String query) {
|
public List<PdbFile> getFilesForQuery(final String query) {
|
||||||
final List<PdbFile> result = new ArrayList<>();
|
|
||||||
|
|
||||||
final List<Doc> searchResult = db.search(query);
|
final List<Doc> searchResult = db.search(query);
|
||||||
if (searchResult.size() > 500_000){
|
if (searchResult.size() > 500_000){
|
||||||
throw new IllegalStateException("Too many results.");
|
throw new IllegalStateException("Too many results.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final List<PdbFile> result = toPdbFiles(searchResult);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<PdbFile> toPdbFiles(final List<Doc> searchResult) {
|
||||||
|
final List<PdbFile> result = new ArrayList<>();
|
||||||
for (final Doc document : searchResult) {
|
for (final Doc document : searchResult) {
|
||||||
|
|
||||||
final Path path = document.getPath();
|
final Path path = document.getPath();
|
||||||
@@ -80,7 +77,6 @@ public class TagsToFile implements AutoCloseable {
|
|||||||
|
|
||||||
result.add(pdbFile);
|
result.add(pdbFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,15 +93,13 @@ public class TagsToFile implements AutoCloseable {
|
|||||||
final long start = System.nanoTime();
|
final long start = System.nanoTime();
|
||||||
final List<PdbFile> pdbFiles = getFilesMatchingTagsExactly(tags);
|
final List<PdbFile> pdbFiles = getFilesMatchingTagsExactly(tags);
|
||||||
|
|
||||||
assertAllFilesHaveSameFolder(pdbFiles);
|
|
||||||
|
|
||||||
pdbFiles.removeIf(f -> !f.exists());
|
pdbFiles.removeIf(f -> !f.exists());
|
||||||
final List<Optional<PdbWriter>> optionalWriters = CollectionUtils.map(pdbFiles, writersForTags::writer);
|
final List<Optional<PdbWriter>> optionalWriters = CollectionUtils.map(pdbFiles, writersForTags::writer);
|
||||||
final List<Optional<PdbWriter>> existingWriters = CollectionUtils.filter(optionalWriters,
|
final List<Optional<PdbWriter>> existingWriters = CollectionUtils.filter(optionalWriters,
|
||||||
Optional::isPresent);
|
Optional::isPresent);
|
||||||
final List<PdbWriter> writers = CollectionUtils.map(existingWriters, Optional::get);
|
final List<PdbWriter> writers = CollectionUtils.map(existingWriters, Optional::get);
|
||||||
|
|
||||||
METRICS_LOGGER.debug("find writers took {}ms for tags {}", (System.nanoTime() - start)
|
METRICS_LOGGER_FIND_WRITER.debug("find writers took {}ms for tags {}", (System.nanoTime() - start)
|
||||||
/ 1_000_000.0, tags);
|
/ 1_000_000.0, tags);
|
||||||
|
|
||||||
final Optional<PdbWriter> optionalFirst = chooseBestMatchingWriter(writers, date);
|
final Optional<PdbWriter> optionalFirst = chooseBestMatchingWriter(writers, date);
|
||||||
@@ -163,27 +157,20 @@ public class TagsToFile implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private PdbWriter newPdbWriter(final Tags tags) {
|
private PdbWriter newPdbWriter(final Tags tags) {
|
||||||
|
final long start = System.nanoTime();
|
||||||
try {
|
try {
|
||||||
final PdbFile pdbFile = createNewPdbFile(tags);
|
final PdbFile pdbFile = createNewPdbFile(tags);
|
||||||
final PdbWriter result = new PdbWriter(pdbFile);
|
final PdbWriter result = new PdbWriter(pdbFile);
|
||||||
|
|
||||||
getOrInit(tags).addWriter(result);
|
getOrInit(tags).addWriter(result);
|
||||||
|
|
||||||
|
METRICS_LOGGER_NEW_WRITER.debug("newPdbWriter took {}ms tags: ", (System.nanoTime() - start) / 1_000_000.0, tags);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
throw new WriteException(e);
|
throw new WriteException(e);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private void assertAllFilesHaveSameFolder(final List<PdbFile> pdbFiles) {
|
|
||||||
final Set<Path> reducedFolder = pdbFiles.stream()//
|
|
||||||
.map(PdbFile::getPath)//
|
|
||||||
.map(p -> p.getParent().getParent().getParent())//
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
|
|
||||||
if (reducedFolder.size() > 1) {
|
|
||||||
throw new IllegalStateException(
|
|
||||||
"All storage folders for the same tag must be the same, but are not: " + reducedFolder);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private PdbFile createNewPdbFile(final Tags tags) throws IOException {
|
private PdbFile createNewPdbFile(final Tags tags) throws IOException {
|
||||||
|
|||||||
Reference in New Issue
Block a user