prepare more efficient query completion

adding an index that answers the question
given a query "a=b and c=", what are possible values
for c.
This commit is contained in:
2019-01-13 10:22:17 +01:00
parent 5197063ae3
commit 72e9a9ebe3
8 changed files with 338 additions and 6 deletions

View File

@@ -24,6 +24,7 @@ import org.lucares.pdb.datastore.Doc;
import org.lucares.pdb.datastore.Proposal;
import org.lucares.pdb.datastore.lang.Expression;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
import org.lucares.pdb.datastore.lang.NewProposerParser;
import org.lucares.pdb.datastore.lang.QueryLanguageParser;
import org.lucares.pdb.diskstorage.DiskStorage;
import org.lucares.pdb.map.PersistentMap;
@@ -150,14 +151,17 @@ public class DataStore implements AutoCloseable {
private final PersistentMap<Tag, Long> tagToDocsId;
private final QueryCompletionIndex queryCompletionIndex;
// A Doc will never be changed once it is created. Therefore we can cache them
// easily.
private final HotEntryCache<Long, Doc> docIdToDocCache = new HotEntryCache<>(Duration.ofMinutes(10),
private final HotEntryCache<Long, Doc> docIdToDocCache = new HotEntryCache<>(Duration.ofSeconds(5),
"docIdToDocCache");
private final DiskStorage diskStorage;
private final Path diskStorageFilePath;
private final Path storageBasePath;
private final Path queryCompletionIndexFile;
public DataStore(final Path dataDirectory) throws IOException {
storageBasePath = storageDirectory(dataDirectory);
@@ -178,6 +182,9 @@ public class DataStore implements AutoCloseable {
final Path docIdToDocIndexPath = storageBasePath.resolve("docIdToDocIndex.bs");
docIdToDoc = new PersistentMap<>(docIdToDocIndexPath, PersistentMap.LONG_CODER, ENCODER_DOC);
queryCompletionIndexFile = storageBasePath.resolve("queryCompletionIndex.bs");
queryCompletionIndex = new QueryCompletionIndex(queryCompletionIndexFile);
}
private Path keyCompressionFile(final Path dataDirectory) throws IOException {
@@ -199,6 +206,7 @@ public class DataStore implements AutoCloseable {
final Long oldDocId = tagsToDocId.putValue(tags, docId);
Preconditions.checkNull(oldDocId, "There must be at most one document for tags: {0}", tags);
// store mapping from tag to docId, so that we can find all docs for a given tag
final List<Tag> ts = new ArrayList<>(tags.toTags());
ts.add(TAG_ALL_DOCS);
for (final Tag tag : ts) {
@@ -215,6 +223,10 @@ public class DataStore implements AutoCloseable {
}
}
// index the tags, so that we can efficiently find all possible values for a
// field in a query
queryCompletionIndex.addTags(tags);
return newFilesRootBlockOffset;
}
@@ -295,7 +307,7 @@ public class DataStore implements AutoCloseable {
final List<Doc> result = new ArrayList<>(docIdsList.size());
synchronized (docIdToDoc) {
final long start = System.nanoTime();
for (int i = 0; i < docIdsList.size(); i++) {
final long docId = docIdsList.get(i);
@@ -304,6 +316,8 @@ public class DataStore implements AutoCloseable {
result.add(doc);
}
System.out.println(
"mapDocIdsToDocs(" + docIdsList.size() + "): " + (System.nanoTime() - start) / 1_000_000.0 + "ms");
}
return result;
}
@@ -342,6 +356,11 @@ public class DataStore implements AutoCloseable {
}
public List<Proposal> propose(final String query, final int caretIndex) {
final NewProposerParser newProposerParser = new NewProposerParser(this);
final List<Proposal> proposals = newProposerParser.propose(query, caretIndex);
System.out.println(proposals);
return new Proposer(this).propose(query, caretIndex);
}

View File

@@ -0,0 +1,158 @@
package org.lucares.pdb.datastore.internal;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import org.lucares.collections.LongList;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.map.Empty;
import org.lucares.pdb.map.PersistentMap;
import org.lucares.pdb.map.PersistentMap.EncoderDecoder;
import org.lucares.utils.byteencoder.VariableByteEncoder;
/**
* This index supports query completion.
* <p>
* E.g. Given the query "firstname=John and lastname=|" ('|' denotes the
* position of the caret). How do we find all lastnames that match this query?
* <br>
* The expensive way is to execute the query for all available lastnames and
* keep those that return at least one result.<br>
* A more effiecient way uses an index that lists all lastnames that occurr with
* firstname=John. If we write this as table, then it looks like this:
*
* <pre>
*┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
*┃ fieldA ┃ valueA ┃ fieldB ┃ valueB ┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ John ┃lastname ┃ Connor ┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ John ┃lastname ┃Carpenter┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ John ┃country ┃ Germany ┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ John ┃lastname ┃ Nash ┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ Rick ┃lastname ┃ Meyer ┃
*┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
*┃firstname┃ Rick ┃lastname ┃ Castle ┃
*┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
* </pre>
*
* The lastnames where firstname=John are Connor, Carpenter and Nash. Given such
* a table we can just for all rows with fieldA=firstname and valueA=John and
* fieldB = lastname.
* <p>
* The values in this index represent such a table.
* <p>
* Note: the index contains all four columns, but when searching we only use the
* first three.
*
*/
public class QueryCompletionIndex implements AutoCloseable {
private static final class TwoTags {
private final Tag tagA;
private final Tag tagB;
public TwoTags(final Tag tagA, final Tag tagB) {
this.tagA = tagA;
this.tagB = tagB;
}
public Tag getTagA() {
return tagA;
}
public Tag getTagB() {
return tagB;
}
@Override
public String toString() {
return tagA + "::" + tagB;
}
}
private static final class EncoderTwoTags implements EncoderDecoder<TwoTags> {
@Override
public byte[] encode(final TwoTags tagAndField) {
final LongList tmp = new LongList(4);
final Tag tagA = tagAndField.getTagA();
final Tag tagB = tagAndField.getTagB();
tmp.add(tagA.getKey());
tmp.add(tagA.getValue());
tmp.add(tagB.getKey());
// A query for tagA.key and tagA.value and tagB.key is done by setting
// tagB.value==0.
// The query is then executed as a prefix search. Thus tagB.value must not be
// part of the byte array that is returned.
if (tagB.getValue() >= 0) {
tmp.add(tagB.getValue());
}
return VariableByteEncoder.encode(tmp);
}
@Override
public TwoTags decode(final byte[] bytes) {
final LongList tmp = VariableByteEncoder.decode(bytes);
final int tagAKey = (int) tmp.get(0);
final int tagAValue = (int) tmp.get(1);
final int tagBKey = (int) tmp.get(2);
final int tagBValue = (int) tmp.get(3);
final Tag tagA = new Tag(tagAKey, tagAValue);
final Tag tagB = new Tag(tagBKey, tagBValue);
return new TwoTags(tagA, tagB);
}
}
private final PersistentMap<TwoTags, Empty> tagToTagIndex;
public QueryCompletionIndex(final Path indexFile) throws IOException {
tagToTagIndex = new PersistentMap<>(indexFile, new EncoderTwoTags(), PersistentMap.EMPTY_ENCODER);
}
public void addTags(final Tags tags) throws IOException {
final List<Tag> listOfTagsA = tags.toTags();
final List<Tag> listOfTagsB = tags.toTags();
// index all combinations of tagA and tagB
for (final Tag tagA : listOfTagsA) {
for (final Tag tagB : listOfTagsB) {
final TwoTags key = new TwoTags(tagA, tagB);
tagToTagIndex.putValue(key, Empty.INSTANCE);
}
}
}
@Override
public void close() throws IOException {
tagToTagIndex.close();
}
public SortedSet<String> find(final Tag tag, final String field) throws IOException {
final SortedSet<String> result = new TreeSet<>();
final int tagBKey = Tags.STRING_COMPRESSOR.put(field);
final Tag tagB = new Tag(tagBKey, -1); // the value must be negative for the prefix search to work. See
// EncoderTwoTags
final TwoTags keyPrefix = new TwoTags(tag, tagB);
tagToTagIndex.visitValues(keyPrefix, (k, v) -> {
result.add(k.getTagB().getValueAsString());
});
return result;
}
}

View File

@@ -0,0 +1,27 @@
package org.lucares.pdb.datastore.lang;
import java.util.List;
import org.lucares.pdb.datastore.Proposal;
import org.lucares.pdb.datastore.internal.DataStore;
public class NewProposerParser {
private final static String CARET_MARKER = "\ue001"; // third character in the private use area
private final DataStore dataStore;
public NewProposerParser(final DataStore dataStore) {
this.dataStore = dataStore;
}
public List<Proposal> propose(final String query, final int caretIndex) {
final String queryString = new StringBuilder(query).insert(caretIndex, CARET_MARKER).toString();
final Expression expression = QueryLanguageParser.parse(queryString);
return null;
}
}