prepare more efficient query completion

adding an index that answers the question given a query "a=b and c=", what are possible values for c.
2019-01-13 10:22:17 +01:00
parent 5197063ae3
commit 72e9a9ebe3
8 changed files with 338 additions and 6 deletions
@@ -24,6 +24,7 @@ import org.lucares.pdb.datastore.Doc;
 import org.lucares.pdb.datastore.Proposal;
 import org.lucares.pdb.datastore.lang.Expression;
 import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
+import org.lucares.pdb.datastore.lang.NewProposerParser;
 import org.lucares.pdb.datastore.lang.QueryLanguageParser;
 import org.lucares.pdb.diskstorage.DiskStorage;
 import org.lucares.pdb.map.PersistentMap;
@@ -150,14 +151,17 @@ public class DataStore implements AutoCloseable {

 	private final PersistentMap<Tag, Long> tagToDocsId;

+	private final QueryCompletionIndex queryCompletionIndex;
+
 	// A Doc will never be changed once it is created. Therefore we can cache them
 	// easily.
-	private final HotEntryCache<Long, Doc> docIdToDocCache = new HotEntryCache<>(Duration.ofMinutes(10),
+	private final HotEntryCache<Long, Doc> docIdToDocCache = new HotEntryCache<>(Duration.ofSeconds(5),
 			"docIdToDocCache");

 	private final DiskStorage diskStorage;
 	private final Path diskStorageFilePath;
 	private final Path storageBasePath;
+	private final Path queryCompletionIndexFile;

 	public DataStore(final Path dataDirectory) throws IOException {
 		storageBasePath = storageDirectory(dataDirectory);
@@ -178,6 +182,9 @@ public class DataStore implements AutoCloseable {

 		final Path docIdToDocIndexPath = storageBasePath.resolve("docIdToDocIndex.bs");
 		docIdToDoc = new PersistentMap<>(docIdToDocIndexPath, PersistentMap.LONG_CODER, ENCODER_DOC);
+
+		queryCompletionIndexFile = storageBasePath.resolve("queryCompletionIndex.bs");
+		queryCompletionIndex = new QueryCompletionIndex(queryCompletionIndexFile);
 	}

 	private Path keyCompressionFile(final Path dataDirectory) throws IOException {
@@ -199,6 +206,7 @@ public class DataStore implements AutoCloseable {
 		final Long oldDocId = tagsToDocId.putValue(tags, docId);
 		Preconditions.checkNull(oldDocId, "There must be at most one document for tags: {0}", tags);

+		// store mapping from tag to docId, so that we can find all docs for a given tag
 		final List<Tag> ts = new ArrayList<>(tags.toTags());
 		ts.add(TAG_ALL_DOCS);
 		for (final Tag tag : ts) {
@@ -215,6 +223,10 @@ public class DataStore implements AutoCloseable {
 			}
 		}

+		// index the tags, so that we can efficiently find all possible values for a
+		// field in a query
+		queryCompletionIndex.addTags(tags);
+
 		return newFilesRootBlockOffset;
 	}

@@ -295,7 +307,7 @@ public class DataStore implements AutoCloseable {
 		final List<Doc> result = new ArrayList<>(docIdsList.size());

 		synchronized (docIdToDoc) {
-
+			final long start = System.nanoTime();
 			for (int i = 0; i < docIdsList.size(); i++) {
 				final long docId = docIdsList.get(i);

@@ -304,6 +316,8 @@ public class DataStore implements AutoCloseable {

 				result.add(doc);
 			}
+			System.out.println(
+					"mapDocIdsToDocs(" + docIdsList.size() + "): " + (System.nanoTime() - start) / 1_000_000.0 + "ms");
 		}
 		return result;
 	}
@@ -342,6 +356,11 @@ public class DataStore implements AutoCloseable {
 	}

 	public List<Proposal> propose(final String query, final int caretIndex) {
+
+		final NewProposerParser newProposerParser = new NewProposerParser(this);
+		final List<Proposal> proposals = newProposerParser.propose(query, caretIndex);
+		System.out.println(proposals);
+
 		return new Proposer(this).propose(query, caretIndex);
 	}

@@ -0,0 +1,158 @@
+package org.lucares.pdb.datastore.internal;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.lucares.collections.LongList;
+import org.lucares.pdb.api.Tag;
+import org.lucares.pdb.api.Tags;
+import org.lucares.pdb.map.Empty;
+import org.lucares.pdb.map.PersistentMap;
+import org.lucares.pdb.map.PersistentMap.EncoderDecoder;
+import org.lucares.utils.byteencoder.VariableByteEncoder;
+
+/**
+ * This index supports query completion.
+ * <p>
+ * E.g. Given the query "firstname=John and lastname=|" ('|' denotes the
+ * position of the caret). How do we find all lastnames that match this query?
+ * <br>
+ * The expensive way is to execute the query for all available lastnames and
+ * keep those that return at least one result.<br>
+ * A more effiecient way uses an index that lists all lastnames that occurr with
+ * firstname=John. If we write this as table, then it looks like this:
+ *
+ * <pre>
+ *┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
+ *┃ fieldA  ┃ valueA  ┃ fieldB  ┃  valueB ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃ Connor  ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃Carpenter┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃country  ┃ Germany ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃ Nash    ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ Rick    ┃lastname ┃ Meyer   ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ Rick    ┃lastname ┃ Castle  ┃
+ *┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ * </pre>
+ *
+ * The lastnames where firstname=John are Connor, Carpenter and Nash. Given such
+ * a table we can just for all rows with fieldA=firstname and valueA=John and
+ * fieldB = lastname.
+ * <p>
+ * The values in this index represent such a table.
+ * <p>
+ * Note: the index contains all four columns, but when searching we only use the
+ * first three.
+ *
+ */
+public class QueryCompletionIndex implements AutoCloseable {
+	private static final class TwoTags {
+		private final Tag tagA;
+		private final Tag tagB;
+
+		public TwoTags(final Tag tagA, final Tag tagB) {
+			this.tagA = tagA;
+			this.tagB = tagB;
+		}
+
+		public Tag getTagA() {
+			return tagA;
+		}
+
+		public Tag getTagB() {
+			return tagB;
+		}
+
+		@Override
+		public String toString() {
+			return tagA + "::" + tagB;
+		}
+
+	}
+
+	private static final class EncoderTwoTags implements EncoderDecoder<TwoTags> {
+
+		@Override
+		public byte[] encode(final TwoTags tagAndField) {
+			final LongList tmp = new LongList(4);
+			final Tag tagA = tagAndField.getTagA();
+			final Tag tagB = tagAndField.getTagB();
+
+			tmp.add(tagA.getKey());
+			tmp.add(tagA.getValue());
+
+			tmp.add(tagB.getKey());
+
+			// A query for tagA.key and tagA.value and tagB.key is done by setting
+			// tagB.value==0.
+			// The query is then executed as a prefix search. Thus tagB.value must not be
+			// part of the byte array that is returned.
+			if (tagB.getValue() >= 0) {
+				tmp.add(tagB.getValue());
+			}
+
+			return VariableByteEncoder.encode(tmp);
+		}
+
+		@Override
+		public TwoTags decode(final byte[] bytes) {
+
+			final LongList tmp = VariableByteEncoder.decode(bytes);
+			final int tagAKey = (int) tmp.get(0);
+			final int tagAValue = (int) tmp.get(1);
+			final int tagBKey = (int) tmp.get(2);
+			final int tagBValue = (int) tmp.get(3);
+
+			final Tag tagA = new Tag(tagAKey, tagAValue);
+			final Tag tagB = new Tag(tagBKey, tagBValue);
+
+			return new TwoTags(tagA, tagB);
+		}
+	}
+
+	private final PersistentMap<TwoTags, Empty> tagToTagIndex;
+
+	public QueryCompletionIndex(final Path indexFile) throws IOException {
+		tagToTagIndex = new PersistentMap<>(indexFile, new EncoderTwoTags(), PersistentMap.EMPTY_ENCODER);
+	}
+
+	public void addTags(final Tags tags) throws IOException {
+		final List<Tag> listOfTagsA = tags.toTags();
+		final List<Tag> listOfTagsB = tags.toTags();
+
+		// index all combinations of tagA and tagB
+		for (final Tag tagA : listOfTagsA) {
+			for (final Tag tagB : listOfTagsB) {
+				final TwoTags key = new TwoTags(tagA, tagB);
+				tagToTagIndex.putValue(key, Empty.INSTANCE);
+			}
+		}
+	}
+
+	@Override
+	public void close() throws IOException {
+		tagToTagIndex.close();
+	}
+
+	public SortedSet<String> find(final Tag tag, final String field) throws IOException {
+		final SortedSet<String> result = new TreeSet<>();
+		final int tagBKey = Tags.STRING_COMPRESSOR.put(field);
+		final Tag tagB = new Tag(tagBKey, -1); // the value must be negative for the prefix search to work. See
+												// EncoderTwoTags
+		final TwoTags keyPrefix = new TwoTags(tag, tagB);
+		tagToTagIndex.visitValues(keyPrefix, (k, v) -> {
+			result.add(k.getTagB().getValueAsString());
+		});
+
+		return result;
+
+	}
+}
@@ -0,0 +1,27 @@
+package org.lucares.pdb.datastore.lang;
+
+import java.util.List;
+
+import org.lucares.pdb.datastore.Proposal;
+import org.lucares.pdb.datastore.internal.DataStore;
+
+public class NewProposerParser {
+
+	private final static String CARET_MARKER = "\ue001"; // third character in the private use area
+
+	private final DataStore dataStore;
+
+	public NewProposerParser(final DataStore dataStore) {
+		this.dataStore = dataStore;
+	}
+
+	public List<Proposal> propose(final String query, final int caretIndex) {
+
+		final String queryString = new StringBuilder(query).insert(caretIndex, CARET_MARKER).toString();
+
+		final Expression expression = QueryLanguageParser.parse(queryString);
+
+		return null;
+	}
+
+}
@@ -76,6 +76,9 @@ public class ProposerTest {
 		assertProposals("bird", 4, //
 				new Proposal("bird", "bird=* ", true, "bird=", 5) //
 		);
+		assertProposals("bird=eagle and n", 16, //
+				new Proposal("name", "bird=eagle and name=* ", true, "bird=eagle and name=", 20) //
+		);
 	}

 	public void testPrefixOfValue() throws Exception {
@@ -86,9 +89,9 @@ public class ProposerTest {
 				new Proposal("Jennifer", "name =Jennifer", true, "name =Jennifer", 14), //
 				new Proposal("Jenny", "name =Jenny", true, "name =Jenny", 11) //
 		);
-
-		assertProposals("bird=eagle and n", 16, //
-				new Proposal("name", "bird=eagle and name=* ", true, "bird=eagle and name=", 20) //
+		assertProposals("name =Tim,Je", 12, //
+				new Proposal("Jennifer", "name =Tim,Jennifer", true, "name =Tim,Jennifer", 18), //
+				new Proposal("Jenny", "name =Tim,Jenny", true, "name =Tim,Jenny", 15) //
 		);
 		/*
 		*/
@@ -0,0 +1,60 @@
+package org.lucares.pdb.datastore.internal;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.SortedSet;
+
+import org.lucares.pdb.api.StringCompressor;
+import org.lucares.pdb.api.Tag;
+import org.lucares.pdb.api.Tags;
+import org.lucares.pdb.api.UniqueStringIntegerPairs;
+import org.lucares.utils.file.FileUtils;
+import org.testng.Assert;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+@Test
+public class QueryCompletionIndexTest {
+
+	private Path dataDirectory;
+
+	@BeforeMethod
+	public void beforeMethod() throws IOException {
+		dataDirectory = Files.createTempDirectory("pdb");
+	}
+
+	@AfterMethod
+	public void afterMethod() throws IOException {
+		FileUtils.delete(dataDirectory);
+	}
+
+	public void test() throws Exception {
+		Tags.STRING_COMPRESSOR = new StringCompressor(new UniqueStringIntegerPairs());
+
+		final List<Tags> tags = Arrays.asList(//
+				Tags.create("firstname", "John", "lastname", "Doe", "country", "Atlantis"), // A
+				Tags.create("firstname", "Jane", "lastname", "Doe", "country", "ElDorado"), // B
+				Tags.create("firstname", "John", "lastname", "Miller", "country", "Atlantis")// C
+		);
+
+		try (QueryCompletionIndex index = new QueryCompletionIndex(dataDirectory.resolve("qci.bs"))) {
+			for (final Tags t : tags) {
+				index.addTags(t);
+			}
+
+			// all firstnames where lastname=Doe are returned sorted alphabetically.
+			// tags A and B match
+			final SortedSet<String> firstnamesWithLastnameDoe = index.find(new Tag("lastname", "Doe"), "firstname");
+			Assert.assertEquals(firstnamesWithLastnameDoe, Arrays.asList("Jane", "John"));
+
+			// no duplicates are returned:
+			// tags A and C match firstname=John, but both have country=Atlantis
+			final SortedSet<String> countryWithFirstnameJohn = index.find(new Tag("firstname", "John"), "country");
+			Assert.assertEquals(countryWithFirstnameJohn, Arrays.asList("Atlantis"));
+		}
+	}
+}