diff --git a/block-storage/src/main/java/org/lucares/pdb/map/Empty.java b/block-storage/src/main/java/org/lucares/pdb/map/Empty.java new file mode 100644 index 0000000..7627887 --- /dev/null +++ b/block-storage/src/main/java/org/lucares/pdb/map/Empty.java @@ -0,0 +1,26 @@ +package org.lucares.pdb.map; + +import org.lucares.pdb.map.PersistentMap.EncoderDecoder; + +/** + * Used to denote empty values in {@link PersistentMap}. + *

+ * Use {@link PersistentMap#EMPTY_ENCODER} as {@link EncoderDecoder}. + *

+ * Implementation note: We cannot use {@link Void}, because {@link Void} cannot + * be instantiated. A {@link PersistentMap PersistentMap<<String, Void>} + * would have to return {@code null} for {@link PersistentMap#getValue(Object)} + * which would make it impossible to know whether the key existed or not.
+ * {@link Empty} solves this by providing a single unmodifiable value. + */ +public final class Empty { + public static final Empty INSTANCE = new Empty(); + + private Empty() { + } + + @Override + public String toString() { + return ""; + } +} diff --git a/block-storage/src/main/java/org/lucares/pdb/map/PersistentMap.java b/block-storage/src/main/java/org/lucares/pdb/map/PersistentMap.java index 7cf4a69..69dc042 100644 --- a/block-storage/src/main/java/org/lucares/pdb/map/PersistentMap.java +++ b/block-storage/src/main/java/org/lucares/pdb/map/PersistentMap.java @@ -8,6 +8,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Objects; import java.util.Stack; import java.util.UUID; @@ -87,9 +88,28 @@ public class PersistentMap implements AutoCloseable { } } + private static final class EmptyCoder implements EncoderDecoder { + + private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; + + @Override + public byte[] encode(final Empty __) { + return EMPTY_BYTE_ARRAY; + } + + @Override + public Empty decode(final byte[] bytes) { + + Preconditions.checkEqual(bytes.length, 0, ""); + + return Empty.INSTANCE; + } + } + public static final EncoderDecoder LONG_CODER = new LongCoder(); public static final EncoderDecoder UUID_ENCODER = new UUIDCoder(); public static final EncoderDecoder STRING_CODER = new StringCoder(); + public static final EncoderDecoder EMPTY_ENCODER = new EmptyCoder(); static final int BLOCK_SIZE = 4096; static final long NODE_OFFSET_TO_ROOT_NODE = 8; @@ -180,17 +200,27 @@ public class PersistentMap implements AutoCloseable { final byte[] value) throws IOException { final PersistentMapDiskNode node = getNode(nodeOffest); - final var entry = node.getNodeEntryTo(key); + final NodeEntry entry = node.getNodeEntryTo(key); if (entry == null || entry.isDataNode()) { final byte[] oldValue; if (entry == null) { oldValue = null; } else { + // found a NodeEntry that is either equal to key, or it is at the insertion + // point final boolean entryIsForKey = entry.equal(key); oldValue = entryIsForKey ? entry.getValue() : null; + // Early exit, if the oldValue equals the new value. + // We do not have to replace the value, because it would not change anything + // (just cause unnecessary write operations). But we return the oldValue so that + // the caller thinks we replaced the value. + if (Objects.equals(oldValue, value)) { + return oldValue; + } + if (entryIsForKey) { node.removeKey(key); } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java index 1c1eefb..a3077b9 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java @@ -24,6 +24,7 @@ import org.lucares.pdb.datastore.Doc; import org.lucares.pdb.datastore.Proposal; import org.lucares.pdb.datastore.lang.Expression; import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor; +import org.lucares.pdb.datastore.lang.NewProposerParser; import org.lucares.pdb.datastore.lang.QueryLanguageParser; import org.lucares.pdb.diskstorage.DiskStorage; import org.lucares.pdb.map.PersistentMap; @@ -150,14 +151,17 @@ public class DataStore implements AutoCloseable { private final PersistentMap tagToDocsId; + private final QueryCompletionIndex queryCompletionIndex; + // A Doc will never be changed once it is created. Therefore we can cache them // easily. - private final HotEntryCache docIdToDocCache = new HotEntryCache<>(Duration.ofMinutes(10), + private final HotEntryCache docIdToDocCache = new HotEntryCache<>(Duration.ofSeconds(5), "docIdToDocCache"); private final DiskStorage diskStorage; private final Path diskStorageFilePath; private final Path storageBasePath; + private final Path queryCompletionIndexFile; public DataStore(final Path dataDirectory) throws IOException { storageBasePath = storageDirectory(dataDirectory); @@ -178,6 +182,9 @@ public class DataStore implements AutoCloseable { final Path docIdToDocIndexPath = storageBasePath.resolve("docIdToDocIndex.bs"); docIdToDoc = new PersistentMap<>(docIdToDocIndexPath, PersistentMap.LONG_CODER, ENCODER_DOC); + + queryCompletionIndexFile = storageBasePath.resolve("queryCompletionIndex.bs"); + queryCompletionIndex = new QueryCompletionIndex(queryCompletionIndexFile); } private Path keyCompressionFile(final Path dataDirectory) throws IOException { @@ -199,6 +206,7 @@ public class DataStore implements AutoCloseable { final Long oldDocId = tagsToDocId.putValue(tags, docId); Preconditions.checkNull(oldDocId, "There must be at most one document for tags: {0}", tags); + // store mapping from tag to docId, so that we can find all docs for a given tag final List ts = new ArrayList<>(tags.toTags()); ts.add(TAG_ALL_DOCS); for (final Tag tag : ts) { @@ -215,6 +223,10 @@ public class DataStore implements AutoCloseable { } } + // index the tags, so that we can efficiently find all possible values for a + // field in a query + queryCompletionIndex.addTags(tags); + return newFilesRootBlockOffset; } @@ -295,7 +307,7 @@ public class DataStore implements AutoCloseable { final List result = new ArrayList<>(docIdsList.size()); synchronized (docIdToDoc) { - + final long start = System.nanoTime(); for (int i = 0; i < docIdsList.size(); i++) { final long docId = docIdsList.get(i); @@ -304,6 +316,8 @@ public class DataStore implements AutoCloseable { result.add(doc); } + System.out.println( + "mapDocIdsToDocs(" + docIdsList.size() + "): " + (System.nanoTime() - start) / 1_000_000.0 + "ms"); } return result; } @@ -342,6 +356,11 @@ public class DataStore implements AutoCloseable { } public List propose(final String query, final int caretIndex) { + + final NewProposerParser newProposerParser = new NewProposerParser(this); + final List proposals = newProposerParser.propose(query, caretIndex); + System.out.println(proposals); + return new Proposer(this).propose(query, caretIndex); } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java new file mode 100644 index 0000000..dcd66ab --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java @@ -0,0 +1,158 @@ +package org.lucares.pdb.datastore.internal; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.lucares.collections.LongList; +import org.lucares.pdb.api.Tag; +import org.lucares.pdb.api.Tags; +import org.lucares.pdb.map.Empty; +import org.lucares.pdb.map.PersistentMap; +import org.lucares.pdb.map.PersistentMap.EncoderDecoder; +import org.lucares.utils.byteencoder.VariableByteEncoder; + +/** + * This index supports query completion. + *

+ * E.g. Given the query "firstname=John and lastname=|" ('|' denotes the + * position of the caret). How do we find all lastnames that match this query? + *
+ * The expensive way is to execute the query for all available lastnames and + * keep those that return at least one result.
+ * A more effiecient way uses an index that lists all lastnames that occurr with + * firstname=John. If we write this as table, then it looks like this: + * + *

+ *┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
+ *┃ fieldA  ┃ valueA  ┃ fieldB  ┃  valueB ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃ Connor  ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃Carpenter┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃country  ┃ Germany ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ John    ┃lastname ┃ Nash    ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ Rick    ┃lastname ┃ Meyer   ┃
+ *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *┃firstname┃ Rick    ┃lastname ┃ Castle  ┃
+ *┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ * 
+ * + * The lastnames where firstname=John are Connor, Carpenter and Nash. Given such + * a table we can just for all rows with fieldA=firstname and valueA=John and + * fieldB = lastname. + *

+ * The values in this index represent such a table. + *

+ * Note: the index contains all four columns, but when searching we only use the + * first three. + * + */ +public class QueryCompletionIndex implements AutoCloseable { + private static final class TwoTags { + private final Tag tagA; + private final Tag tagB; + + public TwoTags(final Tag tagA, final Tag tagB) { + this.tagA = tagA; + this.tagB = tagB; + } + + public Tag getTagA() { + return tagA; + } + + public Tag getTagB() { + return tagB; + } + + @Override + public String toString() { + return tagA + "::" + tagB; + } + + } + + private static final class EncoderTwoTags implements EncoderDecoder { + + @Override + public byte[] encode(final TwoTags tagAndField) { + final LongList tmp = new LongList(4); + final Tag tagA = tagAndField.getTagA(); + final Tag tagB = tagAndField.getTagB(); + + tmp.add(tagA.getKey()); + tmp.add(tagA.getValue()); + + tmp.add(tagB.getKey()); + + // A query for tagA.key and tagA.value and tagB.key is done by setting + // tagB.value==0. + // The query is then executed as a prefix search. Thus tagB.value must not be + // part of the byte array that is returned. + if (tagB.getValue() >= 0) { + tmp.add(tagB.getValue()); + } + + return VariableByteEncoder.encode(tmp); + } + + @Override + public TwoTags decode(final byte[] bytes) { + + final LongList tmp = VariableByteEncoder.decode(bytes); + final int tagAKey = (int) tmp.get(0); + final int tagAValue = (int) tmp.get(1); + final int tagBKey = (int) tmp.get(2); + final int tagBValue = (int) tmp.get(3); + + final Tag tagA = new Tag(tagAKey, tagAValue); + final Tag tagB = new Tag(tagBKey, tagBValue); + + return new TwoTags(tagA, tagB); + } + } + + private final PersistentMap tagToTagIndex; + + public QueryCompletionIndex(final Path indexFile) throws IOException { + tagToTagIndex = new PersistentMap<>(indexFile, new EncoderTwoTags(), PersistentMap.EMPTY_ENCODER); + } + + public void addTags(final Tags tags) throws IOException { + final List listOfTagsA = tags.toTags(); + final List listOfTagsB = tags.toTags(); + + // index all combinations of tagA and tagB + for (final Tag tagA : listOfTagsA) { + for (final Tag tagB : listOfTagsB) { + final TwoTags key = new TwoTags(tagA, tagB); + tagToTagIndex.putValue(key, Empty.INSTANCE); + } + } + } + + @Override + public void close() throws IOException { + tagToTagIndex.close(); + } + + public SortedSet find(final Tag tag, final String field) throws IOException { + final SortedSet result = new TreeSet<>(); + final int tagBKey = Tags.STRING_COMPRESSOR.put(field); + final Tag tagB = new Tag(tagBKey, -1); // the value must be negative for the prefix search to work. See + // EncoderTwoTags + final TwoTags keyPrefix = new TwoTags(tag, tagB); + tagToTagIndex.visitValues(keyPrefix, (k, v) -> { + result.add(k.getTagB().getValueAsString()); + }); + + return result; + + } +} diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/NewProposerParser.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/NewProposerParser.java new file mode 100644 index 0000000..4ab7624 --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/NewProposerParser.java @@ -0,0 +1,27 @@ +package org.lucares.pdb.datastore.lang; + +import java.util.List; + +import org.lucares.pdb.datastore.Proposal; +import org.lucares.pdb.datastore.internal.DataStore; + +public class NewProposerParser { + + private final static String CARET_MARKER = "\ue001"; // third character in the private use area + + private final DataStore dataStore; + + public NewProposerParser(final DataStore dataStore) { + this.dataStore = dataStore; + } + + public List propose(final String query, final int caretIndex) { + + final String queryString = new StringBuilder(query).insert(caretIndex, CARET_MARKER).toString(); + + final Expression expression = QueryLanguageParser.parse(queryString); + + return null; + } + +} diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java index 0fc1a5a..aad9399 100644 --- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java @@ -76,6 +76,9 @@ public class ProposerTest { assertProposals("bird", 4, // new Proposal("bird", "bird=* ", true, "bird=", 5) // ); + assertProposals("bird=eagle and n", 16, // + new Proposal("name", "bird=eagle and name=* ", true, "bird=eagle and name=", 20) // + ); } public void testPrefixOfValue() throws Exception { @@ -86,9 +89,9 @@ public class ProposerTest { new Proposal("Jennifer", "name =Jennifer", true, "name =Jennifer", 14), // new Proposal("Jenny", "name =Jenny", true, "name =Jenny", 11) // ); - - assertProposals("bird=eagle and n", 16, // - new Proposal("name", "bird=eagle and name=* ", true, "bird=eagle and name=", 20) // + assertProposals("name =Tim,Je", 12, // + new Proposal("Jennifer", "name =Tim,Jennifer", true, "name =Tim,Jennifer", 18), // + new Proposal("Jenny", "name =Tim,Jenny", true, "name =Tim,Jenny", 15) // ); /* */ diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/QueryCompletionIndexTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/QueryCompletionIndexTest.java new file mode 100644 index 0000000..7c74fe7 --- /dev/null +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/QueryCompletionIndexTest.java @@ -0,0 +1,60 @@ +package org.lucares.pdb.datastore.internal; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.SortedSet; + +import org.lucares.pdb.api.StringCompressor; +import org.lucares.pdb.api.Tag; +import org.lucares.pdb.api.Tags; +import org.lucares.pdb.api.UniqueStringIntegerPairs; +import org.lucares.utils.file.FileUtils; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +@Test +public class QueryCompletionIndexTest { + + private Path dataDirectory; + + @BeforeMethod + public void beforeMethod() throws IOException { + dataDirectory = Files.createTempDirectory("pdb"); + } + + @AfterMethod + public void afterMethod() throws IOException { + FileUtils.delete(dataDirectory); + } + + public void test() throws Exception { + Tags.STRING_COMPRESSOR = new StringCompressor(new UniqueStringIntegerPairs()); + + final List tags = Arrays.asList(// + Tags.create("firstname", "John", "lastname", "Doe", "country", "Atlantis"), // A + Tags.create("firstname", "Jane", "lastname", "Doe", "country", "ElDorado"), // B + Tags.create("firstname", "John", "lastname", "Miller", "country", "Atlantis")// C + ); + + try (QueryCompletionIndex index = new QueryCompletionIndex(dataDirectory.resolve("qci.bs"))) { + for (final Tags t : tags) { + index.addTags(t); + } + + // all firstnames where lastname=Doe are returned sorted alphabetically. + // tags A and B match + final SortedSet firstnamesWithLastnameDoe = index.find(new Tag("lastname", "Doe"), "firstname"); + Assert.assertEquals(firstnamesWithLastnameDoe, Arrays.asList("Jane", "John")); + + // no duplicates are returned: + // tags A and C match firstname=John, but both have country=Atlantis + final SortedSet countryWithFirstnameJohn = index.find(new Tag("firstname", "John"), "country"); + Assert.assertEquals(countryWithFirstnameJohn, Arrays.asList("Atlantis")); + } + } +} diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java b/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java index f3d9d62..6e5c57f 100644 --- a/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java +++ b/pdb-api/src/main/java/org/lucares/pdb/api/Tags.java @@ -8,6 +8,7 @@ import java.util.TreeSet; import java.util.function.BiConsumer; import java.util.function.Function; +import org.lucares.collections.IntList; import org.lucares.collections.LongList; import org.lucares.utils.byteencoder.VariableByteEncoder; @@ -156,6 +157,14 @@ public class Tags implements Comparable { return result; } + public IntList getKeysAsInt() { + final IntList result = new IntList(); + for (final Tag tag : tags) { + result.add(tag.getKey()); + } + return result; + } + public List toTags() { return Collections.unmodifiableList(tags); }