diff --git a/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java index b11cf88..bc35c30 100644 --- a/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java +++ b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java @@ -16,12 +16,12 @@ import org.lucares.collections.LongList; *

* Please note two things: *

    - *
  1. 0 is encoded to 1; the encoded values do not contain 0 + *
  2. 0 is encoded to 1; the encoded bytes do not contain the null byte *
  3. all but the last byte have the high value bit set *
- * That means no byte will have the value 0. This is important when decoding - * bytes, because we can decode bytes until we encounter the first null byte, or - * we reach the end of the array. + * No byte will have the value 0. This is important when decoding bytes, because + * we can decode bytes until we encounter the first null byte, or we reach the + * end of the array. */ public class VariableByteEncoder { diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java index daf4f3a..252b637 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java @@ -11,6 +11,7 @@ import org.lucares.collections.LongList; import org.lucares.pdb.api.DateTimeRange; import org.lucares.pdb.api.Tag; import org.lucares.pdb.api.Tags; +import org.lucares.pdb.datastore.lang.QueryCompletionExpressionOptimizer; import org.lucares.pdb.map.Empty; import org.lucares.pdb.map.PersistentMap; import org.lucares.pdb.map.PersistentMap.EncoderDecoder; @@ -20,35 +21,108 @@ import org.lucares.utils.byteencoder.VariableByteEncoder; /** * This index supports query completion. *

- * E.g. Given the query "firstname=John and lastname=|" ('|' denotes the - * position of the caret). How do we find all lastnames that match this query? + * E.g. Given the query "firstname=John and size=tall and lastname=|" ('|' + * denotes the position of the caret). How do we find all lastnames that match + * this query? + *

+ *

Alternative Solutions

The expensive way is to execute the query for + * all available lastnames and keep those that return at least one result.
+ * Another well know approach is to have an index that maps field+value to the + * list of documents that are tagged with field=value. You get all documents for + * firstname=John as well as size=tall. Then you intersect those lists and get + * the document that for "firstname=John and size=tall". Then you iterate over + * those documents and check which of those are tagged with lastname, collect + * all lastnames and return them. The disadvantage of this is that we have to + * load all matched documents, which does not scale for millions of documents. *
- * The expensive way is to execute the query for all available lastnames and - * keep those that return at least one result.
- * A more effiecient way uses an index that lists all lastnames that occur with - * firstname=John. If we write this as table, then it looks like this: + * An improvement is to add indices for all documents that are tagged to a + * field. In other words for each field we have a list of documents that are + * tagged to any value in them. This can improve things a lot, but in the worst + * case we still have to get all documents.
+ * If the number of values in a field is small we could iterate over all values + * in lastname and build the intersection with the documents matching + * "firstname=John and size=tall". + * + *

Solution

Here we chose a different solution. We are not building + * intersections of document ids to find out if an expression yields a result. + * We do it the other way around. + *

+ * The key insights are, that + *

    + *
  1. for query completion we do not have to know which documents match the + * query. We only need do know that a document matches. + *
  2. We can normalize all boolean expressions, see + * {@link QueryCompletionExpressionOptimizer}. + *
  3. There is no remove operation. This simplifies things. With removal we + * would have to maintain additional counters. + *
+ *

+ * Lets start simple. If we allow only queries of the form "field=value" (no + * binary operators, not negation), then query completion becomes simple. We + * only need to know the field+value combinations that have been used. An index + * like this will be sufficient. * *

- *┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
- *┃ fieldB  ┃ fieldA  ┃ valueA  ┃  valueB ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John    ┃ Connor  ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John    ┃Carpenter┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃country  ┃firstname┃ John    ┃ Germany ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John    ┃ Nash    ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ Rick    ┃ Meyer   ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ Rick    ┃ Castle  ┃
- *┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ *┏━━━━━━━━━┳━━━━━━━━━┓
+ *┃ field   ┃ value   ┃
+ *┣━━━━━━━━━╋━━━━━━━━━┫
+ *┃lastname ┃Connor   ┃
+ *┣━━━━━━━━━╋━━━━━━━━━┫
+ *┃lastname ┃Carpenter┃
+ *┗━━━━━━━━━┻━━━━━━━━━┛
  * 
* - * The lastnames where firstname=John are: Connor, Carpenter and Nash. Given - * such a table we can just for all rows with fieldA=firstname and valueA=John - * and fieldB = lastname. + * Note, this index can be used to find all values in field 'lastname' that + * start with a 'C', or that end in an 'ter' ,or that contain 'nn'. + *

+ * Now we make it a little bit more complex by analyzing queries like + * 'lastname=Meyer and firstname=|" ('|' denotes the position of the caret). The + * index above does not help us here, so we need another index. To do this we + * use an index that contains all pairwise combinations of tags. + *

+ * Given the following 5 documents. + * + *

+ * d1 with tags firstname=John, lastname=Carpenter
+ * d2 with tags firstname=John, lastname=Connor
+ * d3 with tags firstname=John, lastname=Meyer
+ * d4 with tags firstname=Rick, lastname=Castle
+ * d5 with tags firstname=Rick, lastname=Meyer
+ * 
+ * + * The index looks like this. We visualize it as a table. In reality the strings + * are mapped to integers and then transformed into bytes with + * {@link VariableByteEncoder}: + * + *
+ *   ┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
+ *   ┃ fieldB  ┃ fieldA  ┃ valueA  ┃  valueB ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 1 ┃firstname┃lastname ┃Carpenter┃ John    ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 2 ┃firstname┃lastname ┃ Castle  ┃ Rick    ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 3 ┃firstname┃lastname ┃ Connor  ┃ John    ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 4 ┃firstname┃lastname ┃ Meyer   ┃ John    ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 5 ┃firstname┃lastname ┃ Meyer   ┃ Rick    ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 6 ┃lastname ┃firstname┃ John    ┃ Connor  ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 7 ┃lastname ┃firstname┃ John    ┃Carpenter┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 8 ┃lastname ┃firstname┃ John    ┃ Meyer   ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 9 ┃lastname ┃firstname┃ Rick    ┃ Castle  ┃
+ *   ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *10 ┃lastname ┃firstname┃ Rick    ┃ Meyer   ┃
+ *   ┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ * 
+ * + * The column order is important. It allows us to efficiently get all values for + * field B where field A has a some value. E.g. the all lastnames where + * firstname=John are in rows 6-8.
*

* Please note, that the columns for fieldA and fieldB come first. This is to * make this index more suitable for IN-expressions and wildcard expressions of @@ -57,15 +131,15 @@ import org.lucares.utils.byteencoder.VariableByteEncoder; * evaluation while iterating over those hits. We do not have to expand the * wildcard and the do hundreds or thousands of queries. *

- * Please note, that fieldB comes before fieldA. This is, so that we can run - * inverse searches more efficiently. E.g. finding all values for - * fieldB=lastname where fieldA=firstname has a value != Connor. This is used - * for queries like 'NOT (firstname=Connor) and lastname=|' + * FieldB comes before fieldA. This is, so that we can run inverse searches more + * efficiently. E.g. finding all values for fieldB=lastname where + * fieldA=firstname has a value != Connor. This is used for queries like 'NOT + * (firstname=Connor) and lastname=|' *

- * The values in this index represent such a table. - *

- * Note: the index contains all four columns, but when searching we only use the - * first three. + * The index size grows quadratically in the number of tags each document has. + * Ten or more tags are common. That means one hundred or more entries in the + * index. That sounds a lot, but remember, this has only to be done once. The + * benefits are much faster searches. * */ public class QueryCompletionIndex implements AutoCloseable { @@ -164,10 +238,10 @@ public class QueryCompletionIndex implements AutoCloseable { return new TwoTags(tagA, tagB); } - + @Override public byte[] getEmptyValue() { - return new byte[] {0,0,0,0}; + return new byte[] { 0, 0, 0, 0 }; } } @@ -193,10 +267,9 @@ public class QueryCompletionIndex implements AutoCloseable { return new Tag(key, value); } - @Override public byte[] getEmptyValue() { - return new byte[] {0}; + return new byte[] { 0 }; } } @@ -217,10 +290,10 @@ public class QueryCompletionIndex implements AutoCloseable { final long compressedString = VariableByteEncoder.decodeFirstValue(bytes); return Tags.STRING_COMPRESSOR.get((int) compressedString); } - + @Override public byte[] getEmptyValue() { - return new byte[] {0}; + return new byte[] { 0 }; } } @@ -263,14 +336,33 @@ public class QueryCompletionIndex implements AutoCloseable { tagToTagIndex.close(); } - public SortedSet find(final DateTimeRange dateRange, final String property, final String value, - final String field) { - final Tag tag = new Tag(property, value); - Preconditions.checkGreaterOrEqual(tag.getKey(), 0, "The property '{0}' is unkown", property); - Preconditions.checkGreaterOrEqual(tag.getValue(), 0, "The value '{0}' is unkown", value); - return find(dateRange, tag, field); + /** + * Find values that are yield results when executing the query "fieldA=valueA + * and fieldB=???" + * + * @param dateRange the date range + * @param fieldA the other field of the and expression + * @param valueA the value of the other field + * @param fieldB the field we are searching values for + * @return values of fieldB + */ + public SortedSet find(final DateTimeRange dateRange, final String fieldA, final String valueA, + final String fieldB) { + final Tag tag = new Tag(fieldA, valueA); + Preconditions.checkGreaterOrEqual(tag.getKey(), 0, "The field ''{0}'' is unkown", fieldA); + Preconditions.checkGreaterOrEqual(tag.getValue(), 0, "The value ''{0}'' is unkown", valueA); + return find(dateRange, tag, fieldB); } + /** + * Find values that are yield results when executing the query + * "tag.field=tag.value and fieldB=???" + * + * @param dateRange the date range + * @param tag the other tag + * @param field the field we are searching values for + * @return values for the field + */ public SortedSet find(final DateTimeRange dateRange, final Tag tag, final String field) { final SortedSet result = new TreeSet<>(); @@ -287,6 +379,13 @@ public class QueryCompletionIndex implements AutoCloseable { return result; } + /** + * Find all values for the given field. + * + * @param dateRange the date range + * @param field the field + * @return the values + */ public SortedSet findAllValuesForField(final DateTimeRange dateRange, final String field) { final SortedSet result = new TreeSet<>(); @@ -301,6 +400,16 @@ public class QueryCompletionIndex implements AutoCloseable { return result; } + /** + * Find values for {@code field} that will yield results for the query + * "tag.field=tag.value and not field=???". + *

+ * + * @param dateRange the date range + * @param tag the other tag + * @param field the field we are searching values for + * @return the values + */ public SortedSet findAllValuesNotForField(final DateTimeRange dateRange, final Tag tag, final String field) { final SortedSet result = new TreeSet<>(); diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java index 433024a..aa320f4 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java @@ -348,11 +348,11 @@ abstract public class Expression { } static class Property extends Expression { - final String property; + final String field; final Terminal value; - public Property(final String property, final Terminal value) { - this.property = property; + public Property(final String field, final Terminal value) { + this.field = field; this.value = value; } @@ -364,7 +364,7 @@ abstract public class Expression { @Override public String toString() { - return property + " = " + value.getValue(); + return field + " = " + value.getValue(); } @Override @@ -372,8 +372,8 @@ abstract public class Expression { return value.containsCaret(); } - public String getProperty() { - return property; + public String getField() { + return field; } public Terminal getValue() { @@ -388,7 +388,7 @@ abstract public class Expression { public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + ((property == null) ? 0 : property.hashCode()); + result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + ((value == null) ? 0 : value.hashCode()); return result; } @@ -402,10 +402,10 @@ abstract public class Expression { if (getClass() != obj.getClass()) return false; final Property other = (Property) obj; - if (property == null) { - if (other.property != null) + if (field == null) { + if (other.field != null) return false; - } else if (!property.equals(other.property)) + } else if (!field.equals(other.field)) return false; if (value == null) { if (other.value != null) @@ -508,21 +508,21 @@ abstract public class Expression { } static class InExpression extends Expression { - private final String property; + private final String field; private final List values; - public InExpression(final String property, final String value) { - this(property, Arrays.asList(value)); + public InExpression(final String field, final String value) { + this(field, Arrays.asList(value)); } - public InExpression(final String property, final List values) { - this.property = property; + public InExpression(final String field, final List values) { + this.field = field; this.values = values; } @Override public String toString() { - return property + " in (" + String.join(", ", values) + ")"; + return field + " in (" + String.join(", ", values) + ")"; } @Override @@ -531,7 +531,7 @@ abstract public class Expression { } public String getProperty() { - return property; + return field; } public List getValues() { @@ -552,7 +552,7 @@ abstract public class Expression { public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + ((property == null) ? 0 : property.hashCode()); + result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + ((values == null) ? 0 : values.hashCode()); return result; } @@ -566,10 +566,10 @@ abstract public class Expression { if (getClass() != obj.getClass()) return false; final InExpression other = (InExpression) obj; - if (property == null) { - if (other.property != null) + if (field == null) { + if (other.field != null) return false; - } else if (!property.equals(other.property)) + } else if (!field.equals(other.field)) return false; if (values == null) { if (other.values != null) diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java index be45a12..dc9bc3c 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java @@ -44,7 +44,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor result = new TreeSet<>(); - final String fieldA = property.getProperty(); + final String fieldA = property.getField(); final String valueA = property.getValue().getValue(); final boolean hasField = index.hasField(dateTimeRange, fieldA); @@ -133,7 +133,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field); final SortedSet valuesForField = index.find(dateTimeRange, tag, field); @@ -162,7 +162,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor visit(final Property property) { final long start = System.nanoTime(); - final String field = property.getProperty(); + final String field = property.getField(); final String value = property.getValue().getValue(); final SortedSet allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange, field); @@ -179,7 +179,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange, - caretExpression.getProperty()); + caretExpression.getField()); final SortedSet valuesForFieldMatchingCaretExpression = GloblikePattern.filterValues(allValuesForField, valuePattern, TreeSet::new); @@ -232,7 +232,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange, field); final String valueWithCaretMarker = ((Property) innerExpression).getValue().getValue(); final String valuePrefix = valueWithCaretMarker.substring(0, diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java index 159b134..1effe15 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java @@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory; /** * Query completion utilizes an index that contains all mappings of * tags+fieldname to values. This index can be used to answer the question what - * the possible values for fields in simple and queries are. + * the possible values for fields in simple 'and' queries are. *

* E.g. Given the query "lastname=Doe and firstname=|" ('|' is the marker for * the caret position). All possible values for firstname are in the index under diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java b/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java index 4a6bb4e..48bbcc8 100644 --- a/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java +++ b/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java @@ -1,25 +1,43 @@ package org.lucares.pdb.api; +/** + * A {@link Tag} consists of a field and a value. In a query this is written as + * field=value, e.g., name=Sam where 'name' is the field and + * 'Sam' is the value. + */ public class Tag implements Comparable { - private final int key; + private final int field; private final int value; - public Tag(final int key, final int value) { - this.key = key; + /** + * Create a new tag with field and value specified as int. See + * {@link Tags#STRING_COMPRESSOR} for the mapping between Strings and ints. + * + * @param field the field as int + * @param value the value as int + */ + public Tag(final int field, final int value) { + this.field = field; this.value = value; } - public Tag(final String key, final String value) { - this.key = key != null ? Tags.STRING_COMPRESSOR.getIfPresent(key) : -1; + /** + * Create a new {@link Tag} for the given field and value. + * + * @param field the field + * @param value the value + */ + public Tag(final String field, final String value) { + this.field = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1; this.value = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1; } @Override public int compareTo(final Tag o) { - if (key != o.key) { - return key - o.key; + if (field != o.field) { + return field - o.field; } else if (value != o.value) { return value - o.value; } @@ -28,11 +46,11 @@ public class Tag implements Comparable { } public int getKey() { - return key; + return field; } public String getKeyAsString() { - return Tags.STRING_COMPRESSOR.get(key); + return Tags.STRING_COMPRESSOR.get(field); } public int getValue() { @@ -45,14 +63,14 @@ public class Tag implements Comparable { @Override public String toString() { - return Tags.STRING_COMPRESSOR.get(key) + "=" + Tags.STRING_COMPRESSOR.get(value); + return Tags.STRING_COMPRESSOR.get(field) + "=" + Tags.STRING_COMPRESSOR.get(value); } @Override public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + key; + result = prime * result + field; result = prime * result + value; return result; } @@ -66,7 +84,7 @@ public class Tag implements Comparable { if (getClass() != obj.getClass()) return false; final Tag other = (Tag) obj; - if (key != other.key) + if (field != other.field) return false; if (value != other.value) return false;