diff --git a/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java
index b11cf88..bc35c30 100644
--- a/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java
+++ b/byte-utils/src/main/java/org/lucares/utils/byteencoder/VariableByteEncoder.java
@@ -16,12 +16,12 @@ import org.lucares.collections.LongList;
*
* Please note two things:
*
- * - 0 is encoded to 1; the encoded values do not contain 0
+ *
- 0 is encoded to 1; the encoded bytes do not contain the null byte
*
- all but the last byte have the high value bit set
*
- * That means no byte will have the value 0. This is important when decoding
- * bytes, because we can decode bytes until we encounter the first null byte, or
- * we reach the end of the array.
+ * No byte will have the value 0. This is important when decoding bytes, because
+ * we can decode bytes until we encounter the first null byte, or we reach the
+ * end of the array.
*/
public class VariableByteEncoder {
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java
index daf4f3a..252b637 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java
@@ -11,6 +11,7 @@ import org.lucares.collections.LongList;
import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
+import org.lucares.pdb.datastore.lang.QueryCompletionExpressionOptimizer;
import org.lucares.pdb.map.Empty;
import org.lucares.pdb.map.PersistentMap;
import org.lucares.pdb.map.PersistentMap.EncoderDecoder;
@@ -20,35 +21,108 @@ import org.lucares.utils.byteencoder.VariableByteEncoder;
/**
* This index supports query completion.
*
- * E.g. Given the query "firstname=John and lastname=|" ('|' denotes the
- * position of the caret). How do we find all lastnames that match this query?
+ * E.g. Given the query "firstname=John and size=tall and lastname=|" ('|'
+ * denotes the position of the caret). How do we find all lastnames that match
+ * this query?
+ *
+ *
Alternative Solutions
The expensive way is to execute the query for
+ * all available lastnames and keep those that return at least one result.
+ * Another well know approach is to have an index that maps field+value to the
+ * list of documents that are tagged with field=value. You get all documents for
+ * firstname=John as well as size=tall. Then you intersect those lists and get
+ * the document that for "firstname=John and size=tall". Then you iterate over
+ * those documents and check which of those are tagged with lastname, collect
+ * all lastnames and return them. The disadvantage of this is that we have to
+ * load all matched documents, which does not scale for millions of documents.
*
- * The expensive way is to execute the query for all available lastnames and
- * keep those that return at least one result.
- * A more effiecient way uses an index that lists all lastnames that occur with
- * firstname=John. If we write this as table, then it looks like this:
+ * An improvement is to add indices for all documents that are tagged to a
+ * field. In other words for each field we have a list of documents that are
+ * tagged to any value in them. This can improve things a lot, but in the worst
+ * case we still have to get all documents.
+ * If the number of values in a field is small we could iterate over all values
+ * in lastname and build the intersection with the documents matching
+ * "firstname=John and size=tall".
+ *
+ * Solution
Here we chose a different solution. We are not building
+ * intersections of document ids to find out if an expression yields a result.
+ * We do it the other way around.
+ *
+ * The key insights are, that
+ *
+ * - for query completion we do not have to know which documents match the
+ * query. We only need do know that a document matches.
+ *
- We can normalize all boolean expressions, see
+ * {@link QueryCompletionExpressionOptimizer}.
+ *
- There is no remove operation. This simplifies things. With removal we
+ * would have to maintain additional counters.
+ *
+ *
+ * Lets start simple. If we allow only queries of the form "field=value" (no
+ * binary operators, not negation), then query completion becomes simple. We
+ * only need to know the field+value combinations that have been used. An index
+ * like this will be sufficient.
*
*
- *┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
- *┃ fieldB ┃ fieldA ┃ valueA ┃ valueB ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John ┃ Connor ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John ┃Carpenter┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃country ┃firstname┃ John ┃ Germany ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ John ┃ Nash ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ Rick ┃ Meyer ┃
- *┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
- *┃lastname ┃firstname┃ Rick ┃ Castle ┃
- *┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ *┏━━━━━━━━━┳━━━━━━━━━┓
+ *┃ field ┃ value ┃
+ *┣━━━━━━━━━╋━━━━━━━━━┫
+ *┃lastname ┃Connor ┃
+ *┣━━━━━━━━━╋━━━━━━━━━┫
+ *┃lastname ┃Carpenter┃
+ *┗━━━━━━━━━┻━━━━━━━━━┛
*
*
- * The lastnames where firstname=John are: Connor, Carpenter and Nash. Given
- * such a table we can just for all rows with fieldA=firstname and valueA=John
- * and fieldB = lastname.
+ * Note, this index can be used to find all values in field 'lastname' that
+ * start with a 'C', or that end in an 'ter' ,or that contain 'nn'.
+ *
+ * Now we make it a little bit more complex by analyzing queries like
+ * 'lastname=Meyer and firstname=|" ('|' denotes the position of the caret). The
+ * index above does not help us here, so we need another index. To do this we
+ * use an index that contains all pairwise combinations of tags.
+ *
+ * Given the following 5 documents.
+ *
+ *
+ * d1 with tags firstname=John, lastname=Carpenter
+ * d2 with tags firstname=John, lastname=Connor
+ * d3 with tags firstname=John, lastname=Meyer
+ * d4 with tags firstname=Rick, lastname=Castle
+ * d5 with tags firstname=Rick, lastname=Meyer
+ *
+ *
+ * The index looks like this. We visualize it as a table. In reality the strings
+ * are mapped to integers and then transformed into bytes with
+ * {@link VariableByteEncoder}:
+ *
+ *
+ * ┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
+ * ┃ fieldB ┃ fieldA ┃ valueA ┃ valueB ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 1 ┃firstname┃lastname ┃Carpenter┃ John ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 2 ┃firstname┃lastname ┃ Castle ┃ Rick ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 3 ┃firstname┃lastname ┃ Connor ┃ John ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 4 ┃firstname┃lastname ┃ Meyer ┃ John ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 5 ┃firstname┃lastname ┃ Meyer ┃ Rick ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 6 ┃lastname ┃firstname┃ John ┃ Connor ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 7 ┃lastname ┃firstname┃ John ┃Carpenter┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 8 ┃lastname ┃firstname┃ John ┃ Meyer ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ * 9 ┃lastname ┃firstname┃ Rick ┃ Castle ┃
+ * ┣━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━╋━━━━━━━━━┫
+ *10 ┃lastname ┃firstname┃ Rick ┃ Meyer ┃
+ * ┗━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┻━━━━━━━━━┛
+ *
+ *
+ * The column order is important. It allows us to efficiently get all values for
+ * field B where field A has a some value. E.g. the all lastnames where
+ * firstname=John are in rows 6-8.
*
* Please note, that the columns for fieldA and fieldB come first. This is to
* make this index more suitable for IN-expressions and wildcard expressions of
@@ -57,15 +131,15 @@ import org.lucares.utils.byteencoder.VariableByteEncoder;
* evaluation while iterating over those hits. We do not have to expand the
* wildcard and the do hundreds or thousands of queries.
*
- * Please note, that fieldB comes before fieldA. This is, so that we can run
- * inverse searches more efficiently. E.g. finding all values for
- * fieldB=lastname where fieldA=firstname has a value != Connor. This is used
- * for queries like 'NOT (firstname=Connor) and lastname=|'
+ * FieldB comes before fieldA. This is, so that we can run inverse searches more
+ * efficiently. E.g. finding all values for fieldB=lastname where
+ * fieldA=firstname has a value != Connor. This is used for queries like 'NOT
+ * (firstname=Connor) and lastname=|'
*
- * The values in this index represent such a table.
- *
- * Note: the index contains all four columns, but when searching we only use the
- * first three.
+ * The index size grows quadratically in the number of tags each document has.
+ * Ten or more tags are common. That means one hundred or more entries in the
+ * index. That sounds a lot, but remember, this has only to be done once. The
+ * benefits are much faster searches.
*
*/
public class QueryCompletionIndex implements AutoCloseable {
@@ -164,10 +238,10 @@ public class QueryCompletionIndex implements AutoCloseable {
return new TwoTags(tagA, tagB);
}
-
+
@Override
public byte[] getEmptyValue() {
- return new byte[] {0,0,0,0};
+ return new byte[] { 0, 0, 0, 0 };
}
}
@@ -193,10 +267,9 @@ public class QueryCompletionIndex implements AutoCloseable {
return new Tag(key, value);
}
-
@Override
public byte[] getEmptyValue() {
- return new byte[] {0};
+ return new byte[] { 0 };
}
}
@@ -217,10 +290,10 @@ public class QueryCompletionIndex implements AutoCloseable {
final long compressedString = VariableByteEncoder.decodeFirstValue(bytes);
return Tags.STRING_COMPRESSOR.get((int) compressedString);
}
-
+
@Override
public byte[] getEmptyValue() {
- return new byte[] {0};
+ return new byte[] { 0 };
}
}
@@ -263,14 +336,33 @@ public class QueryCompletionIndex implements AutoCloseable {
tagToTagIndex.close();
}
- public SortedSet find(final DateTimeRange dateRange, final String property, final String value,
- final String field) {
- final Tag tag = new Tag(property, value);
- Preconditions.checkGreaterOrEqual(tag.getKey(), 0, "The property '{0}' is unkown", property);
- Preconditions.checkGreaterOrEqual(tag.getValue(), 0, "The value '{0}' is unkown", value);
- return find(dateRange, tag, field);
+ /**
+ * Find values that are yield results when executing the query "fieldA=valueA
+ * and fieldB=???"
+ *
+ * @param dateRange the date range
+ * @param fieldA the other field of the and expression
+ * @param valueA the value of the other field
+ * @param fieldB the field we are searching values for
+ * @return values of fieldB
+ */
+ public SortedSet find(final DateTimeRange dateRange, final String fieldA, final String valueA,
+ final String fieldB) {
+ final Tag tag = new Tag(fieldA, valueA);
+ Preconditions.checkGreaterOrEqual(tag.getKey(), 0, "The field ''{0}'' is unkown", fieldA);
+ Preconditions.checkGreaterOrEqual(tag.getValue(), 0, "The value ''{0}'' is unkown", valueA);
+ return find(dateRange, tag, fieldB);
}
+ /**
+ * Find values that are yield results when executing the query
+ * "tag.field=tag.value and fieldB=???"
+ *
+ * @param dateRange the date range
+ * @param tag the other tag
+ * @param field the field we are searching values for
+ * @return values for the field
+ */
public SortedSet find(final DateTimeRange dateRange, final Tag tag, final String field) {
final SortedSet result = new TreeSet<>();
@@ -287,6 +379,13 @@ public class QueryCompletionIndex implements AutoCloseable {
return result;
}
+ /**
+ * Find all values for the given field.
+ *
+ * @param dateRange the date range
+ * @param field the field
+ * @return the values
+ */
public SortedSet findAllValuesForField(final DateTimeRange dateRange, final String field) {
final SortedSet result = new TreeSet<>();
@@ -301,6 +400,16 @@ public class QueryCompletionIndex implements AutoCloseable {
return result;
}
+ /**
+ * Find values for {@code field} that will yield results for the query
+ * "tag.field=tag.value and not field=???".
+ *
+ *
+ * @param dateRange the date range
+ * @param tag the other tag
+ * @param field the field we are searching values for
+ * @return the values
+ */
public SortedSet findAllValuesNotForField(final DateTimeRange dateRange, final Tag tag,
final String field) {
final SortedSet result = new TreeSet<>();
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java
index 433024a..aa320f4 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/Expression.java
@@ -348,11 +348,11 @@ abstract public class Expression {
}
static class Property extends Expression {
- final String property;
+ final String field;
final Terminal value;
- public Property(final String property, final Terminal value) {
- this.property = property;
+ public Property(final String field, final Terminal value) {
+ this.field = field;
this.value = value;
}
@@ -364,7 +364,7 @@ abstract public class Expression {
@Override
public String toString() {
- return property + " = " + value.getValue();
+ return field + " = " + value.getValue();
}
@Override
@@ -372,8 +372,8 @@ abstract public class Expression {
return value.containsCaret();
}
- public String getProperty() {
- return property;
+ public String getField() {
+ return field;
}
public Terminal getValue() {
@@ -388,7 +388,7 @@ abstract public class Expression {
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + ((property == null) ? 0 : property.hashCode());
+ result = prime * result + ((field == null) ? 0 : field.hashCode());
result = prime * result + ((value == null) ? 0 : value.hashCode());
return result;
}
@@ -402,10 +402,10 @@ abstract public class Expression {
if (getClass() != obj.getClass())
return false;
final Property other = (Property) obj;
- if (property == null) {
- if (other.property != null)
+ if (field == null) {
+ if (other.field != null)
return false;
- } else if (!property.equals(other.property))
+ } else if (!field.equals(other.field))
return false;
if (value == null) {
if (other.value != null)
@@ -508,21 +508,21 @@ abstract public class Expression {
}
static class InExpression extends Expression {
- private final String property;
+ private final String field;
private final List values;
- public InExpression(final String property, final String value) {
- this(property, Arrays.asList(value));
+ public InExpression(final String field, final String value) {
+ this(field, Arrays.asList(value));
}
- public InExpression(final String property, final List values) {
- this.property = property;
+ public InExpression(final String field, final List values) {
+ this.field = field;
this.values = values;
}
@Override
public String toString() {
- return property + " in (" + String.join(", ", values) + ")";
+ return field + " in (" + String.join(", ", values) + ")";
}
@Override
@@ -531,7 +531,7 @@ abstract public class Expression {
}
public String getProperty() {
- return property;
+ return field;
}
public List getValues() {
@@ -552,7 +552,7 @@ abstract public class Expression {
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + ((property == null) ? 0 : property.hashCode());
+ result = prime * result + ((field == null) ? 0 : field.hashCode());
result = prime * result + ((values == null) ? 0 : values.hashCode());
return result;
}
@@ -566,10 +566,10 @@ abstract public class Expression {
if (getClass() != obj.getClass())
return false;
final InExpression other = (InExpression) obj;
- if (property == null) {
- if (other.property != null)
+ if (field == null) {
+ if (other.field != null)
return false;
- } else if (!property.equals(other.property))
+ } else if (!field.equals(other.field))
return false;
if (values == null) {
if (other.values != null)
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java
index be45a12..dc9bc3c 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java
@@ -44,7 +44,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor result = new TreeSet<>();
- final String fieldA = property.getProperty();
+ final String fieldA = property.getField();
final String valueA = property.getValue().getValue();
final boolean hasField = index.hasField(dateTimeRange, fieldA);
@@ -133,7 +133,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
final SortedSet valuesForField = index.find(dateTimeRange, tag, field);
@@ -162,7 +162,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor visit(final Property property) {
final long start = System.nanoTime();
- final String field = property.getProperty();
+ final String field = property.getField();
final String value = property.getValue().getValue();
final SortedSet allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange, field);
@@ -179,7 +179,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange,
- caretExpression.getProperty());
+ caretExpression.getField());
final SortedSet valuesForFieldMatchingCaretExpression = GloblikePattern.filterValues(allValuesForField,
valuePattern, TreeSet::new);
@@ -232,7 +232,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor allValuesForField = queryCompletionIndex.findAllValuesForField(dateRange, field);
final String valueWithCaretMarker = ((Property) innerExpression).getValue().getValue();
final String valuePrefix = valueWithCaretMarker.substring(0,
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java
index 159b134..1effe15 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionExpressionOptimizer.java
@@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory;
/**
* Query completion utilizes an index that contains all mappings of
* tags+fieldname to values. This index can be used to answer the question what
- * the possible values for fields in simple and queries are.
+ * the possible values for fields in simple 'and' queries are.
*
* E.g. Given the query "lastname=Doe and firstname=|" ('|' is the marker for
* the caret position). All possible values for firstname are in the index under
diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java b/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java
index 4a6bb4e..48bbcc8 100644
--- a/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java
+++ b/pdb-api/src/main/java/org/lucares/pdb/api/Tag.java
@@ -1,25 +1,43 @@
package org.lucares.pdb.api;
+/**
+ * A {@link Tag} consists of a field and a value. In a query this is written as
+ * field=value, e.g., name=Sam where 'name' is the field and
+ * 'Sam' is the value.
+ */
public class Tag implements Comparable {
- private final int key;
+ private final int field;
private final int value;
- public Tag(final int key, final int value) {
- this.key = key;
+ /**
+ * Create a new tag with field and value specified as int. See
+ * {@link Tags#STRING_COMPRESSOR} for the mapping between Strings and ints.
+ *
+ * @param field the field as int
+ * @param value the value as int
+ */
+ public Tag(final int field, final int value) {
+ this.field = field;
this.value = value;
}
- public Tag(final String key, final String value) {
- this.key = key != null ? Tags.STRING_COMPRESSOR.getIfPresent(key) : -1;
+ /**
+ * Create a new {@link Tag} for the given field and value.
+ *
+ * @param field the field
+ * @param value the value
+ */
+ public Tag(final String field, final String value) {
+ this.field = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
this.value = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
}
@Override
public int compareTo(final Tag o) {
- if (key != o.key) {
- return key - o.key;
+ if (field != o.field) {
+ return field - o.field;
} else if (value != o.value) {
return value - o.value;
}
@@ -28,11 +46,11 @@ public class Tag implements Comparable {
}
public int getKey() {
- return key;
+ return field;
}
public String getKeyAsString() {
- return Tags.STRING_COMPRESSOR.get(key);
+ return Tags.STRING_COMPRESSOR.get(field);
}
public int getValue() {
@@ -45,14 +63,14 @@ public class Tag implements Comparable {
@Override
public String toString() {
- return Tags.STRING_COMPRESSOR.get(key) + "=" + Tags.STRING_COMPRESSOR.get(value);
+ return Tags.STRING_COMPRESSOR.get(field) + "=" + Tags.STRING_COMPRESSOR.get(value);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + key;
+ result = prime * result + field;
result = prime * result + value;
return result;
}
@@ -66,7 +84,7 @@ public class Tag implements Comparable {
if (getClass() != obj.getClass())
return false;
final Tag other = (Tag) obj;
- if (key != other.key)
+ if (field != other.field)
return false;
if (value != other.value)
return false;