the union of many small lists is expensive

The reason seems to be the number of memory allocations. In order
to create the union of 100 lists we have 99 memory allocations.
The first needs the space for the first two lists, the second the
space for the first three lists, and so on.

We can reduce the number of allocations drastically (in many
cases to one) by leveraging the fact that many of the lists
were already sorted, non-overlapping and increasing, so that
we can simply concatenate them.
This commit is contained in:
2019-01-05 08:52:56 +01:00
parent 3dca7483de
commit 5197063ae3
2 changed files with 59 additions and 7 deletions

View File

@@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.lucares.collections.LongList;
import org.lucares.pdb.api.RuntimeIOException;
@@ -17,6 +18,7 @@ import org.lucares.pdb.datastore.lang.Expression.Or;
import org.lucares.pdb.datastore.lang.Expression.Parentheses;
import org.lucares.pdb.diskstorage.DiskStorage;
import org.lucares.pdb.map.PersistentMap;
import org.lucares.utils.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,7 +43,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
final long start = System.nanoTime();
final LongList result = LongList.intersection(leftFiles, rightFiles);
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
LOGGER.trace("and: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
result.size());
assert result.isSorted();
return result;
@@ -56,7 +59,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
final LongList rightFiles = right.visit(this);
final long start = System.nanoTime();
final LongList result = LongList.union(leftFiles, rightFiles);
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
LOGGER.trace("or: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
result.size());
assert result.isSorted();
return result;
@@ -72,7 +76,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
final LongList result = getAllDocIds().clone();
result.removeAll(docIdsToBeNegated);
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
LOGGER.trace("not: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
result.size());
return result;
}
@@ -88,7 +93,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
public LongList visit(final Expression.MatchAll expression) {
final long start = System.nanoTime();
final LongList result = getAllDocIds();
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
LOGGER.trace("matchAll: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
result.size());
return result;
}
@@ -108,7 +114,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
result = LongList.union(result, mergedDocIds);
}
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
LOGGER.trace("in: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
result.size());
return result;
}
@@ -125,12 +132,25 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
private List<LongList> filterByWildcard(final String propertyName, final Pattern valuePattern) {
final List<LongList> result = new ArrayList<>();
try {
final long start = System.nanoTime();
keyToValueToDocId.visitValues(new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
try {
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
try (final BSFile bsFile = BSFile.existingFile(blockOffsetToDocIds, diskStorage)) {
bsFile.streamOfLongLists().forEach(result::add);
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
// and increasing, that means we can just concatenate them and get a sorted
// list.
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
final LongList concatenatedLists = concatenateLists(longLists);
Preconditions.checkTrue(concatenatedLists.isSorted(),
"The LongLists containing document ids must be sorted, "
+ "non-overlapping and increasing, so that the concatenation "
+ "is sorted. This is guaranteed by the fact that document ids "
+ "are generated in monotonically increasing order.");
result.add(concatenatedLists);
}
}
} catch (final IOException e) {
@@ -138,6 +158,9 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
}
});
LOGGER.trace("filterByWildcard: for key {} took {}ms", propertyName,
(System.nanoTime() - start) / 1_000_000.0);
return result;
} catch (final IOException e) {
throw new RuntimeIOException(e);
@@ -155,4 +178,17 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
return result;
}
private static LongList concatenateLists(final Collection<LongList> lists) {
final int totalSize = lists.stream().mapToInt(LongList::size).sum();
final LongList result = new LongList(totalSize);
for (final LongList list : lists) {
result.addAll(list);
}
return result;
}
}

View File

@@ -30,6 +30,8 @@ public class Preconditions {
* @param b
* @param message formatted with {@link MessageFormat}
* @param args
* @throws IllegalStateException if {@code a} is not greater or equal to
* {@code b}
*/
public static void checkGreaterOrEqual(final long a, final long b, final String message, final Object... args) {
if (a < b) {
@@ -49,6 +51,8 @@ public class Preconditions {
* @param expected the expected value
* @param message formatted with {@link MessageFormat}
* @param args arguments for the message
* @throws IllegalStateException if {@code actual} is not equal to
* {@code expected}
*/
public static void checkEqual(final Object actual, final Object expected, final String message,
final Object... args) {
@@ -58,6 +62,18 @@ public class Preconditions {
}
}
/**
* Check that the given value is true.
*
* @param actual must be true
* @param message formatted with {@link MessageFormat}
* @param args arguments for the message
* @throws IllegalStateException if {@code actual} is not true
*/
public static void checkTrue(final boolean actual, final String message, final Object... args) {
checkEqual(actual, true, message, args);
}
public static void checkNull(final Object actual, final String message, final Object... args) {
if (actual != null) {
throw new IllegalStateException(MessageFormat.format(message, args));