the union of many small lists is expensive
The reason seems to be the number of memory allocations. In order to create the union of 100 lists we have 99 memory allocations. The first needs the space for the first two lists, the second the space for the first three lists, and so on. We can reduce the number of allocations drastically (in many cases to one) by leveraging the fact that many of the lists were already sorted, non-overlapping and increasing, so that we can simply concatenate them.
This commit is contained in:
@@ -5,6 +5,7 @@ import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.lucares.collections.LongList;
|
||||
import org.lucares.pdb.api.RuntimeIOException;
|
||||
@@ -17,6 +18,7 @@ import org.lucares.pdb.datastore.lang.Expression.Or;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Parentheses;
|
||||
import org.lucares.pdb.diskstorage.DiskStorage;
|
||||
import org.lucares.pdb.map.PersistentMap;
|
||||
import org.lucares.utils.Preconditions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -41,7 +43,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
|
||||
final long start = System.nanoTime();
|
||||
final LongList result = LongList.intersection(leftFiles, rightFiles);
|
||||
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
|
||||
LOGGER.trace("and: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
|
||||
result.size());
|
||||
assert result.isSorted();
|
||||
|
||||
return result;
|
||||
@@ -56,7 +59,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
final LongList rightFiles = right.visit(this);
|
||||
final long start = System.nanoTime();
|
||||
final LongList result = LongList.union(leftFiles, rightFiles);
|
||||
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
|
||||
LOGGER.trace("or: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
|
||||
result.size());
|
||||
assert result.isSorted();
|
||||
|
||||
return result;
|
||||
@@ -72,7 +76,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
final LongList result = getAllDocIds().clone();
|
||||
result.removeAll(docIdsToBeNegated);
|
||||
|
||||
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
|
||||
LOGGER.trace("not: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
|
||||
result.size());
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -88,7 +93,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
public LongList visit(final Expression.MatchAll expression) {
|
||||
final long start = System.nanoTime();
|
||||
final LongList result = getAllDocIds();
|
||||
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
|
||||
LOGGER.trace("matchAll: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
|
||||
result.size());
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -108,7 +114,8 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
result = LongList.union(result, mergedDocIds);
|
||||
}
|
||||
|
||||
LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size());
|
||||
LOGGER.trace("in: {} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0,
|
||||
result.size());
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -125,12 +132,25 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
private List<LongList> filterByWildcard(final String propertyName, final Pattern valuePattern) {
|
||||
final List<LongList> result = new ArrayList<>();
|
||||
try {
|
||||
|
||||
final long start = System.nanoTime();
|
||||
keyToValueToDocId.visitValues(new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
|
||||
try {
|
||||
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
|
||||
try (final BSFile bsFile = BSFile.existingFile(blockOffsetToDocIds, diskStorage)) {
|
||||
bsFile.streamOfLongLists().forEach(result::add);
|
||||
|
||||
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
|
||||
// and increasing, that means we can just concatenate them and get a sorted
|
||||
// list.
|
||||
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
|
||||
final LongList concatenatedLists = concatenateLists(longLists);
|
||||
|
||||
Preconditions.checkTrue(concatenatedLists.isSorted(),
|
||||
"The LongLists containing document ids must be sorted, "
|
||||
+ "non-overlapping and increasing, so that the concatenation "
|
||||
+ "is sorted. This is guaranteed by the fact that document ids "
|
||||
+ "are generated in monotonically increasing order.");
|
||||
|
||||
result.add(concatenatedLists);
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
@@ -138,6 +158,9 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
}
|
||||
});
|
||||
|
||||
LOGGER.trace("filterByWildcard: for key {} took {}ms", propertyName,
|
||||
(System.nanoTime() - start) / 1_000_000.0);
|
||||
|
||||
return result;
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
@@ -155,4 +178,17 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<LongList> {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static LongList concatenateLists(final Collection<LongList> lists) {
|
||||
|
||||
final int totalSize = lists.stream().mapToInt(LongList::size).sum();
|
||||
final LongList result = new LongList(totalSize);
|
||||
|
||||
for (final LongList list : lists) {
|
||||
result.addAll(list);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user