multiway merge of multiple sorted lists

This commit is contained in:
2020-11-06 19:22:45 +01:00
parent 5b5e948293
commit 8a5309fbe8
4 changed files with 448 additions and 13 deletions

View File

@@ -1,8 +1,10 @@
package org.lucares.collections;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Spliterator.OfLong;
@@ -41,7 +43,7 @@ public final class LongList implements Serializable, Cloneable {
* Keeps track of whether or not the list is sorted. This allows us to use
* binary search for {@link #indexOf(int)} and efficient algorithms for
* {@link #intersection(LongList, LongList)} /
* {@link #union(LongList, LongList)} / {@link #uniq()} /
* {@link #unionInternal(LongList, LongList)} / {@link #uniq()} /
* {@link #removeAll(int, int)}. An empty list is sorted.
*/
private boolean sorted = true;
@@ -121,7 +123,7 @@ public final class LongList implements Serializable, Cloneable {
*
* @param startInclusive the lower bound (inclusive)
* @param endInclusive the upper bound (inclusive)
* @return the {@link IntList}
* @return the {@link LongList}
*/
public static LongList rangeClosed(final long startInclusive, final long endInclusive) {
if (startInclusive > endInclusive) {
@@ -137,6 +139,32 @@ public final class LongList implements Serializable, Cloneable {
}
}
/**
* Returns a new list with the values from the given start index to the end of
* the list.
*
* @param startInclusive start index
* @return {@code LongList}
*/
public LongList sublist(final int startInclusive) {
return sublist(startInclusive, size);
}
/**
* Returns a new list with the values of the given range.
*
* @param startInclusive the start index
* @param endExclusive the end index (exclusive)
* @return {@link LongList}
*/
public LongList sublist(final int startInclusive, int endExclusive) {
final LongList result = new LongList(endExclusive - startInclusive);
result.data = Arrays.copyOfRange(data, startInclusive, endExclusive);
result.size = result.data.length;
result.sorted = sorted;
return result;
}
/**
* Returns {@code true} if this list contains no elements.
*
@@ -1114,27 +1142,32 @@ public final class LongList implements Serializable, Cloneable {
}
/**
* Returns a list with all elements that are in list {@code a} or {@code b}
* (logical or).
* Returns a list with all elements that are in list {@code a} or {@code b} or
* ... or {@code n} (logical or).
* <p>
* The result does not contain duplicate values.
* <p>
* If both lists were sorted, then the output list will also be sorted. If at
* If all lists were sorted, then the output list will also be sorted. If at
* least one list is unsorted, then the order is undefined.
* <p>
* If both lists are sorted, then the time complexity is O(n+m), where n is the
* length of the first list and m the length of the second list. If at least one
* list is not sorted, then the time complexity is O(m*log(m)), where m is the
* length of the longer list.
* TODO check time complexity If all lists are sorted, then the time complexity
* is O(n+m), where n is the length of the first list and m the length of the
* second list. If at least one list is not sorted, then the time complexity is
* O(m*log(m)), where m is the length of the longer list.
*
* @param a the first list
* @param b the second list
* @param longLists the lists
* @return the union of both lists
*/
public static LongList union(final LongList a, final LongList b) {
public static LongList union(final LongList... longLists) {
return union(List.of(longLists));
}
private static LongList unionInternal(final LongList a, final LongList b) {
final LongList result;
if (a.isSorted() && b.isSorted()) {
if (a.isEmpty() && b.isEmpty()) {
result = new LongList();
} else if (a.isSorted() && b.isSorted()) {
result = unionSorted(a, b);
} else {
result = unionUnsorted(a, b);
@@ -1142,6 +1175,35 @@ public final class LongList implements Serializable, Cloneable {
return result;
}
public static LongList union(final Collection<LongList> longLists) {
switch (longLists.size()) {
case 0:
return new LongList();
case 1:
return longLists.iterator().next().clone();
case 2:
final Iterator<LongList> it = longLists.iterator();
final LongList a = it.next();
final LongList b = it.next();
return unionInternal(a, b);
default:
final Collection<LongList> sortedLists = subsetOfSortedLists(longLists);
final Collection<LongList> unsortedLists = subsetOfUnsortedLists(longLists);
final LongList unionSorted = MultiwayLongMerger.unionSorted(sortedLists);
final LongList result;
if (unsortedLists.isEmpty()) {
result = unionSorted;
} else {
final LongList unionUnsorted = unionUnsorted(unsortedLists);
result = unionInternal(unionSorted, unionUnsorted);
}
return result;
}
}
private static LongList unionSorted(final LongList a, final LongList b) {
final int aSize = a.size();
@@ -1211,6 +1273,17 @@ public final class LongList implements Serializable, Cloneable {
return unionSorted(aSorted, bSorted);
}
private static LongList unionUnsorted(Collection<LongList> longLists) {
final List<LongList> sortedLists = new ArrayList<>();
for (LongList longList : longLists) {
final LongList copy = new LongList(longList);
copy.sort();
sortedLists.add(copy);
}
return MultiwayLongMerger.unionSorted(sortedLists);
}
private void checkIfSorted() {
sorted = true;
for (int i = 1; i < size && sorted; i++) {
@@ -1218,4 +1291,30 @@ public final class LongList implements Serializable, Cloneable {
}
}
private static Collection<LongList> subsetOfSortedLists(Collection<LongList> longLists) {
final List<LongList> result = new ArrayList<>();
for (LongList longList : longLists) {
if (longList.isSorted()) {
result.add(longList);
}
}
return result;
}
private static Collection<LongList> subsetOfUnsortedLists(Collection<LongList> longLists) {
final List<LongList> result = new ArrayList<>();
for (LongList longList : longLists) {
if (!longList.isSorted()) {
result.add(longList);
}
}
return result;
}
}

View File

@@ -0,0 +1,252 @@
package org.lucares.collections;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
class MultiwayLongMerger {
private static final long UNSET = Long.MIN_VALUE;
private static class LongQueue {
final LongList wrapped;
int offset = 0;
public LongQueue(LongList wrapped) {
this.wrapped = wrapped;
}
boolean isEmpty() {
return offset >= wrapped.size();
}
long pop() {
assert offset < wrapped.size();
final long result = wrapped.get(offset);
offset++;
return result;
}
public long peek() {
return wrapped.get(offset);
}
@Override
public String toString() {
return wrapped.sublist(offset).toString();
}
public int size() {
return wrapped.size()-offset;
}
}
static LongList unionSorted(Collection<LongList> longLists) {
assertAllListsAreSorted(longLists);
final List<LongQueue> queues = new ArrayList<LongQueue>();
boolean hasValueUNSET = initQueues(longLists, queues);
final LongList result = new LongList();
if (hasValueUNSET) {
result.add(UNSET);
}
if (!queues.isEmpty()) {
mergeQueues(queues, result);
}
return result;
}
private static void mergeQueues(final List<LongQueue> queues, final LongList result) {
final MinValuePriorityQueue selectionTree = new MinValuePriorityQueue(queues);
long previousValue = UNSET;
long val;
while ((val = selectionTree.pop()) != UNSET) {
if (val != previousValue) {
result.add(val);
previousValue = val;
}
}
}
private static boolean initQueues(Collection<LongList> longLists, final List<LongQueue> queues) {
boolean hasValueUNSET = false;
for (LongList longList : longLists) {
if (!longList.isEmpty()) {
final LongQueue queue = new LongQueue(longList);
while (!queue.isEmpty() && queue.peek() == UNSET) {
queue.pop();
hasValueUNSET = true;
}
if (!queue.isEmpty()) {
queues.add(queue);
}
}
}
return hasValueUNSET;
}
private static void assertAllListsAreSorted(Collection<LongList> longLists) {
for (LongList longList : longLists) {
if (!longList.isSorted()) {
throw new IllegalArgumentException("lists must be sorted");
}
}
}
private static int nextPowOfTwo(int i) {
return Integer.highestOneBit(i - 1) << 1;
}
private static class MinValuePriorityQueue {
private List<LongQueue> longQueues;
/*
* a classic heap where the nodes are layed out in breath first order. First the
* root, then the nodes of level 1, then the nodes of level 2, ...
*/
private final long[] heap;
private final int size;
private final int firstLeafIndex;
public MinValuePriorityQueue(final Collection<LongQueue> longQueues) {
this.longQueues = new ArrayList<>(longQueues);
size = longQueues.size();
heap = new long[2 * nextPowOfTwo(size) - 1];
firstLeafIndex = heap.length / 2;
Arrays.fill(heap, UNSET);
init();
}
/**
* Returns the smallest value of the heap. Returns
* {@link MultiwayLongMerger#UNSET}={@value MultiwayLongMerger#UNSET} if the
* heap is empty.
*
* @return the smallest value or
* {@link MultiwayLongMerger#UNSET}={@value MultiwayLongMerger#UNSET} if
* heap is empty
*/
public long pop() {
long result = heap[0];
fillWithMinOfChildren(0);
return result;
}
/**
* <pre>
* 7
* 3
* 8
* 1
* 9
* 4
* 10
* 0
* 11
* 5
* 12
* 2
* 13
* 6
* 14
* </pre>
*/
private void init() {
// fill leaf nodes
int offset = firstLeafIndex;
for (int j = 0; j < size; j++) {
final LongQueue q = longQueues.get(j);
heap[offset + j] = q.isEmpty() ? UNSET : q.pop();
}
// fill the non-leaf layers (from the leafs up to the root)
while (offset > 0) {
offset /= 2; //
for (int i = offset; i <= offset * 2; i++) {
fillWithMinOfChildren(i);
}
}
}
private int leftChildIndex(int i) {
return i * 2 + 1;
}
private int rightChildIndex(int i) {
return i * 2 + 2;
}
private boolean isLeaf(int i) {
return i >= firstLeafIndex;
}
private int leafIndexToListIndex(int i) {
assert isLeaf(i) : "index " + i + " is not a leaf";
return i - firstLeafIndex;
}
private void fillWithMinOfChildren(int index) {
final int leftChildIndex = index * 2 + 1; //leftChildIndex(index);
final int rightChildIndex = leftChildIndex+1;//rightChildIndex(index);
final long valueOfLeftChild = heap[leftChildIndex];
final long valueOfRightChild = heap[rightChildIndex];
final int chosenValue;
if (valueOfLeftChild == UNSET) {
if (valueOfRightChild == UNSET) {
heap[index] = UNSET;
return;
} else {
//left < right
heap[index] = valueOfRightChild;
chosenValue = rightChildIndex;
}
} else if (valueOfRightChild == UNSET) {
// left > right
heap[index] = valueOfLeftChild;
chosenValue = leftChildIndex;
} else {
if (valueOfLeftChild < valueOfRightChild) {
// left < right
heap[index] = valueOfLeftChild;
chosenValue = leftChildIndex;
} else {
// left >= right
heap[index] = valueOfRightChild;
chosenValue = rightChildIndex;
}
}
refillValue(chosenValue);
}
private void refillValue(int index) {
if (isLeaf(index)) {
final int listIndex = index - firstLeafIndex; //leafIndexToListIndex(index);
final LongQueue queue = longQueues.get(listIndex);
heap[index] = queue.isEmpty() ? UNSET : queue.pop();
return;
}
fillWithMinOfChildren(index);
}
}
public static void main(String[] args) {
for (int i = 0; i < 20; i++) {
System.out.println(i + " " + (Long.highestOneBit(i - 1) << 1));
}
}
}

View File

@@ -1614,6 +1614,31 @@ public class LongListTest {
Assertions.assertEquals(LongList.union(a, b), LongList.union(b, a));
}
@Test
public void testUnionSortedLists_three() {
final LongList a = LongList.of(1, 2, 3);
final LongList b = LongList.of(2, 4, 6);
final LongList c = LongList.of(3, 5, 7);
final LongList actual = LongList.union(a, b, c);
Assertions.assertEquals(LongList.of(1,2,3,4,5,6,7), actual);
Assertions.assertEquals(LongList.union(a, b, c), LongList.union(b, c, a));
Assertions.assertEquals(LongList.union(a, b, c), LongList.union(b, a, c));
}
@Test
public void testUnionSortedLists_four_LongMinValue() {
final LongList a = LongList.of(Long.MIN_VALUE, 2, 3,Long.MAX_VALUE);
final LongList b = LongList.of(2, 4, 6, Long.MAX_VALUE);
final LongList c = LongList.of(Long.MIN_VALUE, 5, 7);
final LongList d = LongList.of(Long.MIN_VALUE, Long.MIN_VALUE);
final LongList actual = LongList.union(a, b, c, d);
Assertions.assertEquals(LongList.of(Long.MIN_VALUE,2,3,4,5,6,7, Long.MAX_VALUE), actual);
Assertions.assertEquals(LongList.union(a, b, c, d), LongList.union(b, c, a, d));
Assertions.assertEquals(LongList.union(a, b, c, d), LongList.union(d, b, a, c));
}
@Test
public void testUnionUnsortedLists() {
final LongList a = LongList.of(1, 0, 3, 4);

View File

@@ -0,0 +1,59 @@
package org.lucares.collections;
import java.util.Arrays;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class MultiwayLongMergerTest {
@Test
public void testMergeTwoLists() {
LongList a = LongList.of(1,2,3);
LongList b = LongList.of(1,3,5);
LongList expected = LongList.of(1,2,3,5);
LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b));
Assertions.assertEquals(expected, union);
}
@Test
public void testMergeThreeLists() {
LongList a = LongList.of(1,2,3);
LongList b = LongList.of(1,3,5);
LongList c = LongList.of(2,3,5);
LongList expected = LongList.of(1,2,3,5);
LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b,c));
Assertions.assertEquals(expected, union);
}
@Test
public void testMergeListsWithLongMin() {
LongList a = LongList.of(Long.MIN_VALUE,2,3);
LongList b = LongList.of(1,3,5);
LongList c = LongList.of(Long.MIN_VALUE,Long.MIN_VALUE);
LongList expected = LongList.of(Long.MIN_VALUE,1,2,3,5);
LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b,c));
Assertions.assertEquals(expected, union);
}
@Test
public void testMergeEmptyLists() {
LongList a = LongList.of();
LongList b = LongList.of();
LongList expected = LongList.of();
LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b));
Assertions.assertEquals(expected, union);
}
}