From 8a5309fbe8257bb25751f34db7eccec3a7d3a1d9 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 6 Nov 2020 19:22:45 +0100 Subject: [PATCH] multiway merge of multiple sorted lists --- .../org/lucares/collections/LongList.java | 125 ++++++++- .../collections/MultiwayLongMerger.java | 252 ++++++++++++++++++ .../org/lucares/collections/LongListTest.java | 25 ++ .../collections/MultiwayLongMergerTest.java | 59 ++++ 4 files changed, 448 insertions(+), 13 deletions(-) create mode 100644 primitiveCollections/src/main/java/org/lucares/collections/MultiwayLongMerger.java create mode 100644 primitiveCollections/src/test/java/org/lucares/collections/MultiwayLongMergerTest.java diff --git a/primitiveCollections/src/main/java/org/lucares/collections/LongList.java b/primitiveCollections/src/main/java/org/lucares/collections/LongList.java index f3fbdb7..e705766 100644 --- a/primitiveCollections/src/main/java/org/lucares/collections/LongList.java +++ b/primitiveCollections/src/main/java/org/lucares/collections/LongList.java @@ -1,8 +1,10 @@ package org.lucares.collections; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Iterator; import java.util.List; import java.util.Random; import java.util.Spliterator.OfLong; @@ -41,7 +43,7 @@ public final class LongList implements Serializable, Cloneable { * Keeps track of whether or not the list is sorted. This allows us to use * binary search for {@link #indexOf(int)} and efficient algorithms for * {@link #intersection(LongList, LongList)} / - * {@link #union(LongList, LongList)} / {@link #uniq()} / + * {@link #unionInternal(LongList, LongList)} / {@link #uniq()} / * {@link #removeAll(int, int)}. An empty list is sorted. */ private boolean sorted = true; @@ -121,7 +123,7 @@ public final class LongList implements Serializable, Cloneable { * * @param startInclusive the lower bound (inclusive) * @param endInclusive the upper bound (inclusive) - * @return the {@link IntList} + * @return the {@link LongList} */ public static LongList rangeClosed(final long startInclusive, final long endInclusive) { if (startInclusive > endInclusive) { @@ -137,6 +139,32 @@ public final class LongList implements Serializable, Cloneable { } } + /** + * Returns a new list with the values from the given start index to the end of + * the list. + * + * @param startInclusive start index + * @return {@code LongList} + */ + public LongList sublist(final int startInclusive) { + return sublist(startInclusive, size); + } + + /** + * Returns a new list with the values of the given range. + * + * @param startInclusive the start index + * @param endExclusive the end index (exclusive) + * @return {@link LongList} + */ + public LongList sublist(final int startInclusive, int endExclusive) { + final LongList result = new LongList(endExclusive - startInclusive); + result.data = Arrays.copyOfRange(data, startInclusive, endExclusive); + result.size = result.data.length; + result.sorted = sorted; + return result; + } + /** * Returns {@code true} if this list contains no elements. * @@ -1114,27 +1142,32 @@ public final class LongList implements Serializable, Cloneable { } /** - * Returns a list with all elements that are in list {@code a} or {@code b} - * (logical or). + * Returns a list with all elements that are in list {@code a} or {@code b} or + * ... or {@code n} (logical or). *

* The result does not contain duplicate values. *

- * If both lists were sorted, then the output list will also be sorted. If at + * If all lists were sorted, then the output list will also be sorted. If at * least one list is unsorted, then the order is undefined. *

- * If both lists are sorted, then the time complexity is O(n+m), where n is the - * length of the first list and m the length of the second list. If at least one - * list is not sorted, then the time complexity is O(m*log(m)), where m is the - * length of the longer list. + * TODO check time complexity If all lists are sorted, then the time complexity + * is O(n+m), where n is the length of the first list and m the length of the + * second list. If at least one list is not sorted, then the time complexity is + * O(m*log(m)), where m is the length of the longer list. * - * @param a the first list - * @param b the second list + * @param longLists the lists * @return the union of both lists */ - public static LongList union(final LongList a, final LongList b) { + public static LongList union(final LongList... longLists) { + return union(List.of(longLists)); + } + + private static LongList unionInternal(final LongList a, final LongList b) { final LongList result; - if (a.isSorted() && b.isSorted()) { + if (a.isEmpty() && b.isEmpty()) { + result = new LongList(); + } else if (a.isSorted() && b.isSorted()) { result = unionSorted(a, b); } else { result = unionUnsorted(a, b); @@ -1142,6 +1175,35 @@ public final class LongList implements Serializable, Cloneable { return result; } + public static LongList union(final Collection longLists) { + switch (longLists.size()) { + case 0: + return new LongList(); + case 1: + return longLists.iterator().next().clone(); + case 2: + final Iterator it = longLists.iterator(); + final LongList a = it.next(); + final LongList b = it.next(); + return unionInternal(a, b); + default: + final Collection sortedLists = subsetOfSortedLists(longLists); + final Collection unsortedLists = subsetOfUnsortedLists(longLists); + + final LongList unionSorted = MultiwayLongMerger.unionSorted(sortedLists); + + final LongList result; + if (unsortedLists.isEmpty()) { + result = unionSorted; + } else { + final LongList unionUnsorted = unionUnsorted(unsortedLists); + result = unionInternal(unionSorted, unionUnsorted); + } + + return result; + } + } + private static LongList unionSorted(final LongList a, final LongList b) { final int aSize = a.size(); @@ -1211,6 +1273,17 @@ public final class LongList implements Serializable, Cloneable { return unionSorted(aSorted, bSorted); } + private static LongList unionUnsorted(Collection longLists) { + + final List sortedLists = new ArrayList<>(); + for (LongList longList : longLists) { + final LongList copy = new LongList(longList); + copy.sort(); + sortedLists.add(copy); + } + return MultiwayLongMerger.unionSorted(sortedLists); + } + private void checkIfSorted() { sorted = true; for (int i = 1; i < size && sorted; i++) { @@ -1218,4 +1291,30 @@ public final class LongList implements Serializable, Cloneable { } } + private static Collection subsetOfSortedLists(Collection longLists) { + + final List result = new ArrayList<>(); + + for (LongList longList : longLists) { + if (longList.isSorted()) { + result.add(longList); + } + } + + return result; + } + + private static Collection subsetOfUnsortedLists(Collection longLists) { + + final List result = new ArrayList<>(); + + for (LongList longList : longLists) { + if (!longList.isSorted()) { + result.add(longList); + } + } + + return result; + } + } diff --git a/primitiveCollections/src/main/java/org/lucares/collections/MultiwayLongMerger.java b/primitiveCollections/src/main/java/org/lucares/collections/MultiwayLongMerger.java new file mode 100644 index 0000000..a6deecd --- /dev/null +++ b/primitiveCollections/src/main/java/org/lucares/collections/MultiwayLongMerger.java @@ -0,0 +1,252 @@ +package org.lucares.collections; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +class MultiwayLongMerger { + private static final long UNSET = Long.MIN_VALUE; + + private static class LongQueue { + final LongList wrapped; + + int offset = 0; + + public LongQueue(LongList wrapped) { + this.wrapped = wrapped; + } + + boolean isEmpty() { + return offset >= wrapped.size(); + } + + long pop() { + assert offset < wrapped.size(); + final long result = wrapped.get(offset); + offset++; + return result; + } + + public long peek() { + return wrapped.get(offset); + } + + @Override + public String toString() { + return wrapped.sublist(offset).toString(); + } + + public int size() { + return wrapped.size()-offset; + } + } + + static LongList unionSorted(Collection longLists) { + + assertAllListsAreSorted(longLists); + + final List queues = new ArrayList(); + boolean hasValueUNSET = initQueues(longLists, queues); + + final LongList result = new LongList(); + if (hasValueUNSET) { + result.add(UNSET); + } + + if (!queues.isEmpty()) { + mergeQueues(queues, result); + } + + return result; + } + + private static void mergeQueues(final List queues, final LongList result) { + final MinValuePriorityQueue selectionTree = new MinValuePriorityQueue(queues); + + long previousValue = UNSET; + long val; + while ((val = selectionTree.pop()) != UNSET) { + if (val != previousValue) { + result.add(val); + previousValue = val; + } + } + } + + private static boolean initQueues(Collection longLists, final List queues) { + boolean hasValueUNSET = false; + for (LongList longList : longLists) { + if (!longList.isEmpty()) { + final LongQueue queue = new LongQueue(longList); + while (!queue.isEmpty() && queue.peek() == UNSET) { + queue.pop(); + hasValueUNSET = true; + } + if (!queue.isEmpty()) { + queues.add(queue); + } + } + } + return hasValueUNSET; + } + + private static void assertAllListsAreSorted(Collection longLists) { + for (LongList longList : longLists) { + if (!longList.isSorted()) { + throw new IllegalArgumentException("lists must be sorted"); + } + } + } + + private static int nextPowOfTwo(int i) { + return Integer.highestOneBit(i - 1) << 1; + } + + private static class MinValuePriorityQueue { + + private List longQueues; + + /* + * a classic heap where the nodes are layed out in breath first order. First the + * root, then the nodes of level 1, then the nodes of level 2, ... + */ + private final long[] heap; + + private final int size; + + private final int firstLeafIndex; + + public MinValuePriorityQueue(final Collection longQueues) { + this.longQueues = new ArrayList<>(longQueues); + size = longQueues.size(); + heap = new long[2 * nextPowOfTwo(size) - 1]; + + firstLeafIndex = heap.length / 2; + + Arrays.fill(heap, UNSET); + init(); + } + + /** + * Returns the smallest value of the heap. Returns + * {@link MultiwayLongMerger#UNSET}={@value MultiwayLongMerger#UNSET} if the + * heap is empty. + * + * @return the smallest value or + * {@link MultiwayLongMerger#UNSET}={@value MultiwayLongMerger#UNSET} if + * heap is empty + */ + public long pop() { + long result = heap[0]; + fillWithMinOfChildren(0); + return result; + } + + /** + *

+		 *               7
+		 *           3  
+		 *               8
+		 *       1   
+		 *               9
+		 *           4
+		 *               10
+		 *   0
+		 *               11
+		 *           5
+		 *               12
+		 *       2
+		 *               13
+		 *           6 
+		 *               14
+		 * 
+ */ + private void init() { + // fill leaf nodes + int offset = firstLeafIndex; + for (int j = 0; j < size; j++) { + final LongQueue q = longQueues.get(j); + heap[offset + j] = q.isEmpty() ? UNSET : q.pop(); + } + + // fill the non-leaf layers (from the leafs up to the root) + while (offset > 0) { + offset /= 2; // + for (int i = offset; i <= offset * 2; i++) { + fillWithMinOfChildren(i); + } + } + } + + private int leftChildIndex(int i) { + return i * 2 + 1; + } + + private int rightChildIndex(int i) { + return i * 2 + 2; + } + + private boolean isLeaf(int i) { + return i >= firstLeafIndex; + } + + private int leafIndexToListIndex(int i) { + assert isLeaf(i) : "index " + i + " is not a leaf"; + return i - firstLeafIndex; + } + + private void fillWithMinOfChildren(int index) { + final int leftChildIndex = index * 2 + 1; //leftChildIndex(index); + final int rightChildIndex = leftChildIndex+1;//rightChildIndex(index); + + final long valueOfLeftChild = heap[leftChildIndex]; + final long valueOfRightChild = heap[rightChildIndex]; + + final int chosenValue; + + if (valueOfLeftChild == UNSET) { + if (valueOfRightChild == UNSET) { + heap[index] = UNSET; + return; + } else { + //left < right + heap[index] = valueOfRightChild; + chosenValue = rightChildIndex; + } + } else if (valueOfRightChild == UNSET) { + // left > right + heap[index] = valueOfLeftChild; + chosenValue = leftChildIndex; + } else { + if (valueOfLeftChild < valueOfRightChild) { + // left < right + heap[index] = valueOfLeftChild; + chosenValue = leftChildIndex; + } else { + // left >= right + heap[index] = valueOfRightChild; + chosenValue = rightChildIndex; + } + } + + refillValue(chosenValue); + } + + private void refillValue(int index) { + if (isLeaf(index)) { + final int listIndex = index - firstLeafIndex; //leafIndexToListIndex(index); + final LongQueue queue = longQueues.get(listIndex); + heap[index] = queue.isEmpty() ? UNSET : queue.pop(); + return; + } + fillWithMinOfChildren(index); + } + } + + public static void main(String[] args) { + for (int i = 0; i < 20; i++) { + System.out.println(i + " " + (Long.highestOneBit(i - 1) << 1)); + } + } +} diff --git a/primitiveCollections/src/test/java/org/lucares/collections/LongListTest.java b/primitiveCollections/src/test/java/org/lucares/collections/LongListTest.java index 1fc3c6f..467a59a 100644 --- a/primitiveCollections/src/test/java/org/lucares/collections/LongListTest.java +++ b/primitiveCollections/src/test/java/org/lucares/collections/LongListTest.java @@ -1613,6 +1613,31 @@ public class LongListTest { Assertions.assertEquals(LongList.of(2), actual); Assertions.assertEquals(LongList.union(a, b), LongList.union(b, a)); } + + @Test + public void testUnionSortedLists_three() { + final LongList a = LongList.of(1, 2, 3); + final LongList b = LongList.of(2, 4, 6); + final LongList c = LongList.of(3, 5, 7); + + final LongList actual = LongList.union(a, b, c); + Assertions.assertEquals(LongList.of(1,2,3,4,5,6,7), actual); + Assertions.assertEquals(LongList.union(a, b, c), LongList.union(b, c, a)); + Assertions.assertEquals(LongList.union(a, b, c), LongList.union(b, a, c)); + } + + @Test + public void testUnionSortedLists_four_LongMinValue() { + final LongList a = LongList.of(Long.MIN_VALUE, 2, 3,Long.MAX_VALUE); + final LongList b = LongList.of(2, 4, 6, Long.MAX_VALUE); + final LongList c = LongList.of(Long.MIN_VALUE, 5, 7); + final LongList d = LongList.of(Long.MIN_VALUE, Long.MIN_VALUE); + + final LongList actual = LongList.union(a, b, c, d); + Assertions.assertEquals(LongList.of(Long.MIN_VALUE,2,3,4,5,6,7, Long.MAX_VALUE), actual); + Assertions.assertEquals(LongList.union(a, b, c, d), LongList.union(b, c, a, d)); + Assertions.assertEquals(LongList.union(a, b, c, d), LongList.union(d, b, a, c)); + } @Test public void testUnionUnsortedLists() { diff --git a/primitiveCollections/src/test/java/org/lucares/collections/MultiwayLongMergerTest.java b/primitiveCollections/src/test/java/org/lucares/collections/MultiwayLongMergerTest.java new file mode 100644 index 0000000..d96ddca --- /dev/null +++ b/primitiveCollections/src/test/java/org/lucares/collections/MultiwayLongMergerTest.java @@ -0,0 +1,59 @@ +package org.lucares.collections; + +import java.util.Arrays; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class MultiwayLongMergerTest { + + @Test + public void testMergeTwoLists() { + + LongList a = LongList.of(1,2,3); + LongList b = LongList.of(1,3,5); + LongList expected = LongList.of(1,2,3,5); + + + LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b)); + Assertions.assertEquals(expected, union); + } + + @Test + public void testMergeThreeLists() { + + LongList a = LongList.of(1,2,3); + LongList b = LongList.of(1,3,5); + LongList c = LongList.of(2,3,5); + LongList expected = LongList.of(1,2,3,5); + + + LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b,c)); + Assertions.assertEquals(expected, union); + } + + @Test + public void testMergeListsWithLongMin() { + + LongList a = LongList.of(Long.MIN_VALUE,2,3); + LongList b = LongList.of(1,3,5); + LongList c = LongList.of(Long.MIN_VALUE,Long.MIN_VALUE); + LongList expected = LongList.of(Long.MIN_VALUE,1,2,3,5); + + + LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b,c)); + Assertions.assertEquals(expected, union); + } + + @Test + public void testMergeEmptyLists() { + + LongList a = LongList.of(); + LongList b = LongList.of(); + LongList expected = LongList.of(); + + + LongList union = MultiwayLongMerger.unionSorted(Arrays.asList(a,b)); + Assertions.assertEquals(expected, union); + } +}