tags are now stored as variable length byte sequences of longs

Replaced Tags.filenameBytes with a SortedSet<Tag>. Tags are now
stored as longs (variable length encoded) in the PersistenMap.
Tags.filenameBytes was introduced to reduce memory consumption, when
all tags were hold in memory. Tags are now stored in a PersistentMap
and only read when needed.

Moved the VariableByteEncoder into its own project, because it was
needed by pdb-api.
This commit is contained in:
2018-11-17 20:03:46 +01:00
parent b2107acf4e
commit 135ab42cd8
14 changed files with 97 additions and 128 deletions

7
byte-utils/.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
/.settings/
/.classpath
/.project
/bin/
/build/
/target/
/test-output/

7
byte-utils/build.gradle Normal file
View File

@@ -0,0 +1,7 @@
dependencies {
compile 'org.apache.logging.log4j:log4j-core:2.10.0'
compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0'
compile 'org.lucares:primitiveCollections:0.1.20180908084945'
}

View File

@@ -0,0 +1,242 @@
package org.lucares.utils.byteencoder;
import java.util.Arrays;
import org.lucares.collections.LongList;
/**
* Encodes longs into bytes using variable byte encoding. We are using a
* transformation that encodes negative values into positive values. Even
* numbers represent positive longs, uneven values represent negative longs, or
* the null.
* <p>
* We then encode encode each 7 bits into one byte. This highest value bit is
* reserved for a flag that tells us whether or not more bytes follow. This bit
* is set for all but the last byte.
* <p>
* Please note two things:
* <ol>
* <li>0 is encoded to 1; the encoded values do not contain 0
* <li>all but the last byte have the high value bit set
* </ol>
* That means no byte will have the value 0. This is important when decoding
* bytes, because we can decode bytes until we encounter the first null byte, or
* we reach the end of the array.
*/
public class VariableByteEncoder {
public static final long MIN_VALUE = Long.MIN_VALUE / 2 + 1;
public static final long MAX_VALUE = Long.MAX_VALUE / 2;
private static final int MAX_BYTES_PER_VALUE = 10;
private static final int CONTINUATION_BYTE_FLAG = 1 << 7; // 10000000
private static final long DATA_BITS = (1 << 7) - 1; // 01111111
private static final ThreadLocal<byte[]> SINGLE_VALUE_BUFFER = ThreadLocal
.withInitial(() -> new byte[MAX_BYTES_PER_VALUE]);
/**
* Encodes time and value into the given buffer.
* <p>
* If the encoded values do not fit into the buffer, then 0 is returned. The
* caller will have to provide a new buffer with more space.
*
* @param value1 first value, (between -(2^62)+1 and 2^62)
* @param value2 second value, (between -(2^62)+1 and 2^62)
* @param buffer
* @param offsetInBuffer
* @return number of bytes appended to the provided buffer
*/
public static int encodeInto(final long value1, final long value2, final byte[] buffer, final int offsetInBuffer) {
int offset = offsetInBuffer;
final int bytesAdded1 = encodeInto(value1, buffer, offset);
if (bytesAdded1 > 0) {
offset += bytesAdded1;
final int bytesAdded2 = encodeInto(value2, buffer, offset);
if (bytesAdded2 > 0) {
// both value fit into the buffer
// return the number of added bytes
return bytesAdded1 + bytesAdded2;
} else {
// second value did not fit into the buffer,
// remove the first value
// and return 0 to indicate that the values did not fit
Arrays.fill(buffer, offsetInBuffer, buffer.length, (byte) 0);
return 0;
}
}
// return 0 if the encoded bytes do not fit
// the caller will have to provide a new buffer
return 0;
}
public static LongList decode(final byte[] buffer) {
final LongList result = new LongList();
decodeInto(buffer, result);
return result;
}
public static int encodeInto(final long value, final byte[] buffer, final int offsetInBuffer) {
int offset = offsetInBuffer;
assert value >= MIN_VALUE : "min encodable value is -2^62+1 = " + MIN_VALUE;
assert value <= MAX_VALUE : "max encodable value is 2^62 = " + MAX_VALUE;
long normVal = encodeIntoPositiveValue(value);
try {
final long maxFirstByteValue = 127;
while (normVal > maxFirstByteValue) {
buffer[offset] = (byte) ((normVal & DATA_BITS) | CONTINUATION_BYTE_FLAG);
offset++;
normVal = normVal >> 7; // shift by number of value bits
}
buffer[offset] = (byte) (normVal);
return offset - offsetInBuffer + 1; // return number of encoded bytes
} catch (final ArrayIndexOutOfBoundsException e) {
// We need more bytes to store the value than are available.
// Reset the bytes we just wrote.
Arrays.fill(buffer, offsetInBuffer, buffer.length, (byte) 0);
return 0;
}
}
private static void decodeInto(final byte[] buffer, final LongList bufferedLongs) {
for (int i = 0; i < buffer.length; i++) {
if (buffer[i] == 0) {
// no value is encoded to 0 => there are no further values
break;
} else {
long val = buffer[i] & DATA_BITS;
int shift = 7;
while (!isLastByte(buffer[i]) && i + 1 < buffer.length) {
val = val | ((buffer[i + 1] & DATA_BITS) << shift);
i++;
shift += 7;
}
bufferedLongs.add(decodeIntoSignedValue(val));
}
}
}
/**
* The input value (positive, negative or null) is encoded into a positive
* value.
*
* <pre>
*
* input: 0 1 -1 2 -2 3 -3
* encoded: 1 2 3 4 5 6 7
* </pre>
*/
private static long encodeIntoPositiveValue(final long value) {
return value > 0 ? value * 2 : (value * -2) + 1;
}
/**
* inverse of {@link #encodeIntoPositiveValue(long)}
*
* @param value
* @return
*/
private static long decodeIntoSignedValue(final long value) {
return (value / 2) * (value % 2 == 0 ? 1 : -1);
}
private static boolean isLastByte(final byte b) {
return (b & CONTINUATION_BYTE_FLAG) == 0;
}
public static byte[] encode(final long... longs) {
int neededBytes = 0;
for (final long l : longs) {
neededBytes += VariableByteEncoder.neededBytes(l);
}
final byte[] result = new byte[neededBytes];
final int bytesWritten = encodeInto(longs, result, 0);
if (bytesWritten <= 0) {
throw new IllegalStateException(
"Did not reserve enough space to store " + longs + ". We reserved only " + neededBytes + " bytes.");
}
return result;
}
public static long decodeFirstValue(final byte[] buffer) {
int offset = 0;
long val = buffer[offset] & DATA_BITS;
int shift = 7;
while (!isLastByte(buffer[offset]) && offset + 1 < buffer.length) {
val = val | ((buffer[offset + 1] & DATA_BITS) << shift);
offset++;
shift += 7;
}
return decodeIntoSignedValue(val);
}
public static int encodeInto(final LongList values, final byte[] buffer, final int offsetInBuffer) {
int offset = offsetInBuffer;
for (int i = 0; i < values.size(); i++) {
final long value = values.get(i);
final int bytesAdded = encodeInto(value, buffer, offset);
if (bytesAdded <= 0) {
Arrays.fill(buffer, offsetInBuffer, offset, (byte) 0);
return 0;
}
offset += bytesAdded;
}
return offset - offsetInBuffer;
}
public static int encodeInto(final long[] values, final byte[] buffer, final int offsetInBuffer) {
int offset = offsetInBuffer;
for (int i = 0; i < values.length; i++) {
final long value = values[i];
final int bytesAdded = encodeInto(value, buffer, offset);
if (bytesAdded <= 0) {
Arrays.fill(buffer, offsetInBuffer, offset, (byte) 0);
return 0;
}
offset += bytesAdded;
}
return offset - offsetInBuffer;
}
public static byte[] encode(final LongList longs) {
final int neededBytes = longs.stream().mapToInt(VariableByteEncoder::neededBytes).sum();
final byte[] result = new byte[neededBytes];
final int bytesWritten = encodeInto(longs, result, 0);
if (bytesWritten <= 0) {
throw new IllegalStateException(
"Did not reserve enough space to store " + longs + ". We reserved only " + neededBytes + " bytes.");
}
return result;
}
public static int neededBytes(final long value) {
final byte[] buffer = SINGLE_VALUE_BUFFER.get();
final int usedBytes = encodeInto(value, buffer, 0);
return usedBytes;
}
}

View File

@@ -0,0 +1,110 @@
package org.lucares.utils.byteencoder;
import static org.testng.Assert.assertEquals;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicInteger;
import org.lucares.collections.LongList;
import org.lucares.utils.byteencoder.VariableByteEncoder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@Test
public class VariableByteEncoderTest {
@DataProvider
public Object[][] providerEncodeDecode() {
return new Object[][] { //
// encoded into 1 byte
{ 10, -5, 5 }, //
{ 10, 0, 5 }, //
{ 10, -63, 63 }, //
// encoded into 2 bytes
{ 10, 130, 131 }, //
// encoded into 3 bytes
{ 10, -8191, 8191 }, //
// encoded into n bytes
{ 1, Long.MAX_VALUE / 2 - 4, Long.MAX_VALUE / 2 }, //
{ 1, Long.MIN_VALUE / 2, Long.MAX_VALUE / 2 }, //
{ 11, Long.MIN_VALUE / 2 + 1, Long.MIN_VALUE / 2 + 3 }, //
{ 12, Long.MAX_VALUE / 2 - 3, Long.MAX_VALUE / 2 },//
};
}
@Test(dataProvider = "providerEncodeDecode")
public void testEncodeDecode(final long numValues, final long minValue, final long maxValue) {
final LongList originalValues = new LongList();
final byte[] buffer = new byte[1024];
final AtomicInteger offsetInBuffer = new AtomicInteger(0);
ThreadLocalRandom.current().longs(numValues, minValue, maxValue).forEachOrdered(value -> {
originalValues.add(value);
final int appendedBytes = VariableByteEncoder.encodeInto(value, buffer, offsetInBuffer.get());
offsetInBuffer.addAndGet(appendedBytes);
});
final LongList actualValues = VariableByteEncoder.decode(buffer);
assertEquals(actualValues.toString(), originalValues.toString());
}
@DataProvider
public Object[][] providerEncodeDecodeOfTwoValues() {
return new Object[][] { //
{ 12345, 67890, false, 1 }, // first value needs three bytes, it does not fit
{ 12345, 67890, false, 2 }, // first value needs three bytes, it does not fit
{ 12345, 67890, false, 3 }, // first value needs three bytes, second value does not fit
{ 12345, 67890, false, 4 }, // first value needs three bytes, second value does not fit
{ 12345, 67890, false, 5 }, // first value needs three bytes, second value does not fit
{ 12345, 67890, true, 6 }, // both values need three bytes
{ 12345, 67890, true, 10 }, //
};
}
@Test(dataProvider = "providerEncodeDecodeOfTwoValues")
public void testEncodeDecodeOfTwoValues(final long value1, final long value2, final boolean fits,
final int bufferSize) {
final LongList originalValues = new LongList();
final byte[] buffer = new byte[bufferSize];
final int bytesAdded = VariableByteEncoder.encodeInto(value1, value2, buffer, 0);
Assert.assertEquals(bytesAdded > 0, fits);
if (fits) {
originalValues.addAll(value1, value2);
} else {
Assert.assertEquals(buffer, new byte[bufferSize],
"checks that buffer is resetted after it discovers the values do not fit");
}
final LongList decodedValues = VariableByteEncoder.decode(buffer);
Assert.assertEquals(decodedValues, originalValues);
}
@DataProvider
public Object[][] providerNededBytes() {
return new Object[][] { //
{ 0, 1 }, //
{ -10, 1 }, //
{ 10, 1 }, //
{ -63, 1 }, //
{ 63, 1 }, //
{ -64, 2 }, //
{ 64, 2 }, //
{ -8191, 2 }, //
{ 8191, 2 }, //
{ -8192, 3 }, //
{ 8192, 3 }, //
};
}
@Test(dataProvider = "providerNededBytes")
public void testNeededBytes(final long value, final int expectedNeededBytes) {
final int neededBytes = VariableByteEncoder.neededBytes(value);
final byte[] encoded = VariableByteEncoder.encode(value);
Assert.assertEquals(encoded.length, neededBytes);
}
}