add support for renaming and post processing of csv columns

This commit is contained in:
2019-12-14 18:11:59 +01:00
parent 1124dc8082
commit 00ba4d2a69
8 changed files with 250 additions and 72 deletions

View File

@@ -1,6 +1,7 @@
package org.lucares.pdb.api;
import java.nio.file.Path;
import java.util.function.Function;
/**
* Persistently maps Strings to integers.
@@ -23,8 +24,9 @@ public class StringCompressor {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
}
public int put(final byte[] bytes, final int start, final int endExclusive) {
return usip.computeIfAbsent(bytes, start, endExclusive);
public int put(final byte[] bytes, final int start, final int endExclusive,
final Function<String, String> postProcess) {
return usip.computeIfAbsent(bytes, start, endExclusive, postProcess);
}
public String get(final int integer) {

View File

@@ -193,7 +193,8 @@ public class UniqueStringIntegerPairs {
return stringToInt.get(string);
}
public Integer computeIfAbsent(final byte[] bytes, final int start, final int endExclusive) {
public Integer computeIfAbsent(final byte[] bytes, final int start, final int endExclusive,
final Function<String, String> postProcess) {
final ByteArray byteArray = new ByteArray(bytes, start, endExclusive);
Integer result = bytesToInt.get(byteArray);
@@ -201,8 +202,16 @@ public class UniqueStringIntegerPairs {
synchronized (stringToInt) {
if (!bytesToInt.containsKey(byteArray)) {
final String string = new String(bytes, start, endExclusive - start, StandardCharsets.UTF_8);
final String normalizedString = postProcess.apply(string);
result = get(normalizedString);
if (result != null) {
return result;
}
final Integer integer = intToString.size();
put(string, integer);
put(normalizedString, integer); // adds the normalized String to stringToInt and bytesToInt
bytesToInt.put(byteArray, integer); // also add the original String to bytesToInt, because it is
// used as cache
}
result = bytesToInt.get(byteArray);
}