add second parser that uses a standard CSV reader
This commit is contained in:
@@ -185,11 +185,14 @@ public class DiskStorage implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Optional<FreeListNode> findFreeBlockWithSize(final long blockSize) throws IOException {
|
private Optional<FreeListNode> findFreeBlockWithSize(final long blockSize) throws IOException {
|
||||||
|
final long start = System.nanoTime();
|
||||||
FreeListNode result = null;
|
FreeListNode result = null;
|
||||||
final long freeListRootNodePosition = readFreeListRootNodePosition();
|
final long freeListRootNodePosition = readFreeListRootNodePosition();
|
||||||
|
int counter = 0;
|
||||||
|
|
||||||
long nextFreeListNodeOffset = freeListRootNodePosition;
|
long nextFreeListNodeOffset = freeListRootNodePosition;
|
||||||
while (nextFreeListNodeOffset > 0) {
|
while (nextFreeListNodeOffset > 0) {
|
||||||
|
counter++;
|
||||||
final var freeListNode = readFreeListNode(nextFreeListNodeOffset);
|
final var freeListNode = readFreeListNode(nextFreeListNodeOffset);
|
||||||
|
|
||||||
if (freeListNode.getSize() == blockSize) {
|
if (freeListNode.getSize() == blockSize) {
|
||||||
@@ -198,6 +201,10 @@ public class DiskStorage implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
nextFreeListNodeOffset = freeListNode.getNext();
|
nextFreeListNodeOffset = freeListNode.getNext();
|
||||||
}
|
}
|
||||||
|
final double d = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
|
if (d > 0.5) {
|
||||||
|
System.out.println("findFreeBlockWithSize took: " + d + " ms counter" + counter);
|
||||||
|
}
|
||||||
|
|
||||||
return Optional.ofNullable(result);
|
return Optional.ofNullable(result);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ ext {
|
|||||||
lib_antlr = "org.antlr:antlr4:4.9.2"
|
lib_antlr = "org.antlr:antlr4:4.9.2"
|
||||||
|
|
||||||
lib_commons_collections4 = 'org.apache.commons:commons-collections4:4.4'
|
lib_commons_collections4 = 'org.apache.commons:commons-collections4:4.4'
|
||||||
|
lib_commons_csv= 'org.apache.commons:commons-csv:1.9.0'
|
||||||
lib_commons_lang3 = 'org.apache.commons:commons-lang3:3.12.0'
|
lib_commons_lang3 = 'org.apache.commons:commons-lang3:3.12.0'
|
||||||
lib_jackson_databind = 'com.fasterxml.jackson.core:jackson-databind:2.12.4'
|
lib_jackson_databind = 'com.fasterxml.jackson.core:jackson-databind:2.12.4'
|
||||||
|
|
||||||
|
|||||||
@@ -159,7 +159,13 @@ public class DataStore implements AutoCloseable {
|
|||||||
public void write(final long dateAsEpochMilli, final Tags tags, final long value) {
|
public void write(final long dateAsEpochMilli, final Tags tags, final long value) {
|
||||||
final ParititionId partitionId = DateIndexExtension.toPartitionId(dateAsEpochMilli);
|
final ParititionId partitionId = DateIndexExtension.toPartitionId(dateAsEpochMilli);
|
||||||
final PdbWriter writer = getWriter(partitionId, tags);
|
final PdbWriter writer = getWriter(partitionId, tags);
|
||||||
|
|
||||||
|
final long start = System.nanoTime();
|
||||||
writer.write(dateAsEpochMilli, value);
|
writer.write(dateAsEpochMilli, value);
|
||||||
|
final double duration = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
|
if (duration > 1) {
|
||||||
|
System.out.println(" write took: " + duration + " ms " + tags);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// visible for test
|
// visible for test
|
||||||
@@ -377,9 +383,14 @@ public class DataStore implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private PdbWriter getWriter(final ParititionId partitionId, final Tags tags) throws ReadException, WriteException {
|
private PdbWriter getWriter(final ParititionId partitionId, final Tags tags) throws ReadException, WriteException {
|
||||||
|
final long start = System.nanoTime();
|
||||||
final PartitionedTagsCacheKey cacheKey = new PartitionedTagsCacheKey(tags, partitionId);
|
final PartitionedTagsCacheKey cacheKey = new PartitionedTagsCacheKey(tags, partitionId);
|
||||||
return writerCache.putIfAbsent(cacheKey, t -> getWriterInternal(partitionId, tags));
|
final PdbWriter result = writerCache.putIfAbsent(cacheKey, t -> getWriterInternal(partitionId, tags));
|
||||||
|
final double duration = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
|
if (duration > 1) {
|
||||||
|
System.out.println(" get Writer took: " + duration + " ms " + tags);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// visible for test
|
// visible for test
|
||||||
@@ -392,9 +403,14 @@ public class DataStore implements AutoCloseable {
|
|||||||
PdbWriter writer;
|
PdbWriter writer;
|
||||||
if (docsForTags.isPresent()) {
|
if (docsForTags.isPresent()) {
|
||||||
try {
|
try {
|
||||||
|
final long start = System.nanoTime();
|
||||||
final Doc doc = docsForTags.get();
|
final Doc doc = docsForTags.get();
|
||||||
final PdbFile pdbFile = new PdbFile(partitionId, doc.getRootBlockNumber(), tags);
|
final PdbFile pdbFile = new PdbFile(partitionId, doc.getRootBlockNumber(), tags);
|
||||||
writer = new PdbWriter(pdbFile, diskStorage.getExisting(partitionId));
|
writer = new PdbWriter(pdbFile, diskStorage.getExisting(partitionId));
|
||||||
|
final double duration = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
|
if (duration > 1) {
|
||||||
|
System.out.println(" init existing writer took: " + duration + " ms " + tags);
|
||||||
|
}
|
||||||
} catch (final RuntimeException e) {
|
} catch (final RuntimeException e) {
|
||||||
throw new ReadException(e);
|
throw new ReadException(e);
|
||||||
}
|
}
|
||||||
@@ -410,8 +426,10 @@ public class DataStore implements AutoCloseable {
|
|||||||
final PdbFile pdbFile = createNewPdbFile(partitionId, tags);
|
final PdbFile pdbFile = createNewPdbFile(partitionId, tags);
|
||||||
final PdbWriter result = new PdbWriter(pdbFile, diskStorage.getExisting(partitionId));
|
final PdbWriter result = new PdbWriter(pdbFile, diskStorage.getExisting(partitionId));
|
||||||
|
|
||||||
METRICS_LOGGER_NEW_WRITER.debug("newPdbWriter took {}ms tags: {}",
|
final double duration = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
(System.nanoTime() - start) / 1_000_000.0, tags);
|
if (duration > 1) {
|
||||||
|
METRICS_LOGGER_NEW_WRITER.info("newPdbWriter took {}ms tags: {}", duration, tags);
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
} catch (final RuntimeException e) {
|
} catch (final RuntimeException e) {
|
||||||
throw new WriteException(e);
|
throw new WriteException(e);
|
||||||
|
|||||||
@@ -313,6 +313,7 @@ public class QueryCompletionIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void addTags(final ParititionId partitionId, final Tags tags) throws IOException {
|
public void addTags(final ParititionId partitionId, final Tags tags) throws IOException {
|
||||||
|
final long start = System.nanoTime();
|
||||||
final List<Tag> listOfTagsA = tags.toTags();
|
final List<Tag> listOfTagsA = tags.toTags();
|
||||||
final List<Tag> listOfTagsB = tags.toTags();
|
final List<Tag> listOfTagsB = tags.toTags();
|
||||||
|
|
||||||
@@ -329,6 +330,10 @@ public class QueryCompletionIndex implements AutoCloseable {
|
|||||||
fieldToValueIndex.putValue(partitionId, tag, Empty.INSTANCE);
|
fieldToValueIndex.putValue(partitionId, tag, Empty.INSTANCE);
|
||||||
fieldIndex.putValue(partitionId, Tags.STRING_COMPRESSOR.getKeyAsString(tag), Empty.INSTANCE);
|
fieldIndex.putValue(partitionId, Tags.STRING_COMPRESSOR.getKeyAsString(tag), Empty.INSTANCE);
|
||||||
}
|
}
|
||||||
|
final double d = (System.nanoTime() - start) / 1_000_000.0;
|
||||||
|
if (d > 1) {
|
||||||
|
System.out.println(" addTags: " + d + " ms");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -29,6 +29,11 @@ public class StringCompressor {
|
|||||||
return usip.computeIfAbsent(bytes, start, endExclusive, postProcess);
|
return usip.computeIfAbsent(bytes, start, endExclusive, postProcess);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int put(final String value, final Function<String, String> postProcess) {
|
||||||
|
final String processedValue = postProcess.apply(value);
|
||||||
|
return usip.computeIfAbsentWithPostprocess(processedValue, postProcess);
|
||||||
|
}
|
||||||
|
|
||||||
public String get(final int integer) {
|
public String get(final int integer) {
|
||||||
|
|
||||||
return usip.getKey(integer);
|
return usip.getKey(integer);
|
||||||
|
|||||||
@@ -40,6 +40,10 @@ public class UniqueStringIntegerPairs {
|
|||||||
private final int start;
|
private final int start;
|
||||||
private final int endExclusive;
|
private final int endExclusive;
|
||||||
|
|
||||||
|
public ByteArray(final String string) {
|
||||||
|
this(string.getBytes(StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
public ByteArray(final byte[] array, final int start, final int endExclusive) {
|
public ByteArray(final byte[] array, final int start, final int endExclusive) {
|
||||||
super();
|
super();
|
||||||
this.array = array;
|
this.array = array;
|
||||||
@@ -127,7 +131,7 @@ public class UniqueStringIntegerPairs {
|
|||||||
final int integer = Integer.parseInt(tokens[1]);
|
final int integer = Integer.parseInt(tokens[1]);
|
||||||
intToStringPut(integer, string);
|
intToStringPut(integer, string);
|
||||||
stringToInt.put(string, integer);
|
stringToInt.put(string, integer);
|
||||||
bytesToInt.put(new ByteArray(string.getBytes(StandardCharsets.UTF_8)), integer);
|
bytesToInt.put(new ByteArray(string), integer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -164,7 +168,7 @@ public class UniqueStringIntegerPairs {
|
|||||||
|
|
||||||
intToStringPut(integer, string);
|
intToStringPut(integer, string);
|
||||||
stringToInt.put(string, integer);
|
stringToInt.put(string, integer);
|
||||||
bytesToInt.put(new ByteArray(string.getBytes(StandardCharsets.UTF_8)), integer);
|
bytesToInt.put(new ByteArray(string), integer);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer get(final String string) {
|
public Integer get(final String string) {
|
||||||
@@ -198,10 +202,20 @@ public class UniqueStringIntegerPairs {
|
|||||||
|
|
||||||
final ByteArray byteArray = new ByteArray(bytes, start, endExclusive);
|
final ByteArray byteArray = new ByteArray(bytes, start, endExclusive);
|
||||||
Integer result = bytesToInt.get(byteArray);
|
Integer result = bytesToInt.get(byteArray);
|
||||||
|
if (result == null) {
|
||||||
|
final String string = new String(bytes, start, endExclusive - start, StandardCharsets.UTF_8);
|
||||||
|
result = computeIfAbsentWithPostprocess(string, postProcess);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer computeIfAbsentWithPostprocess(final String string, final Function<String, String> postProcess) {
|
||||||
|
|
||||||
|
final ByteArray byteArray = new ByteArray(string);
|
||||||
|
Integer result = bytesToInt.get(byteArray);
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
synchronized (stringToInt) {
|
synchronized (stringToInt) {
|
||||||
if (!bytesToInt.containsKey(byteArray)) {
|
if (!bytesToInt.containsKey(byteArray)) {
|
||||||
final String string = new String(bytes, start, endExclusive - start, StandardCharsets.UTF_8);
|
|
||||||
final String normalizedString = postProcess.apply(string);
|
final String normalizedString = postProcess.apply(string);
|
||||||
result = get(normalizedString);
|
result = get(normalizedString);
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ dependencies {
|
|||||||
implementation project(':pdb-js')
|
implementation project(':pdb-js')
|
||||||
implementation project(':pdb-utils')
|
implementation project(':pdb-utils')
|
||||||
|
|
||||||
|
implementation lib_commons_csv
|
||||||
implementation lib_commons_lang3
|
implementation lib_commons_lang3
|
||||||
implementation lib_primitive_collections
|
implementation lib_primitive_collections
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,162 @@
|
|||||||
|
package org.lucares.pdbui;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.time.temporal.TemporalAccessor;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.csv.CSVFormat;
|
||||||
|
import org.apache.commons.csv.CSVParser;
|
||||||
|
import org.apache.commons.csv.CSVRecord;
|
||||||
|
import org.lucares.pdb.api.Tags;
|
||||||
|
import org.lucares.pdb.api.TagsBuilder;
|
||||||
|
import org.lucares.pdb.datastore.Entries;
|
||||||
|
import org.lucares.pdb.datastore.Entry;
|
||||||
|
import org.lucares.pdb.datastore.RuntimeTimeoutException;
|
||||||
|
import org.lucares.pdbui.CsvReaderSettings.ColumnDefinitions;
|
||||||
|
import org.lucares.pdbui.CsvReaderSettings.PostProcessors;
|
||||||
|
import org.lucares.utils.CollectionUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public class CsvReaderCsvToEntryTransformer implements CsvToEntryTransformer {
|
||||||
|
|
||||||
|
private static final Logger LOGGER = LoggerFactory.getLogger(CsvReaderCsvToEntryTransformer.class);
|
||||||
|
|
||||||
|
private final ArrayBlockingQueue<Entries> queue;
|
||||||
|
private final CsvReaderSettings settings;
|
||||||
|
private int[] compressedHeaders;
|
||||||
|
private List<Function<String, String>> postProcessersForColumns;
|
||||||
|
|
||||||
|
public CsvReaderCsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue, final CsvReaderSettings settings) {
|
||||||
|
this.queue = queue;
|
||||||
|
this.settings = settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readCSV(final InputStream in) throws IOException, InterruptedException, RuntimeTimeoutException {
|
||||||
|
|
||||||
|
final int chunksize = 1000;
|
||||||
|
Entries entries = new Entries(chunksize);
|
||||||
|
|
||||||
|
final int keyTimestamp = Tags.STRING_COMPRESSOR.put(settings.getTimeColumn());
|
||||||
|
final int keyDuration = Tags.STRING_COMPRESSOR.put(settings.getValueColumn());
|
||||||
|
final DateTimeFormatter dateParser = createDateParser(settings.getDateTimePattern());
|
||||||
|
final Tags additionalTags = initAdditionalTags(settings);
|
||||||
|
|
||||||
|
final CSVFormat csvFormat = getCsvFormat();
|
||||||
|
try (final InputStreamReader reader = new InputStreamReader(in, StandardCharsets.UTF_8);
|
||||||
|
final CSVParser parser = new CSVParser(reader, csvFormat);) {
|
||||||
|
|
||||||
|
final Iterator<CSVRecord> iterator = parser.stream().iterator();
|
||||||
|
final CSVRecord headers = iterator.next();
|
||||||
|
handleHeaders(headers);
|
||||||
|
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
final CSVRecord next = iterator.next();
|
||||||
|
final Entry entry = handleLine(next, keyTimestamp, keyDuration, dateParser, additionalTags);
|
||||||
|
if (entry != null) {
|
||||||
|
entries.add(entry);
|
||||||
|
}
|
||||||
|
if (entries.size() >= chunksize) {
|
||||||
|
queue.put(entries);
|
||||||
|
entries = new Entries(chunksize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entries.forceFlush();
|
||||||
|
queue.put(entries);
|
||||||
|
entries.waitUntilFlushed(5, TimeUnit.MINUTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DateTimeFormatter createDateParser(final String dateTimePattern) {
|
||||||
|
if (dateTimePattern.equals(CsvReaderSettings.ISO_8601)) {
|
||||||
|
return DateTimeFormatter.ISO_OFFSET_DATE_TIME;
|
||||||
|
} else {
|
||||||
|
return DateTimeFormatter.ofPattern(dateTimePattern);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleHeaders(final CSVRecord headers) {
|
||||||
|
compressedHeaders = new int[headers.size()];
|
||||||
|
postProcessersForColumns = new ArrayList<>();
|
||||||
|
CollectionUtils.addNCopies(postProcessersForColumns, headers.size(), Function.identity());
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (final String columnName : headers) {
|
||||||
|
|
||||||
|
if (ignoreColum(columnName)) {
|
||||||
|
compressedHeaders[i] = IGNORE_COLUMN;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
final String renameTo = settings.getColumnDefinitions().getRenameTo(columnName);
|
||||||
|
final String renamedColumn = renameTo != null ? renameTo : columnName;
|
||||||
|
compressedHeaders[i] = Tags.STRING_COMPRESSOR.put(renamedColumn);
|
||||||
|
final EnumSet<PostProcessors> postProcessors = settings.getColumnDefinitions()
|
||||||
|
.getPostProcessors(columnName);
|
||||||
|
final Function<String, String> postProcessFunction = PostProcessors.toFunction(postProcessors);
|
||||||
|
postProcessersForColumns.set(i, postProcessFunction);
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Entry handleLine(final CSVRecord csvrecord, final int keyTimestamp, final int keyDuration,
|
||||||
|
final DateTimeFormatter dateParser, final Tags additionalTags) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
final int[] columns = compressedHeaders;
|
||||||
|
final TagsBuilder tagsBuilder = new TagsBuilder(additionalTags);
|
||||||
|
final int size = columns.length;
|
||||||
|
long epochMilli = -1;
|
||||||
|
long duration = -1;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
final int key = columns[i];
|
||||||
|
final String val = csvrecord.get(i);
|
||||||
|
|
||||||
|
if (key == IGNORE_COLUMN) {
|
||||||
|
// this column's value will not be ingested
|
||||||
|
} else if (key == keyTimestamp) {
|
||||||
|
final TemporalAccessor time = dateParser.parse(val);
|
||||||
|
epochMilli = Instant.from(time).toEpochMilli();
|
||||||
|
} else if (key == keyDuration) {
|
||||||
|
duration = Long.parseLong(val);
|
||||||
|
} else if (!val.isEmpty()) {
|
||||||
|
final Function<String, String> postProcess = postProcessersForColumns.get(i);
|
||||||
|
final int value = Tags.STRING_COMPRESSOR.put(val, postProcess);
|
||||||
|
|
||||||
|
tagsBuilder.add(key, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final Tags tags = tagsBuilder.build();
|
||||||
|
return new Entry(epochMilli, duration, tags);
|
||||||
|
} catch (final RuntimeException e) {
|
||||||
|
LOGGER.debug("ignoring invalid line '" + csvrecord + "'", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private CSVFormat getCsvFormat() {
|
||||||
|
final CSVFormat result = CSVFormat.Builder.create()//
|
||||||
|
.setDelimiter(settings.getSeparator())//
|
||||||
|
.setCommentMarker(settings.getComment().charAt(0))//
|
||||||
|
.build();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean ignoreColum(final String columnName) {
|
||||||
|
final ColumnDefinitions columnDefinitions = settings.getColumnDefinitions();
|
||||||
|
return columnDefinitions.isIgnoredColumn(columnName) || columnName.startsWith(COLUM_IGNORE_PREFIX);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,6 +18,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
|
|
||||||
public final class CsvReaderSettings {
|
public final class CsvReaderSettings {
|
||||||
|
|
||||||
|
public static final String ISO_8601 = "ISO-8601";
|
||||||
|
|
||||||
private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static String stripPrefixDefault(final String value) {
|
public static String stripPrefixDefault(final String value) {
|
||||||
@@ -165,6 +167,8 @@ public final class CsvReaderSettings {
|
|||||||
|
|
||||||
private String separator;
|
private String separator;
|
||||||
|
|
||||||
|
private Character quoteCharacter = null;
|
||||||
|
|
||||||
private ColumnDefinitions columnDefinitions = new ColumnDefinitions();
|
private ColumnDefinitions columnDefinitions = new ColumnDefinitions();
|
||||||
|
|
||||||
private Map<String, String> additionalTags = new HashMap<String, String>();
|
private Map<String, String> additionalTags = new HashMap<String, String>();
|
||||||
@@ -175,6 +179,8 @@ public final class CsvReaderSettings {
|
|||||||
|
|
||||||
private String comment = "#";
|
private String comment = "#";
|
||||||
|
|
||||||
|
private String dateTimePattern = ISO_8601;
|
||||||
|
|
||||||
private final List<TagMatcher> firstLineMatcher = new ArrayList<>();
|
private final List<TagMatcher> firstLineMatcher = new ArrayList<>();
|
||||||
|
|
||||||
public CsvReaderSettings() {
|
public CsvReaderSettings() {
|
||||||
@@ -282,6 +288,32 @@ public final class CsvReaderSettings {
|
|||||||
this.firstLineMatcher.add(tagMatcher);
|
this.firstLineMatcher.add(tagMatcher);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The quote character. If null then no quoting is allowed.
|
||||||
|
*
|
||||||
|
* @param quoteCharacter
|
||||||
|
*/
|
||||||
|
public void setQuoteCharacter(final Character quoteCharacter) {
|
||||||
|
this.quoteCharacter = quoteCharacter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The quote character. If null then no quoting is allowed.
|
||||||
|
*
|
||||||
|
* @return the quote character
|
||||||
|
*/
|
||||||
|
public Character getQuoteCharacter() {
|
||||||
|
return quoteCharacter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDateTimePattern() {
|
||||||
|
return dateTimePattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDateTimePattern(final String dateTimePattern) {
|
||||||
|
this.dateTimePattern = dateTimePattern;
|
||||||
|
}
|
||||||
|
|
||||||
public CsvReaderSettings copy() {
|
public CsvReaderSettings copy() {
|
||||||
try {
|
try {
|
||||||
final String json = OBJECT_MAPPER.writeValueAsString(this);
|
final String json = OBJECT_MAPPER.writeValueAsString(this);
|
||||||
@@ -299,6 +331,12 @@ public final class CsvReaderSettings {
|
|||||||
builder.append(separator);
|
builder.append(separator);
|
||||||
builder.append(", ");
|
builder.append(", ");
|
||||||
}
|
}
|
||||||
|
if (quoteCharacter != null) {
|
||||||
|
builder.append("\nquoteCharacter=");
|
||||||
|
builder.append(quoteCharacter);
|
||||||
|
} else {
|
||||||
|
builder.append("\nno quotes");
|
||||||
|
}
|
||||||
if (columnDefinitions != null) {
|
if (columnDefinitions != null) {
|
||||||
builder.append("\ncolumnDefinitions=");
|
builder.append("\ncolumnDefinitions=");
|
||||||
builder.append(columnDefinitions);
|
builder.append(columnDefinitions);
|
||||||
|
|||||||
@@ -2,132 +2,23 @@ package org.lucares.pdbui;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.EnumSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import org.lucares.collections.IntList;
|
|
||||||
import org.lucares.pdb.api.Tags;
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.api.TagsBuilder;
|
import org.lucares.pdb.api.TagsBuilder;
|
||||||
import org.lucares.pdb.datastore.Entries;
|
|
||||||
import org.lucares.pdb.datastore.Entry;
|
|
||||||
import org.lucares.pdb.datastore.RuntimeTimeoutException;
|
import org.lucares.pdb.datastore.RuntimeTimeoutException;
|
||||||
import org.lucares.pdbui.CsvReaderSettings.ColumnDefinitions;
|
|
||||||
import org.lucares.pdbui.CsvReaderSettings.PostProcessors;
|
|
||||||
import org.lucares.pdbui.date.FastISODateParser;
|
|
||||||
import org.lucares.utils.CollectionUtils;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
class CsvToEntryTransformer {
|
public interface CsvToEntryTransformer {
|
||||||
private static final Logger LOGGER = LoggerFactory.getLogger(CsvToEntryTransformer.class);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Column header names starting with "-" will be ignored.
|
* Column header names starting with "-" will be ignored.
|
||||||
*/
|
*/
|
||||||
static final String COLUM_IGNORE_PREFIX = "-";
|
public static final String COLUM_IGNORE_PREFIX = "-";
|
||||||
|
|
||||||
static final int IGNORE_COLUMN = 0;
|
static final int IGNORE_COLUMN = 0;
|
||||||
private final ArrayBlockingQueue<Entries> queue;
|
|
||||||
private final CsvReaderSettings settings;
|
|
||||||
private int[] compressedHeaders;
|
|
||||||
private List<Function<String, String>> postProcessersForColumns;
|
|
||||||
|
|
||||||
public CsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue, final CsvReaderSettings settings) {
|
void readCSV(InputStream in) throws IOException, InterruptedException, RuntimeTimeoutException;
|
||||||
this.queue = queue;
|
|
||||||
this.settings = settings;
|
|
||||||
}
|
|
||||||
|
|
||||||
void readCSV(final InputStream in) throws IOException, InterruptedException, RuntimeTimeoutException {
|
default Tags initAdditionalTags(final CsvReaderSettings settings) {
|
||||||
final int chunksize = 1000;
|
|
||||||
Entries entries = new Entries(chunksize);
|
|
||||||
|
|
||||||
final byte newline = '\n';
|
|
||||||
final byte separator = settings.separatorByte();
|
|
||||||
final byte comment = settings.commentByte();
|
|
||||||
final byte[] line = new byte[64 * 1024]; // max line length
|
|
||||||
int offsetInLine = 0;
|
|
||||||
int offsetInBuffer = 0;
|
|
||||||
final IntList separatorPositions = new IntList();
|
|
||||||
|
|
||||||
int read = 0;
|
|
||||||
int bytesInLine = 0;
|
|
||||||
int lineCounter = 0;
|
|
||||||
|
|
||||||
final byte[] buffer = new byte[4096 * 16];
|
|
||||||
final int keyTimestamp = Tags.STRING_COMPRESSOR.put(settings.getTimeColumn());
|
|
||||||
final int keyDuration = Tags.STRING_COMPRESSOR.put(settings.getValueColumn());
|
|
||||||
final FastISODateParser dateParser = new FastISODateParser();
|
|
||||||
|
|
||||||
Tags additionalTags = initAdditionalTags();
|
|
||||||
|
|
||||||
while ((read = in.read(buffer)) >= 0) {
|
|
||||||
offsetInBuffer = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < read; i++) {
|
|
||||||
if (buffer[i] == newline) {
|
|
||||||
lineCounter++;
|
|
||||||
final int length = i - offsetInBuffer;
|
|
||||||
System.arraycopy(buffer, offsetInBuffer, line, offsetInLine, length);
|
|
||||||
bytesInLine = offsetInLine + length;
|
|
||||||
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
|
||||||
|
|
||||||
if (line[0] == comment) {
|
|
||||||
if (lineCounter == 1) {
|
|
||||||
final String lineAsString = new String(line, offsetInBuffer, length,
|
|
||||||
StandardCharsets.UTF_8);
|
|
||||||
final Tags firstLineTags = TagMatchExtractor.extractTags(lineAsString,
|
|
||||||
settings.getFirstLineMatcher());
|
|
||||||
additionalTags = additionalTags.add(firstLineTags);
|
|
||||||
} else {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
} else if (compressedHeaders != null) {
|
|
||||||
|
|
||||||
final Entry entry = handleCsvLine(line, bytesInLine, separatorPositions, keyTimestamp,
|
|
||||||
keyDuration, dateParser, additionalTags);
|
|
||||||
if (entry != null) {
|
|
||||||
entries.add(entry);
|
|
||||||
}
|
|
||||||
if (entries.size() >= chunksize) {
|
|
||||||
queue.put(entries);
|
|
||||||
entries = new Entries(chunksize);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
handleCsvHeaderLine(line, bytesInLine, separatorPositions);
|
|
||||||
}
|
|
||||||
|
|
||||||
offsetInBuffer = i + 1;
|
|
||||||
offsetInLine = 0;
|
|
||||||
bytesInLine = 0;
|
|
||||||
separatorPositions.clear();
|
|
||||||
} else if (buffer[i] == separator) {
|
|
||||||
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (offsetInBuffer < read) {
|
|
||||||
final int length = read - offsetInBuffer;
|
|
||||||
System.arraycopy(buffer, offsetInBuffer, line, offsetInLine, length);
|
|
||||||
bytesInLine = offsetInLine + length;
|
|
||||||
offsetInLine += length;
|
|
||||||
offsetInBuffer = 0;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
final Entry entry = handleCsvLine(line, bytesInLine, separatorPositions, keyTimestamp, keyDuration, dateParser,
|
|
||||||
additionalTags);
|
|
||||||
if (entry != null) {
|
|
||||||
entries.add(entry);
|
|
||||||
}
|
|
||||||
entries.forceFlush();
|
|
||||||
queue.put(entries);
|
|
||||||
entries.waitUntilFlushed(5, TimeUnit.MINUTES);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Tags initAdditionalTags() {
|
|
||||||
final TagsBuilder tags = new TagsBuilder();
|
final TagsBuilder tags = new TagsBuilder();
|
||||||
for (final java.util.Map.Entry<String, String> entry : settings.getAdditionalTags().entrySet()) {
|
for (final java.util.Map.Entry<String, String> entry : settings.getAdditionalTags().entrySet()) {
|
||||||
final int field = Tags.STRING_COMPRESSOR.put(entry.getKey());
|
final int field = Tags.STRING_COMPRESSOR.put(entry.getKey());
|
||||||
@@ -136,97 +27,4 @@ class CsvToEntryTransformer {
|
|||||||
}
|
}
|
||||||
return tags.build();
|
return tags.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void handleCsvHeaderLine(final byte[] line, final int bytesInLine, final IntList separatorPositions) {
|
|
||||||
|
|
||||||
final int[] columns = new int[separatorPositions.size()];
|
|
||||||
postProcessersForColumns = new ArrayList<>();
|
|
||||||
CollectionUtils.addNCopies(postProcessersForColumns, separatorPositions.size(), Function.identity());
|
|
||||||
|
|
||||||
int lastSeparatorPosition = -1;
|
|
||||||
final int size = separatorPositions.size();
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
final int separatorPosition = separatorPositions.get(i);
|
|
||||||
|
|
||||||
final String columnName = new String(line, lastSeparatorPosition + 1,
|
|
||||||
separatorPosition - lastSeparatorPosition - 1, StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
if (ignoreColum(columnName)) {
|
|
||||||
columns[i] = IGNORE_COLUMN;
|
|
||||||
} else {
|
|
||||||
|
|
||||||
final String renameTo = settings.getColumnDefinitions().getRenameTo(columnName);
|
|
||||||
final String renamedColumn = renameTo != null ? renameTo : columnName;
|
|
||||||
columns[i] = Tags.STRING_COMPRESSOR.put(renamedColumn);
|
|
||||||
final EnumSet<PostProcessors> postProcessors = settings.getColumnDefinitions()
|
|
||||||
.getPostProcessors(columnName);
|
|
||||||
final Function<String, String> postProcessFunction = PostProcessors.toFunction(postProcessors);
|
|
||||||
postProcessersForColumns.set(i, postProcessFunction);
|
|
||||||
}
|
|
||||||
|
|
||||||
lastSeparatorPosition = separatorPosition;
|
|
||||||
}
|
|
||||||
compressedHeaders = columns;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean ignoreColum(final String columnName) {
|
|
||||||
final ColumnDefinitions columnDefinitions = settings.getColumnDefinitions();
|
|
||||||
return columnDefinitions.isIgnoredColumn(columnName) || columnName.startsWith(COLUM_IGNORE_PREFIX);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Entry handleCsvLine(final byte[] line, final int bytesInLine, final IntList separatorPositions,
|
|
||||||
final int keyTimestamp, final int keyDuration, final FastISODateParser dateParser,
|
|
||||||
final Tags additionalTags) {
|
|
||||||
try {
|
|
||||||
final int[] columns = compressedHeaders;
|
|
||||||
if (separatorPositions.size() != columns.length) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final TagsBuilder tagsBuilder = new TagsBuilder(additionalTags);
|
|
||||||
int lastSeparatorPosition = -1;
|
|
||||||
final int size = separatorPositions.size();
|
|
||||||
long epochMilli = -1;
|
|
||||||
long duration = -1;
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
final int separatorPosition = separatorPositions.get(i);
|
|
||||||
final int key = columns[i];
|
|
||||||
|
|
||||||
if (key == IGNORE_COLUMN) {
|
|
||||||
// this column's value will not be ingested
|
|
||||||
} else if (key == keyTimestamp) {
|
|
||||||
epochMilli = dateParser.parseAsEpochMilli(line, lastSeparatorPosition + 1);
|
|
||||||
} else if (key == keyDuration) {
|
|
||||||
duration = parseLong(line, lastSeparatorPosition + 1, separatorPosition);
|
|
||||||
} else if (lastSeparatorPosition + 1 < separatorPosition) { // value is not empty
|
|
||||||
final Function<String, String> postProcess = postProcessersForColumns.get(i);
|
|
||||||
final int value = Tags.STRING_COMPRESSOR.put(line, lastSeparatorPosition + 1, separatorPosition,
|
|
||||||
postProcess);
|
|
||||||
|
|
||||||
tagsBuilder.add(key, value);
|
|
||||||
}
|
|
||||||
lastSeparatorPosition = separatorPosition;
|
|
||||||
}
|
|
||||||
final Tags tags = tagsBuilder.build();
|
|
||||||
return new Entry(epochMilli, duration, tags);
|
|
||||||
} catch (final RuntimeException e) {
|
|
||||||
LOGGER.debug("ignoring invalid line '" + new String(line, 0, bytesInLine, StandardCharsets.UTF_8) + "'", e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static long parseLong(final byte[] bytes, final int start, final int endExclusive) {
|
|
||||||
long result = 0;
|
|
||||||
int i = start;
|
|
||||||
int c = bytes[i];
|
|
||||||
int sign = 1;
|
|
||||||
if (c == '-') {
|
|
||||||
sign = -1;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
while (i < endExclusive && (c = bytes[i]) >= 48 && c <= 57) {
|
|
||||||
result = result * 10 + (c - 48);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
return sign * result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,20 @@
|
|||||||
|
package org.lucares.pdbui;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
|
|
||||||
|
import org.lucares.pdb.datastore.Entries;
|
||||||
|
|
||||||
|
public class CsvToEntryTransformerFactory {
|
||||||
|
|
||||||
|
public static CsvToEntryTransformer createCsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue,
|
||||||
|
final CsvReaderSettings settings) {
|
||||||
|
|
||||||
|
if (settings.getQuoteCharacter() == null
|
||||||
|
&& Objects.equals(settings.getDateTimePattern(), CsvReaderSettings.ISO_8601)) {
|
||||||
|
return new NoCopyCsvToEntryTransformer(queue, settings);
|
||||||
|
} else {
|
||||||
|
return new CsvReaderCsvToEntryTransformer(queue, settings);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -48,7 +48,7 @@ public class CsvUploadHandler implements PropertyKeys, DisposableBean {
|
|||||||
// improved the
|
// improved the
|
||||||
// ingestion performance fom 1.1m to 1.55m values per second on average
|
// ingestion performance fom 1.1m to 1.55m values per second on average
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
final CsvToEntryTransformer csvToEntryTransformer = new CsvToEntryTransformer(queue, settings);
|
final NoCopyCsvToEntryTransformer csvToEntryTransformer = new NoCopyCsvToEntryTransformer(queue, settings);
|
||||||
try (InputStream in = file.getInputStream()) {
|
try (InputStream in = file.getInputStream()) {
|
||||||
csvToEntryTransformer.readCSV(in);
|
csvToEntryTransformer.readCSV(in);
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ public class FileDropZipHandler implements FileDropFileTypeHandler {
|
|||||||
|
|
||||||
final CsvReaderSettings csvReaderSettings = csvSettings.get();
|
final CsvReaderSettings csvReaderSettings = csvSettings.get();
|
||||||
|
|
||||||
final CsvToEntryTransformer csvToEntryTransformer = new CsvToEntryTransformer(queue, csvReaderSettings);
|
final NoCopyCsvToEntryTransformer csvToEntryTransformer = new NoCopyCsvToEntryTransformer(queue, csvReaderSettings);
|
||||||
try (final InputStream inputStream = new BufferedInputStream(zipFile.getInputStream(entry),
|
try (final InputStream inputStream = new BufferedInputStream(zipFile.getInputStream(entry),
|
||||||
1024 * 1024)) {
|
1024 * 1024)) {
|
||||||
csvToEntryTransformer.readCSV(inputStream);
|
csvToEntryTransformer.readCSV(inputStream);
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ public final class IngestionHandler implements Callable<Void> {
|
|||||||
handleInputStream(gzip);
|
handleInputStream(gzip);
|
||||||
} else {
|
} else {
|
||||||
in.reset();
|
in.reset();
|
||||||
final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(queue,
|
final NoCopyCsvToEntryTransformer csvTransformer = new NoCopyCsvToEntryTransformer(queue,
|
||||||
CsvReaderSettings.create("@timestamp", "duration", ",", new ColumnDefinitions()));
|
CsvReaderSettings.create("@timestamp", "duration", ",", new ColumnDefinitions()));
|
||||||
csvTransformer.readCSV(in);
|
csvTransformer.readCSV(in);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,218 @@
|
|||||||
|
package org.lucares.pdbui;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.lucares.collections.IntList;
|
||||||
|
import org.lucares.pdb.api.Tags;
|
||||||
|
import org.lucares.pdb.api.TagsBuilder;
|
||||||
|
import org.lucares.pdb.datastore.Entries;
|
||||||
|
import org.lucares.pdb.datastore.Entry;
|
||||||
|
import org.lucares.pdb.datastore.RuntimeTimeoutException;
|
||||||
|
import org.lucares.pdbui.CsvReaderSettings.ColumnDefinitions;
|
||||||
|
import org.lucares.pdbui.CsvReaderSettings.PostProcessors;
|
||||||
|
import org.lucares.pdbui.date.FastISODateParser;
|
||||||
|
import org.lucares.utils.CollectionUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
class NoCopyCsvToEntryTransformer implements CsvToEntryTransformer {
|
||||||
|
private static final Logger LOGGER = LoggerFactory.getLogger(NoCopyCsvToEntryTransformer.class);
|
||||||
|
|
||||||
|
private final ArrayBlockingQueue<Entries> queue;
|
||||||
|
private final CsvReaderSettings settings;
|
||||||
|
private int[] compressedHeaders;
|
||||||
|
private List<Function<String, String>> postProcessersForColumns;
|
||||||
|
|
||||||
|
public NoCopyCsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue, final CsvReaderSettings settings) {
|
||||||
|
this.queue = queue;
|
||||||
|
this.settings = settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readCSV(final InputStream in) throws IOException, InterruptedException, RuntimeTimeoutException {
|
||||||
|
final int chunksize = 1000;
|
||||||
|
Entries entries = new Entries(chunksize);
|
||||||
|
|
||||||
|
final byte newline = '\n';
|
||||||
|
final byte separator = settings.separatorByte();
|
||||||
|
final byte comment = settings.commentByte();
|
||||||
|
final byte[] line = new byte[64 * 1024]; // max line length
|
||||||
|
int offsetInLine = 0;
|
||||||
|
int offsetInBuffer = 0;
|
||||||
|
final IntList separatorPositions = new IntList();
|
||||||
|
|
||||||
|
int read = 0;
|
||||||
|
int bytesInLine = 0;
|
||||||
|
int lineCounter = 0;
|
||||||
|
|
||||||
|
final byte[] buffer = new byte[4096 * 16];
|
||||||
|
final int keyTimestamp = Tags.STRING_COMPRESSOR.put(settings.getTimeColumn());
|
||||||
|
final int keyDuration = Tags.STRING_COMPRESSOR.put(settings.getValueColumn());
|
||||||
|
final FastISODateParser dateParser = new FastISODateParser();
|
||||||
|
|
||||||
|
Tags additionalTags = initAdditionalTags(settings);
|
||||||
|
|
||||||
|
while ((read = in.read(buffer)) >= 0) {
|
||||||
|
offsetInBuffer = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < read; i++) {
|
||||||
|
if (buffer[i] == newline) {
|
||||||
|
lineCounter++;
|
||||||
|
final int length = i - offsetInBuffer;
|
||||||
|
System.arraycopy(buffer, offsetInBuffer, line, offsetInLine, length);
|
||||||
|
bytesInLine = offsetInLine + length;
|
||||||
|
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
||||||
|
|
||||||
|
if (line[0] == comment) {
|
||||||
|
if (lineCounter == 1) {
|
||||||
|
final String lineAsString = new String(line, offsetInBuffer, length,
|
||||||
|
StandardCharsets.UTF_8);
|
||||||
|
final Tags firstLineTags = TagMatchExtractor.extractTags(lineAsString,
|
||||||
|
settings.getFirstLineMatcher());
|
||||||
|
additionalTags = additionalTags.add(firstLineTags);
|
||||||
|
} else {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
} else if (compressedHeaders != null) {
|
||||||
|
|
||||||
|
final Entry entry = handleCsvLine(line, bytesInLine, separatorPositions, keyTimestamp,
|
||||||
|
keyDuration, dateParser, additionalTags);
|
||||||
|
if (entry != null) {
|
||||||
|
entries.add(entry);
|
||||||
|
}
|
||||||
|
if (entries.size() >= chunksize) {
|
||||||
|
queue.put(entries);
|
||||||
|
entries = new Entries(chunksize);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
handleCsvHeaderLine(line, bytesInLine, separatorPositions);
|
||||||
|
}
|
||||||
|
|
||||||
|
offsetInBuffer = i + 1;
|
||||||
|
offsetInLine = 0;
|
||||||
|
bytesInLine = 0;
|
||||||
|
separatorPositions.clear();
|
||||||
|
} else if (buffer[i] == separator) {
|
||||||
|
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (offsetInBuffer < read) {
|
||||||
|
final int length = read - offsetInBuffer;
|
||||||
|
System.arraycopy(buffer, offsetInBuffer, line, offsetInLine, length);
|
||||||
|
bytesInLine = offsetInLine + length;
|
||||||
|
offsetInLine += length;
|
||||||
|
offsetInBuffer = 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final Entry entry = handleCsvLine(line, bytesInLine, separatorPositions, keyTimestamp, keyDuration, dateParser,
|
||||||
|
additionalTags);
|
||||||
|
if (entry != null) {
|
||||||
|
entries.add(entry);
|
||||||
|
}
|
||||||
|
entries.forceFlush();
|
||||||
|
queue.put(entries);
|
||||||
|
entries.waitUntilFlushed(5, TimeUnit.MINUTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleCsvHeaderLine(final byte[] line, final int bytesInLine, final IntList separatorPositions) {
|
||||||
|
|
||||||
|
final int[] columns = new int[separatorPositions.size()];
|
||||||
|
postProcessersForColumns = new ArrayList<>();
|
||||||
|
CollectionUtils.addNCopies(postProcessersForColumns, separatorPositions.size(), Function.identity());
|
||||||
|
|
||||||
|
int lastSeparatorPosition = -1;
|
||||||
|
final int size = separatorPositions.size();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
final int separatorPosition = separatorPositions.get(i);
|
||||||
|
|
||||||
|
final String columnName = new String(line, lastSeparatorPosition + 1,
|
||||||
|
separatorPosition - lastSeparatorPosition - 1, StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
if (ignoreColum(columnName)) {
|
||||||
|
columns[i] = IGNORE_COLUMN;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
final String renameTo = settings.getColumnDefinitions().getRenameTo(columnName);
|
||||||
|
final String renamedColumn = renameTo != null ? renameTo : columnName;
|
||||||
|
columns[i] = Tags.STRING_COMPRESSOR.put(renamedColumn);
|
||||||
|
final EnumSet<PostProcessors> postProcessors = settings.getColumnDefinitions()
|
||||||
|
.getPostProcessors(columnName);
|
||||||
|
final Function<String, String> postProcessFunction = PostProcessors.toFunction(postProcessors);
|
||||||
|
postProcessersForColumns.set(i, postProcessFunction);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastSeparatorPosition = separatorPosition;
|
||||||
|
}
|
||||||
|
compressedHeaders = columns;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean ignoreColum(final String columnName) {
|
||||||
|
final ColumnDefinitions columnDefinitions = settings.getColumnDefinitions();
|
||||||
|
return columnDefinitions.isIgnoredColumn(columnName) || columnName.startsWith(COLUM_IGNORE_PREFIX);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Entry handleCsvLine(final byte[] line, final int bytesInLine, final IntList separatorPositions,
|
||||||
|
final int keyTimestamp, final int keyDuration, final FastISODateParser dateParser,
|
||||||
|
final Tags additionalTags) {
|
||||||
|
try {
|
||||||
|
final int[] columns = compressedHeaders;
|
||||||
|
if (separatorPositions.size() != columns.length) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
final TagsBuilder tagsBuilder = new TagsBuilder(additionalTags);
|
||||||
|
int lastSeparatorPosition = -1;
|
||||||
|
final int size = separatorPositions.size();
|
||||||
|
long epochMilli = -1;
|
||||||
|
long duration = -1;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
final int separatorPosition = separatorPositions.get(i);
|
||||||
|
final int key = columns[i];
|
||||||
|
|
||||||
|
if (key == IGNORE_COLUMN) {
|
||||||
|
// this column's value will not be ingested
|
||||||
|
} else if (key == keyTimestamp) {
|
||||||
|
epochMilli = dateParser.parseAsEpochMilli(line, lastSeparatorPosition + 1);
|
||||||
|
} else if (key == keyDuration) {
|
||||||
|
duration = parseLong(line, lastSeparatorPosition + 1, separatorPosition);
|
||||||
|
} else if (lastSeparatorPosition + 1 < separatorPosition) { // value is not empty
|
||||||
|
final Function<String, String> postProcess = postProcessersForColumns.get(i);
|
||||||
|
final int value = Tags.STRING_COMPRESSOR.put(line, lastSeparatorPosition + 1, separatorPosition,
|
||||||
|
postProcess);
|
||||||
|
|
||||||
|
tagsBuilder.add(key, value);
|
||||||
|
}
|
||||||
|
lastSeparatorPosition = separatorPosition;
|
||||||
|
}
|
||||||
|
final Tags tags = tagsBuilder.build();
|
||||||
|
return new Entry(epochMilli, duration, tags);
|
||||||
|
} catch (final RuntimeException e) {
|
||||||
|
LOGGER.debug("ignoring invalid line '" + new String(line, 0, bytesInLine, StandardCharsets.UTF_8) + "'", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long parseLong(final byte[] bytes, final int start, final int endExclusive) {
|
||||||
|
long result = 0;
|
||||||
|
int i = start;
|
||||||
|
int c = bytes[i];
|
||||||
|
int sign = 1;
|
||||||
|
if (c == '-') {
|
||||||
|
sign = -1;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
while (i < endExclusive && (c = bytes[i]) >= 48 && c <= 57) {
|
||||||
|
result = result * 10 + (c - 48);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return sign * result;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
package org.lucares.pdbui;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.time.temporal.ChronoUnit;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.lucares.collections.LongList;
|
||||||
|
import org.lucares.pdb.api.DateTimeRange;
|
||||||
|
import org.lucares.pdb.api.Query;
|
||||||
|
import org.lucares.pdb.datastore.Entries;
|
||||||
|
import org.lucares.pdbui.CsvReaderSettings.ColumnDefinitions;
|
||||||
|
import org.lucares.performance.db.PerformanceDb;
|
||||||
|
import org.lucares.utils.file.FileUtils;
|
||||||
|
|
||||||
|
public class CsvReaderCsvToEntryTransformerTest {
|
||||||
|
|
||||||
|
private Path dataDirectory;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void beforeMethod() throws IOException {
|
||||||
|
dataDirectory = Files.createTempDirectory("pdb");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void afterMethod() throws IOException {
|
||||||
|
FileUtils.delete(dataDirectory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws Exception {
|
||||||
|
|
||||||
|
final OffsetDateTime dateA = OffsetDateTime.now();
|
||||||
|
final OffsetDateTime dateB = OffsetDateTime.now();
|
||||||
|
|
||||||
|
try (final PerformanceDb db = new PerformanceDb(dataDirectory)) {
|
||||||
|
|
||||||
|
final String csv = "#comment line\n"//
|
||||||
|
+ "@timestamp,duration,tag,ignored\n"//
|
||||||
|
+ dateA.format(DateTimeFormatter.ISO_ZONED_DATE_TIME) + ",1,\"tagValue\",ignored\n"//
|
||||||
|
+ dateB.format(DateTimeFormatter.ISO_ZONED_DATE_TIME) + ",2,\"tagValue\",ignored\n";
|
||||||
|
|
||||||
|
final ArrayBlockingQueue<Entries> queue = db.getQueue();
|
||||||
|
final ColumnDefinitions columnDefinitions = new ColumnDefinitions();
|
||||||
|
columnDefinitions.ignoreColumn("ignored");
|
||||||
|
|
||||||
|
final CsvReaderSettings settings = CsvReaderSettings.create("@timestamp", "duration", ",",
|
||||||
|
columnDefinitions);
|
||||||
|
|
||||||
|
final CsvReaderCsvToEntryTransformer transformer = new CsvReaderCsvToEntryTransformer(queue, settings);
|
||||||
|
transformer.readCSV(new ByteArrayInputStream(csv.getBytes(StandardCharsets.UTF_8)));
|
||||||
|
queue.put(Entries.POISON);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (PerformanceDb db = new PerformanceDb(dataDirectory)) {
|
||||||
|
final LongList result = db.get(new Query("tag=tagValue", DateTimeRange.max())).singleGroup().flatMap();
|
||||||
|
Assertions.assertEquals(result.size(), 4);
|
||||||
|
|
||||||
|
Assertions.assertEquals(result.get(0), dateA.toInstant().toEpochMilli());
|
||||||
|
Assertions.assertEquals(result.get(1), 1);
|
||||||
|
|
||||||
|
Assertions.assertEquals(result.get(2), dateB.toInstant().truncatedTo(ChronoUnit.MILLIS).toEpochMilli());
|
||||||
|
Assertions.assertEquals(result.get(3), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -24,7 +24,7 @@ import org.lucares.pdbui.CsvReaderSettings.ColumnDefinitions;
|
|||||||
import org.lucares.performance.db.PerformanceDb;
|
import org.lucares.performance.db.PerformanceDb;
|
||||||
import org.lucares.utils.file.FileUtils;
|
import org.lucares.utils.file.FileUtils;
|
||||||
|
|
||||||
public class CsvToEntryTransformerTest {
|
public class NoCopyCsvToEntryTransformerTest {
|
||||||
|
|
||||||
private Path dataDirectory;
|
private Path dataDirectory;
|
||||||
|
|
||||||
@@ -52,7 +52,7 @@ public class CsvToEntryTransformerTest {
|
|||||||
final ArrayBlockingQueue<Entries> queue = db.getQueue();
|
final ArrayBlockingQueue<Entries> queue = db.getQueue();
|
||||||
final CsvReaderSettings settings = CsvReaderSettings.create("@timestamp", "duration", ",",
|
final CsvReaderSettings settings = CsvReaderSettings.create("@timestamp", "duration", ",",
|
||||||
new ColumnDefinitions());
|
new ColumnDefinitions());
|
||||||
final CsvToEntryTransformer csvToEntryTransformer = new CsvToEntryTransformer(queue, settings);
|
final NoCopyCsvToEntryTransformer csvToEntryTransformer = new NoCopyCsvToEntryTransformer(queue, settings);
|
||||||
csvToEntryTransformer.readCSV(new ByteArrayInputStream(csv.getBytes(StandardCharsets.UTF_8)));
|
csvToEntryTransformer.readCSV(new ByteArrayInputStream(csv.getBytes(StandardCharsets.UTF_8)));
|
||||||
queue.put(Entries.POISON);
|
queue.put(Entries.POISON);
|
||||||
}
|
}
|
||||||
@@ -94,7 +94,7 @@ public class CsvToEntryTransformerTest {
|
|||||||
columnDefinitions.ignoreColumn("ignoredColumn");
|
columnDefinitions.ignoreColumn("ignoredColumn");
|
||||||
final CsvReaderSettings settings = CsvReaderSettings.create("@timestamp", "duration", ",",
|
final CsvReaderSettings settings = CsvReaderSettings.create("@timestamp", "duration", ",",
|
||||||
columnDefinitions);
|
columnDefinitions);
|
||||||
final CsvToEntryTransformer csvToEntryTransformer = new CsvToEntryTransformer(queue, settings);
|
final NoCopyCsvToEntryTransformer csvToEntryTransformer = new NoCopyCsvToEntryTransformer(queue, settings);
|
||||||
csvToEntryTransformer.readCSV(new ByteArrayInputStream(csv.getBytes(StandardCharsets.UTF_8)));
|
csvToEntryTransformer.readCSV(new ByteArrayInputStream(csv.getBytes(StandardCharsets.UTF_8)));
|
||||||
queue.put(Entries.POISON);
|
queue.put(Entries.POISON);
|
||||||
}
|
}
|
||||||
@@ -200,7 +200,7 @@ public class TcpIngestorTest {
|
|||||||
Instant.ofEpochMilli(1).atOffset(ZoneOffset.UTC).format(DateTimeFormatter.ISO_ZONED_DATE_TIME));
|
Instant.ofEpochMilli(1).atOffset(ZoneOffset.UTC).format(DateTimeFormatter.ISO_ZONED_DATE_TIME));
|
||||||
entry.put("duration", 1);
|
entry.put("duration", 1);
|
||||||
entry.put("host", "someHost");
|
entry.put("host", "someHost");
|
||||||
entry.put(CsvToEntryTransformer.COLUM_IGNORE_PREFIX + "ignored", "ignoredValue");
|
entry.put(NoCopyCsvToEntryTransformer.COLUM_IGNORE_PREFIX + "ignored", "ignoredValue");
|
||||||
|
|
||||||
PdbTestUtil.sendAsCsv(ingestor.getPort(), entry);
|
PdbTestUtil.sendAsCsv(ingestor.getPort(), entry);
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
|
|||||||
Reference in New Issue
Block a user