From ffe5ae86524d907713a1262ecb345f67c891f01b Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sat, 30 Nov 2019 18:32:34 +0100 Subject: [PATCH] add CsvReaderSettings Preparation to add more complex CSV parsing rules. --- .../pdb/api/UniqueStringIntegerPairs.java | 1 + .../org/lucares/pdbui/CsvReaderSettings.java | 23 +++++++++++++++++++ .../lucares/pdbui/CsvToEntryTransformer.java | 15 ++++++++---- .../org/lucares/pdbui/IngestionHandler.java | 4 ++-- 4 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 pdb-ui/src/main/java/org/lucares/pdbui/CsvReaderSettings.java diff --git a/pdb-api/src/main/java/org/lucares/pdb/api/UniqueStringIntegerPairs.java b/pdb-api/src/main/java/org/lucares/pdb/api/UniqueStringIntegerPairs.java index b6572b6..3d97f55 100644 --- a/pdb-api/src/main/java/org/lucares/pdb/api/UniqueStringIntegerPairs.java +++ b/pdb-api/src/main/java/org/lucares/pdb/api/UniqueStringIntegerPairs.java @@ -119,6 +119,7 @@ public class UniqueStringIntegerPairs { String line; while ((line = reader.readLine()) != null) { + // TODO use more efficient code to read the CSV -> improves startup time final String[] tokens = line.split(Pattern.quote(SEPARATOR)); if (tokens.length == 2) { diff --git a/pdb-ui/src/main/java/org/lucares/pdbui/CsvReaderSettings.java b/pdb-ui/src/main/java/org/lucares/pdbui/CsvReaderSettings.java new file mode 100644 index 0000000..89238bd --- /dev/null +++ b/pdb-ui/src/main/java/org/lucares/pdbui/CsvReaderSettings.java @@ -0,0 +1,23 @@ +package org.lucares.pdbui; + +import org.lucares.utils.Preconditions; + +public class CsvReaderSettings { + private final byte separator; + + public CsvReaderSettings(final byte separator) { + this.separator = separator; + } + + public CsvReaderSettings(final char separator) { + Preconditions.checkTrue(separator == (byte) separator, + "Only separators that fulfill separator == (byte)separator are supported. " + + "This restriction is because the parsing algorithm skips the overhead of " + + "translating bytes to characters."); + this.separator = (byte) separator; + } + + public byte getSeparator() { + return separator; + } +} diff --git a/pdb-ui/src/main/java/org/lucares/pdbui/CsvToEntryTransformer.java b/pdb-ui/src/main/java/org/lucares/pdbui/CsvToEntryTransformer.java index 751331b..45f06bb 100644 --- a/pdb-ui/src/main/java/org/lucares/pdbui/CsvToEntryTransformer.java +++ b/pdb-ui/src/main/java/org/lucares/pdbui/CsvToEntryTransformer.java @@ -17,14 +17,21 @@ class CsvToEntryTransformer { * Column header names starting with "-" will be ignored. */ static final String COLUM_IGNORE_PREFIX = "-"; - private static final int IGNORE_COLUMN = 0; + static final int IGNORE_COLUMN = 0; + private final ArrayBlockingQueue queue; + private final CsvReaderSettings settings; - void readCSV(final InputStream in, final ArrayBlockingQueue queue) - throws IOException, InterruptedException { + public CsvToEntryTransformer(final ArrayBlockingQueue queue, final CsvReaderSettings settings) { + this.queue = queue; + this.settings = settings; + } + + void readCSV(final InputStream in) throws IOException, InterruptedException { final int chunksize = 1000; Entries entries = new Entries(chunksize); final byte newline = '\n'; + final byte separator = settings.getSeparator(); final byte[] line = new byte[64 * 1024]; // max line length int offsetInLine = 0; int offsetInBuffer = 0; @@ -68,7 +75,7 @@ class CsvToEntryTransformer { offsetInLine = 0; bytesInLine = 0; separatorPositions.clear(); - } else if (buffer[i] == ',') { + } else if (buffer[i] == separator) { separatorPositions.add(offsetInLine + i - offsetInBuffer); } } diff --git a/pdb-ui/src/main/java/org/lucares/pdbui/IngestionHandler.java b/pdb-ui/src/main/java/org/lucares/pdbui/IngestionHandler.java index e53d8b8..d03cda9 100644 --- a/pdb-ui/src/main/java/org/lucares/pdbui/IngestionHandler.java +++ b/pdb-ui/src/main/java/org/lucares/pdbui/IngestionHandler.java @@ -65,8 +65,8 @@ public final class IngestionHandler implements Callable { handleInputStream(gzip); } else { in.reset(); - final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(); - csvTransformer.readCSV(in, queue); + final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(queue, new CsvReaderSettings(',')); + csvTransformer.readCSV(in); } }