add CsvReaderSettings

Preparation to add more complex CSV parsing rules.
This commit is contained in:
2019-11-30 18:32:34 +01:00
parent 08b1be5334
commit ffe5ae8652
4 changed files with 37 additions and 6 deletions

View File

@@ -119,6 +119,7 @@ public class UniqueStringIntegerPairs {
String line; String line;
while ((line = reader.readLine()) != null) { while ((line = reader.readLine()) != null) {
// TODO use more efficient code to read the CSV -> improves startup time
final String[] tokens = line.split(Pattern.quote(SEPARATOR)); final String[] tokens = line.split(Pattern.quote(SEPARATOR));
if (tokens.length == 2) { if (tokens.length == 2) {

View File

@@ -0,0 +1,23 @@
package org.lucares.pdbui;
import org.lucares.utils.Preconditions;
public class CsvReaderSettings {
private final byte separator;
public CsvReaderSettings(final byte separator) {
this.separator = separator;
}
public CsvReaderSettings(final char separator) {
Preconditions.checkTrue(separator == (byte) separator,
"Only separators that fulfill separator == (byte)separator are supported. "
+ "This restriction is because the parsing algorithm skips the overhead of "
+ "translating bytes to characters.");
this.separator = (byte) separator;
}
public byte getSeparator() {
return separator;
}
}

View File

@@ -17,14 +17,21 @@ class CsvToEntryTransformer {
* Column header names starting with "-" will be ignored. * Column header names starting with "-" will be ignored.
*/ */
static final String COLUM_IGNORE_PREFIX = "-"; static final String COLUM_IGNORE_PREFIX = "-";
private static final int IGNORE_COLUMN = 0; static final int IGNORE_COLUMN = 0;
private final ArrayBlockingQueue<Entries> queue;
private final CsvReaderSettings settings;
void readCSV(final InputStream in, final ArrayBlockingQueue<Entries> queue) public CsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue, final CsvReaderSettings settings) {
throws IOException, InterruptedException { this.queue = queue;
this.settings = settings;
}
void readCSV(final InputStream in) throws IOException, InterruptedException {
final int chunksize = 1000; final int chunksize = 1000;
Entries entries = new Entries(chunksize); Entries entries = new Entries(chunksize);
final byte newline = '\n'; final byte newline = '\n';
final byte separator = settings.getSeparator();
final byte[] line = new byte[64 * 1024]; // max line length final byte[] line = new byte[64 * 1024]; // max line length
int offsetInLine = 0; int offsetInLine = 0;
int offsetInBuffer = 0; int offsetInBuffer = 0;
@@ -68,7 +75,7 @@ class CsvToEntryTransformer {
offsetInLine = 0; offsetInLine = 0;
bytesInLine = 0; bytesInLine = 0;
separatorPositions.clear(); separatorPositions.clear();
} else if (buffer[i] == ',') { } else if (buffer[i] == separator) {
separatorPositions.add(offsetInLine + i - offsetInBuffer); separatorPositions.add(offsetInLine + i - offsetInBuffer);
} }
} }

View File

@@ -65,8 +65,8 @@ public final class IngestionHandler implements Callable<Void> {
handleInputStream(gzip); handleInputStream(gzip);
} else { } else {
in.reset(); in.reset();
final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(); final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(queue, new CsvReaderSettings(','));
csvTransformer.readCSV(in, queue); csvTransformer.readCSV(in);
} }
} }