add CsvReaderSettings
Preparation to add more complex CSV parsing rules.
This commit is contained in:
@@ -119,6 +119,7 @@ public class UniqueStringIntegerPairs {
|
|||||||
String line;
|
String line;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
|
|
||||||
|
// TODO use more efficient code to read the CSV -> improves startup time
|
||||||
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
|
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
|
||||||
|
|
||||||
if (tokens.length == 2) {
|
if (tokens.length == 2) {
|
||||||
|
|||||||
@@ -0,0 +1,23 @@
|
|||||||
|
package org.lucares.pdbui;
|
||||||
|
|
||||||
|
import org.lucares.utils.Preconditions;
|
||||||
|
|
||||||
|
public class CsvReaderSettings {
|
||||||
|
private final byte separator;
|
||||||
|
|
||||||
|
public CsvReaderSettings(final byte separator) {
|
||||||
|
this.separator = separator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CsvReaderSettings(final char separator) {
|
||||||
|
Preconditions.checkTrue(separator == (byte) separator,
|
||||||
|
"Only separators that fulfill separator == (byte)separator are supported. "
|
||||||
|
+ "This restriction is because the parsing algorithm skips the overhead of "
|
||||||
|
+ "translating bytes to characters.");
|
||||||
|
this.separator = (byte) separator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getSeparator() {
|
||||||
|
return separator;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,14 +17,21 @@ class CsvToEntryTransformer {
|
|||||||
* Column header names starting with "-" will be ignored.
|
* Column header names starting with "-" will be ignored.
|
||||||
*/
|
*/
|
||||||
static final String COLUM_IGNORE_PREFIX = "-";
|
static final String COLUM_IGNORE_PREFIX = "-";
|
||||||
private static final int IGNORE_COLUMN = 0;
|
static final int IGNORE_COLUMN = 0;
|
||||||
|
private final ArrayBlockingQueue<Entries> queue;
|
||||||
|
private final CsvReaderSettings settings;
|
||||||
|
|
||||||
void readCSV(final InputStream in, final ArrayBlockingQueue<Entries> queue)
|
public CsvToEntryTransformer(final ArrayBlockingQueue<Entries> queue, final CsvReaderSettings settings) {
|
||||||
throws IOException, InterruptedException {
|
this.queue = queue;
|
||||||
|
this.settings = settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
void readCSV(final InputStream in) throws IOException, InterruptedException {
|
||||||
final int chunksize = 1000;
|
final int chunksize = 1000;
|
||||||
Entries entries = new Entries(chunksize);
|
Entries entries = new Entries(chunksize);
|
||||||
|
|
||||||
final byte newline = '\n';
|
final byte newline = '\n';
|
||||||
|
final byte separator = settings.getSeparator();
|
||||||
final byte[] line = new byte[64 * 1024]; // max line length
|
final byte[] line = new byte[64 * 1024]; // max line length
|
||||||
int offsetInLine = 0;
|
int offsetInLine = 0;
|
||||||
int offsetInBuffer = 0;
|
int offsetInBuffer = 0;
|
||||||
@@ -68,7 +75,7 @@ class CsvToEntryTransformer {
|
|||||||
offsetInLine = 0;
|
offsetInLine = 0;
|
||||||
bytesInLine = 0;
|
bytesInLine = 0;
|
||||||
separatorPositions.clear();
|
separatorPositions.clear();
|
||||||
} else if (buffer[i] == ',') {
|
} else if (buffer[i] == separator) {
|
||||||
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
separatorPositions.add(offsetInLine + i - offsetInBuffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -65,8 +65,8 @@ public final class IngestionHandler implements Callable<Void> {
|
|||||||
handleInputStream(gzip);
|
handleInputStream(gzip);
|
||||||
} else {
|
} else {
|
||||||
in.reset();
|
in.reset();
|
||||||
final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer();
|
final CsvToEntryTransformer csvTransformer = new CsvToEntryTransformer(queue, new CsvReaderSettings(','));
|
||||||
csvTransformer.readCSV(in, queue);
|
csvTransformer.readCSV(in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user