performance improvement: ingest data directly from tmp file

This commit is contained in:
2020-11-24 10:04:21 +01:00
parent 11beda5432
commit 08111e0d69
3 changed files with 14 additions and 45 deletions

View File

@@ -1,18 +1,14 @@
package org.lucares.pdbui;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
@@ -22,7 +18,6 @@ import org.lucares.utils.file.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
@@ -31,56 +26,32 @@ public class CsvUploadHandler implements PropertyKeys, DisposableBean {
private static final Logger LOGGER = LoggerFactory.getLogger(CsvUploadHandler.class);
private final Path tmpDir;
private final ExecutorService threadPool = Executors.newFixedThreadPool(2);
private final PerformanceDb performanceDb;
public CsvUploadHandler(@Value("${" + TMP_DIR + "}") final String tmpDir, final PerformanceDb performanceDb)
throws IOException {
this.tmpDir = Paths.get(tmpDir).resolve("uploads");
Files.createDirectories(this.tmpDir);
public CsvUploadHandler(final PerformanceDb performanceDb) throws IOException {
this.performanceDb = performanceDb;
}
public void ingest(final List<MultipartFile> files, final CsvReaderSettings settings,
final boolean waitUntilFinished)
public void ingest(final List<MultipartFile> files, final CsvReaderSettings settings)
throws IllegalStateException, IOException, InterruptedException, ExecutionException, TimeoutException {
final List<Path> tmpFiles = new ArrayList<Path>();
try {
for (final MultipartFile file : files) {
final Path tmpFile = tmpDir.resolve(UUID.randomUUID().toString());
tmpFiles.add(tmpFile);
LOGGER.debug("writing uploaded file to {}", tmpFile);
file.transferTo(tmpFile);
}
} catch (RuntimeException | IOException e) {
FileUtils.deleteSilently(tmpFiles);
throw e;
}
final Future<?> future = threadPool.submit(() -> {
final ArrayBlockingQueue<Entries> queue = performanceDb.getQueue();
for (final Path tmpFile : tmpFiles) {
try {
for (final MultipartFile file : files) {
final CsvToEntryTransformer csvToEntryTransformer = new CsvToEntryTransformer(queue, settings);
try (FileInputStream in = new FileInputStream(tmpFile.toFile())) {
try (InputStream in = file.getInputStream()) {
csvToEntryTransformer.readCSV(in);
}
LOGGER.debug("delete uploaded file {}", tmpFile);
Files.delete(tmpFile);
} catch (final Exception e) {
LOGGER.error("csv ingestion failed", e);
}
}
});
if (waitUntilFinished) {
future.get(1, TimeUnit.HOURS);
} catch (final RuntimeException e) {
FileUtils.deleteSilently(tmpFiles);
throw e;
}
}

View File

@@ -298,11 +298,10 @@ public class PdbController implements HardcodedValues, PropertyKeys {
@ResponseBody
@ResponseStatus(code = HttpStatus.CREATED)
public String handleCsvFileUpload(@RequestParam("file") final MultipartFile[] files,
@RequestPart("settings") final CsvReaderSettings csvReaderSettings,
@RequestParam(name = "waitUntilFinished", defaultValue = "false") final boolean waitUntilFinished)
@RequestPart("settings") final CsvReaderSettings csvReaderSettings)
throws IllegalStateException, IOException, InterruptedException, ExecutionException, TimeoutException {
csvUploadHandler.ingest(List.of(files), csvReaderSettings, waitUntilFinished);
csvUploadHandler.ingest(List.of(files), csvReaderSettings);
return ""; // return value might become a job id that can be used to cancel, or observe
// status
}

View File

@@ -216,13 +216,12 @@ def send_csv(zip, file_in_zip, csvSettings):
'settings': ('csvReaderSettings.json', csvSettings , 'application/json')
})
r = requests.post('http://localhost:17333/api/data?waitUntilFinished=true', data=m,
r = requests.post('http://localhost:17333/api/data', data=m,
headers={
'Accept': 'text/plain, application/json, application/*+json, */*',
'Content-Type': m.content_type.replace("form-data", "mixed;charset=UTF-8")
})
print("response:")
print(r)
print("response: %s" % r)
finally:
shutil.rmtree(tmp_folder)