introduced a new custom file format used for backup and ingestion
The new file format reduces repetition, is easy to parse, easy to generate in any language and is human readable.
This commit is contained in:
@@ -0,0 +1,171 @@
|
||||
package org.lucares.performance.db;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.Duration;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.logging.log4j.Level;
|
||||
import org.apache.logging.log4j.core.config.Configurator;
|
||||
import org.lucares.collections.LongList;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class PdbExport {
|
||||
|
||||
private static final int KB = 1024;
|
||||
private static final int MB = KB * 1024;
|
||||
private static final int GB = MB * 1024;
|
||||
|
||||
public static final char MAGIC_BYTE = '#';
|
||||
public static final char MARKER_DICT_ENTRY_CHAR = '$';
|
||||
public static final String MARKER_DICT_ENTRY = String.valueOf(MARKER_DICT_ENTRY_CHAR);
|
||||
public static final char SEPARATOR_TAG_ID_CHAR = ':';
|
||||
public static final String SEPARATOR_TAG_ID = String.valueOf(SEPARATOR_TAG_ID_CHAR);
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(PdbExport.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
initLogging();
|
||||
|
||||
final Path dataDirectory = Paths.get(args[0]);
|
||||
final Path backupDir = Paths.get(args[1])
|
||||
.resolve(OffsetDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss")));
|
||||
|
||||
export(dataDirectory, backupDir);
|
||||
}
|
||||
|
||||
public static List<Path> export(final Path dataDirectory, final Path backupDir) throws Exception {
|
||||
final List<Path> exportFiles = new ArrayList<>();
|
||||
Files.createDirectories(backupDir);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOGGER.info("shutdown hook");
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
final OffsetDateTime start = OffsetDateTime.now();
|
||||
final String datePrefix = start.format(DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss"));
|
||||
final AtomicLong tagsIdCounter = new AtomicLong(0);
|
||||
long exportFileCounter = 0;
|
||||
|
||||
Path exportFile = null;
|
||||
Writer writer = null;
|
||||
|
||||
try (final PerformanceDb db = new PerformanceDb(dataDirectory);) {
|
||||
|
||||
LOGGER.info("Searching for all files. This may take a while ...");
|
||||
final List<PdbFile> pdbFiles = db.getFilesForQuery("");
|
||||
|
||||
long count = 0;
|
||||
long lastEpochMilli = 0;
|
||||
|
||||
for (final PdbFile pdbFile : pdbFiles) {
|
||||
|
||||
if (writer == null || Files.size(exportFile) > GB) {
|
||||
if (writer != null) {
|
||||
writer.flush();
|
||||
writer.close();
|
||||
}
|
||||
exportFile = backupDir.resolve(String.format("%s.%05d.pdb.gz", datePrefix, exportFileCounter++));
|
||||
exportFiles.add(exportFile);
|
||||
writer = createWriter(exportFile);
|
||||
LOGGER.info("new export file: {}", exportFile);
|
||||
}
|
||||
|
||||
final Stream<LongList> timeValueStream = PdbFile.toStream(Arrays.asList(pdbFile), db.getDataStore());
|
||||
|
||||
final Tags tags = pdbFile.getTags();
|
||||
final long tagsId = addNewTagsToDictionary(writer, tags, tagsIdCounter);
|
||||
|
||||
final Iterator<LongList> it = timeValueStream.iterator();
|
||||
while (it.hasNext()) {
|
||||
final LongList entry = it.next();
|
||||
|
||||
for (int i = 0; i < entry.size(); i += 2) {
|
||||
|
||||
final long epochMilli = entry.get(i);
|
||||
final long value = entry.get(i + 1);
|
||||
|
||||
final long epochMilliDiff = epochMilli - lastEpochMilli;
|
||||
lastEpochMilli = epochMilli;
|
||||
|
||||
writer.write(Long.toString(epochMilliDiff));
|
||||
writer.write(',');
|
||||
writer.write(Long.toString(value));
|
||||
writer.write(',');
|
||||
writer.write(Long.toString(tagsId));
|
||||
writer.write('\n');
|
||||
|
||||
count++;
|
||||
if (count % 100000 == 0) {
|
||||
LOGGER.info("progress: " + count);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOGGER.info("total: " + count);
|
||||
|
||||
} finally {
|
||||
if (writer != null) {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
||||
final OffsetDateTime end = OffsetDateTime.now();
|
||||
|
||||
LOGGER.info("duration: " + Duration.between(start, end));
|
||||
return exportFiles;
|
||||
}
|
||||
|
||||
private static void initLogging() {
|
||||
Configurator.setRootLevel(Level.INFO);
|
||||
}
|
||||
|
||||
private static long addNewTagsToDictionary(final Writer writer, final Tags tags, final AtomicLong tagsIdCounter)
|
||||
throws IOException {
|
||||
final long tagsId = tagsIdCounter.getAndIncrement();
|
||||
|
||||
writer.write(MARKER_DICT_ENTRY);
|
||||
writer.write(Long.toString(tagsId));
|
||||
writer.write(SEPARATOR_TAG_ID);
|
||||
writer.write(tags.toCsv());
|
||||
writer.write('\n');
|
||||
|
||||
return tagsId;
|
||||
}
|
||||
|
||||
private static Writer createWriter(final Path file) {
|
||||
|
||||
try {
|
||||
final OutputStreamWriter writer = new OutputStreamWriter(
|
||||
new GZIPOutputStream(new FileOutputStream(file.toFile()), 4096 * 4), StandardCharsets.UTF_8);
|
||||
// initialize file header
|
||||
writer.write(MAGIC_BYTE);
|
||||
return writer;
|
||||
|
||||
} catch (final IOException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,6 +20,7 @@ import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.datastore.Proposal;
|
||||
import org.lucares.pdb.datastore.internal.DataStore;
|
||||
import org.lucares.pdb.datastore.lang.SyntaxException;
|
||||
import org.lucares.pdb.diskstorage.DiskStorage;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -122,6 +123,10 @@ public class PerformanceDb implements AutoCloseable {
|
||||
return get(query, Grouping.NO_GROUPING);
|
||||
}
|
||||
|
||||
public List<PdbFile> getFilesForQuery(final String query) {
|
||||
return tagsToFile.getFilesForQuery(query);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the entries as an unbound, ordered and non-parallel stream.
|
||||
*
|
||||
@@ -177,4 +182,8 @@ public class PerformanceDb implements AutoCloseable {
|
||||
public SortedSet<String> getFieldsValues(final String query, final String fieldName) {
|
||||
return dataStore.getAvailableValuesForKey(query, fieldName);
|
||||
}
|
||||
|
||||
public DiskStorage getDataStore() {
|
||||
return dataStore.getDiskStorage();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user