diff --git a/data-store/build.gradle b/data-store/build.gradle
index 1ea1b4e..88074b1 100644
--- a/data-store/build.gradle
+++ b/data-store/build.gradle
@@ -8,6 +8,7 @@ dependencies {
compile 'org.lucares:primitiveCollections:0.1.20171228131833'
compile 'org.apache.commons:commons-lang3:3.7'
+ compile 'com.google.guava:guava:24.1-jre'
compile 'org.apache.logging.log4j:log4j-core:2.10.0'
compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0'
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java b/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java
index 6eee55f..1536d69 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java
@@ -5,27 +5,77 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import org.lucares.pdb.api.Tags;
+import org.lucares.pdb.datastore.internal.DataStore;
public class Doc {
private final Tags tags;
- private final byte[] path;
+ private final long offsetInListingFile;
+ private byte[] path;
- public Doc(final Tags tags, final Path path) {
+ /**
+ * Initializes a new document.
+ *
+ * The path can be {@code null}. If path is {@code null}, then
+ * {@code offsetInListingFile} must be set. The path will be initialized lazily
+ * when needed.
+ *
+ * This is used to reduce the memory footprint.
+ *
+ * @param tags
+ * @param offsetInListingFile
+ * must be set if {@code path} is {@code null}
+ * @param path
+ * optional, can be {@code null}
+ */
+ public Doc(final Tags tags, final long offsetInListingFile, final Path path) {
super();
this.tags = tags;
- this.path = path.toString().getBytes(StandardCharsets.UTF_8);
+ this.offsetInListingFile = offsetInListingFile;
+ setPath(path);
}
public Tags getTags() {
return tags;
}
- public Path getPath() {
- return Paths.get(new String(path, StandardCharsets.UTF_8));
+ public void setPath(final Path path) {
+ if (path != null) {
+ this.path = path.toString().getBytes(StandardCharsets.UTF_8);
+ } else {
+ this.path = null;
+ }
+ }
+
+ /**
+ * The path to the storage file.
+ *
+ * This value is lazily initialized. Callers have to provide a resolver. See
+ * {@link DataStore#getFolderStoragePathResolver()}.
+ *
+ * @return the path
+ */
+ public Path getPath(final FolderStoragePathResolver resolver) {
+
+ if (path == null) {
+ final Path resolvedPath = resolver.getPath(offsetInListingFile);
+ setPath(resolvedPath);
+ }
+ final Path result = Paths.get(new String(path, StandardCharsets.UTF_8));
+ return result;
+ }
+
+ private Path getPathNullable() {
+ return getPath(FolderStoragePathResolver.NULL);
+ }
+
+ public long getOffsetInListingFile() {
+ return offsetInListingFile;
}
@Override
public String toString() {
- return "Doc [tags=" + tags + ", path=" + getPath() + "]";
+ return "Doc [tags=" + tags + ", offsetInListingFile=" + offsetInListingFile + ", path=" + getPathNullable()
+ + "]";
}
+
}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java b/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java
new file mode 100644
index 0000000..332d700
--- /dev/null
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java
@@ -0,0 +1,9 @@
+package org.lucares.pdb.datastore;
+
+import java.nio.file.Path;
+
+public interface FolderStoragePathResolver {
+ FolderStoragePathResolver NULL = offset -> null;
+
+ public Path getPath(long offsetInListingFile);
+}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java b/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java
index 7c787f3..93c2f3a 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java
@@ -39,8 +39,12 @@ public class PdbDB {
return proposer.propose(query, caretIndex);
}
- public List getByTags(Tags tags) {
-
+ public List getByTags(final Tags tags) {
+
return dataStore.getByTags(tags);
}
+
+ public FolderStoragePathResolver getFolderStoragePathResolver() {
+ return dataStore.getFolderStoragePathResolver();
+ }
}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
index fe82cd6..d1afa04 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
@@ -17,6 +17,7 @@ import org.lucares.collections.IntList;
import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.Doc;
+import org.lucares.pdb.datastore.FolderStoragePathResolver;
import org.lucares.pdb.datastore.lang.Expression;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor.AllDocIds;
@@ -41,25 +42,33 @@ public class DataStore {
private final ConcurrentHashMap> keyToValueToDocId = new ConcurrentHashMap<>();
private final FolderStorage folderStorage;
+ private final FolderStoragePathResolver folderStoragePathResolver;
public DataStore(final Path dataDirectory) throws IOException {
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(dataDirectory));
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
init(folderStorage);
+
+ folderStoragePathResolver = folderStorage::getPathByOffset;
}
private void init(final FolderStorage folderStorage) throws IOException {
final long start = System.nanoTime();
- final Stream files = folderStorage.list();
- files.parallel().forEach(path -> {
+ final Stream files = folderStorage.list();
+ files// .parallel()
+ .forEach(listingFileEntry -> {
- final String filename = path.getFileName().toString();
- final Tags tags = toTags(filename);
- cacheTagToFileMapping(tags, path);
+ listingFileEntry.unsetPath(); // unset the path, so that we don't store it for every document (will
+ // be
+ // initialized lazily if needed)
- });
+ final String filename = listingFileEntry.getFilename();
+ final Tags tags = toTags(filename);
+ cacheTagToFileMapping(tags, listingFileEntry);
+
+ });
trimIntLists();
sortIntLists();
synchronized (docIdToDoc) {
@@ -68,10 +77,10 @@ public class DataStore {
INITIALIZE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms");
}
- private void cacheTagToFileMapping(final Tags tags, final Path path) {
+ private void cacheTagToFileMapping(final Tags tags, final ListingFileEntry listingFileEntry) {
final int docId;
- final Doc newDoc = new Doc(tags, path);
+ final Doc newDoc = new Doc(tags, listingFileEntry.getOffsetInListingFile(), listingFileEntry.getPath());
synchronized (docIdToDoc) {
docId = docIdToDoc.size();
docIdToDoc.add(newDoc);
@@ -140,11 +149,11 @@ public class DataStore {
public Path createNewFile(final Tags tags) throws IOException {
final String filename = tags.getFilename();
- final Path result = folderStorage.insert(filename, PDB_EXTENSION);
+ final ListingFileEntry listingFileEntry = folderStorage.insert(filename, PDB_EXTENSION);
- cacheTagToFileMapping(tags, result);
+ cacheTagToFileMapping(tags, listingFileEntry);
- return result;
+ return listingFileEntry.getPath();
}
private Tags toTags(final String filename) {
@@ -232,4 +241,8 @@ public class DataStore {
final List result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0));
return result;
}
+
+ public FolderStoragePathResolver getFolderStoragePathResolver() {
+ return folderStoragePathResolver;
+ }
}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java
index 4da8295..773d404 100644
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java
@@ -1,6 +1,9 @@
package org.lucares.pdb.datastore.internal;
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.RandomAccessFile;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -9,9 +12,13 @@ import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Iterator;
+import java.util.Spliterator;
+import java.util.Spliterators;
import java.util.function.BiPredicate;
import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import org.lucares.pdb.api.RuntimeIOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,6 +46,7 @@ public class FolderStorage {
this.listingFile = storageBaseDirectory.resolve(LISTING_FILE_NAME);
this.maxFilesPerFolder = maxFilesPerFolder;
init();
+ initListingFileIfNotExists();
}
private void init() throws IOException {
@@ -57,7 +65,7 @@ public class FolderStorage {
filesInSecondLevel = (int) Files.list(currentDirectory).count();
}
- public Path insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
+ public ListingFileEntry insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
ensureCapacity();
@@ -71,17 +79,29 @@ public class FolderStorage {
Files.createFile(newFile);
filesInSecondLevel++;
- updateListingFile(newFile);
+ final ListingFileEntry result = updateListingFile(newFile);
- return newFile;
+ return result;
}
- private synchronized void updateListingFile(final Path newFile) throws IOException {
+ private synchronized ListingFileEntry updateListingFile(final Path newFile) throws IOException {
+ final long offsetInListingFile = getFilePointer();
try (Writer out = Files.newBufferedWriter(listingFile, StandardCharsets.UTF_8, StandardOpenOption.CREATE,
StandardOpenOption.APPEND)) {
out.write(newFile.toString());
out.write("\n");
}
+ final String filename = newFile.getFileName().toString();
+ return new ListingFileEntry(filename, offsetInListingFile, newFile);
+ }
+
+ private long getFilePointer() throws FileNotFoundException, IOException {
+ final RandomAccessFile randomAccessFile = new RandomAccessFile(listingFile.toFile(), "r");
+ try {
+ return randomAccessFile.getFilePointer();
+ } finally {
+ randomAccessFile.close();
+ }
}
private void ensureCapacity() throws IOException {
@@ -103,15 +123,28 @@ public class FolderStorage {
Files.createDirectories(currentDirectory);
}
- public Stream list() throws IOException {
+ public Stream list() throws IOException {
+ return readListingFile();
+ }
+
+ private Stream readListingFile() throws IOException {
+
+ try (final ListingFileIterator iterator = new ListingFileIterator(listingFile)) {
+ final Spliterator spliterator = Spliterators.spliteratorUnknownSize(iterator,
+ Spliterator.ORDERED);
+ final Stream stream = StreamSupport.stream(spliterator, false);
+ return stream;
+ }
+ }
+
+ private void initListingFileIfNotExists() throws IOException {
if (!Files.exists(listingFile)) {
final long start = System.nanoTime();
LOGGER.info("listing file not found -> creating a new one");
createNewListingFile();
METRICS_CREATE_LISTING_FILE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms");
}
- return Files.lines(listingFile, StandardCharsets.UTF_8).map(Paths::get);
}
private void createNewListingFile() throws IOException {
@@ -125,9 +158,23 @@ public class FolderStorage {
final Iterator iterator = stream.iterator();
while (iterator.hasNext()) {
final Path path = iterator.next();
- out.write(path.toString());
- out.write("\n");
+ if (!path.getFileName().toString().equals(LISTING_FILE_NAME)) {
+ out.write(path.toString());
+ out.write("\n");
+ }
}
}
}
+
+ public Path getPathByOffset(final long offsetInListingFile) throws RuntimeIOException {
+
+ try (BufferedReader reader = Files.newBufferedReader(listingFile, StandardCharsets.UTF_8)) {
+ reader.skip(offsetInListingFile);
+ final String line = reader.readLine();
+ return Paths.get(line);
+ } catch (final IOException e) {
+ throw new RuntimeIOException(e);
+ }
+
+ }
}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java
new file mode 100644
index 0000000..b0a40a3
--- /dev/null
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java
@@ -0,0 +1,74 @@
+package org.lucares.pdb.datastore.internal;
+
+import java.nio.file.Path;
+
+import javax.annotation.Nullable;
+
+public class ListingFileEntry {
+ private final String filename;
+ private final long offsetInListingFile;
+ private Path path;
+
+ public ListingFileEntry(final String filename, final long offsetInListingFile, final Path path) {
+ this.filename = filename;
+ this.offsetInListingFile = offsetInListingFile;
+ this.path = path;
+ }
+
+ public String getFilename() {
+ return filename;
+ }
+
+ public long getOffsetInListingFile() {
+ return offsetInListingFile;
+ }
+
+ public void unsetPath() {
+ path = null;
+ }
+
+ @Nullable
+ public Path getPath() {
+ return path;
+ }
+
+ @Override
+ public String toString() {
+ return "ListingFileEntry [filename=" + filename + ", offsetInListingFile=" + offsetInListingFile + ", path="
+ + path + "]";
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((filename == null) ? 0 : filename.hashCode());
+ result = prime * result + (int) (offsetInListingFile ^ (offsetInListingFile >>> 32));
+ result = prime * result + ((path == null) ? 0 : path.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final ListingFileEntry other = (ListingFileEntry) obj;
+ if (filename == null) {
+ if (other.filename != null)
+ return false;
+ } else if (!filename.equals(other.filename))
+ return false;
+ if (offsetInListingFile != other.offsetInListingFile)
+ return false;
+ if (path == null) {
+ if (other.path != null)
+ return false;
+ } else if (!path.equals(other.path))
+ return false;
+ return true;
+ }
+}
diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java
new file mode 100644
index 0000000..d8ccd20
--- /dev/null
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java
@@ -0,0 +1,78 @@
+package org.lucares.pdb.datastore.internal;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Optional;
+
+import org.lucares.pdb.api.RuntimeIOException;
+
+import com.google.common.io.CountingInputStream;
+
+public class ListingFileIterator implements Iterator, AutoCloseable {
+
+ private final CountingInputStream is;
+
+ private Optional next = null;
+
+ public ListingFileIterator(final Path listingFile) throws FileNotFoundException {
+ is = new CountingInputStream(new BufferedInputStream(new FileInputStream(listingFile.toFile())));
+ }
+
+ @Override
+ public boolean hasNext() {
+
+ if (next == null) {
+ next = Optional.ofNullable(getNext());
+ }
+
+ return next.isPresent();
+ }
+
+ @Override
+ public ListingFileEntry next() {
+
+ final ListingFileEntry result = next.orElseGet(() -> getNext());
+ if (result == null) {
+ throw new NoSuchElementException();
+ }
+ next = Optional.ofNullable(getNext());
+ return result;
+ }
+
+ public ListingFileEntry getNext() {
+ final StringBuilder line = new StringBuilder();
+ try {
+ final long offsetInListingFile = is.getCount();
+
+ int codePoint;
+ while ((codePoint = is.read()) >= 0) {
+ if (codePoint == '\n') {
+ break;
+ }
+ line.appendCodePoint(codePoint);
+ }
+
+ if (codePoint < 0) {
+ return null;
+ }
+
+ final int lastSeparatorPosition = line.lastIndexOf(File.separator);
+ final String filename = line.substring(lastSeparatorPosition + 1);
+ return new ListingFileEntry(filename, offsetInListingFile, null);
+
+ } catch (final IOException e) {
+ throw new RuntimeIOException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ is.close();
+ }
+}
diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java
index c33d531..81b23e2 100644
--- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java
+++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java
@@ -123,7 +123,8 @@ public class DataStoreTest {
private void assertSearch(final String query, final Tags... tags) {
final List actualDocs = dataStore.search(query);
- final List actual = CollectionUtils.map(actualDocs, Doc::getPath);
+ final List actual = CollectionUtils.map(actualDocs,
+ doc -> doc.getPath(dataStore.getFolderStoragePathResolver()));
final List expectedPaths = CollectionUtils.map(tags, tagsToPath::get);
@@ -152,7 +153,8 @@ public class DataStoreTest {
private void assertSearch(final DataStore dataStore, final String query, final Path... paths) {
final List actualDocs = dataStore.search(query);
- final List actual = CollectionUtils.map(actualDocs, Doc::getPath);
+ final List actual = CollectionUtils.map(actualDocs,
+ doc -> doc.getPath(dataStore.getFolderStoragePathResolver()));
Assert.assertEquals(actual, Arrays.asList(paths));
}
diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java
index c4c4dff..b9e79d4 100644
--- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java
+++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java
@@ -80,15 +80,17 @@ public class FolderStorageTest {
public void testCreateAndUpdateFileListing() throws Exception {
final int maxFilesPerFolder = 10;
final Path storageLeafFolder = dataDirectory.resolve("0").resolve("0");
+ final int storageLeafFolderLength = storageLeafFolder.toString().length();
// initial creation
{
final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder);
storage.insert("abc", ".txt");
storage.insert("def", ".txt");
- final List initialListing = storage.list().collect(Collectors.toList());
- Assert.assertEquals(initialListing,
- Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt")));
+ final List initialListing = storage.list().collect(Collectors.toList());
+ Assert.assertEquals(initialListing, Arrays.asList(//
+ new ListingFileEntry("abc$.txt", 0, null), //
+ new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null)));
}
// load existing storage
@@ -96,18 +98,21 @@ public class FolderStorageTest {
final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder);
// files inserted previously are still there
- final List initialListing = storage.list().collect(Collectors.toList());
+ final List initialListing = storage.list().collect(Collectors.toList());
- Assert.assertEquals(initialListing,
- Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt")));
+ Assert.assertEquals(initialListing, Arrays.asList(//
+ new ListingFileEntry("abc$.txt", 0, null), //
+ new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null)));
// add new file
storage.insert("ghi", ".txt");
// listing is updated
- final List updatedListing = storage.list().collect(Collectors.toList());
- Assert.assertEquals(updatedListing, Arrays.asList(storageLeafFolder.resolve("abc$.txt"),
- storageLeafFolder.resolve("def$.txt"), storageLeafFolder.resolve("ghi$.txt")));
+ final List updatedListing = storage.list().collect(Collectors.toList());
+ Assert.assertEquals(updatedListing, Arrays.asList(//
+ new ListingFileEntry("abc$.txt", 0, null), //
+ new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null), //
+ new ListingFileEntry("ghi$.txt", 2 * storageLeafFolderLength + 20, null)));
}
}
diff --git a/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java b/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java
index 35ad310..77f1749 100644
--- a/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java
+++ b/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java
@@ -15,6 +15,7 @@ import java.util.function.Consumer;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.Doc;
+import org.lucares.pdb.datastore.FolderStoragePathResolver;
import org.lucares.pdb.datastore.PdbDB;
import org.lucares.utils.CollectionUtils;
import org.slf4j.Logger;
@@ -70,7 +71,8 @@ public class TagsToFile implements AutoCloseable {
final List result = new ArrayList<>();
for (final Doc document : searchResult) {
- final Path path = document.getPath();
+ final FolderStoragePathResolver resolver = db.getFolderStoragePathResolver();
+ final Path path = document.getPath(resolver);
final Tags tags = document.getTags();
final PdbFile pdbFile = new PdbFile(path, tags);