From 82b8a8a932d33d0b7a5fb513e186dccfc1215ab0 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sun, 6 May 2018 12:58:10 +0200 Subject: [PATCH] reduce memory footprint by lazily intializing the path in Doc The path in Doc is not optional. This reduces memory consumption, because we only have to store a long (the offset in the listing file). This assumes, that only a small percentage of Docs is requested. --- data-store/build.gradle | 1 + .../java/org/lucares/pdb/datastore/Doc.java | 62 +++++++++++++-- .../datastore/FolderStoragePathResolver.java | 9 +++ .../java/org/lucares/pdb/datastore/PdbDB.java | 8 +- .../pdb/datastore/internal/DataStore.java | 35 ++++++--- .../pdb/datastore/internal/FolderStorage.java | 63 +++++++++++++-- .../datastore/internal/ListingFileEntry.java | 74 ++++++++++++++++++ .../internal/ListingFileIterator.java | 78 +++++++++++++++++++ .../pdb/datastore/internal/DataStoreTest.java | 6 +- .../datastore/internal/FolderStorageTest.java | 23 +++--- .../lucares/performance/db/TagsToFile.java | 4 +- 11 files changed, 324 insertions(+), 39 deletions(-) create mode 100644 data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java create mode 100644 data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java create mode 100644 data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java diff --git a/data-store/build.gradle b/data-store/build.gradle index 1ea1b4e..88074b1 100644 --- a/data-store/build.gradle +++ b/data-store/build.gradle @@ -8,6 +8,7 @@ dependencies { compile 'org.lucares:primitiveCollections:0.1.20171228131833' compile 'org.apache.commons:commons-lang3:3.7' + compile 'com.google.guava:guava:24.1-jre' compile 'org.apache.logging.log4j:log4j-core:2.10.0' compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0' diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java b/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java index 6eee55f..1536d69 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/Doc.java @@ -5,27 +5,77 @@ import java.nio.file.Path; import java.nio.file.Paths; import org.lucares.pdb.api.Tags; +import org.lucares.pdb.datastore.internal.DataStore; public class Doc { private final Tags tags; - private final byte[] path; + private final long offsetInListingFile; + private byte[] path; - public Doc(final Tags tags, final Path path) { + /** + * Initializes a new document. + *

+ * The path can be {@code null}. If path is {@code null}, then + * {@code offsetInListingFile} must be set. The path will be initialized lazily + * when needed. + *

+ * This is used to reduce the memory footprint. + * + * @param tags + * @param offsetInListingFile + * must be set if {@code path} is {@code null} + * @param path + * optional, can be {@code null} + */ + public Doc(final Tags tags, final long offsetInListingFile, final Path path) { super(); this.tags = tags; - this.path = path.toString().getBytes(StandardCharsets.UTF_8); + this.offsetInListingFile = offsetInListingFile; + setPath(path); } public Tags getTags() { return tags; } - public Path getPath() { - return Paths.get(new String(path, StandardCharsets.UTF_8)); + public void setPath(final Path path) { + if (path != null) { + this.path = path.toString().getBytes(StandardCharsets.UTF_8); + } else { + this.path = null; + } + } + + /** + * The path to the storage file. + *

+ * This value is lazily initialized. Callers have to provide a resolver. See + * {@link DataStore#getFolderStoragePathResolver()}. + * + * @return the path + */ + public Path getPath(final FolderStoragePathResolver resolver) { + + if (path == null) { + final Path resolvedPath = resolver.getPath(offsetInListingFile); + setPath(resolvedPath); + } + final Path result = Paths.get(new String(path, StandardCharsets.UTF_8)); + return result; + } + + private Path getPathNullable() { + return getPath(FolderStoragePathResolver.NULL); + } + + public long getOffsetInListingFile() { + return offsetInListingFile; } @Override public String toString() { - return "Doc [tags=" + tags + ", path=" + getPath() + "]"; + return "Doc [tags=" + tags + ", offsetInListingFile=" + offsetInListingFile + ", path=" + getPathNullable() + + "]"; } + } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java b/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java new file mode 100644 index 0000000..332d700 --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/FolderStoragePathResolver.java @@ -0,0 +1,9 @@ +package org.lucares.pdb.datastore; + +import java.nio.file.Path; + +public interface FolderStoragePathResolver { + FolderStoragePathResolver NULL = offset -> null; + + public Path getPath(long offsetInListingFile); +} diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java b/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java index 7c787f3..93c2f3a 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/PdbDB.java @@ -39,8 +39,12 @@ public class PdbDB { return proposer.propose(query, caretIndex); } - public List getByTags(Tags tags) { - + public List getByTags(final Tags tags) { + return dataStore.getByTags(tags); } + + public FolderStoragePathResolver getFolderStoragePathResolver() { + return dataStore.getFolderStoragePathResolver(); + } } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java index fe82cd6..d1afa04 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java @@ -17,6 +17,7 @@ import org.lucares.collections.IntList; import org.lucares.pdb.api.StringCompressor; import org.lucares.pdb.api.Tags; import org.lucares.pdb.datastore.Doc; +import org.lucares.pdb.datastore.FolderStoragePathResolver; import org.lucares.pdb.datastore.lang.Expression; import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor; import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor.AllDocIds; @@ -41,25 +42,33 @@ public class DataStore { private final ConcurrentHashMap> keyToValueToDocId = new ConcurrentHashMap<>(); private final FolderStorage folderStorage; + private final FolderStoragePathResolver folderStoragePathResolver; public DataStore(final Path dataDirectory) throws IOException { Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(dataDirectory)); folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000); init(folderStorage); + + folderStoragePathResolver = folderStorage::getPathByOffset; } private void init(final FolderStorage folderStorage) throws IOException { final long start = System.nanoTime(); - final Stream files = folderStorage.list(); - files.parallel().forEach(path -> { + final Stream files = folderStorage.list(); + files// .parallel() + .forEach(listingFileEntry -> { - final String filename = path.getFileName().toString(); - final Tags tags = toTags(filename); - cacheTagToFileMapping(tags, path); + listingFileEntry.unsetPath(); // unset the path, so that we don't store it for every document (will + // be + // initialized lazily if needed) - }); + final String filename = listingFileEntry.getFilename(); + final Tags tags = toTags(filename); + cacheTagToFileMapping(tags, listingFileEntry); + + }); trimIntLists(); sortIntLists(); synchronized (docIdToDoc) { @@ -68,10 +77,10 @@ public class DataStore { INITIALIZE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms"); } - private void cacheTagToFileMapping(final Tags tags, final Path path) { + private void cacheTagToFileMapping(final Tags tags, final ListingFileEntry listingFileEntry) { final int docId; - final Doc newDoc = new Doc(tags, path); + final Doc newDoc = new Doc(tags, listingFileEntry.getOffsetInListingFile(), listingFileEntry.getPath()); synchronized (docIdToDoc) { docId = docIdToDoc.size(); docIdToDoc.add(newDoc); @@ -140,11 +149,11 @@ public class DataStore { public Path createNewFile(final Tags tags) throws IOException { final String filename = tags.getFilename(); - final Path result = folderStorage.insert(filename, PDB_EXTENSION); + final ListingFileEntry listingFileEntry = folderStorage.insert(filename, PDB_EXTENSION); - cacheTagToFileMapping(tags, result); + cacheTagToFileMapping(tags, listingFileEntry); - return result; + return listingFileEntry.getPath(); } private Tags toTags(final String filename) { @@ -232,4 +241,8 @@ public class DataStore { final List result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0)); return result; } + + public FolderStoragePathResolver getFolderStoragePathResolver() { + return folderStoragePathResolver; + } } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java index 4da8295..773d404 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java @@ -1,6 +1,9 @@ package org.lucares.pdb.datastore.internal; +import java.io.BufferedReader; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.RandomAccessFile; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -9,9 +12,13 @@ import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.nio.file.attribute.BasicFileAttributes; import java.util.Iterator; +import java.util.Spliterator; +import java.util.Spliterators; import java.util.function.BiPredicate; import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import org.lucares.pdb.api.RuntimeIOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,6 +46,7 @@ public class FolderStorage { this.listingFile = storageBaseDirectory.resolve(LISTING_FILE_NAME); this.maxFilesPerFolder = maxFilesPerFolder; init(); + initListingFileIfNotExists(); } private void init() throws IOException { @@ -57,7 +65,7 @@ public class FolderStorage { filesInSecondLevel = (int) Files.list(currentDirectory).count(); } - public Path insert(final String filenamePrefix, final String filenameSuffix) throws IOException { + public ListingFileEntry insert(final String filenamePrefix, final String filenameSuffix) throws IOException { ensureCapacity(); @@ -71,17 +79,29 @@ public class FolderStorage { Files.createFile(newFile); filesInSecondLevel++; - updateListingFile(newFile); + final ListingFileEntry result = updateListingFile(newFile); - return newFile; + return result; } - private synchronized void updateListingFile(final Path newFile) throws IOException { + private synchronized ListingFileEntry updateListingFile(final Path newFile) throws IOException { + final long offsetInListingFile = getFilePointer(); try (Writer out = Files.newBufferedWriter(listingFile, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.APPEND)) { out.write(newFile.toString()); out.write("\n"); } + final String filename = newFile.getFileName().toString(); + return new ListingFileEntry(filename, offsetInListingFile, newFile); + } + + private long getFilePointer() throws FileNotFoundException, IOException { + final RandomAccessFile randomAccessFile = new RandomAccessFile(listingFile.toFile(), "r"); + try { + return randomAccessFile.getFilePointer(); + } finally { + randomAccessFile.close(); + } } private void ensureCapacity() throws IOException { @@ -103,15 +123,28 @@ public class FolderStorage { Files.createDirectories(currentDirectory); } - public Stream list() throws IOException { + public Stream list() throws IOException { + return readListingFile(); + } + + private Stream readListingFile() throws IOException { + + try (final ListingFileIterator iterator = new ListingFileIterator(listingFile)) { + final Spliterator spliterator = Spliterators.spliteratorUnknownSize(iterator, + Spliterator.ORDERED); + final Stream stream = StreamSupport.stream(spliterator, false); + return stream; + } + } + + private void initListingFileIfNotExists() throws IOException { if (!Files.exists(listingFile)) { final long start = System.nanoTime(); LOGGER.info("listing file not found -> creating a new one"); createNewListingFile(); METRICS_CREATE_LISTING_FILE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms"); } - return Files.lines(listingFile, StandardCharsets.UTF_8).map(Paths::get); } private void createNewListingFile() throws IOException { @@ -125,9 +158,23 @@ public class FolderStorage { final Iterator iterator = stream.iterator(); while (iterator.hasNext()) { final Path path = iterator.next(); - out.write(path.toString()); - out.write("\n"); + if (!path.getFileName().toString().equals(LISTING_FILE_NAME)) { + out.write(path.toString()); + out.write("\n"); + } } } } + + public Path getPathByOffset(final long offsetInListingFile) throws RuntimeIOException { + + try (BufferedReader reader = Files.newBufferedReader(listingFile, StandardCharsets.UTF_8)) { + reader.skip(offsetInListingFile); + final String line = reader.readLine(); + return Paths.get(line); + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + + } } diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java new file mode 100644 index 0000000..b0a40a3 --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileEntry.java @@ -0,0 +1,74 @@ +package org.lucares.pdb.datastore.internal; + +import java.nio.file.Path; + +import javax.annotation.Nullable; + +public class ListingFileEntry { + private final String filename; + private final long offsetInListingFile; + private Path path; + + public ListingFileEntry(final String filename, final long offsetInListingFile, final Path path) { + this.filename = filename; + this.offsetInListingFile = offsetInListingFile; + this.path = path; + } + + public String getFilename() { + return filename; + } + + public long getOffsetInListingFile() { + return offsetInListingFile; + } + + public void unsetPath() { + path = null; + } + + @Nullable + public Path getPath() { + return path; + } + + @Override + public String toString() { + return "ListingFileEntry [filename=" + filename + ", offsetInListingFile=" + offsetInListingFile + ", path=" + + path + "]"; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((filename == null) ? 0 : filename.hashCode()); + result = prime * result + (int) (offsetInListingFile ^ (offsetInListingFile >>> 32)); + result = prime * result + ((path == null) ? 0 : path.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final ListingFileEntry other = (ListingFileEntry) obj; + if (filename == null) { + if (other.filename != null) + return false; + } else if (!filename.equals(other.filename)) + return false; + if (offsetInListingFile != other.offsetInListingFile) + return false; + if (path == null) { + if (other.path != null) + return false; + } else if (!path.equals(other.path)) + return false; + return true; + } +} diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java new file mode 100644 index 0000000..d8ccd20 --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/ListingFileIterator.java @@ -0,0 +1,78 @@ +package org.lucares.pdb.datastore.internal; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Optional; + +import org.lucares.pdb.api.RuntimeIOException; + +import com.google.common.io.CountingInputStream; + +public class ListingFileIterator implements Iterator, AutoCloseable { + + private final CountingInputStream is; + + private Optional next = null; + + public ListingFileIterator(final Path listingFile) throws FileNotFoundException { + is = new CountingInputStream(new BufferedInputStream(new FileInputStream(listingFile.toFile()))); + } + + @Override + public boolean hasNext() { + + if (next == null) { + next = Optional.ofNullable(getNext()); + } + + return next.isPresent(); + } + + @Override + public ListingFileEntry next() { + + final ListingFileEntry result = next.orElseGet(() -> getNext()); + if (result == null) { + throw new NoSuchElementException(); + } + next = Optional.ofNullable(getNext()); + return result; + } + + public ListingFileEntry getNext() { + final StringBuilder line = new StringBuilder(); + try { + final long offsetInListingFile = is.getCount(); + + int codePoint; + while ((codePoint = is.read()) >= 0) { + if (codePoint == '\n') { + break; + } + line.appendCodePoint(codePoint); + } + + if (codePoint < 0) { + return null; + } + + final int lastSeparatorPosition = line.lastIndexOf(File.separator); + final String filename = line.substring(lastSeparatorPosition + 1); + return new ListingFileEntry(filename, offsetInListingFile, null); + + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } + + @Override + public void close() throws IOException { + is.close(); + } +} diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java index c33d531..81b23e2 100644 --- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/DataStoreTest.java @@ -123,7 +123,8 @@ public class DataStoreTest { private void assertSearch(final String query, final Tags... tags) { final List actualDocs = dataStore.search(query); - final List actual = CollectionUtils.map(actualDocs, Doc::getPath); + final List actual = CollectionUtils.map(actualDocs, + doc -> doc.getPath(dataStore.getFolderStoragePathResolver())); final List expectedPaths = CollectionUtils.map(tags, tagsToPath::get); @@ -152,7 +153,8 @@ public class DataStoreTest { private void assertSearch(final DataStore dataStore, final String query, final Path... paths) { final List actualDocs = dataStore.search(query); - final List actual = CollectionUtils.map(actualDocs, Doc::getPath); + final List actual = CollectionUtils.map(actualDocs, + doc -> doc.getPath(dataStore.getFolderStoragePathResolver())); Assert.assertEquals(actual, Arrays.asList(paths)); } diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java index c4c4dff..b9e79d4 100644 --- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java @@ -80,15 +80,17 @@ public class FolderStorageTest { public void testCreateAndUpdateFileListing() throws Exception { final int maxFilesPerFolder = 10; final Path storageLeafFolder = dataDirectory.resolve("0").resolve("0"); + final int storageLeafFolderLength = storageLeafFolder.toString().length(); // initial creation { final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder); storage.insert("abc", ".txt"); storage.insert("def", ".txt"); - final List initialListing = storage.list().collect(Collectors.toList()); - Assert.assertEquals(initialListing, - Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt"))); + final List initialListing = storage.list().collect(Collectors.toList()); + Assert.assertEquals(initialListing, Arrays.asList(// + new ListingFileEntry("abc$.txt", 0, null), // + new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null))); } // load existing storage @@ -96,18 +98,21 @@ public class FolderStorageTest { final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder); // files inserted previously are still there - final List initialListing = storage.list().collect(Collectors.toList()); + final List initialListing = storage.list().collect(Collectors.toList()); - Assert.assertEquals(initialListing, - Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt"))); + Assert.assertEquals(initialListing, Arrays.asList(// + new ListingFileEntry("abc$.txt", 0, null), // + new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null))); // add new file storage.insert("ghi", ".txt"); // listing is updated - final List updatedListing = storage.list().collect(Collectors.toList()); - Assert.assertEquals(updatedListing, Arrays.asList(storageLeafFolder.resolve("abc$.txt"), - storageLeafFolder.resolve("def$.txt"), storageLeafFolder.resolve("ghi$.txt"))); + final List updatedListing = storage.list().collect(Collectors.toList()); + Assert.assertEquals(updatedListing, Arrays.asList(// + new ListingFileEntry("abc$.txt", 0, null), // + new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null), // + new ListingFileEntry("ghi$.txt", 2 * storageLeafFolderLength + 20, null))); } } diff --git a/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java b/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java index 35ad310..77f1749 100644 --- a/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java +++ b/performanceDb/src/main/java/org/lucares/performance/db/TagsToFile.java @@ -15,6 +15,7 @@ import java.util.function.Consumer; import org.lucares.pdb.api.Tags; import org.lucares.pdb.datastore.Doc; +import org.lucares.pdb.datastore.FolderStoragePathResolver; import org.lucares.pdb.datastore.PdbDB; import org.lucares.utils.CollectionUtils; import org.slf4j.Logger; @@ -70,7 +71,8 @@ public class TagsToFile implements AutoCloseable { final List result = new ArrayList<>(); for (final Doc document : searchResult) { - final Path path = document.getPath(); + final FolderStoragePathResolver resolver = db.getFolderStoragePathResolver(); + final Path path = document.getPath(resolver); final Tags tags = document.getTags(); final PdbFile pdbFile = new PdbFile(path, tags);