reduce memory footprint by lazily intializing the path in Doc

The path in Doc is not optional. This reduces memory consumption,
because we only have to store a long (the offset in the listing file).
This assumes, that only a small percentage of Docs is requested.
This commit is contained in:
2018-05-06 12:58:10 +02:00
parent e3102c01d4
commit 82b8a8a932
11 changed files with 324 additions and 39 deletions

View File

@@ -8,6 +8,7 @@ dependencies {
compile 'org.lucares:primitiveCollections:0.1.20171228131833'
compile 'org.apache.commons:commons-lang3:3.7'
compile 'com.google.guava:guava:24.1-jre'
compile 'org.apache.logging.log4j:log4j-core:2.10.0'
compile 'org.apache.logging.log4j:log4j-slf4j-impl:2.10.0'

View File

@@ -5,27 +5,77 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.internal.DataStore;
public class Doc {
private final Tags tags;
private final byte[] path;
private final long offsetInListingFile;
private byte[] path;
public Doc(final Tags tags, final Path path) {
/**
* Initializes a new document.
* <p>
* The path can be {@code null}. If path is {@code null}, then
* {@code offsetInListingFile} must be set. The path will be initialized lazily
* when needed.
* <p>
* This is used to reduce the memory footprint.
*
* @param tags
* @param offsetInListingFile
* must be set if {@code path} is {@code null}
* @param path
* optional, can be {@code null}
*/
public Doc(final Tags tags, final long offsetInListingFile, final Path path) {
super();
this.tags = tags;
this.path = path.toString().getBytes(StandardCharsets.UTF_8);
this.offsetInListingFile = offsetInListingFile;
setPath(path);
}
public Tags getTags() {
return tags;
}
public Path getPath() {
return Paths.get(new String(path, StandardCharsets.UTF_8));
public void setPath(final Path path) {
if (path != null) {
this.path = path.toString().getBytes(StandardCharsets.UTF_8);
} else {
this.path = null;
}
}
/**
* The path to the storage file.
* <p>
* This value is lazily initialized. Callers have to provide a resolver. See
* {@link DataStore#getFolderStoragePathResolver()}.
*
* @return the path
*/
public Path getPath(final FolderStoragePathResolver resolver) {
if (path == null) {
final Path resolvedPath = resolver.getPath(offsetInListingFile);
setPath(resolvedPath);
}
final Path result = Paths.get(new String(path, StandardCharsets.UTF_8));
return result;
}
private Path getPathNullable() {
return getPath(FolderStoragePathResolver.NULL);
}
public long getOffsetInListingFile() {
return offsetInListingFile;
}
@Override
public String toString() {
return "Doc [tags=" + tags + ", path=" + getPath() + "]";
return "Doc [tags=" + tags + ", offsetInListingFile=" + offsetInListingFile + ", path=" + getPathNullable()
+ "]";
}
}

View File

@@ -0,0 +1,9 @@
package org.lucares.pdb.datastore;
import java.nio.file.Path;
public interface FolderStoragePathResolver {
FolderStoragePathResolver NULL = offset -> null;
public Path getPath(long offsetInListingFile);
}

View File

@@ -39,8 +39,12 @@ public class PdbDB {
return proposer.propose(query, caretIndex);
}
public List<Doc> getByTags(Tags tags) {
public List<Doc> getByTags(final Tags tags) {
return dataStore.getByTags(tags);
}
public FolderStoragePathResolver getFolderStoragePathResolver() {
return dataStore.getFolderStoragePathResolver();
}
}

View File

@@ -17,6 +17,7 @@ import org.lucares.collections.IntList;
import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.Doc;
import org.lucares.pdb.datastore.FolderStoragePathResolver;
import org.lucares.pdb.datastore.lang.Expression;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor.AllDocIds;
@@ -41,25 +42,33 @@ public class DataStore {
private final ConcurrentHashMap<String, Map<String, IntList>> keyToValueToDocId = new ConcurrentHashMap<>();
private final FolderStorage folderStorage;
private final FolderStoragePathResolver folderStoragePathResolver;
public DataStore(final Path dataDirectory) throws IOException {
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(dataDirectory));
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
init(folderStorage);
folderStoragePathResolver = folderStorage::getPathByOffset;
}
private void init(final FolderStorage folderStorage) throws IOException {
final long start = System.nanoTime();
final Stream<Path> files = folderStorage.list();
files.parallel().forEach(path -> {
final Stream<ListingFileEntry> files = folderStorage.list();
files// .parallel()
.forEach(listingFileEntry -> {
final String filename = path.getFileName().toString();
final Tags tags = toTags(filename);
cacheTagToFileMapping(tags, path);
listingFileEntry.unsetPath(); // unset the path, so that we don't store it for every document (will
// be
// initialized lazily if needed)
});
final String filename = listingFileEntry.getFilename();
final Tags tags = toTags(filename);
cacheTagToFileMapping(tags, listingFileEntry);
});
trimIntLists();
sortIntLists();
synchronized (docIdToDoc) {
@@ -68,10 +77,10 @@ public class DataStore {
INITIALIZE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms");
}
private void cacheTagToFileMapping(final Tags tags, final Path path) {
private void cacheTagToFileMapping(final Tags tags, final ListingFileEntry listingFileEntry) {
final int docId;
final Doc newDoc = new Doc(tags, path);
final Doc newDoc = new Doc(tags, listingFileEntry.getOffsetInListingFile(), listingFileEntry.getPath());
synchronized (docIdToDoc) {
docId = docIdToDoc.size();
docIdToDoc.add(newDoc);
@@ -140,11 +149,11 @@ public class DataStore {
public Path createNewFile(final Tags tags) throws IOException {
final String filename = tags.getFilename();
final Path result = folderStorage.insert(filename, PDB_EXTENSION);
final ListingFileEntry listingFileEntry = folderStorage.insert(filename, PDB_EXTENSION);
cacheTagToFileMapping(tags, result);
cacheTagToFileMapping(tags, listingFileEntry);
return result;
return listingFileEntry.getPath();
}
private Tags toTags(final String filename) {
@@ -232,4 +241,8 @@ public class DataStore {
final List<Doc> result = tagsToDocs.getOrDefault(tags, new ArrayList<>(0));
return result;
}
public FolderStoragePathResolver getFolderStoragePathResolver() {
return folderStoragePathResolver;
}
}

View File

@@ -1,6 +1,9 @@
package org.lucares.pdb.datastore.internal;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -9,9 +12,13 @@ import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.BiPredicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.lucares.pdb.api.RuntimeIOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,6 +46,7 @@ public class FolderStorage {
this.listingFile = storageBaseDirectory.resolve(LISTING_FILE_NAME);
this.maxFilesPerFolder = maxFilesPerFolder;
init();
initListingFileIfNotExists();
}
private void init() throws IOException {
@@ -57,7 +65,7 @@ public class FolderStorage {
filesInSecondLevel = (int) Files.list(currentDirectory).count();
}
public Path insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
public ListingFileEntry insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
ensureCapacity();
@@ -71,17 +79,29 @@ public class FolderStorage {
Files.createFile(newFile);
filesInSecondLevel++;
updateListingFile(newFile);
final ListingFileEntry result = updateListingFile(newFile);
return newFile;
return result;
}
private synchronized void updateListingFile(final Path newFile) throws IOException {
private synchronized ListingFileEntry updateListingFile(final Path newFile) throws IOException {
final long offsetInListingFile = getFilePointer();
try (Writer out = Files.newBufferedWriter(listingFile, StandardCharsets.UTF_8, StandardOpenOption.CREATE,
StandardOpenOption.APPEND)) {
out.write(newFile.toString());
out.write("\n");
}
final String filename = newFile.getFileName().toString();
return new ListingFileEntry(filename, offsetInListingFile, newFile);
}
private long getFilePointer() throws FileNotFoundException, IOException {
final RandomAccessFile randomAccessFile = new RandomAccessFile(listingFile.toFile(), "r");
try {
return randomAccessFile.getFilePointer();
} finally {
randomAccessFile.close();
}
}
private void ensureCapacity() throws IOException {
@@ -103,15 +123,28 @@ public class FolderStorage {
Files.createDirectories(currentDirectory);
}
public Stream<Path> list() throws IOException {
public Stream<ListingFileEntry> list() throws IOException {
return readListingFile();
}
private Stream<ListingFileEntry> readListingFile() throws IOException {
try (final ListingFileIterator iterator = new ListingFileIterator(listingFile)) {
final Spliterator<ListingFileEntry> spliterator = Spliterators.spliteratorUnknownSize(iterator,
Spliterator.ORDERED);
final Stream<ListingFileEntry> stream = StreamSupport.stream(spliterator, false);
return stream;
}
}
private void initListingFileIfNotExists() throws IOException {
if (!Files.exists(listingFile)) {
final long start = System.nanoTime();
LOGGER.info("listing file not found -> creating a new one");
createNewListingFile();
METRICS_CREATE_LISTING_FILE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms");
}
return Files.lines(listingFile, StandardCharsets.UTF_8).map(Paths::get);
}
private void createNewListingFile() throws IOException {
@@ -125,9 +158,23 @@ public class FolderStorage {
final Iterator<Path> iterator = stream.iterator();
while (iterator.hasNext()) {
final Path path = iterator.next();
out.write(path.toString());
out.write("\n");
if (!path.getFileName().toString().equals(LISTING_FILE_NAME)) {
out.write(path.toString());
out.write("\n");
}
}
}
}
public Path getPathByOffset(final long offsetInListingFile) throws RuntimeIOException {
try (BufferedReader reader = Files.newBufferedReader(listingFile, StandardCharsets.UTF_8)) {
reader.skip(offsetInListingFile);
final String line = reader.readLine();
return Paths.get(line);
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
}

View File

@@ -0,0 +1,74 @@
package org.lucares.pdb.datastore.internal;
import java.nio.file.Path;
import javax.annotation.Nullable;
public class ListingFileEntry {
private final String filename;
private final long offsetInListingFile;
private Path path;
public ListingFileEntry(final String filename, final long offsetInListingFile, final Path path) {
this.filename = filename;
this.offsetInListingFile = offsetInListingFile;
this.path = path;
}
public String getFilename() {
return filename;
}
public long getOffsetInListingFile() {
return offsetInListingFile;
}
public void unsetPath() {
path = null;
}
@Nullable
public Path getPath() {
return path;
}
@Override
public String toString() {
return "ListingFileEntry [filename=" + filename + ", offsetInListingFile=" + offsetInListingFile + ", path="
+ path + "]";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((filename == null) ? 0 : filename.hashCode());
result = prime * result + (int) (offsetInListingFile ^ (offsetInListingFile >>> 32));
result = prime * result + ((path == null) ? 0 : path.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final ListingFileEntry other = (ListingFileEntry) obj;
if (filename == null) {
if (other.filename != null)
return false;
} else if (!filename.equals(other.filename))
return false;
if (offsetInListingFile != other.offsetInListingFile)
return false;
if (path == null) {
if (other.path != null)
return false;
} else if (!path.equals(other.path))
return false;
return true;
}
}

View File

@@ -0,0 +1,78 @@
package org.lucares.pdb.datastore.internal;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Optional;
import org.lucares.pdb.api.RuntimeIOException;
import com.google.common.io.CountingInputStream;
public class ListingFileIterator implements Iterator<ListingFileEntry>, AutoCloseable {
private final CountingInputStream is;
private Optional<ListingFileEntry> next = null;
public ListingFileIterator(final Path listingFile) throws FileNotFoundException {
is = new CountingInputStream(new BufferedInputStream(new FileInputStream(listingFile.toFile())));
}
@Override
public boolean hasNext() {
if (next == null) {
next = Optional.ofNullable(getNext());
}
return next.isPresent();
}
@Override
public ListingFileEntry next() {
final ListingFileEntry result = next.orElseGet(() -> getNext());
if (result == null) {
throw new NoSuchElementException();
}
next = Optional.ofNullable(getNext());
return result;
}
public ListingFileEntry getNext() {
final StringBuilder line = new StringBuilder();
try {
final long offsetInListingFile = is.getCount();
int codePoint;
while ((codePoint = is.read()) >= 0) {
if (codePoint == '\n') {
break;
}
line.appendCodePoint(codePoint);
}
if (codePoint < 0) {
return null;
}
final int lastSeparatorPosition = line.lastIndexOf(File.separator);
final String filename = line.substring(lastSeparatorPosition + 1);
return new ListingFileEntry(filename, offsetInListingFile, null);
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
@Override
public void close() throws IOException {
is.close();
}
}

View File

@@ -123,7 +123,8 @@ public class DataStoreTest {
private void assertSearch(final String query, final Tags... tags) {
final List<Doc> actualDocs = dataStore.search(query);
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
final List<Path> actual = CollectionUtils.map(actualDocs,
doc -> doc.getPath(dataStore.getFolderStoragePathResolver()));
final List<Path> expectedPaths = CollectionUtils.map(tags, tagsToPath::get);
@@ -152,7 +153,8 @@ public class DataStoreTest {
private void assertSearch(final DataStore dataStore, final String query, final Path... paths) {
final List<Doc> actualDocs = dataStore.search(query);
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
final List<Path> actual = CollectionUtils.map(actualDocs,
doc -> doc.getPath(dataStore.getFolderStoragePathResolver()));
Assert.assertEquals(actual, Arrays.asList(paths));
}

View File

@@ -80,15 +80,17 @@ public class FolderStorageTest {
public void testCreateAndUpdateFileListing() throws Exception {
final int maxFilesPerFolder = 10;
final Path storageLeafFolder = dataDirectory.resolve("0").resolve("0");
final int storageLeafFolderLength = storageLeafFolder.toString().length();
// initial creation
{
final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder);
storage.insert("abc", ".txt");
storage.insert("def", ".txt");
final List<Path> initialListing = storage.list().collect(Collectors.toList());
Assert.assertEquals(initialListing,
Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt")));
final List<ListingFileEntry> initialListing = storage.list().collect(Collectors.toList());
Assert.assertEquals(initialListing, Arrays.asList(//
new ListingFileEntry("abc$.txt", 0, null), //
new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null)));
}
// load existing storage
@@ -96,18 +98,21 @@ public class FolderStorageTest {
final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder);
// files inserted previously are still there
final List<Path> initialListing = storage.list().collect(Collectors.toList());
final List<ListingFileEntry> initialListing = storage.list().collect(Collectors.toList());
Assert.assertEquals(initialListing,
Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt")));
Assert.assertEquals(initialListing, Arrays.asList(//
new ListingFileEntry("abc$.txt", 0, null), //
new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null)));
// add new file
storage.insert("ghi", ".txt");
// listing is updated
final List<Path> updatedListing = storage.list().collect(Collectors.toList());
Assert.assertEquals(updatedListing, Arrays.asList(storageLeafFolder.resolve("abc$.txt"),
storageLeafFolder.resolve("def$.txt"), storageLeafFolder.resolve("ghi$.txt")));
final List<ListingFileEntry> updatedListing = storage.list().collect(Collectors.toList());
Assert.assertEquals(updatedListing, Arrays.asList(//
new ListingFileEntry("abc$.txt", 0, null), //
new ListingFileEntry("def$.txt", storageLeafFolderLength + 10, null), //
new ListingFileEntry("ghi$.txt", 2 * storageLeafFolderLength + 20, null)));
}
}