From e3102c01d472b6980a2a40da5d1806971ab6bdea Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sat, 5 May 2018 10:46:16 +0200 Subject: [PATCH] use listing.csv instead of iterating through all folders The hope is, that it is faster to read a single file instead of listing hundreds of folders. --- .../pdb/datastore/internal/DataStore.java | 3 ++ .../pdb/datastore/internal/FolderStorage.java | 53 +++++++++++++++++-- .../datastore/internal/FolderStorageTest.java | 42 ++++++++++++++- .../performance/db/PerformanceDbTest.java | 2 +- 4 files changed, 95 insertions(+), 5 deletions(-) diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java index d77cf26..fe82cd6 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java @@ -27,6 +27,7 @@ import org.slf4j.LoggerFactory; public class DataStore { private static final Logger EXECUTE_QUERY_LOGGER = LoggerFactory .getLogger("org.lucares.metrics.dataStore.executeQuery"); + private static final Logger INITIALIZE = LoggerFactory.getLogger("org.lucares.metrics.dataStore.init"); private static final Logger LOGGER = LoggerFactory.getLogger(DataStore.class); private static final String SUBDIR_STORAGE = "storage"; @@ -50,6 +51,7 @@ public class DataStore { private void init(final FolderStorage folderStorage) throws IOException { + final long start = System.nanoTime(); final Stream files = folderStorage.list(); files.parallel().forEach(path -> { @@ -63,6 +65,7 @@ public class DataStore { synchronized (docIdToDoc) { ((ArrayList) docIdToDoc).trimToSize(); } + INITIALIZE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms"); } private void cacheTagToFileMapping(final Tags tags, final Path path) { diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java index 34a0899..4da8295 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/FolderStorage.java @@ -1,14 +1,27 @@ package org.lucares.pdb.datastore.internal; import java.io.IOException; +import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.nio.file.attribute.BasicFileAttributes; +import java.util.Iterator; import java.util.function.BiPredicate; import java.util.stream.Stream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class FolderStorage { + static final String LISTING_FILE_NAME = "listing.csv"; + private final static Logger LOGGER = LoggerFactory.getLogger(FolderStorage.class); + private final static Logger METRICS_CREATE_LISTING_FILE = LoggerFactory + .getLogger("org.lucares.metrics.fodlerStorage.createListingFile"); + private final Path storageBaseDirectory; private int firstLevel = 0; @@ -19,8 +32,11 @@ public class FolderStorage { private final int maxFilesPerFolder; + private final Path listingFile; + public FolderStorage(final Path storageBaseDirectory, final int maxFilesPerFolder) throws IOException { this.storageBaseDirectory = storageBaseDirectory; + this.listingFile = storageBaseDirectory.resolve(LISTING_FILE_NAME); this.maxFilesPerFolder = maxFilesPerFolder; init(); } @@ -29,12 +45,12 @@ public class FolderStorage { Files.createDirectories(storageBaseDirectory); - firstLevel = Math.max((int) Files.list(storageBaseDirectory).count() - 1, 0); + firstLevel = Math.max((int) Files.list(storageBaseDirectory).filter(Files::isDirectory).count() - 1, 0); final Path firstLevelDirectory = storageBaseDirectory.resolve(String.valueOf(firstLevel)); Files.createDirectories(firstLevelDirectory); - secondLevel = Math.max((int) Files.list(firstLevelDirectory).count() - 1, 0); + secondLevel = Math.max((int) Files.list(firstLevelDirectory).filter(Files::isDirectory).count() - 1, 0); currentDirectory = firstLevelDirectory.resolve(String.valueOf(secondLevel)); Files.createDirectories(currentDirectory); @@ -55,9 +71,19 @@ public class FolderStorage { Files.createFile(newFile); filesInSecondLevel++; + updateListingFile(newFile); + return newFile; } + private synchronized void updateListingFile(final Path newFile) throws IOException { + try (Writer out = Files.newBufferedWriter(listingFile, StandardCharsets.UTF_8, StandardOpenOption.CREATE, + StandardOpenOption.APPEND)) { + out.write(newFile.toString()); + out.write("\n"); + } + } + private void ensureCapacity() throws IOException { if (filesInSecondLevel >= maxFilesPerFolder) { secondLevel++; @@ -78,9 +104,30 @@ public class FolderStorage { } public Stream list() throws IOException { + + if (!Files.exists(listingFile)) { + final long start = System.nanoTime(); + LOGGER.info("listing file not found -> creating a new one"); + createNewListingFile(); + METRICS_CREATE_LISTING_FILE.info(((System.nanoTime() - start) / 1_000_000.0) + "ms"); + } + return Files.lines(listingFile, StandardCharsets.UTF_8).map(Paths::get); + } + + private void createNewListingFile() throws IOException { final int maxDepth = Integer.MAX_VALUE; final BiPredicate matchRegularFiles = (path, attr) -> Files.isRegularFile(path); - return Files.find(storageBaseDirectory, maxDepth, matchRegularFiles); + try (final Writer out = Files.newBufferedWriter(listingFile, StandardCharsets.UTF_8, StandardOpenOption.CREATE, + StandardOpenOption.APPEND); + final Stream stream = Files.find(storageBaseDirectory, maxDepth, matchRegularFiles)) { + + final Iterator iterator = stream.iterator(); + while (iterator.hasNext()) { + final Path path = iterator.next(); + out.write(path.toString()); + out.write("\n"); + } + } } } diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java index 5d35c6b..c4c4dff 100644 --- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/FolderStorageTest.java @@ -7,6 +7,7 @@ import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; import org.lucares.utils.CollectionUtils; import org.lucares.utils.file.FileUtils; @@ -15,6 +16,7 @@ import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +@Test public class FolderStorageTest { private static final String SUFFIX = ".txt"; private Path dataDirectory; @@ -74,8 +76,46 @@ public class FolderStorageTest { Assert.assertEquals(actualFiles, expectedFiles); } + @Test + public void testCreateAndUpdateFileListing() throws Exception { + final int maxFilesPerFolder = 10; + final Path storageLeafFolder = dataDirectory.resolve("0").resolve("0"); + // initial creation + { + final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder); + storage.insert("abc", ".txt"); + storage.insert("def", ".txt"); + + final List initialListing = storage.list().collect(Collectors.toList()); + Assert.assertEquals(initialListing, + Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt"))); + } + + // load existing storage + { + final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder); + + // files inserted previously are still there + final List initialListing = storage.list().collect(Collectors.toList()); + + Assert.assertEquals(initialListing, + Arrays.asList(storageLeafFolder.resolve("abc$.txt"), storageLeafFolder.resolve("def$.txt"))); + + // add new file + storage.insert("ghi", ".txt"); + + // listing is updated + final List updatedListing = storage.list().collect(Collectors.toList()); + Assert.assertEquals(updatedListing, Arrays.asList(storageLeafFolder.resolve("abc$.txt"), + storageLeafFolder.resolve("def$.txt"), storageLeafFolder.resolve("ghi$.txt"))); + } + + } + private List getPathsRelativeToDataDirectory() throws IOException { - final List actualFiles = FileUtils.listRecursively(dataDirectory); + List actualFiles = FileUtils.listRecursively(dataDirectory); + actualFiles = CollectionUtils.filter(actualFiles, + p -> !p.getFileName().toString().equals(FolderStorage.LISTING_FILE_NAME)); CollectionUtils.mapInPlace(actualFiles, p -> dataDirectory.relativize(p)); Collections.sort(actualFiles); return actualFiles; diff --git a/performanceDb/src/test/java/org/lucares/performance/db/PerformanceDbTest.java b/performanceDb/src/test/java/org/lucares/performance/db/PerformanceDbTest.java index 7155b6c..6ce7902 100644 --- a/performanceDb/src/test/java/org/lucares/performance/db/PerformanceDbTest.java +++ b/performanceDb/src/test/java/org/lucares/performance/db/PerformanceDbTest.java @@ -113,7 +113,7 @@ public class PerformanceDbTest { final List filesInStorage = FileUtils.listRecursively(DataStore.storageDirectory(dataDirectory)); - Assert.assertEquals(filesInStorage.size(), 1); + Assert.assertEquals(filesInStorage.size(), 2, "the created file and the listing.csv"); final Path tagSpecificFile = filesInStorage.get(0);