do string compression in StringCompressor instead of Tag

2021-05-09 10:37:35 +02:00
parent 36ccc57db6
commit 6dc335600e
9 changed files with 51 additions and 47 deletions
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/DataStore.java
@@ -127,8 +127,9 @@ public class DataStore implements AutoCloseable {
        Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
        Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
        Tags.STRING_COMPRESSOR.put("");
-        TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
-                                                  // must be initialized after the string compressor has been created
+        TAG_ALL_DOCS = Tags.STRING_COMPRESSOR.createTag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the
+                                                                           // StringCompressor internally, so it
+        // must be initialized after the string compressor has been created

        diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");

@@ -263,7 +264,7 @@ public class DataStore implements AutoCloseable {

        final Set<String> keys = new HashSet<>();

-        final Tag keyPrefix = new Tag("", ""); // will find everything
+        final Tag keyPrefix = Tags.STRING_COMPRESSOR.createTag("", ""); // will find everything

        final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
        tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/QueryCompletionIndex.java
@@ -154,8 +154,8 @@ public class QueryCompletionIndex implements AutoCloseable {

        public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {

-            tagA = new Tag(fieldA, valueA);
-            tagB = new Tag(fieldB, valueB);
+            tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, valueA);
+            tagB = Tags.STRING_COMPRESSOR.createTag(fieldB, valueB);
        }

        public Tag getTagA() {
--- a/data-store/src/main/java/org/lucares/pdb/datastore/internal/TagEncoderDecoder.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/internal/TagEncoderDecoder.java
@@ -38,17 +38,17 @@ class TagEncoderDecoder implements EncoderDecoder<Tag> {
        switch (compressedStrings.size()) {
        case 0:

-            result = new Tag("", "");
+            result = Tags.STRING_COMPRESSOR.createTag("", "");
            break;
        case 1:
            final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
-            result = new Tag(k, "");
+            result = Tags.STRING_COMPRESSOR.createTag(k, "");

            break;
        case 2:
            final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
            final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
-            result = new Tag(key, value);
+            result = Tags.STRING_COMPRESSOR.createTag(key, value);
            break;
        default:
            throw new IllegalStateException("too many values: " + compressedStrings);
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java
@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
 import org.lucares.collections.LongList;
 import org.lucares.pdb.api.DateTimeRange;
 import org.lucares.pdb.api.Tag;
+import org.lucares.pdb.api.Tags;
 import org.lucares.pdb.blockstorage.LongStreamFile;
 import org.lucares.pdb.datastore.internal.DataStore;
 import org.lucares.pdb.datastore.internal.DatePartitioner;
@@ -148,27 +149,29 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<PartitionLongLis
        final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
        for (final ParititionId partitionId : availablePartitionIds) {
            final List<LongList> docIdsForPartition = new ArrayList<>();
-            keyToValueToDocId.visitValues(partitionId, new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
-                if (valuePattern.matcher(tags.getValueAsString()).matches()) {
-                    try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
-                            partitionId)) {
+            keyToValueToDocId.visitValues(partitionId, Tags.STRING_COMPRESSOR.createTag(propertyName, ""),
+                    (tags, blockOffsetToDocIds) -> {
+                        if (valuePattern.matcher(tags.getValueAsString()).matches()) {
+                            try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
+                                    partitionId)) {

-                        // We know that all LongLists coming from a BSFile are sorted, non-overlapping
-                        // and increasing, that means we can just concatenate them and get a sorted
-                        // list.
-                        final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
-                        final LongList concatenatedLists = concatenateLists(longLists);
+                                // We know that all LongLists coming from a BSFile are sorted, non-overlapping
+                                // and increasing, that means we can just concatenate them and get a sorted
+                                // list.
+                                final List<LongList> longLists = bsFile.streamOfLongLists()
+                                        .collect(Collectors.toList());
+                                final LongList concatenatedLists = concatenateLists(longLists);

-                        Preconditions.checkTrue(concatenatedLists.isSorted(),
-                                "The LongLists containing document ids must be sorted, "
-                                        + "non-overlapping and increasing, so that the concatenation "
-                                        + "is sorted. This is guaranteed by the fact that document ids "
-                                        + "are generated in monotonically increasing order.");
+                                Preconditions.checkTrue(concatenatedLists.isSorted(),
+                                        "The LongLists containing document ids must be sorted, "
+                                                + "non-overlapping and increasing, so that the concatenation "
+                                                + "is sorted. This is guaranteed by the fact that document ids "
+                                                + "are generated in monotonically increasing order.");

-                        docIdsForPartition.add(concatenatedLists);
-                    }
-                }
-            });
+                                docIdsForPartition.add(concatenatedLists);
+                            }
+                        }
+                    });

            final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
            result.put(partitionId, mergedDocsIdsForPartition);
--- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java
+++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/FindValuesForQueryCompletion.java
@@ -7,6 +7,7 @@ import java.util.TreeSet;

 import org.lucares.pdb.api.DateTimeRange;
 import org.lucares.pdb.api.Tag;
+import org.lucares.pdb.api.Tags;
 import org.lucares.pdb.datastore.internal.GlobMatcher;
 import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
 import org.lucares.pdb.datastore.lang.Expression.And;
@@ -62,7 +63,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
                    result = new TreeSet<>();

                    for (final String v : valuesA) {
-                        final Tag tagA = new Tag(fieldA, v);
+                        final Tag tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, v);
                        final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
                        result.addAll(tmp);
                    }
@@ -150,7 +151,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
            }

            final Property property = (Property) expression.getExpression();
-            final Tag tag = new Tag(property.getField(), property.getValueAsString());
+            final Tag tag = Tags.STRING_COMPRESSOR.createTag(property.getField(), property.getValueAsString());

            final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
            final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);
--- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/QueryCompletionIndexTest.java
+++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/QueryCompletionIndexTest.java
@@ -10,14 +10,13 @@ import java.util.SortedSet;
 import java.util.TreeSet;

 import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.lucares.pdb.api.DateTimeRange;
 import org.lucares.pdb.api.StringCompressor;
-import org.lucares.pdb.api.Tag;
 import org.lucares.pdb.api.Tags;
 import org.lucares.pdb.api.UniqueStringIntegerPairs;
-import org.junit.jupiter.api.Assertions;
 import org.lucares.utils.file.FileUtils;

 public class QueryCompletionIndexTest {
@@ -54,14 +53,14 @@ public class QueryCompletionIndexTest {

            // all firstnames where lastname=Doe are returned sorted alphabetically.
            // tags A and B match
-            final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange, new Tag("lastname", "Doe"),
-                    "firstname");
+            final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange,
+                    Tags.STRING_COMPRESSOR.createTag("lastname", "Doe"), "firstname");
            Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);

            // no duplicates are returned:
            // tags A and C match firstname=John, but both have country=Atlantis
-            final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange, new Tag("firstname", "John"),
-                    "country");
+            final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange,
+                    Tags.STRING_COMPRESSOR.createTag("firstname", "John"), "country");
            Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);

            // findAllValuesForField sorts alphabetically