do string compression in StringCompressor instead of Tag
This commit is contained in:
@@ -127,8 +127,9 @@ public class DataStore implements AutoCloseable {
|
||||
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
|
||||
Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
|
||||
Tags.STRING_COMPRESSOR.put("");
|
||||
TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
|
||||
// must be initialized after the string compressor has been created
|
||||
TAG_ALL_DOCS = Tags.STRING_COMPRESSOR.createTag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the
|
||||
// StringCompressor internally, so it
|
||||
// must be initialized after the string compressor has been created
|
||||
|
||||
diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");
|
||||
|
||||
@@ -263,7 +264,7 @@ public class DataStore implements AutoCloseable {
|
||||
|
||||
final Set<String> keys = new HashSet<>();
|
||||
|
||||
final Tag keyPrefix = new Tag("", ""); // will find everything
|
||||
final Tag keyPrefix = Tags.STRING_COMPRESSOR.createTag("", ""); // will find everything
|
||||
|
||||
final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
|
||||
tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));
|
||||
|
||||
@@ -154,8 +154,8 @@ public class QueryCompletionIndex implements AutoCloseable {
|
||||
|
||||
public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {
|
||||
|
||||
tagA = new Tag(fieldA, valueA);
|
||||
tagB = new Tag(fieldB, valueB);
|
||||
tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, valueA);
|
||||
tagB = Tags.STRING_COMPRESSOR.createTag(fieldB, valueB);
|
||||
}
|
||||
|
||||
public Tag getTagA() {
|
||||
|
||||
@@ -38,17 +38,17 @@ class TagEncoderDecoder implements EncoderDecoder<Tag> {
|
||||
switch (compressedStrings.size()) {
|
||||
case 0:
|
||||
|
||||
result = new Tag("", "");
|
||||
result = Tags.STRING_COMPRESSOR.createTag("", "");
|
||||
break;
|
||||
case 1:
|
||||
final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
||||
result = new Tag(k, "");
|
||||
result = Tags.STRING_COMPRESSOR.createTag(k, "");
|
||||
|
||||
break;
|
||||
case 2:
|
||||
final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
||||
final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
|
||||
result = new Tag(key, value);
|
||||
result = Tags.STRING_COMPRESSOR.createTag(key, value);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("too many values: " + compressedStrings);
|
||||
|
||||
@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
||||
import org.lucares.collections.LongList;
|
||||
import org.lucares.pdb.api.DateTimeRange;
|
||||
import org.lucares.pdb.api.Tag;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.blockstorage.LongStreamFile;
|
||||
import org.lucares.pdb.datastore.internal.DataStore;
|
||||
import org.lucares.pdb.datastore.internal.DatePartitioner;
|
||||
@@ -148,27 +149,29 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<PartitionLongLis
|
||||
final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
|
||||
for (final ParititionId partitionId : availablePartitionIds) {
|
||||
final List<LongList> docIdsForPartition = new ArrayList<>();
|
||||
keyToValueToDocId.visitValues(partitionId, new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
|
||||
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
|
||||
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
|
||||
partitionId)) {
|
||||
keyToValueToDocId.visitValues(partitionId, Tags.STRING_COMPRESSOR.createTag(propertyName, ""),
|
||||
(tags, blockOffsetToDocIds) -> {
|
||||
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
|
||||
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
|
||||
partitionId)) {
|
||||
|
||||
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
|
||||
// and increasing, that means we can just concatenate them and get a sorted
|
||||
// list.
|
||||
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
|
||||
final LongList concatenatedLists = concatenateLists(longLists);
|
||||
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
|
||||
// and increasing, that means we can just concatenate them and get a sorted
|
||||
// list.
|
||||
final List<LongList> longLists = bsFile.streamOfLongLists()
|
||||
.collect(Collectors.toList());
|
||||
final LongList concatenatedLists = concatenateLists(longLists);
|
||||
|
||||
Preconditions.checkTrue(concatenatedLists.isSorted(),
|
||||
"The LongLists containing document ids must be sorted, "
|
||||
+ "non-overlapping and increasing, so that the concatenation "
|
||||
+ "is sorted. This is guaranteed by the fact that document ids "
|
||||
+ "are generated in monotonically increasing order.");
|
||||
Preconditions.checkTrue(concatenatedLists.isSorted(),
|
||||
"The LongLists containing document ids must be sorted, "
|
||||
+ "non-overlapping and increasing, so that the concatenation "
|
||||
+ "is sorted. This is guaranteed by the fact that document ids "
|
||||
+ "are generated in monotonically increasing order.");
|
||||
|
||||
docIdsForPartition.add(concatenatedLists);
|
||||
}
|
||||
}
|
||||
});
|
||||
docIdsForPartition.add(concatenatedLists);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
|
||||
result.put(partitionId, mergedDocsIdsForPartition);
|
||||
|
||||
@@ -7,6 +7,7 @@ import java.util.TreeSet;
|
||||
|
||||
import org.lucares.pdb.api.DateTimeRange;
|
||||
import org.lucares.pdb.api.Tag;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.datastore.internal.GlobMatcher;
|
||||
import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
|
||||
import org.lucares.pdb.datastore.lang.Expression.And;
|
||||
@@ -62,7 +63,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
|
||||
result = new TreeSet<>();
|
||||
|
||||
for (final String v : valuesA) {
|
||||
final Tag tagA = new Tag(fieldA, v);
|
||||
final Tag tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, v);
|
||||
final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
|
||||
result.addAll(tmp);
|
||||
}
|
||||
@@ -150,7 +151,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
|
||||
}
|
||||
|
||||
final Property property = (Property) expression.getExpression();
|
||||
final Tag tag = new Tag(property.getField(), property.getValueAsString());
|
||||
final Tag tag = Tags.STRING_COMPRESSOR.createTag(property.getField(), property.getValueAsString());
|
||||
|
||||
final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
|
||||
final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);
|
||||
|
||||
@@ -10,14 +10,13 @@ import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.lucares.pdb.api.DateTimeRange;
|
||||
import org.lucares.pdb.api.StringCompressor;
|
||||
import org.lucares.pdb.api.Tag;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.api.UniqueStringIntegerPairs;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.lucares.utils.file.FileUtils;
|
||||
|
||||
public class QueryCompletionIndexTest {
|
||||
@@ -54,14 +53,14 @@ public class QueryCompletionIndexTest {
|
||||
|
||||
// all firstnames where lastname=Doe are returned sorted alphabetically.
|
||||
// tags A and B match
|
||||
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange, new Tag("lastname", "Doe"),
|
||||
"firstname");
|
||||
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange,
|
||||
Tags.STRING_COMPRESSOR.createTag("lastname", "Doe"), "firstname");
|
||||
Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);
|
||||
|
||||
// no duplicates are returned:
|
||||
// tags A and C match firstname=John, but both have country=Atlantis
|
||||
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange, new Tag("firstname", "John"),
|
||||
"country");
|
||||
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange,
|
||||
Tags.STRING_COMPRESSOR.createTag("firstname", "John"), "country");
|
||||
Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);
|
||||
|
||||
// findAllValuesForField sorts alphabetically
|
||||
|
||||
Reference in New Issue
Block a user