do string compression in StringCompressor instead of Tag

This commit is contained in:
2021-05-09 10:37:35 +02:00
parent 36ccc57db6
commit 6dc335600e
9 changed files with 51 additions and 47 deletions

View File

@@ -127,8 +127,9 @@ public class DataStore implements AutoCloseable {
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
Tags.STRING_COMPRESSOR.put("");
TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
// must be initialized after the string compressor has been created
TAG_ALL_DOCS = Tags.STRING_COMPRESSOR.createTag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the
// StringCompressor internally, so it
// must be initialized after the string compressor has been created
diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");
@@ -263,7 +264,7 @@ public class DataStore implements AutoCloseable {
final Set<String> keys = new HashSet<>();
final Tag keyPrefix = new Tag("", ""); // will find everything
final Tag keyPrefix = Tags.STRING_COMPRESSOR.createTag("", ""); // will find everything
final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));

View File

@@ -154,8 +154,8 @@ public class QueryCompletionIndex implements AutoCloseable {
public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {
tagA = new Tag(fieldA, valueA);
tagB = new Tag(fieldB, valueB);
tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, valueA);
tagB = Tags.STRING_COMPRESSOR.createTag(fieldB, valueB);
}
public Tag getTagA() {

View File

@@ -38,17 +38,17 @@ class TagEncoderDecoder implements EncoderDecoder<Tag> {
switch (compressedStrings.size()) {
case 0:
result = new Tag("", "");
result = Tags.STRING_COMPRESSOR.createTag("", "");
break;
case 1:
final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
result = new Tag(k, "");
result = Tags.STRING_COMPRESSOR.createTag(k, "");
break;
case 2:
final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
result = new Tag(key, value);
result = Tags.STRING_COMPRESSOR.createTag(key, value);
break;
default:
throw new IllegalStateException("too many values: " + compressedStrings);

View File

@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.lucares.collections.LongList;
import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.blockstorage.LongStreamFile;
import org.lucares.pdb.datastore.internal.DataStore;
import org.lucares.pdb.datastore.internal.DatePartitioner;
@@ -148,27 +149,29 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<PartitionLongLis
final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
for (final ParititionId partitionId : availablePartitionIds) {
final List<LongList> docIdsForPartition = new ArrayList<>();
keyToValueToDocId.visitValues(partitionId, new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
partitionId)) {
keyToValueToDocId.visitValues(partitionId, Tags.STRING_COMPRESSOR.createTag(propertyName, ""),
(tags, blockOffsetToDocIds) -> {
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
partitionId)) {
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
// and increasing, that means we can just concatenate them and get a sorted
// list.
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
final LongList concatenatedLists = concatenateLists(longLists);
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
// and increasing, that means we can just concatenate them and get a sorted
// list.
final List<LongList> longLists = bsFile.streamOfLongLists()
.collect(Collectors.toList());
final LongList concatenatedLists = concatenateLists(longLists);
Preconditions.checkTrue(concatenatedLists.isSorted(),
"The LongLists containing document ids must be sorted, "
+ "non-overlapping and increasing, so that the concatenation "
+ "is sorted. This is guaranteed by the fact that document ids "
+ "are generated in monotonically increasing order.");
Preconditions.checkTrue(concatenatedLists.isSorted(),
"The LongLists containing document ids must be sorted, "
+ "non-overlapping and increasing, so that the concatenation "
+ "is sorted. This is guaranteed by the fact that document ids "
+ "are generated in monotonically increasing order.");
docIdsForPartition.add(concatenatedLists);
}
}
});
docIdsForPartition.add(concatenatedLists);
}
}
});
final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
result.put(partitionId, mergedDocsIdsForPartition);

View File

@@ -7,6 +7,7 @@ import java.util.TreeSet;
import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.internal.GlobMatcher;
import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
import org.lucares.pdb.datastore.lang.Expression.And;
@@ -62,7 +63,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
result = new TreeSet<>();
for (final String v : valuesA) {
final Tag tagA = new Tag(fieldA, v);
final Tag tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, v);
final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
result.addAll(tmp);
}
@@ -150,7 +151,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
}
final Property property = (Property) expression.getExpression();
final Tag tag = new Tag(property.getField(), property.getValueAsString());
final Tag tag = Tags.STRING_COMPRESSOR.createTag(property.getField(), property.getValueAsString());
final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);

View File

@@ -10,14 +10,13 @@ import java.util.SortedSet;
import java.util.TreeSet;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.api.UniqueStringIntegerPairs;
import org.junit.jupiter.api.Assertions;
import org.lucares.utils.file.FileUtils;
public class QueryCompletionIndexTest {
@@ -54,14 +53,14 @@ public class QueryCompletionIndexTest {
// all firstnames where lastname=Doe are returned sorted alphabetically.
// tags A and B match
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange, new Tag("lastname", "Doe"),
"firstname");
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange,
Tags.STRING_COMPRESSOR.createTag("lastname", "Doe"), "firstname");
Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);
// no duplicates are returned:
// tags A and C match firstname=John, but both have country=Atlantis
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange, new Tag("firstname", "John"),
"country");
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange,
Tags.STRING_COMPRESSOR.createTag("firstname", "John"), "country");
Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);
// findAllValuesForField sorts alphabetically

View File

@@ -39,4 +39,16 @@ public class StringCompressor {
return integer != null ? integer : -1;
}
/**
* Create a new {@link Tag} for the given field and value.
*
* @param field the field
* @param value the value
*/
public Tag createTag(final String field, final String value) {
final int f = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
final int v = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
return new Tag(f, v);
}
}

View File

@@ -22,17 +22,6 @@ public class Tag implements Comparable<Tag> {
this.value = value;
}
/**
* Create a new {@link Tag} for the given field and value.
*
* @param field the field
* @param value the value
*/
public Tag(final String field, final String value) {
this.field = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
this.value = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
}
@Override
public int compareTo(final Tag o) {

View File

@@ -7,7 +7,6 @@ import java.util.LinkedHashMap;
import java.util.Map;
import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.api.TagsBuilder;
import org.lucares.pdb.api.UniqueStringIntegerPairs;
@@ -74,7 +73,7 @@ public class MemoryScale {
}
private static Object createTag() {
return new Tag("", "");
return Tags.STRING_COMPRESSOR.createTag("", "");
}
private static Object createTags0() {