do string compression in StringCompressor instead of Tag

This commit is contained in:
2021-05-09 10:37:35 +02:00
parent 36ccc57db6
commit 6dc335600e
9 changed files with 51 additions and 47 deletions

View File

@@ -127,8 +127,9 @@ public class DataStore implements AutoCloseable {
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath)); Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY); Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
Tags.STRING_COMPRESSOR.put(""); Tags.STRING_COMPRESSOR.put("");
TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it TAG_ALL_DOCS = Tags.STRING_COMPRESSOR.createTag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the
// must be initialized after the string compressor has been created // StringCompressor internally, so it
// must be initialized after the string compressor has been created
diskStorage = new PartitionDiskStore(storageBasePath, "data.bs"); diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");
@@ -263,7 +264,7 @@ public class DataStore implements AutoCloseable {
final Set<String> keys = new HashSet<>(); final Set<String> keys = new HashSet<>();
final Tag keyPrefix = new Tag("", ""); // will find everything final Tag keyPrefix = Tags.STRING_COMPRESSOR.createTag("", ""); // will find everything
final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange); final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString())); tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));

View File

@@ -154,8 +154,8 @@ public class QueryCompletionIndex implements AutoCloseable {
public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) { public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {
tagA = new Tag(fieldA, valueA); tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, valueA);
tagB = new Tag(fieldB, valueB); tagB = Tags.STRING_COMPRESSOR.createTag(fieldB, valueB);
} }
public Tag getTagA() { public Tag getTagA() {

View File

@@ -38,17 +38,17 @@ class TagEncoderDecoder implements EncoderDecoder<Tag> {
switch (compressedStrings.size()) { switch (compressedStrings.size()) {
case 0: case 0:
result = new Tag("", ""); result = Tags.STRING_COMPRESSOR.createTag("", "");
break; break;
case 1: case 1:
final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0)); final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
result = new Tag(k, ""); result = Tags.STRING_COMPRESSOR.createTag(k, "");
break; break;
case 2: case 2:
final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0)); final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1)); final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
result = new Tag(key, value); result = Tags.STRING_COMPRESSOR.createTag(key, value);
break; break;
default: default:
throw new IllegalStateException("too many values: " + compressedStrings); throw new IllegalStateException("too many values: " + compressedStrings);

View File

@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.lucares.collections.LongList; import org.lucares.collections.LongList;
import org.lucares.pdb.api.DateTimeRange; import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.Tag; import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.blockstorage.LongStreamFile; import org.lucares.pdb.blockstorage.LongStreamFile;
import org.lucares.pdb.datastore.internal.DataStore; import org.lucares.pdb.datastore.internal.DataStore;
import org.lucares.pdb.datastore.internal.DatePartitioner; import org.lucares.pdb.datastore.internal.DatePartitioner;
@@ -148,27 +149,29 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<PartitionLongLis
final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner); final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
for (final ParititionId partitionId : availablePartitionIds) { for (final ParititionId partitionId : availablePartitionIds) {
final List<LongList> docIdsForPartition = new ArrayList<>(); final List<LongList> docIdsForPartition = new ArrayList<>();
keyToValueToDocId.visitValues(partitionId, new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> { keyToValueToDocId.visitValues(partitionId, Tags.STRING_COMPRESSOR.createTag(propertyName, ""),
if (valuePattern.matcher(tags.getValueAsString()).matches()) { (tags, blockOffsetToDocIds) -> {
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds, if (valuePattern.matcher(tags.getValueAsString()).matches()) {
partitionId)) { try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
partitionId)) {
// We know that all LongLists coming from a BSFile are sorted, non-overlapping // We know that all LongLists coming from a BSFile are sorted, non-overlapping
// and increasing, that means we can just concatenate them and get a sorted // and increasing, that means we can just concatenate them and get a sorted
// list. // list.
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList()); final List<LongList> longLists = bsFile.streamOfLongLists()
final LongList concatenatedLists = concatenateLists(longLists); .collect(Collectors.toList());
final LongList concatenatedLists = concatenateLists(longLists);
Preconditions.checkTrue(concatenatedLists.isSorted(), Preconditions.checkTrue(concatenatedLists.isSorted(),
"The LongLists containing document ids must be sorted, " "The LongLists containing document ids must be sorted, "
+ "non-overlapping and increasing, so that the concatenation " + "non-overlapping and increasing, so that the concatenation "
+ "is sorted. This is guaranteed by the fact that document ids " + "is sorted. This is guaranteed by the fact that document ids "
+ "are generated in monotonically increasing order."); + "are generated in monotonically increasing order.");
docIdsForPartition.add(concatenatedLists); docIdsForPartition.add(concatenatedLists);
} }
} }
}); });
final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition); final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
result.put(partitionId, mergedDocsIdsForPartition); result.put(partitionId, mergedDocsIdsForPartition);

View File

@@ -7,6 +7,7 @@ import java.util.TreeSet;
import org.lucares.pdb.api.DateTimeRange; import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.Tag; import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.internal.GlobMatcher; import org.lucares.pdb.datastore.internal.GlobMatcher;
import org.lucares.pdb.datastore.internal.QueryCompletionIndex; import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
import org.lucares.pdb.datastore.lang.Expression.And; import org.lucares.pdb.datastore.lang.Expression.And;
@@ -62,7 +63,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
result = new TreeSet<>(); result = new TreeSet<>();
for (final String v : valuesA) { for (final String v : valuesA) {
final Tag tagA = new Tag(fieldA, v); final Tag tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, v);
final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field); final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
result.addAll(tmp); result.addAll(tmp);
} }
@@ -150,7 +151,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
} }
final Property property = (Property) expression.getExpression(); final Property property = (Property) expression.getExpression();
final Tag tag = new Tag(property.getField(), property.getValueAsString()); final Tag tag = Tags.STRING_COMPRESSOR.createTag(property.getField(), property.getValueAsString());
final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field); final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field); final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);

View File

@@ -10,14 +10,13 @@ import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.lucares.pdb.api.DateTimeRange; import org.lucares.pdb.api.DateTimeRange;
import org.lucares.pdb.api.StringCompressor; import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags; import org.lucares.pdb.api.Tags;
import org.lucares.pdb.api.UniqueStringIntegerPairs; import org.lucares.pdb.api.UniqueStringIntegerPairs;
import org.junit.jupiter.api.Assertions;
import org.lucares.utils.file.FileUtils; import org.lucares.utils.file.FileUtils;
public class QueryCompletionIndexTest { public class QueryCompletionIndexTest {
@@ -54,14 +53,14 @@ public class QueryCompletionIndexTest {
// all firstnames where lastname=Doe are returned sorted alphabetically. // all firstnames where lastname=Doe are returned sorted alphabetically.
// tags A and B match // tags A and B match
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange, new Tag("lastname", "Doe"), final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange,
"firstname"); Tags.STRING_COMPRESSOR.createTag("lastname", "Doe"), "firstname");
Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe); Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);
// no duplicates are returned: // no duplicates are returned:
// tags A and C match firstname=John, but both have country=Atlantis // tags A and C match firstname=John, but both have country=Atlantis
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange, new Tag("firstname", "John"), final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange,
"country"); Tags.STRING_COMPRESSOR.createTag("firstname", "John"), "country");
Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn); Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);
// findAllValuesForField sorts alphabetically // findAllValuesForField sorts alphabetically

View File

@@ -39,4 +39,16 @@ public class StringCompressor {
return integer != null ? integer : -1; return integer != null ? integer : -1;
} }
/**
* Create a new {@link Tag} for the given field and value.
*
* @param field the field
* @param value the value
*/
public Tag createTag(final String field, final String value) {
final int f = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
final int v = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
return new Tag(f, v);
}
} }

View File

@@ -22,17 +22,6 @@ public class Tag implements Comparable<Tag> {
this.value = value; this.value = value;
} }
/**
* Create a new {@link Tag} for the given field and value.
*
* @param field the field
* @param value the value
*/
public Tag(final String field, final String value) {
this.field = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
this.value = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
}
@Override @Override
public int compareTo(final Tag o) { public int compareTo(final Tag o) {

View File

@@ -7,7 +7,6 @@ import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import org.lucares.pdb.api.StringCompressor; import org.lucares.pdb.api.StringCompressor;
import org.lucares.pdb.api.Tag;
import org.lucares.pdb.api.Tags; import org.lucares.pdb.api.Tags;
import org.lucares.pdb.api.TagsBuilder; import org.lucares.pdb.api.TagsBuilder;
import org.lucares.pdb.api.UniqueStringIntegerPairs; import org.lucares.pdb.api.UniqueStringIntegerPairs;
@@ -74,7 +73,7 @@ public class MemoryScale {
} }
private static Object createTag() { private static Object createTag() {
return new Tag("", ""); return Tags.STRING_COMPRESSOR.createTag("", "");
} }
private static Object createTags0() { private static Object createTags0() {