do string compression in StringCompressor instead of Tag
This commit is contained in:
@@ -127,8 +127,9 @@ public class DataStore implements AutoCloseable {
|
|||||||
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
|
Tags.STRING_COMPRESSOR = StringCompressor.create(keyCompressionFile(storageBasePath));
|
||||||
Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
|
Tags.STRING_COMPRESSOR.put(ALL_DOCS_KEY);
|
||||||
Tags.STRING_COMPRESSOR.put("");
|
Tags.STRING_COMPRESSOR.put("");
|
||||||
TAG_ALL_DOCS = new Tag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the StringCompressor internally, so it
|
TAG_ALL_DOCS = Tags.STRING_COMPRESSOR.createTag(ALL_DOCS_KEY, ""); // Tag(String, String) uses the
|
||||||
// must be initialized after the string compressor has been created
|
// StringCompressor internally, so it
|
||||||
|
// must be initialized after the string compressor has been created
|
||||||
|
|
||||||
diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");
|
diskStorage = new PartitionDiskStore(storageBasePath, "data.bs");
|
||||||
|
|
||||||
@@ -263,7 +264,7 @@ public class DataStore implements AutoCloseable {
|
|||||||
|
|
||||||
final Set<String> keys = new HashSet<>();
|
final Set<String> keys = new HashSet<>();
|
||||||
|
|
||||||
final Tag keyPrefix = new Tag("", ""); // will find everything
|
final Tag keyPrefix = Tags.STRING_COMPRESSOR.createTag("", ""); // will find everything
|
||||||
|
|
||||||
final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
|
final PartitionIdSource partitionIdSource = new DatePartitioner(dateRange);
|
||||||
tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));
|
tagToDocsId.visitValues(partitionIdSource, keyPrefix, (tags, __) -> keys.add(tags.getKeyAsString()));
|
||||||
|
|||||||
@@ -154,8 +154,8 @@ public class QueryCompletionIndex implements AutoCloseable {
|
|||||||
|
|
||||||
public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {
|
public TwoTags(final String fieldB, final String fieldA, final String valueA, final String valueB) {
|
||||||
|
|
||||||
tagA = new Tag(fieldA, valueA);
|
tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, valueA);
|
||||||
tagB = new Tag(fieldB, valueB);
|
tagB = Tags.STRING_COMPRESSOR.createTag(fieldB, valueB);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Tag getTagA() {
|
public Tag getTagA() {
|
||||||
|
|||||||
@@ -38,17 +38,17 @@ class TagEncoderDecoder implements EncoderDecoder<Tag> {
|
|||||||
switch (compressedStrings.size()) {
|
switch (compressedStrings.size()) {
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
result = new Tag("", "");
|
result = Tags.STRING_COMPRESSOR.createTag("", "");
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
final String k = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
||||||
result = new Tag(k, "");
|
result = Tags.STRING_COMPRESSOR.createTag(k, "");
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
final String key = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(0));
|
||||||
final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
|
final String value = Tags.STRING_COMPRESSOR.get((int) compressedStrings.get(1));
|
||||||
result = new Tag(key, value);
|
result = Tags.STRING_COMPRESSOR.createTag(key, value);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalStateException("too many values: " + compressedStrings);
|
throw new IllegalStateException("too many values: " + compressedStrings);
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
|||||||
import org.lucares.collections.LongList;
|
import org.lucares.collections.LongList;
|
||||||
import org.lucares.pdb.api.DateTimeRange;
|
import org.lucares.pdb.api.DateTimeRange;
|
||||||
import org.lucares.pdb.api.Tag;
|
import org.lucares.pdb.api.Tag;
|
||||||
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.blockstorage.LongStreamFile;
|
import org.lucares.pdb.blockstorage.LongStreamFile;
|
||||||
import org.lucares.pdb.datastore.internal.DataStore;
|
import org.lucares.pdb.datastore.internal.DataStore;
|
||||||
import org.lucares.pdb.datastore.internal.DatePartitioner;
|
import org.lucares.pdb.datastore.internal.DatePartitioner;
|
||||||
@@ -148,27 +149,29 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor<PartitionLongLis
|
|||||||
final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
|
final Set<ParititionId> availablePartitionIds = keyToValueToDocId.getAvailablePartitionIds(datePartitioner);
|
||||||
for (final ParititionId partitionId : availablePartitionIds) {
|
for (final ParititionId partitionId : availablePartitionIds) {
|
||||||
final List<LongList> docIdsForPartition = new ArrayList<>();
|
final List<LongList> docIdsForPartition = new ArrayList<>();
|
||||||
keyToValueToDocId.visitValues(partitionId, new Tag(propertyName, ""), (tags, blockOffsetToDocIds) -> {
|
keyToValueToDocId.visitValues(partitionId, Tags.STRING_COMPRESSOR.createTag(propertyName, ""),
|
||||||
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
|
(tags, blockOffsetToDocIds) -> {
|
||||||
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
|
if (valuePattern.matcher(tags.getValueAsString()).matches()) {
|
||||||
partitionId)) {
|
try (final LongStreamFile bsFile = diskStorage.streamExistingFile(blockOffsetToDocIds,
|
||||||
|
partitionId)) {
|
||||||
|
|
||||||
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
|
// We know that all LongLists coming from a BSFile are sorted, non-overlapping
|
||||||
// and increasing, that means we can just concatenate them and get a sorted
|
// and increasing, that means we can just concatenate them and get a sorted
|
||||||
// list.
|
// list.
|
||||||
final List<LongList> longLists = bsFile.streamOfLongLists().collect(Collectors.toList());
|
final List<LongList> longLists = bsFile.streamOfLongLists()
|
||||||
final LongList concatenatedLists = concatenateLists(longLists);
|
.collect(Collectors.toList());
|
||||||
|
final LongList concatenatedLists = concatenateLists(longLists);
|
||||||
|
|
||||||
Preconditions.checkTrue(concatenatedLists.isSorted(),
|
Preconditions.checkTrue(concatenatedLists.isSorted(),
|
||||||
"The LongLists containing document ids must be sorted, "
|
"The LongLists containing document ids must be sorted, "
|
||||||
+ "non-overlapping and increasing, so that the concatenation "
|
+ "non-overlapping and increasing, so that the concatenation "
|
||||||
+ "is sorted. This is guaranteed by the fact that document ids "
|
+ "is sorted. This is guaranteed by the fact that document ids "
|
||||||
+ "are generated in monotonically increasing order.");
|
+ "are generated in monotonically increasing order.");
|
||||||
|
|
||||||
docIdsForPartition.add(concatenatedLists);
|
docIdsForPartition.add(concatenatedLists);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
|
final LongList mergedDocsIdsForPartition = LongList.union(docIdsForPartition);
|
||||||
result.put(partitionId, mergedDocsIdsForPartition);
|
result.put(partitionId, mergedDocsIdsForPartition);
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import java.util.TreeSet;
|
|||||||
|
|
||||||
import org.lucares.pdb.api.DateTimeRange;
|
import org.lucares.pdb.api.DateTimeRange;
|
||||||
import org.lucares.pdb.api.Tag;
|
import org.lucares.pdb.api.Tag;
|
||||||
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.datastore.internal.GlobMatcher;
|
import org.lucares.pdb.datastore.internal.GlobMatcher;
|
||||||
import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
|
import org.lucares.pdb.datastore.internal.QueryCompletionIndex;
|
||||||
import org.lucares.pdb.datastore.lang.Expression.And;
|
import org.lucares.pdb.datastore.lang.Expression.And;
|
||||||
@@ -62,7 +63,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
|
|||||||
result = new TreeSet<>();
|
result = new TreeSet<>();
|
||||||
|
|
||||||
for (final String v : valuesA) {
|
for (final String v : valuesA) {
|
||||||
final Tag tagA = new Tag(fieldA, v);
|
final Tag tagA = Tags.STRING_COMPRESSOR.createTag(fieldA, v);
|
||||||
final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
|
final SortedSet<String> tmp = index.find(dateTimeRange, tagA, field);
|
||||||
result.addAll(tmp);
|
result.addAll(tmp);
|
||||||
}
|
}
|
||||||
@@ -150,7 +151,7 @@ public class FindValuesForQueryCompletion extends ExpressionVisitor<SortedSet<St
|
|||||||
}
|
}
|
||||||
|
|
||||||
final Property property = (Property) expression.getExpression();
|
final Property property = (Property) expression.getExpression();
|
||||||
final Tag tag = new Tag(property.getField(), property.getValueAsString());
|
final Tag tag = Tags.STRING_COMPRESSOR.createTag(property.getField(), property.getValueAsString());
|
||||||
|
|
||||||
final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
|
final SortedSet<String> valuesNotForField = index.findAllValuesNotForField(dateTimeRange, tag, field);
|
||||||
final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);
|
final SortedSet<String> valuesForField = index.find(dateTimeRange, tag, field);
|
||||||
|
|||||||
@@ -10,14 +10,13 @@ import java.util.SortedSet;
|
|||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.lucares.pdb.api.DateTimeRange;
|
import org.lucares.pdb.api.DateTimeRange;
|
||||||
import org.lucares.pdb.api.StringCompressor;
|
import org.lucares.pdb.api.StringCompressor;
|
||||||
import org.lucares.pdb.api.Tag;
|
|
||||||
import org.lucares.pdb.api.Tags;
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.api.UniqueStringIntegerPairs;
|
import org.lucares.pdb.api.UniqueStringIntegerPairs;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.lucares.utils.file.FileUtils;
|
import org.lucares.utils.file.FileUtils;
|
||||||
|
|
||||||
public class QueryCompletionIndexTest {
|
public class QueryCompletionIndexTest {
|
||||||
@@ -54,14 +53,14 @@ public class QueryCompletionIndexTest {
|
|||||||
|
|
||||||
// all firstnames where lastname=Doe are returned sorted alphabetically.
|
// all firstnames where lastname=Doe are returned sorted alphabetically.
|
||||||
// tags A and B match
|
// tags A and B match
|
||||||
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange, new Tag("lastname", "Doe"),
|
final SortedSet<String> firstnamesWithLastnameDoe = index.find(dateRange,
|
||||||
"firstname");
|
Tags.STRING_COMPRESSOR.createTag("lastname", "Doe"), "firstname");
|
||||||
Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);
|
Assertions.assertEquals(new TreeSet<>(Set.of("Jane", "John")), firstnamesWithLastnameDoe);
|
||||||
|
|
||||||
// no duplicates are returned:
|
// no duplicates are returned:
|
||||||
// tags A and C match firstname=John, but both have country=Atlantis
|
// tags A and C match firstname=John, but both have country=Atlantis
|
||||||
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange, new Tag("firstname", "John"),
|
final SortedSet<String> countryWithFirstnameJohn = index.find(dateRange,
|
||||||
"country");
|
Tags.STRING_COMPRESSOR.createTag("firstname", "John"), "country");
|
||||||
Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);
|
Assertions.assertEquals(new TreeSet<>(Arrays.asList("Atlantis")), countryWithFirstnameJohn);
|
||||||
|
|
||||||
// findAllValuesForField sorts alphabetically
|
// findAllValuesForField sorts alphabetically
|
||||||
|
|||||||
@@ -39,4 +39,16 @@ public class StringCompressor {
|
|||||||
return integer != null ? integer : -1;
|
return integer != null ? integer : -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link Tag} for the given field and value.
|
||||||
|
*
|
||||||
|
* @param field the field
|
||||||
|
* @param value the value
|
||||||
|
*/
|
||||||
|
public Tag createTag(final String field, final String value) {
|
||||||
|
final int f = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
|
||||||
|
final int v = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
|
||||||
|
return new Tag(f, v);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,17 +22,6 @@ public class Tag implements Comparable<Tag> {
|
|||||||
this.value = value;
|
this.value = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a new {@link Tag} for the given field and value.
|
|
||||||
*
|
|
||||||
* @param field the field
|
|
||||||
* @param value the value
|
|
||||||
*/
|
|
||||||
public Tag(final String field, final String value) {
|
|
||||||
this.field = field != null ? Tags.STRING_COMPRESSOR.getIfPresent(field) : -1;
|
|
||||||
this.value = value != null ? Tags.STRING_COMPRESSOR.getIfPresent(value) : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(final Tag o) {
|
public int compareTo(final Tag o) {
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import java.util.LinkedHashMap;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.lucares.pdb.api.StringCompressor;
|
import org.lucares.pdb.api.StringCompressor;
|
||||||
import org.lucares.pdb.api.Tag;
|
|
||||||
import org.lucares.pdb.api.Tags;
|
import org.lucares.pdb.api.Tags;
|
||||||
import org.lucares.pdb.api.TagsBuilder;
|
import org.lucares.pdb.api.TagsBuilder;
|
||||||
import org.lucares.pdb.api.UniqueStringIntegerPairs;
|
import org.lucares.pdb.api.UniqueStringIntegerPairs;
|
||||||
@@ -74,7 +73,7 @@ public class MemoryScale {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static Object createTag() {
|
private static Object createTag() {
|
||||||
return new Tag("", "");
|
return Tags.STRING_COMPRESSOR.createTag("", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Object createTags0() {
|
private static Object createTags0() {
|
||||||
|
|||||||
Reference in New Issue
Block a user