replace ludb with data-store

LuDB has a few disadvantages. 
  1. Most notably disk space. H2 wastes a lot of valuable disk space.
     For my test data set with 44 million entries it is 14 MB 
     (sometimes a lot more; depends on H2 internal cleanup). With 
     data-store it is 15 KB.
     Overall I could reduce the disk space from 231 MB to 200 MB (13.4 %
     in this example). That is an average of 4.6 bytes per entry.
  2. Speed:
     a) Liquibase is slow. The first time it takes approx. three seconds
     b) Query and insertion. with data-store we can insert entries 
        up to 1.6 times faster.

Data-store uses a few tricks to save disk space:
  1. We encode the tags into the file names.
  2. To keep them short we translate the key/value of the tag into 
     shorter numbers. For example "foo" -> 12 and "bar" to 47. So the
     tag "foo"/"bar" would be 12/47. 
     We then translate this number into a numeral system of base 62
     (a-zA-Z0-9), so it can be used for file names and it is shorter.
     That way we only have to store the mapping of string to int.
  3. We do that in a simple tab separated file.
This commit is contained in:
2017-04-16 09:07:28 +02:00
parent 85e45f74b7
commit ac1ee20046
56 changed files with 2243 additions and 677 deletions

View File

@@ -1,99 +0,0 @@
package org.lucares.pdb.api;
import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
public class Entry {
/**
* A special {@link Entry} that can be used as poison object for
* {@link BlockingQueueIterator}.
*/
public static final Entry POISON = new Entry(0, -1);
public static final long MAX_VALUE = 0xFF_FF_FF_FFL;
private final long epochMilli;
private final long value;
private final Tags tags;
public Entry(final OffsetDateTime date, final long value, final Tags tags) {
this.tags = tags;
this.epochMilli = date.toInstant().toEpochMilli();
this.value = value;
}
public Entry(final long epochMilli, final long value, final Tags tags) {
if (value < 0 || value > MAX_VALUE) {
throw new IllegalArgumentException("value must be between 0 and " + MAX_VALUE + ", but was " + value);
}
this.epochMilli = epochMilli;
this.value = value;
this.tags = tags;
}
private Entry(final long epochMilli, final long value) {
this.epochMilli = epochMilli;
this.value = value;
this.tags = null;
}
public OffsetDateTime getDate() {
final Instant instant = Instant.ofEpochMilli(epochMilli);
return OffsetDateTime.ofInstant(instant, ZoneOffset.UTC);
}
public long getValue() {
return value;
}
public long getEpochMilli() {
return epochMilli;
}
public Tags getTags() {
return tags;
}
@Override
public String toString() {
final OffsetDateTime date = getDate();
return date.format(DateTimeFormatter.ISO_ZONED_DATE_TIME) + " = " + value + " (" + tags + ")";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (epochMilli ^ (epochMilli >>> 32));
result = prime * result + ((tags == null) ? 0 : tags.hashCode());
result = prime * result + (int) (value ^ (value >>> 32));
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final Entry other = (Entry) obj;
if (epochMilli != other.epochMilli)
return false;
if (tags == null) {
if (other.tags != null)
return false;
} else if (!tags.equals(other.tags))
return false;
if (value != other.value)
return false;
return true;
}
}

View File

@@ -1,32 +0,0 @@
package org.lucares.pdb.api;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class GroupResult {
private final Tags groupedBy;
private final Stream<Entry> entries;
public GroupResult(final Stream<Entry> entries, final Tags groupedBy) {
this.entries = entries;
this.groupedBy = groupedBy;
}
/**
* @return {@link Stream} unbound, unordered and non-parallel
*/
public Stream<Entry> asStream() {
return entries;
}
public List<Entry> asList() {
return entries.collect(Collectors.toList());
}
public Tags getGroupedBy() {
return groupedBy;
}
}

View File

@@ -1,30 +0,0 @@
package org.lucares.pdb.api;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
public class Result {
private final List<GroupResult> groupResults;
public Result(final GroupResult... groupResults) {
this(Arrays.asList(groupResults));
}
public Result(final Collection<GroupResult> groupResults) {
this.groupResults = new ArrayList<>(groupResults);
}
public GroupResult singleGroup() {
if (groupResults.size() != 1) {
throw new IllegalStateException("the result does not contain exactly one group");
}
return groupResults.get(0);
}
public List<GroupResult> getGroups() {
return new ArrayList<>(groupResults);
}
}

View File

@@ -1,57 +0,0 @@
package org.lucares.pdb.api;
public class Tag {
private final String key;
private final String value;
public Tag(final String key, final String value) {
this.key = key;
this.value = value;
}
public String getKey() {
return key;
}
public String getValue() {
return value;
}
@Override
public String toString() {
return key + "=" + value;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((key == null) ? 0 : key.hashCode());
result = prime * result + ((value == null) ? 0 : value.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final Tag other = (Tag) obj;
if (key == null) {
if (other.key != null)
return false;
} else if (!key.equals(other.key))
return false;
if (value == null) {
if (other.value != null)
return false;
} else if (!value.equals(other.value))
return false;
return true;
}
}

View File

@@ -1,136 +0,0 @@
package org.lucares.pdb.api;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BiConsumer;
public class Tags {
static final Tags EMPTY = new Tags();
private final Map<String, Tag> tags;
private Tags() {
super();
tags = Collections.emptyMap();
}
private Tags(final Map<String, Tag> tags) {
this.tags = tags;
}
public static Tags create() {
return EMPTY;
}
public static Tags create(final String key1, final String value1, final String key2, final String value2) {
final Map<String, Tag> tags = new HashMap<>(2);
tags.put(key1, new Tag(key1, value1));
tags.put(key2, new Tag(key2, value2));
return new Tags(tags);
}
public static Tags create(final String key, final String value) {
final Map<String, Tag> tags = new HashMap<>(1);
tags.put(key, new Tag(key, value));
return new Tags(tags);
}
public Tags copyAdd(final String key, final String value) {
Objects.requireNonNull(key, "key must not be null");
Objects.requireNonNull(value, "value must not be null");
final Map<String, Tag> newTags = new HashMap<>(tags);
newTags.put(key, new Tag(key, value));
return new Tags(newTags);
}
public Tags copyAddIfNotNull(final String key, final String value) {
final Tags result;
if (value != null) {
result = copyAdd(key, value);
} else {
result = this;
}
return result;
}
public String getValue(final String key) {
final Tag tag = tags.get(key);
final String value = tag != null ? tag.getValue() : null;
return value;
}
public Set<String> getKeys() {
return new TreeSet<>(tags.keySet());
}
public void forEach(final BiConsumer<String, String> keyValueConsumer) {
for (final Map.Entry<String, Tag> e : tags.entrySet()) {
keyValueConsumer.accept(e.getKey(), e.getValue().getValue());
}
}
@Override
public String toString() {
return String.valueOf(tags);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((tags == null) ? 0 : tags.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final Tags other = (Tags) obj;
if (tags == null) {
if (other.tags != null)
return false;
} else if (!tags.equals(other.tags))
return false;
return true;
}
public String abbreviatedRepresentation() {
final StringBuilder result = new StringBuilder();
final int maxLength = 200;
final SortedSet<String> keys = new TreeSet<>(tags.keySet());
final int cutAt = maxLength / (keys.size() * 2 + 2);
for (final String key : keys) {
final String value = tags.get(key).getValue();
result.append(substr(key, cutAt));
result.append("-");
result.append(substr(value, cutAt));
result.append("_");
}
return substr(result.toString(), maxLength);
}
private static String substr(final String s, final int maxLength) {
return s.substring(0, Math.min(maxLength, s.length()));
}
}

View File

@@ -0,0 +1,52 @@
package org.lucares.utils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class CollectionUtils {
public static <T, R extends T> void mapInPlace(final List<T> list, final Function<T, R> mapper) {
for (int i = 0; i < list.size(); i++) {
final T value = list.get(i);
final T newValue = mapper.apply(value);
list.set(i, newValue);
}
}
public static <T, R> List<R> map(final Collection<T> list, final Function<T, R> mapper) {
final List<R> result = new ArrayList<>(list.size());
for (final T t : list) {
result.add(mapper.apply(t));
}
return result;
}
public static <T, R> List<R> map(final T[] input, final Function<T, R> mapper) {
return Stream.of(input).map(mapper).collect(Collectors.toList());
}
public static <T, V> Map<T, V> toMap(final Iterable<V> iterable, final Function<V, T> keyMapper) {
final Map<T, V> result = new HashMap<>();
for (final V value : iterable) {
final T key = keyMapper.apply(value);
result.put(key, value);
}
return result;
}
public static <T> List<T> filter(final Collection<T> collection, final Predicate<T> predicate) {
return collection.stream().filter(predicate).collect(Collectors.toList());
}
}