replace ludb with data-store

LuDB has a few disadvantages. 
  1. Most notably disk space. H2 wastes a lot of valuable disk space.
     For my test data set with 44 million entries it is 14 MB 
     (sometimes a lot more; depends on H2 internal cleanup). With 
     data-store it is 15 KB.
     Overall I could reduce the disk space from 231 MB to 200 MB (13.4 %
     in this example). That is an average of 4.6 bytes per entry.
  2. Speed:
     a) Liquibase is slow. The first time it takes approx. three seconds
     b) Query and insertion. with data-store we can insert entries 
        up to 1.6 times faster.

Data-store uses a few tricks to save disk space:
  1. We encode the tags into the file names.
  2. To keep them short we translate the key/value of the tag into 
     shorter numbers. For example "foo" -> 12 and "bar" to 47. So the
     tag "foo"/"bar" would be 12/47. 
     We then translate this number into a numeral system of base 62
     (a-zA-Z0-9), so it can be used for file names and it is shorter.
     That way we only have to store the mapping of string to int.
  3. We do that in a simple tab separated file.
This commit is contained in:
2017-04-16 09:07:28 +02:00
parent 85e45f74b7
commit ac1ee20046
56 changed files with 2243 additions and 677 deletions

View File

@@ -0,0 +1,65 @@
grammar PdbLang;
@header {
package org.lucares.pdb.datastore.lang;
}
start : expression EOF ;
expression
: LPAREN expression RPAREN #parenExpression
| NOT expression #notExpression
| prop=identifier eq=equal value=propValue #propertyExpression
| left=expression AND right=expression #binaryAndExpression
| left=expression OR right=expression #binaryOrExpression
;
identifier
: IDENTIFIER #identifierExpression
;
propValue
: identifier
;
equal : EQUAL ;
AND : 'and' ;
OR : 'or' ;
NOT : '!';
EQUAL : '=' ;
LPAREN : '(' ;
RPAREN : ')' ;
WS : [ \r\t\u000C\n]+ -> skip;
IDENTIFIER
: JavaLetter JavaLetterOrDigit*
;
fragment
JavaLetter
: [a-zA-Z0-9$_] // these are the "java letters" below 0x7F
| [\u002a] // asterisk, used for wildcards
| // covers all characters above 0x7F which are not a surrogate
~[\u0000-\u007F\uD800-\uDBFF]
{Character.isJavaIdentifierStart(_input.LA(-1))}?
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
[\uD800-\uDBFF] [\uDC00-\uDFFF]
{Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
;
fragment
JavaLetterOrDigit
: [a-zA-Z0-9$_] // these are the "java letters or digits" below 0x7F
| [\u002a] // asterisk, used for wildcards
| '.'
| '/'
| '-'
| // covers all characters above 0x7F which are not a surrogate
~[\u0000-\u007F\uD800-\uDBFF]
{Character.isJavaIdentifierPart(_input.LA(-1))}?
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
[\uD800-\uDBFF] [\uDC00-\uDFFF]
{Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
;

View File

@@ -0,0 +1,199 @@
package org.lucares.pdb.datastore;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.lucares.collections.IntList;
import org.lucares.pdb.api.Tags;
import org.lucares.pdb.datastore.internal.FolderStorage;
import org.lucares.pdb.datastore.internal.RadixConverter;
import org.lucares.pdb.datastore.internal.StringCompressor;
import org.lucares.pdb.datastore.lang.Expression;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor.AllDocIds;
import org.lucares.pdb.datastore.lang.QueryLanguageParser;
public class DataStore {
private static final String SUBDIR_STORAGE = "storage";
private static final String PDB_EXTENSION = ".pdb";
private static final String KEY_VALUE_SEPARATOR = "-";
private static final String KEY_VALUE_PAIR_SEPARATOR = "_";
private static final String KEY_VALUE_END_SEPARATOR = "$";
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
+ REGEX_KEY_VALUE + ")*";;
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)%2$s[0-9]*%3$s", REGEX_KEY_VALUE_PAIRS,
Pattern.quote(KEY_VALUE_END_SEPARATOR), PDB_EXTENSION);
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
private final List<Doc> docIdToDoc = new ArrayList<>();
private final Map<String, Map<String, IntList>> keyToValueToDocId = new HashMap<>();
private final StringCompressor stringCompressor;
private final FolderStorage folderStorage;
public DataStore(final Path dataDirectory) throws IOException {
stringCompressor = StringCompressor.create(keyCompressionFile(dataDirectory));
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
init(folderStorage);
}
private void init(final FolderStorage folderStorage) throws IOException {
final Stream<Path> files = folderStorage.list();
files.forEach(path -> {
final String filename = path.getFileName().toString();
final Tags tags = toTags(filename);
cacheTagToFileMapping(tags, path);
});
}
private void cacheTagToFileMapping(final Tags tags, final Path path) {
final int docId = docIdToDoc.size();
docIdToDoc.add(new Doc(tags, path));
for (final String key : tags.getKeys()) {
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new HashMap<>());
final String value = tags.getValue(key);
final IntList docIds = valueToDocIds.computeIfAbsent(value, v -> new IntList());
docIds.add(docId);
}
}
private Path keyCompressionFile(final Path dataDirectory) throws IOException {
return dataDirectory.resolve("keys.csv");
}
public static Path storageDirectory(final Path dataDirectory) throws IOException {
return dataDirectory.resolve(SUBDIR_STORAGE);
}
public Path createNewFile(final Tags tags) throws IOException {
final Path filename = toFilename(tags);
final Path result = folderStorage.insert(filename.toString(), PDB_EXTENSION);
cacheTagToFileMapping(tags, result);
return result;
}
private Path toFilename(final Tags tags) {
final StringBuilder path = new StringBuilder();
final SortedSet<String> sortedKeys = new TreeSet<>(tags.getKeys());
for (final String key : sortedKeys) {
final String value = tags.getValue(key);
final int compressedKey = stringCompressor.put(key);
final int compressedValue = stringCompressor.put(value);
if (path.length() > 0) {
path.append(KEY_VALUE_PAIR_SEPARATOR);
}
path.append(RadixConverter.toString(compressedKey));
path.append(KEY_VALUE_SEPARATOR);
path.append(RadixConverter.toString(compressedValue));
}
path.append(KEY_VALUE_END_SEPARATOR);
return Paths.get(path.toString());
}
private Tags toTags(final String filename) {
Tags tags = Tags.create();
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
if (matcher.find()) {
final String serializedTags = matcher.group(1);
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
if (keyValuePair.length == 2) {
final String key = stringCompressor.get(RadixConverter.fromString(keyValuePair[0]));
final String value = stringCompressor.get(RadixConverter.fromString(keyValuePair[1]));
tags = tags.copyAdd(key, value);
}
}
}
return tags;
}
public List<Doc> search(final String query) {
final Expression expression = QueryLanguageParser.parse(query);
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(keyToValueToDocId,
new AllDocIds(docIdToDoc));
final IntList docIdsList = expression.visit(visitor);
final List<Doc> result = new ArrayList<>(docIdsList.size());
final int[] intDocIds = docIdsList.toArray();
for (int i = 0; i < intDocIds.length; i++) {
final int docId = intDocIds[i];
final Doc doc = docIdToDoc.get(docId);
result.add(doc);
}
return result;
}
public List<String> getAvailableFields() {
final List<String> result = new ArrayList<>();
result.addAll(keyToValueToDocId.keySet());
Collections.sort(result);
return result;
}
public SortedSet<String> getAvailableValuesForKey(final String query, final String key) {
final SortedSet<String> result = new TreeSet<>();
final List<Doc> docs = search(query);
for (final Doc doc : docs) {
final String valueForKey = doc.getTags().getValue(key);
if (valueForKey != null) {
result.add(valueForKey);
}
}
return result;
}
}

View File

@@ -0,0 +1,30 @@
package org.lucares.pdb.datastore;
import java.nio.file.Path;
import org.lucares.pdb.api.Tags;
public class Doc {
private final Tags tags;
private final Path path;
public Doc(final Tags tags, final Path path) {
super();
this.tags = tags;
this.path = path;
}
public Tags getTags() {
return tags;
}
public Path getPath() {
return path;
}
@Override
public String toString() {
return "Doc [tags=" + tags + ", path=" + path + "]";
}
}

View File

@@ -0,0 +1,20 @@
package org.lucares.pdb.datastore.internal;
import java.util.function.Function;
public class CreateNewKey implements Function<String, String> {
private final int index;
public CreateNewKey(final int index) {
this.index = index;
}
@Override
public String apply(final String key) {
final String result = String.valueOf(index);
return result;
}
}

View File

@@ -0,0 +1,86 @@
package org.lucares.pdb.datastore.internal;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.function.BiPredicate;
import java.util.stream.Stream;
public class FolderStorage {
private final Path storageBaseDirectory;
private int firstLevel = 0;
private int secondLevel = 0;
private int filesInSecondLevel = 0;
private Path currentDirectory;
private final int maxFilesPerFolder;
public FolderStorage(final Path storageBaseDirectory, final int maxFilesPerFolder) throws IOException {
this.storageBaseDirectory = storageBaseDirectory;
this.maxFilesPerFolder = maxFilesPerFolder;
init();
}
private void init() throws IOException {
Files.createDirectories(storageBaseDirectory);
firstLevel = Math.max((int) Files.list(storageBaseDirectory).count() - 1, 0);
final Path firstLevelDirectory = storageBaseDirectory.resolve(String.valueOf(firstLevel));
Files.createDirectories(firstLevelDirectory);
secondLevel = Math.max((int) Files.list(firstLevelDirectory).count() - 1, 0);
currentDirectory = firstLevelDirectory.resolve(String.valueOf(secondLevel));
Files.createDirectories(currentDirectory);
filesInSecondLevel = (int) Files.list(currentDirectory).count();
}
public Path insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
ensureCapacity();
String filename = filenamePrefix + filenameSuffix;
int index = 1;
Path newFile = currentDirectory.resolve(filename);
while (Files.exists(newFile)) {
filename = filenamePrefix + index++ + filenameSuffix;
newFile = currentDirectory.resolve(filename);
}
Files.createFile(newFile);
filesInSecondLevel++;
return newFile;
}
private void ensureCapacity() throws IOException {
if (filesInSecondLevel >= maxFilesPerFolder) {
secondLevel++;
if (secondLevel >= maxFilesPerFolder) {
firstLevel++;
secondLevel = 0;
}
filesInSecondLevel = 0;
updateCurrentDirectory();
}
}
private void updateCurrentDirectory() throws IOException {
currentDirectory = storageBaseDirectory.resolve(String.valueOf(firstLevel))
.resolve(String.valueOf(secondLevel));
Files.createDirectories(currentDirectory);
}
public Stream<Path> list() throws IOException {
final int maxDepth = Integer.MAX_VALUE;
final BiPredicate<Path, BasicFileAttributes> matchRegularFiles = (path, attr) -> Files.isRegularFile(path);
return Files.find(storageBaseDirectory, maxDepth, matchRegularFiles);
}
}

View File

@@ -0,0 +1,42 @@
package org.lucares.pdb.datastore.internal;
public class RadixConverter {
private static final String ALPHABET = "0123456789ABCDEFGHIJKLMNOPRSTUVWXYZacbdefghijklmnopqrstuvwxyz";
public static String toString(final int value) {
if (value < 0) {
throw new IllegalArgumentException("value must not be negative");
}
final StringBuilder result = new StringBuilder();
int v = value;
if (v == 0) {
result.append(ALPHABET.charAt(0));
} else {
while (v > 0) {
final int remainder = v % ALPHABET.length();
v = v / ALPHABET.length();
result.insert(0, ALPHABET.charAt(remainder));
}
}
return result.toString();
}
public static int fromString(final String string) {
int result = 0;
for (int i = 0; i < string.length(); i++) {
final int value = ALPHABET.indexOf(string.charAt(i));
result = result * ALPHABET.length() + value;
}
return result;
}
}

View File

@@ -0,0 +1,10 @@
package org.lucares.pdb.datastore.internal;
public class RuntimeIOException extends RuntimeException {
private static final long serialVersionUID = 1L;
public RuntimeIOException(final Throwable cause) {
super(cause);
}
}

View File

@@ -0,0 +1,32 @@
package org.lucares.pdb.datastore.internal;
import java.nio.file.Path;
import org.lucares.pdb.datastore.internal.map.UniqueStringIntegerPairs;
/**
* Persistently maps Strings to integers.
*/
public class StringCompressor {
private final UniqueStringIntegerPairs usip;
public StringCompressor(final UniqueStringIntegerPairs usip) throws RuntimeIOException {
this.usip = usip;
}
public static StringCompressor create(final Path path) {
final UniqueStringIntegerPairs mapsi = new UniqueStringIntegerPairs(path);
return new StringCompressor(mapsi);
}
public Integer put(final String string) {
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
}
public String get(final int integer) {
return usip.getKey(integer);
}
}

View File

@@ -0,0 +1,126 @@
package org.lucares.pdb.datastore.internal.map;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.lucares.pdb.datastore.internal.RuntimeIOException;
/**
* A very simple {@link Set}-like or {@link Map}-like datastructure that stores
* unique&sup1; pairs of Strings and integers persistently.
* <p>
* (1) Unique means, that neither the string, nor the integer may occur twice.
* For Example, imagine the pair ("a", 1) already exists, then neither ("a", 2)
* nor ("b", 1) may be added.
* <p>
* You can only add pairs. No deletion. It keeps an in memory view for fast
* retrievals.
*/
public class UniqueStringIntegerPairs {
private static final String SEPARATOR = "\t";
private static final boolean APPEND = true;
/**
* Maps a string to an integer. E.g. "myLongValue" -> 123
*/
private final Map<String, Integer> stringToInt = new HashMap<>();
/**
* Maps an integer to a string. E.g. 123 -> "myLongValue"
*/
private final SortedMap<Integer, String> intToString = new TreeMap<>();
private final Path file;
public UniqueStringIntegerPairs(final Path file) {
super();
this.file = file;
init(file);
}
private void init(final Path file) throws RuntimeIOException {
try {
Files.createDirectories(file.getParent());
if (!Files.exists(file)) {
Files.createFile(file);
}
try (final BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(file.toFile()), StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
if (tokens.length == 2) {
final String string = tokens[0];
final int value = Integer.parseInt(tokens[1]);
intToString.put(value, string);
stringToInt.put(string, value);
}
}
}
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
}
public void put(final String first, final int second) {
if (stringToInt.containsKey(first) || intToString.containsKey(second)) {
throw new IllegalArgumentException("Unique key constraint violation for (" + first + ", " + second + ")");
}
try (final Writer writer = new OutputStreamWriter(new FileOutputStream(file.toFile(), APPEND),
StandardCharsets.UTF_8)) {
writer.write(first + SEPARATOR + second + "\n");
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
intToString.put(second, first);
stringToInt.put(first, second);
}
public Integer get(final String first) {
return stringToInt.get(first);
}
public String getKey(final Integer second) {
return intToString.get(second);
}
public Integer getHighestInteger() {
return intToString.size() == 0 ? -1 : intToString.lastKey();
}
public Integer computeIfAbsent(final String first, final Function<String, Integer> mappingFunction) {
if (!stringToInt.containsKey(first)) {
final Integer second = mappingFunction.apply(first);
put(first, second);
}
return stringToInt.get(first);
}
}

View File

@@ -0,0 +1,19 @@
package org.lucares.pdb.datastore.lang;
import org.antlr.v4.runtime.BaseErrorListener;
import org.antlr.v4.runtime.RecognitionException;
import org.antlr.v4.runtime.Recognizer;
public class ErrorListener extends BaseErrorListener {
@Override
public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line,
final int charPositionInLine, final String msg, final RecognitionException e) {
final int lineStart = line;
final int startIndex = charPositionInLine;
final int lineStop = line;
final int stopIndex = charPositionInLine;
throw new SyntaxException(msg, lineStart, startIndex, lineStop, stopIndex);
}
}

View File

@@ -0,0 +1,437 @@
package org.lucares.pdb.datastore.lang;
abstract public class Expression {
public <T> T visit(final ExpressionVisitor<T> visitor) {
throw new UnsupportedOperationException();
}
abstract static class UnaryExpression extends Expression {
private final int line;
private final int startIndex;
private final int stopIndex;
public UnaryExpression(final int line, final int startIndex, final int stopIndex) {
super();
this.line = line;
this.startIndex = startIndex;
this.stopIndex = stopIndex;
}
int getLine() {
return line;
}
int getStartIndex() {
return startIndex;
}
int getStopIndex() {
return stopIndex;
}
}
abstract static class TemporaryExpression extends Expression {
abstract Expression toExpression(Expression left, Expression right);
}
public static MatchAll matchAll() {
return MatchAll.INSTANCE;
}
static class OrTemporary extends TemporaryExpression {
@Override
Expression toExpression(final Expression left, final Expression right) {
return new Or(left, right);
}
@Override
public String toString() {
return "OrTemporary";
}
}
static class AndTemporary extends TemporaryExpression {
@Override
Expression toExpression(final Expression left, final Expression right) {
return new And(left, right);
}
@Override
public String toString() {
return "AndTemporary";
}
}
static class Not extends Expression {
private final Expression expression;
Not(final Expression expression) {
this.expression = expression;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
Expression getExpression() {
return expression;
}
@Override
public String toString() {
return "!" + expression;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((expression == null) ? 0 : expression.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Not other = (Not) obj;
if (expression == null) {
if (other.expression != null) {
return false;
}
} else if (!expression.equals(other.expression)) {
return false;
}
return true;
}
}
static class Or extends Expression {
private final Expression left;
private final Expression right;
Or(final Expression left, final Expression right) {
this.left = left;
this.right = right;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
Expression getLeft() {
return left;
}
Expression getRight() {
return right;
}
@Override
public String toString() {
return " (" + left + " or " + right + ") ";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((left == null) ? 0 : left.hashCode());
result = prime * result + ((right == null) ? 0 : right.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Or other = (Or) obj;
if (left == null) {
if (other.left != null) {
return false;
}
} else if (!left.equals(other.left)) {
return false;
}
if (right == null) {
if (other.right != null) {
return false;
}
} else if (!right.equals(other.right)) {
return false;
}
return true;
}
}
static class And extends Expression {
private final Expression left;
private final Expression right;
And(final Expression left, final Expression right) {
this.left = left;
this.right = right;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
Expression getLeft() {
return left;
}
Expression getRight() {
return right;
}
@Override
public String toString() {
return " (" + left + " and " + right + ") ";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((left == null) ? 0 : left.hashCode());
result = prime * result + ((right == null) ? 0 : right.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final And other = (And) obj;
if (left == null) {
if (other.left != null) {
return false;
}
} else if (!left.equals(other.left)) {
return false;
}
if (right == null) {
if (other.right != null) {
return false;
}
} else if (!right.equals(other.right)) {
return false;
}
return true;
}
}
static class MatchAll extends Expression {
public static final MatchAll INSTANCE = new MatchAll();
private MatchAll() {
//
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
@Override
public String toString() {
return "true";
}
}
static class Terminal extends UnaryExpression {
private final String value;
Terminal(final String value, final int line, final int startIndex, final int stopIndex) {
super(line, startIndex, stopIndex);
this.value = value;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
@Override
public String toString() {
return value;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((value == null) ? 0 : value.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Terminal other = (Terminal) obj;
if (value == null) {
if (other.value != null) {
return false;
}
} else if (!value.equals(other.value)) {
return false;
}
return true;
}
public String getValue() {
return value;
}
}
static class Property extends Expression {
final String property;
final Terminal value;
public Property(final String property, final Terminal value) {
this.property = property;
this.value = value;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
@Override
public String toString() {
return " " + property + " = " + value.getValue() + " ";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((property == null) ? 0 : property.hashCode());
result = prime * result + ((value == null) ? 0 : value.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final Property other = (Property) obj;
if (property == null) {
if (other.property != null)
return false;
} else if (!property.equals(other.property))
return false;
if (value == null) {
if (other.value != null)
return false;
} else if (!value.equals(other.value))
return false;
return true;
}
}
static class Parentheses extends Expression {
private final Expression expression;
Parentheses(final Expression expression) {
this.expression = expression;
}
@Override
public <T> T visit(final ExpressionVisitor<T> visitor) {
return visitor.visit(this);
}
public Expression getExpression() {
return expression;
}
@Override
public String toString() {
return " [ " + expression + " ] ";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((expression == null) ? 0 : expression.hashCode());
return result;
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Parentheses other = (Parentheses) obj;
if (expression == null) {
if (other.expression != null) {
return false;
}
} else if (!expression.equals(other.expression)) {
return false;
}
return true;
}
}
}

View File

@@ -0,0 +1,248 @@
package org.lucares.pdb.datastore.lang;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.lucares.collections.IntList;
import org.lucares.pdb.datastore.Doc;
import org.lucares.pdb.datastore.lang.Expression.And;
import org.lucares.pdb.datastore.lang.Expression.Not;
import org.lucares.pdb.datastore.lang.Expression.Or;
import org.lucares.pdb.datastore.lang.Expression.Parentheses;
import org.lucares.pdb.datastore.lang.Expression.Property;
import org.lucares.pdb.datastore.lang.Expression.Terminal;
import org.lucares.utils.CollectionUtils;
public class ExpressionToDocIdVisitor extends ExpressionVisitor<IntList> {
public static final class AllDocIds {
private final List<Doc> docIdToPath;
private IntList cachedPathIds = new IntList();
public AllDocIds(final List<Doc> docIdToPath) {
this.docIdToPath = docIdToPath;
}
public IntList getAllDocIds() {
final int pathIds = docIdToPath.size();
if (cachedPathIds.size() != pathIds) {
final IntList result = new IntList(pathIds);
for (int i = 0; i < pathIds; i++) {
result.add(i);
}
cachedPathIds = result;
}
return cachedPathIds;
}
}
private static final Map<String, IntList> EMPTY_VALUES = Collections.emptyMap();
private static final IntList EMPTY_DOC_IDS = new IntList();
private final Map<String, Map<String, IntList>> keyToValueToDocId;
private final AllDocIds allDocIds;
public ExpressionToDocIdVisitor(final Map<String, Map<String, IntList>> keyToValueToDocId,
final AllDocIds allDocIds) {
this.keyToValueToDocId = keyToValueToDocId;
this.allDocIds = allDocIds;
}
@Override
public IntList visit(final And expression) {
final Expression left = expression.getLeft();
final Expression right = expression.getRight();
final IntList leftFiles = left.visit(this);
final IntList rightFiles = right.visit(this);
final IntList result = new IntList(Math.min(leftFiles.size(), rightFiles.size()));
int l = 0;
int r = 0;
while (l < leftFiles.size() && r < rightFiles.size()) {
final int lv = leftFiles.get(l);
final int rv = rightFiles.get(r);
if (lv < rv) {
l++;
} else if (lv > rv) {
r++;
} else {
result.add(lv);
l++;
r++;
}
}
return result;
}
@Override
public IntList visit(final Or expression) {
final Expression left = expression.getLeft();
final Expression right = expression.getRight();
final IntList leftFiles = left.visit(this);
final IntList rightFiles = right.visit(this);
final IntList result = merge(leftFiles, rightFiles);
return result;
}
@Override
public IntList visit(final Not expression) {
final Expression negatedExpression = expression.getExpression();
final IntList expr = negatedExpression.visit(this);
final IntList allDocIds = getAllDocIds();
final IntList result = new IntList(allDocIds.size());
final int[] docIdsToBeNegated = expr.toArray();
for (int i = 0; i < allDocIds.size(); i++) {
final int docId = allDocIds.get(i);
if (Arrays.binarySearch(docIdsToBeNegated, docId) < 0) {
result.add(docId);
}
}
return result;
}
@Override
public IntList visit(final Parentheses parentheses) {
throw new UnsupportedOperationException(
"Parenthesis not supported. The correct order should come from the parser.");
}
@Override
public IntList visit(final Expression.MatchAll expression) {
return getAllDocIds();
}
private IntList getAllDocIds() {
return allDocIds.getAllDocIds();
}
@Override
public IntList visit(final Property expression) {
final String propertyName = expression.property;
final Terminal propertyValue = expression.value;
final String stringValue = propertyValue.getValue();
final IntList result;
if (isMatchAll(stringValue)) {
final Map<String, IntList> allValuesForKey = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES);
result = merge(allValuesForKey.values());
} else if (containsWildcard(stringValue)) {
final Collection<IntList> docIds = filterByWildcard(propertyName, globToRegex(stringValue));
result = merge(docIds);
} else {
result = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES).getOrDefault(stringValue,
EMPTY_DOC_IDS);
}
return result;
}
private Pattern globToRegex(final String globPattern) {
final String[] tokens = StringUtils.splitPreserveAllTokens(globPattern, "*");
final List<String> quotedTokens = CollectionUtils.map(tokens, Pattern::quote);
final String regex = String.join(".*", quotedTokens);
return Pattern.compile(regex);
}
private List<IntList> filterByWildcard(final String propertyName, final Pattern valuePattern) {
final List<IntList> result = new ArrayList<>();
final Map<String, IntList> valueToDocId = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES);
for (final Entry<String, IntList> entry : valueToDocId.entrySet()) {
if (valuePattern.matcher(entry.getKey()).matches()) {
result.add(entry.getValue());
}
}
return result;
}
private boolean containsWildcard(final String stringValue) {
return stringValue.contains("*");
}
private IntList merge(final Collection<IntList> lists) {
IntList result = new IntList();
for (final IntList intList : lists) {
result = merge(result, intList);
}
return result;
}
private boolean isMatchAll(final String stringValue) {
return Objects.equals("*", stringValue);
}
private IntList merge(final IntList leftFiles, final IntList rightFiles) {
final IntList result = new IntList(leftFiles.size() + rightFiles.size());
int l = 0;
int r = 0;
while (l < leftFiles.size() && r < rightFiles.size()) {
final int lv = leftFiles.get(l);
final int rv = rightFiles.get(r);
if (lv < rv) {
result.add(lv);
l++;
} else if (lv > rv) {
result.add(rv);
r++;
} else {
result.add(lv);
l++;
r++;
}
}
if (l < leftFiles.size()) {
final int length = leftFiles.size() - l;
result.addAll(leftFiles.get(l, length));
} else if (r < rightFiles.size()) {
final int length = rightFiles.size() - r;
result.addAll(rightFiles.get(r, length));
}
return result;
}
}

View File

@@ -0,0 +1,31 @@
package org.lucares.pdb.datastore.lang;
public abstract class ExpressionVisitor<T> {
public T visit(final Expression.And expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.Or expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.Not expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.Property expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.Terminal expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.MatchAll expression) {
throw new UnsupportedOperationException();
}
public T visit(final Expression.Parentheses parentheses) {
throw new UnsupportedOperationException();
}
}

View File

@@ -0,0 +1,109 @@
package org.lucares.pdb.datastore.lang;
import java.util.Stack;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.ParseTreeListener;
import org.antlr.v4.runtime.tree.ParseTreeWalker;
import org.lucares.pdb.datastore.lang.Expression.AndTemporary;
import org.lucares.pdb.datastore.lang.Expression.Not;
import org.lucares.pdb.datastore.lang.Expression.OrTemporary;
import org.lucares.pdb.datastore.lang.Expression.Property;
import org.lucares.pdb.datastore.lang.Expression.TemporaryExpression;
import org.lucares.pdb.datastore.lang.Expression.Terminal;
import org.lucares.pdb.datastore.lang.PdbLangParser.BinaryAndExpressionContext;
import org.lucares.pdb.datastore.lang.PdbLangParser.BinaryOrExpressionContext;
import org.lucares.pdb.datastore.lang.PdbLangParser.IdentifierExpressionContext;
import org.lucares.pdb.datastore.lang.PdbLangParser.NotExpressionContext;
import org.lucares.pdb.datastore.lang.PdbLangParser.PropertyExpressionContext;
public class QueryLanguage {
public Expression parse(final String input) {
// define the input
final CharStream in = CharStreams.fromString(input);
// create lexer and parser
final PdbLangLexer lexer = new PdbLangLexer(in);
lexer.addErrorListener(new ErrorListener());
final CommonTokenStream tokens = new CommonTokenStream(lexer);
final PdbLangParser parser = new PdbLangParser(tokens);
parser.addErrorListener(new ErrorListener());
final Stack<Expression> stack = new Stack<>();
// define a listener that is called for every terminals and
// non-terminals
final ParseTreeListener listener = new PdbLangBaseListener() {
@Override
public void exitIdentifierExpression(final IdentifierExpressionContext ctx) {
// System.out.println("push identifier " + ctx.getText());
if (ctx.getText().length() > 255) {
throw new SyntaxException(ctx, "token too long");
}
final int line = ctx.getStart().getLine();
final int startIndex = ctx.getStart().getStartIndex();
final int stopIndex = ctx.getStart().getStopIndex();
stack.push(new Terminal(ctx.getText(), line, startIndex, stopIndex));
}
@Override
public void exitPropertyExpression(final PropertyExpressionContext ctx) {
// System.out.println("property expression");
final Expression value = stack.pop();
final Terminal property = (Terminal) stack.pop();
stack.push(new Property(property.getValue(), (Terminal) value));
}
@Override
public void exitNotExpression(final NotExpressionContext ctx) {
final Expression expression = stack.pop();
final Expression notExpression = new Not(expression);
stack.push(notExpression);
}
@Override
public void exitBinaryAndExpression(final BinaryAndExpressionContext ctx) {
final Expression right = stack.pop();
final TemporaryExpression operation = new AndTemporary();
final Expression left = stack.pop();
stack.push(operation.toExpression(left, right));
}
@Override
public void exitBinaryOrExpression(final BinaryOrExpressionContext ctx) {
final Expression right = stack.pop();
final TemporaryExpression operation = new OrTemporary();
final Expression left = stack.pop();
stack.push(operation.toExpression(left, right));
}
};
// Specify our entry point
final ParseTree parseTree = parser.start();
// Walk it and attach our listener
final ParseTreeWalker walker = new ParseTreeWalker();
walker.walk(listener, parseTree);
if (stack.size() != 1) {
throw new RuntimeException("stack should have exactly one element " + stack);
}
return stack.pop();
}
}

View File

@@ -0,0 +1,17 @@
package org.lucares.pdb.datastore.lang;
import org.apache.commons.lang3.StringUtils;
public class QueryLanguageParser {
public static Expression parse(final String query) {
final Expression result;
if (StringUtils.isEmpty(query)) {
result = Expression.matchAll();
} else {
final QueryLanguage lang = new QueryLanguage();
result = lang.parse(query);
}
return result;
}
}

View File

@@ -0,0 +1,64 @@
package org.lucares.pdb.datastore.lang;
import org.antlr.v4.runtime.ParserRuleContext;
public class SyntaxException extends RuntimeException {
private static final long serialVersionUID = 1L;
private int lineStart;
private int startIndex;
private int lineStop;
private int stopIndex;
public SyntaxException(final ParserRuleContext context, final String message) {
this(message, context.getStart().getLine(), context.getStart().getStartIndex(), context.getStop().getLine(),
context.getStop().getStopIndex());
}
public SyntaxException(final String message, final int lineStart, final int startIndex, final int lineStop,
final int stopIndex) {
super(message + ": " + generateMessage(lineStart, startIndex, lineStop, stopIndex));
this.lineStart = lineStart;
this.startIndex = startIndex;
this.lineStop = lineStop;
this.stopIndex = stopIndex;
}
private static String generateMessage(final int lineStart, final int startIndex, final int lineStop,
final int stopIndex) {
return String.format("line=%d, start=%d, to line=%d stop=%d", lineStart, startIndex, lineStop, stopIndex);
}
public int getLineStart() {
return lineStart;
}
public void setLineStart(final int lineStart) {
this.lineStart = lineStart;
}
public int getStartIndex() {
return startIndex;
}
public void setStartIndex(final int startIndex) {
this.startIndex = startIndex;
}
public int getLineStop() {
return lineStop;
}
public void setLineStop(final int lineStop) {
this.lineStop = lineStop;
}
public int getStopIndex() {
return stopIndex;
}
public void setStopIndex(final int stopIndex) {
this.stopIndex = stopIndex;
}
}