replace ludb with data-store
LuDB has a few disadvantages.
1. Most notably disk space. H2 wastes a lot of valuable disk space.
For my test data set with 44 million entries it is 14 MB
(sometimes a lot more; depends on H2 internal cleanup). With
data-store it is 15 KB.
Overall I could reduce the disk space from 231 MB to 200 MB (13.4 %
in this example). That is an average of 4.6 bytes per entry.
2. Speed:
a) Liquibase is slow. The first time it takes approx. three seconds
b) Query and insertion. with data-store we can insert entries
up to 1.6 times faster.
Data-store uses a few tricks to save disk space:
1. We encode the tags into the file names.
2. To keep them short we translate the key/value of the tag into
shorter numbers. For example "foo" -> 12 and "bar" to 47. So the
tag "foo"/"bar" would be 12/47.
We then translate this number into a numeral system of base 62
(a-zA-Z0-9), so it can be used for file names and it is shorter.
That way we only have to store the mapping of string to int.
3. We do that in a simple tab separated file.
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
grammar PdbLang;
|
||||
|
||||
@header {
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
}
|
||||
|
||||
start : expression EOF ;
|
||||
|
||||
expression
|
||||
: LPAREN expression RPAREN #parenExpression
|
||||
| NOT expression #notExpression
|
||||
| prop=identifier eq=equal value=propValue #propertyExpression
|
||||
| left=expression AND right=expression #binaryAndExpression
|
||||
| left=expression OR right=expression #binaryOrExpression
|
||||
;
|
||||
|
||||
identifier
|
||||
: IDENTIFIER #identifierExpression
|
||||
;
|
||||
propValue
|
||||
: identifier
|
||||
;
|
||||
|
||||
equal : EQUAL ;
|
||||
|
||||
AND : 'and' ;
|
||||
OR : 'or' ;
|
||||
NOT : '!';
|
||||
EQUAL : '=' ;
|
||||
LPAREN : '(' ;
|
||||
RPAREN : ')' ;
|
||||
WS : [ \r\t\u000C\n]+ -> skip;
|
||||
|
||||
|
||||
IDENTIFIER
|
||||
: JavaLetter JavaLetterOrDigit*
|
||||
;
|
||||
|
||||
|
||||
fragment
|
||||
JavaLetter
|
||||
: [a-zA-Z0-9$_] // these are the "java letters" below 0x7F
|
||||
| [\u002a] // asterisk, used for wildcards
|
||||
| // covers all characters above 0x7F which are not a surrogate
|
||||
~[\u0000-\u007F\uD800-\uDBFF]
|
||||
{Character.isJavaIdentifierStart(_input.LA(-1))}?
|
||||
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
|
||||
[\uD800-\uDBFF] [\uDC00-\uDFFF]
|
||||
{Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
|
||||
;
|
||||
|
||||
fragment
|
||||
JavaLetterOrDigit
|
||||
: [a-zA-Z0-9$_] // these are the "java letters or digits" below 0x7F
|
||||
| [\u002a] // asterisk, used for wildcards
|
||||
| '.'
|
||||
| '/'
|
||||
| '-'
|
||||
| // covers all characters above 0x7F which are not a surrogate
|
||||
~[\u0000-\u007F\uD800-\uDBFF]
|
||||
{Character.isJavaIdentifierPart(_input.LA(-1))}?
|
||||
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
|
||||
[\uD800-\uDBFF] [\uDC00-\uDFFF]
|
||||
{Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
|
||||
;
|
||||
@@ -0,0 +1,199 @@
|
||||
package org.lucares.pdb.datastore;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.lucares.collections.IntList;
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.pdb.datastore.internal.FolderStorage;
|
||||
import org.lucares.pdb.datastore.internal.RadixConverter;
|
||||
import org.lucares.pdb.datastore.internal.StringCompressor;
|
||||
import org.lucares.pdb.datastore.lang.Expression;
|
||||
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor;
|
||||
import org.lucares.pdb.datastore.lang.ExpressionToDocIdVisitor.AllDocIds;
|
||||
import org.lucares.pdb.datastore.lang.QueryLanguageParser;
|
||||
|
||||
public class DataStore {
|
||||
|
||||
private static final String SUBDIR_STORAGE = "storage";
|
||||
private static final String PDB_EXTENSION = ".pdb";
|
||||
private static final String KEY_VALUE_SEPARATOR = "-";
|
||||
private static final String KEY_VALUE_PAIR_SEPARATOR = "_";
|
||||
private static final String KEY_VALUE_END_SEPARATOR = "$";
|
||||
|
||||
private static final String REGEX_KEY_VALUE = "[a-zA-Z0-9]+" + Pattern.quote(KEY_VALUE_SEPARATOR) + "[a-zA-Z0-9]+";
|
||||
|
||||
private static final String REGEX_KEY_VALUE_PAIRS = REGEX_KEY_VALUE + "(" + Pattern.quote(KEY_VALUE_PAIR_SEPARATOR)
|
||||
+ REGEX_KEY_VALUE + ")*";;
|
||||
|
||||
private static final String REGEX_STORAGE_FILE = String.format("(%1$s)%2$s[0-9]*%3$s", REGEX_KEY_VALUE_PAIRS,
|
||||
Pattern.quote(KEY_VALUE_END_SEPARATOR), PDB_EXTENSION);
|
||||
|
||||
private static final Pattern EXTRACT_TAGS_PATTERN = Pattern.compile(REGEX_STORAGE_FILE);
|
||||
|
||||
private final List<Doc> docIdToDoc = new ArrayList<>();
|
||||
|
||||
private final Map<String, Map<String, IntList>> keyToValueToDocId = new HashMap<>();
|
||||
|
||||
private final StringCompressor stringCompressor;
|
||||
private final FolderStorage folderStorage;
|
||||
|
||||
public DataStore(final Path dataDirectory) throws IOException {
|
||||
stringCompressor = StringCompressor.create(keyCompressionFile(dataDirectory));
|
||||
|
||||
folderStorage = new FolderStorage(storageDirectory(dataDirectory), 1000);
|
||||
init(folderStorage);
|
||||
}
|
||||
|
||||
private void init(final FolderStorage folderStorage) throws IOException {
|
||||
|
||||
final Stream<Path> files = folderStorage.list();
|
||||
files.forEach(path -> {
|
||||
|
||||
final String filename = path.getFileName().toString();
|
||||
final Tags tags = toTags(filename);
|
||||
cacheTagToFileMapping(tags, path);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private void cacheTagToFileMapping(final Tags tags, final Path path) {
|
||||
|
||||
final int docId = docIdToDoc.size();
|
||||
docIdToDoc.add(new Doc(tags, path));
|
||||
|
||||
for (final String key : tags.getKeys()) {
|
||||
final Map<String, IntList> valueToDocIds = keyToValueToDocId.computeIfAbsent(key, k -> new HashMap<>());
|
||||
|
||||
final String value = tags.getValue(key);
|
||||
|
||||
final IntList docIds = valueToDocIds.computeIfAbsent(value, v -> new IntList());
|
||||
docIds.add(docId);
|
||||
}
|
||||
}
|
||||
|
||||
private Path keyCompressionFile(final Path dataDirectory) throws IOException {
|
||||
return dataDirectory.resolve("keys.csv");
|
||||
}
|
||||
|
||||
public static Path storageDirectory(final Path dataDirectory) throws IOException {
|
||||
return dataDirectory.resolve(SUBDIR_STORAGE);
|
||||
}
|
||||
|
||||
public Path createNewFile(final Tags tags) throws IOException {
|
||||
|
||||
final Path filename = toFilename(tags);
|
||||
final Path result = folderStorage.insert(filename.toString(), PDB_EXTENSION);
|
||||
|
||||
cacheTagToFileMapping(tags, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private Path toFilename(final Tags tags) {
|
||||
final StringBuilder path = new StringBuilder();
|
||||
|
||||
final SortedSet<String> sortedKeys = new TreeSet<>(tags.getKeys());
|
||||
|
||||
for (final String key : sortedKeys) {
|
||||
final String value = tags.getValue(key);
|
||||
|
||||
final int compressedKey = stringCompressor.put(key);
|
||||
final int compressedValue = stringCompressor.put(value);
|
||||
|
||||
if (path.length() > 0) {
|
||||
path.append(KEY_VALUE_PAIR_SEPARATOR);
|
||||
}
|
||||
|
||||
path.append(RadixConverter.toString(compressedKey));
|
||||
path.append(KEY_VALUE_SEPARATOR);
|
||||
path.append(RadixConverter.toString(compressedValue));
|
||||
}
|
||||
path.append(KEY_VALUE_END_SEPARATOR);
|
||||
|
||||
return Paths.get(path.toString());
|
||||
}
|
||||
|
||||
private Tags toTags(final String filename) {
|
||||
Tags tags = Tags.create();
|
||||
|
||||
final Matcher matcher = EXTRACT_TAGS_PATTERN.matcher(filename);
|
||||
|
||||
if (matcher.find()) {
|
||||
final String serializedTags = matcher.group(1);
|
||||
|
||||
final String[] serializedKeyValuePairs = serializedTags.split(Pattern.quote(KEY_VALUE_PAIR_SEPARATOR));
|
||||
|
||||
for (int i = 0; i < serializedKeyValuePairs.length; i++) {
|
||||
final String[] keyValuePair = serializedKeyValuePairs[i].split(Pattern.quote(KEY_VALUE_SEPARATOR));
|
||||
|
||||
if (keyValuePair.length == 2) {
|
||||
|
||||
final String key = stringCompressor.get(RadixConverter.fromString(keyValuePair[0]));
|
||||
final String value = stringCompressor.get(RadixConverter.fromString(keyValuePair[1]));
|
||||
|
||||
tags = tags.copyAdd(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
public List<Doc> search(final String query) {
|
||||
|
||||
final Expression expression = QueryLanguageParser.parse(query);
|
||||
final ExpressionToDocIdVisitor visitor = new ExpressionToDocIdVisitor(keyToValueToDocId,
|
||||
new AllDocIds(docIdToDoc));
|
||||
final IntList docIdsList = expression.visit(visitor);
|
||||
|
||||
final List<Doc> result = new ArrayList<>(docIdsList.size());
|
||||
|
||||
final int[] intDocIds = docIdsList.toArray();
|
||||
for (int i = 0; i < intDocIds.length; i++) {
|
||||
final int docId = intDocIds[i];
|
||||
|
||||
final Doc doc = docIdToDoc.get(docId);
|
||||
result.add(doc);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<String> getAvailableFields() {
|
||||
|
||||
final List<String> result = new ArrayList<>();
|
||||
result.addAll(keyToValueToDocId.keySet());
|
||||
|
||||
Collections.sort(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public SortedSet<String> getAvailableValuesForKey(final String query, final String key) {
|
||||
|
||||
final SortedSet<String> result = new TreeSet<>();
|
||||
final List<Doc> docs = search(query);
|
||||
for (final Doc doc : docs) {
|
||||
final String valueForKey = doc.getTags().getValue(key);
|
||||
|
||||
if (valueForKey != null) {
|
||||
result.add(valueForKey);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
30
data-store/src/main/java/org/lucares/pdb/datastore/Doc.java
Normal file
30
data-store/src/main/java/org/lucares/pdb/datastore/Doc.java
Normal file
@@ -0,0 +1,30 @@
|
||||
package org.lucares.pdb.datastore;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.lucares.pdb.api.Tags;
|
||||
|
||||
public class Doc {
|
||||
private final Tags tags;
|
||||
private final Path path;
|
||||
|
||||
public Doc(final Tags tags, final Path path) {
|
||||
super();
|
||||
this.tags = tags;
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public Tags getTags() {
|
||||
return tags;
|
||||
}
|
||||
|
||||
public Path getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Doc [tags=" + tags + ", path=" + path + "]";
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.util.function.Function;
|
||||
|
||||
public class CreateNewKey implements Function<String, String> {
|
||||
|
||||
private final int index;
|
||||
|
||||
public CreateNewKey(final int index) {
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(final String key) {
|
||||
|
||||
final String result = String.valueOf(index);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.util.function.BiPredicate;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class FolderStorage {
|
||||
|
||||
private final Path storageBaseDirectory;
|
||||
|
||||
private int firstLevel = 0;
|
||||
private int secondLevel = 0;
|
||||
private int filesInSecondLevel = 0;
|
||||
|
||||
private Path currentDirectory;
|
||||
|
||||
private final int maxFilesPerFolder;
|
||||
|
||||
public FolderStorage(final Path storageBaseDirectory, final int maxFilesPerFolder) throws IOException {
|
||||
this.storageBaseDirectory = storageBaseDirectory;
|
||||
this.maxFilesPerFolder = maxFilesPerFolder;
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() throws IOException {
|
||||
|
||||
Files.createDirectories(storageBaseDirectory);
|
||||
|
||||
firstLevel = Math.max((int) Files.list(storageBaseDirectory).count() - 1, 0);
|
||||
|
||||
final Path firstLevelDirectory = storageBaseDirectory.resolve(String.valueOf(firstLevel));
|
||||
Files.createDirectories(firstLevelDirectory);
|
||||
|
||||
secondLevel = Math.max((int) Files.list(firstLevelDirectory).count() - 1, 0);
|
||||
currentDirectory = firstLevelDirectory.resolve(String.valueOf(secondLevel));
|
||||
Files.createDirectories(currentDirectory);
|
||||
|
||||
filesInSecondLevel = (int) Files.list(currentDirectory).count();
|
||||
}
|
||||
|
||||
public Path insert(final String filenamePrefix, final String filenameSuffix) throws IOException {
|
||||
|
||||
ensureCapacity();
|
||||
|
||||
String filename = filenamePrefix + filenameSuffix;
|
||||
int index = 1;
|
||||
Path newFile = currentDirectory.resolve(filename);
|
||||
while (Files.exists(newFile)) {
|
||||
filename = filenamePrefix + index++ + filenameSuffix;
|
||||
newFile = currentDirectory.resolve(filename);
|
||||
}
|
||||
Files.createFile(newFile);
|
||||
filesInSecondLevel++;
|
||||
|
||||
return newFile;
|
||||
}
|
||||
|
||||
private void ensureCapacity() throws IOException {
|
||||
if (filesInSecondLevel >= maxFilesPerFolder) {
|
||||
secondLevel++;
|
||||
if (secondLevel >= maxFilesPerFolder) {
|
||||
firstLevel++;
|
||||
secondLevel = 0;
|
||||
}
|
||||
filesInSecondLevel = 0;
|
||||
|
||||
updateCurrentDirectory();
|
||||
}
|
||||
}
|
||||
|
||||
private void updateCurrentDirectory() throws IOException {
|
||||
currentDirectory = storageBaseDirectory.resolve(String.valueOf(firstLevel))
|
||||
.resolve(String.valueOf(secondLevel));
|
||||
Files.createDirectories(currentDirectory);
|
||||
}
|
||||
|
||||
public Stream<Path> list() throws IOException {
|
||||
final int maxDepth = Integer.MAX_VALUE;
|
||||
final BiPredicate<Path, BasicFileAttributes> matchRegularFiles = (path, attr) -> Files.isRegularFile(path);
|
||||
|
||||
return Files.find(storageBaseDirectory, maxDepth, matchRegularFiles);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
public class RadixConverter {
|
||||
|
||||
private static final String ALPHABET = "0123456789ABCDEFGHIJKLMNOPRSTUVWXYZacbdefghijklmnopqrstuvwxyz";
|
||||
|
||||
public static String toString(final int value) {
|
||||
|
||||
if (value < 0) {
|
||||
throw new IllegalArgumentException("value must not be negative");
|
||||
}
|
||||
|
||||
final StringBuilder result = new StringBuilder();
|
||||
int v = value;
|
||||
|
||||
if (v == 0) {
|
||||
result.append(ALPHABET.charAt(0));
|
||||
} else {
|
||||
while (v > 0) {
|
||||
final int remainder = v % ALPHABET.length();
|
||||
v = v / ALPHABET.length();
|
||||
|
||||
result.insert(0, ALPHABET.charAt(remainder));
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static int fromString(final String string) {
|
||||
|
||||
int result = 0;
|
||||
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
final int value = ALPHABET.indexOf(string.charAt(i));
|
||||
result = result * ALPHABET.length() + value;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
public class RuntimeIOException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public RuntimeIOException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.lucares.pdb.datastore.internal.map.UniqueStringIntegerPairs;
|
||||
|
||||
/**
|
||||
* Persistently maps Strings to integers.
|
||||
*/
|
||||
public class StringCompressor {
|
||||
|
||||
private final UniqueStringIntegerPairs usip;
|
||||
|
||||
public StringCompressor(final UniqueStringIntegerPairs usip) throws RuntimeIOException {
|
||||
this.usip = usip;
|
||||
}
|
||||
|
||||
public static StringCompressor create(final Path path) {
|
||||
final UniqueStringIntegerPairs mapsi = new UniqueStringIntegerPairs(path);
|
||||
return new StringCompressor(mapsi);
|
||||
}
|
||||
|
||||
public Integer put(final String string) {
|
||||
|
||||
return usip.computeIfAbsent(string, s -> usip.getHighestInteger() + 1);
|
||||
}
|
||||
|
||||
public String get(final int integer) {
|
||||
|
||||
return usip.getKey(integer);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
package org.lucares.pdb.datastore.internal.map;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.lucares.pdb.datastore.internal.RuntimeIOException;
|
||||
|
||||
/**
|
||||
* A very simple {@link Set}-like or {@link Map}-like datastructure that stores
|
||||
* unique¹ pairs of Strings and integers persistently.
|
||||
* <p>
|
||||
* (1) Unique means, that neither the string, nor the integer may occur twice.
|
||||
* For Example, imagine the pair ("a", 1) already exists, then neither ("a", 2)
|
||||
* nor ("b", 1) may be added.
|
||||
* <p>
|
||||
* You can only add pairs. No deletion. It keeps an in memory view for fast
|
||||
* retrievals.
|
||||
*/
|
||||
public class UniqueStringIntegerPairs {
|
||||
private static final String SEPARATOR = "\t";
|
||||
|
||||
private static final boolean APPEND = true;
|
||||
|
||||
/**
|
||||
* Maps a string to an integer. E.g. "myLongValue" -> 123
|
||||
*/
|
||||
private final Map<String, Integer> stringToInt = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Maps an integer to a string. E.g. 123 -> "myLongValue"
|
||||
*/
|
||||
private final SortedMap<Integer, String> intToString = new TreeMap<>();
|
||||
|
||||
private final Path file;
|
||||
|
||||
public UniqueStringIntegerPairs(final Path file) {
|
||||
super();
|
||||
this.file = file;
|
||||
init(file);
|
||||
}
|
||||
|
||||
private void init(final Path file) throws RuntimeIOException {
|
||||
|
||||
try {
|
||||
Files.createDirectories(file.getParent());
|
||||
if (!Files.exists(file)) {
|
||||
Files.createFile(file);
|
||||
}
|
||||
|
||||
try (final BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(new FileInputStream(file.toFile()), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
|
||||
final String[] tokens = line.split(Pattern.quote(SEPARATOR));
|
||||
|
||||
if (tokens.length == 2) {
|
||||
final String string = tokens[0];
|
||||
final int value = Integer.parseInt(tokens[1]);
|
||||
intToString.put(value, string);
|
||||
stringToInt.put(string, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void put(final String first, final int second) {
|
||||
|
||||
if (stringToInt.containsKey(first) || intToString.containsKey(second)) {
|
||||
throw new IllegalArgumentException("Unique key constraint violation for (" + first + ", " + second + ")");
|
||||
}
|
||||
|
||||
try (final Writer writer = new OutputStreamWriter(new FileOutputStream(file.toFile(), APPEND),
|
||||
StandardCharsets.UTF_8)) {
|
||||
|
||||
writer.write(first + SEPARATOR + second + "\n");
|
||||
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
|
||||
intToString.put(second, first);
|
||||
stringToInt.put(first, second);
|
||||
}
|
||||
|
||||
public Integer get(final String first) {
|
||||
|
||||
return stringToInt.get(first);
|
||||
}
|
||||
|
||||
public String getKey(final Integer second) {
|
||||
return intToString.get(second);
|
||||
}
|
||||
|
||||
public Integer getHighestInteger() {
|
||||
return intToString.size() == 0 ? -1 : intToString.lastKey();
|
||||
}
|
||||
|
||||
public Integer computeIfAbsent(final String first, final Function<String, Integer> mappingFunction) {
|
||||
|
||||
if (!stringToInt.containsKey(first)) {
|
||||
final Integer second = mappingFunction.apply(first);
|
||||
put(first, second);
|
||||
}
|
||||
|
||||
return stringToInt.get(first);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
import org.antlr.v4.runtime.BaseErrorListener;
|
||||
import org.antlr.v4.runtime.RecognitionException;
|
||||
import org.antlr.v4.runtime.Recognizer;
|
||||
|
||||
public class ErrorListener extends BaseErrorListener {
|
||||
|
||||
@Override
|
||||
public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line,
|
||||
final int charPositionInLine, final String msg, final RecognitionException e) {
|
||||
|
||||
final int lineStart = line;
|
||||
final int startIndex = charPositionInLine;
|
||||
final int lineStop = line;
|
||||
final int stopIndex = charPositionInLine;
|
||||
throw new SyntaxException(msg, lineStart, startIndex, lineStop, stopIndex);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,437 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
abstract public class Expression {
|
||||
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
abstract static class UnaryExpression extends Expression {
|
||||
|
||||
private final int line;
|
||||
private final int startIndex;
|
||||
private final int stopIndex;
|
||||
|
||||
public UnaryExpression(final int line, final int startIndex, final int stopIndex) {
|
||||
super();
|
||||
this.line = line;
|
||||
this.startIndex = startIndex;
|
||||
this.stopIndex = stopIndex;
|
||||
}
|
||||
|
||||
int getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
int getStartIndex() {
|
||||
return startIndex;
|
||||
}
|
||||
|
||||
int getStopIndex() {
|
||||
return stopIndex;
|
||||
}
|
||||
}
|
||||
|
||||
abstract static class TemporaryExpression extends Expression {
|
||||
|
||||
abstract Expression toExpression(Expression left, Expression right);
|
||||
}
|
||||
|
||||
public static MatchAll matchAll() {
|
||||
return MatchAll.INSTANCE;
|
||||
}
|
||||
|
||||
static class OrTemporary extends TemporaryExpression {
|
||||
|
||||
@Override
|
||||
Expression toExpression(final Expression left, final Expression right) {
|
||||
return new Or(left, right);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OrTemporary";
|
||||
}
|
||||
}
|
||||
|
||||
static class AndTemporary extends TemporaryExpression {
|
||||
@Override
|
||||
Expression toExpression(final Expression left, final Expression right) {
|
||||
return new And(left, right);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "AndTemporary";
|
||||
}
|
||||
}
|
||||
|
||||
static class Not extends Expression {
|
||||
private final Expression expression;
|
||||
|
||||
Not(final Expression expression) {
|
||||
this.expression = expression;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
Expression getExpression() {
|
||||
return expression;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "!" + expression;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((expression == null) ? 0 : expression.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final Not other = (Not) obj;
|
||||
if (expression == null) {
|
||||
if (other.expression != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!expression.equals(other.expression)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class Or extends Expression {
|
||||
private final Expression left;
|
||||
private final Expression right;
|
||||
|
||||
Or(final Expression left, final Expression right) {
|
||||
this.left = left;
|
||||
this.right = right;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
Expression getLeft() {
|
||||
return left;
|
||||
}
|
||||
|
||||
Expression getRight() {
|
||||
return right;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return " (" + left + " or " + right + ") ";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((left == null) ? 0 : left.hashCode());
|
||||
result = prime * result + ((right == null) ? 0 : right.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final Or other = (Or) obj;
|
||||
if (left == null) {
|
||||
if (other.left != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!left.equals(other.left)) {
|
||||
return false;
|
||||
}
|
||||
if (right == null) {
|
||||
if (other.right != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!right.equals(other.right)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class And extends Expression {
|
||||
private final Expression left;
|
||||
private final Expression right;
|
||||
|
||||
And(final Expression left, final Expression right) {
|
||||
this.left = left;
|
||||
this.right = right;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
Expression getLeft() {
|
||||
return left;
|
||||
}
|
||||
|
||||
Expression getRight() {
|
||||
return right;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return " (" + left + " and " + right + ") ";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((left == null) ? 0 : left.hashCode());
|
||||
result = prime * result + ((right == null) ? 0 : right.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final And other = (And) obj;
|
||||
if (left == null) {
|
||||
if (other.left != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!left.equals(other.left)) {
|
||||
return false;
|
||||
}
|
||||
if (right == null) {
|
||||
if (other.right != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!right.equals(other.right)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class MatchAll extends Expression {
|
||||
|
||||
public static final MatchAll INSTANCE = new MatchAll();
|
||||
|
||||
private MatchAll() {
|
||||
//
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "true";
|
||||
}
|
||||
}
|
||||
|
||||
static class Terminal extends UnaryExpression {
|
||||
private final String value;
|
||||
|
||||
Terminal(final String value, final int line, final int startIndex, final int stopIndex) {
|
||||
super(line, startIndex, stopIndex);
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((value == null) ? 0 : value.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final Terminal other = (Terminal) obj;
|
||||
if (value == null) {
|
||||
if (other.value != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!value.equals(other.value)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
static class Property extends Expression {
|
||||
final String property;
|
||||
final Terminal value;
|
||||
|
||||
public Property(final String property, final Terminal value) {
|
||||
this.property = property;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return " " + property + " = " + value.getValue() + " ";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((property == null) ? 0 : property.hashCode());
|
||||
result = prime * result + ((value == null) ? 0 : value.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
final Property other = (Property) obj;
|
||||
if (property == null) {
|
||||
if (other.property != null)
|
||||
return false;
|
||||
} else if (!property.equals(other.property))
|
||||
return false;
|
||||
if (value == null) {
|
||||
if (other.value != null)
|
||||
return false;
|
||||
} else if (!value.equals(other.value))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static class Parentheses extends Expression {
|
||||
private final Expression expression;
|
||||
|
||||
Parentheses(final Expression expression) {
|
||||
this.expression = expression;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(final ExpressionVisitor<T> visitor) {
|
||||
return visitor.visit(this);
|
||||
}
|
||||
|
||||
public Expression getExpression() {
|
||||
return expression;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return " [ " + expression + " ] ";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((expression == null) ? 0 : expression.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final Parentheses other = (Parentheses) obj;
|
||||
if (expression == null) {
|
||||
if (other.expression != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!expression.equals(other.expression)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,248 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.lucares.collections.IntList;
|
||||
import org.lucares.pdb.datastore.Doc;
|
||||
import org.lucares.pdb.datastore.lang.Expression.And;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Not;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Or;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Parentheses;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Property;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Terminal;
|
||||
import org.lucares.utils.CollectionUtils;
|
||||
|
||||
public class ExpressionToDocIdVisitor extends ExpressionVisitor<IntList> {
|
||||
|
||||
public static final class AllDocIds {
|
||||
|
||||
private final List<Doc> docIdToPath;
|
||||
|
||||
private IntList cachedPathIds = new IntList();
|
||||
|
||||
public AllDocIds(final List<Doc> docIdToPath) {
|
||||
this.docIdToPath = docIdToPath;
|
||||
}
|
||||
|
||||
public IntList getAllDocIds() {
|
||||
|
||||
final int pathIds = docIdToPath.size();
|
||||
|
||||
if (cachedPathIds.size() != pathIds) {
|
||||
final IntList result = new IntList(pathIds);
|
||||
for (int i = 0; i < pathIds; i++) {
|
||||
result.add(i);
|
||||
}
|
||||
cachedPathIds = result;
|
||||
}
|
||||
|
||||
return cachedPathIds;
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, IntList> EMPTY_VALUES = Collections.emptyMap();
|
||||
private static final IntList EMPTY_DOC_IDS = new IntList();
|
||||
private final Map<String, Map<String, IntList>> keyToValueToDocId;
|
||||
private final AllDocIds allDocIds;
|
||||
|
||||
public ExpressionToDocIdVisitor(final Map<String, Map<String, IntList>> keyToValueToDocId,
|
||||
final AllDocIds allDocIds) {
|
||||
this.keyToValueToDocId = keyToValueToDocId;
|
||||
this.allDocIds = allDocIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final And expression) {
|
||||
|
||||
final Expression left = expression.getLeft();
|
||||
final Expression right = expression.getRight();
|
||||
|
||||
final IntList leftFiles = left.visit(this);
|
||||
final IntList rightFiles = right.visit(this);
|
||||
|
||||
final IntList result = new IntList(Math.min(leftFiles.size(), rightFiles.size()));
|
||||
|
||||
int l = 0;
|
||||
int r = 0;
|
||||
|
||||
while (l < leftFiles.size() && r < rightFiles.size()) {
|
||||
|
||||
final int lv = leftFiles.get(l);
|
||||
final int rv = rightFiles.get(r);
|
||||
|
||||
if (lv < rv) {
|
||||
l++;
|
||||
} else if (lv > rv) {
|
||||
r++;
|
||||
} else {
|
||||
result.add(lv);
|
||||
l++;
|
||||
r++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final Or expression) {
|
||||
final Expression left = expression.getLeft();
|
||||
final Expression right = expression.getRight();
|
||||
|
||||
final IntList leftFiles = left.visit(this);
|
||||
final IntList rightFiles = right.visit(this);
|
||||
|
||||
final IntList result = merge(leftFiles, rightFiles);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final Not expression) {
|
||||
|
||||
final Expression negatedExpression = expression.getExpression();
|
||||
final IntList expr = negatedExpression.visit(this);
|
||||
final IntList allDocIds = getAllDocIds();
|
||||
|
||||
final IntList result = new IntList(allDocIds.size());
|
||||
|
||||
final int[] docIdsToBeNegated = expr.toArray();
|
||||
for (int i = 0; i < allDocIds.size(); i++) {
|
||||
final int docId = allDocIds.get(i);
|
||||
if (Arrays.binarySearch(docIdsToBeNegated, docId) < 0) {
|
||||
result.add(docId);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final Parentheses parentheses) {
|
||||
|
||||
throw new UnsupportedOperationException(
|
||||
"Parenthesis not supported. The correct order should come from the parser.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final Expression.MatchAll expression) {
|
||||
|
||||
return getAllDocIds();
|
||||
}
|
||||
|
||||
private IntList getAllDocIds() {
|
||||
return allDocIds.getAllDocIds();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList visit(final Property expression) {
|
||||
|
||||
final String propertyName = expression.property;
|
||||
final Terminal propertyValue = expression.value;
|
||||
final String stringValue = propertyValue.getValue();
|
||||
|
||||
final IntList result;
|
||||
if (isMatchAll(stringValue)) {
|
||||
|
||||
final Map<String, IntList> allValuesForKey = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES);
|
||||
|
||||
result = merge(allValuesForKey.values());
|
||||
} else if (containsWildcard(stringValue)) {
|
||||
|
||||
final Collection<IntList> docIds = filterByWildcard(propertyName, globToRegex(stringValue));
|
||||
|
||||
result = merge(docIds);
|
||||
} else {
|
||||
result = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES).getOrDefault(stringValue,
|
||||
EMPTY_DOC_IDS);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private Pattern globToRegex(final String globPattern) {
|
||||
|
||||
final String[] tokens = StringUtils.splitPreserveAllTokens(globPattern, "*");
|
||||
|
||||
final List<String> quotedTokens = CollectionUtils.map(tokens, Pattern::quote);
|
||||
final String regex = String.join(".*", quotedTokens);
|
||||
|
||||
return Pattern.compile(regex);
|
||||
}
|
||||
|
||||
private List<IntList> filterByWildcard(final String propertyName, final Pattern valuePattern) {
|
||||
|
||||
final List<IntList> result = new ArrayList<>();
|
||||
|
||||
final Map<String, IntList> valueToDocId = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES);
|
||||
for (final Entry<String, IntList> entry : valueToDocId.entrySet()) {
|
||||
if (valuePattern.matcher(entry.getKey()).matches()) {
|
||||
result.add(entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean containsWildcard(final String stringValue) {
|
||||
return stringValue.contains("*");
|
||||
}
|
||||
|
||||
private IntList merge(final Collection<IntList> lists) {
|
||||
|
||||
IntList result = new IntList();
|
||||
|
||||
for (final IntList intList : lists) {
|
||||
result = merge(result, intList);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean isMatchAll(final String stringValue) {
|
||||
return Objects.equals("*", stringValue);
|
||||
}
|
||||
|
||||
private IntList merge(final IntList leftFiles, final IntList rightFiles) {
|
||||
final IntList result = new IntList(leftFiles.size() + rightFiles.size());
|
||||
|
||||
int l = 0;
|
||||
int r = 0;
|
||||
|
||||
while (l < leftFiles.size() && r < rightFiles.size()) {
|
||||
|
||||
final int lv = leftFiles.get(l);
|
||||
final int rv = rightFiles.get(r);
|
||||
|
||||
if (lv < rv) {
|
||||
result.add(lv);
|
||||
l++;
|
||||
} else if (lv > rv) {
|
||||
result.add(rv);
|
||||
r++;
|
||||
} else {
|
||||
result.add(lv);
|
||||
l++;
|
||||
r++;
|
||||
}
|
||||
}
|
||||
|
||||
if (l < leftFiles.size()) {
|
||||
final int length = leftFiles.size() - l;
|
||||
result.addAll(leftFiles.get(l, length));
|
||||
} else if (r < rightFiles.size()) {
|
||||
final int length = rightFiles.size() - r;
|
||||
result.addAll(rightFiles.get(r, length));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
public abstract class ExpressionVisitor<T> {
|
||||
public T visit(final Expression.And expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.Or expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.Not expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.Property expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.Terminal expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.MatchAll expression) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public T visit(final Expression.Parentheses parentheses) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
import java.util.Stack;
|
||||
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.CharStreams;
|
||||
import org.antlr.v4.runtime.CommonTokenStream;
|
||||
import org.antlr.v4.runtime.tree.ParseTree;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeListener;
|
||||
import org.antlr.v4.runtime.tree.ParseTreeWalker;
|
||||
import org.lucares.pdb.datastore.lang.Expression.AndTemporary;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Not;
|
||||
import org.lucares.pdb.datastore.lang.Expression.OrTemporary;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Property;
|
||||
import org.lucares.pdb.datastore.lang.Expression.TemporaryExpression;
|
||||
import org.lucares.pdb.datastore.lang.Expression.Terminal;
|
||||
import org.lucares.pdb.datastore.lang.PdbLangParser.BinaryAndExpressionContext;
|
||||
import org.lucares.pdb.datastore.lang.PdbLangParser.BinaryOrExpressionContext;
|
||||
import org.lucares.pdb.datastore.lang.PdbLangParser.IdentifierExpressionContext;
|
||||
import org.lucares.pdb.datastore.lang.PdbLangParser.NotExpressionContext;
|
||||
import org.lucares.pdb.datastore.lang.PdbLangParser.PropertyExpressionContext;
|
||||
|
||||
public class QueryLanguage {
|
||||
|
||||
public Expression parse(final String input) {
|
||||
// define the input
|
||||
final CharStream in = CharStreams.fromString(input);
|
||||
|
||||
// create lexer and parser
|
||||
final PdbLangLexer lexer = new PdbLangLexer(in);
|
||||
lexer.addErrorListener(new ErrorListener());
|
||||
|
||||
final CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||
final PdbLangParser parser = new PdbLangParser(tokens);
|
||||
parser.addErrorListener(new ErrorListener());
|
||||
|
||||
final Stack<Expression> stack = new Stack<>();
|
||||
|
||||
// define a listener that is called for every terminals and
|
||||
// non-terminals
|
||||
final ParseTreeListener listener = new PdbLangBaseListener() {
|
||||
|
||||
@Override
|
||||
public void exitIdentifierExpression(final IdentifierExpressionContext ctx) {
|
||||
// System.out.println("push identifier " + ctx.getText());
|
||||
|
||||
if (ctx.getText().length() > 255) {
|
||||
throw new SyntaxException(ctx, "token too long");
|
||||
}
|
||||
|
||||
final int line = ctx.getStart().getLine();
|
||||
final int startIndex = ctx.getStart().getStartIndex();
|
||||
final int stopIndex = ctx.getStart().getStopIndex();
|
||||
|
||||
stack.push(new Terminal(ctx.getText(), line, startIndex, stopIndex));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitPropertyExpression(final PropertyExpressionContext ctx) {
|
||||
// System.out.println("property expression");
|
||||
|
||||
final Expression value = stack.pop();
|
||||
final Terminal property = (Terminal) stack.pop();
|
||||
|
||||
stack.push(new Property(property.getValue(), (Terminal) value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitNotExpression(final NotExpressionContext ctx) {
|
||||
|
||||
final Expression expression = stack.pop();
|
||||
|
||||
final Expression notExpression = new Not(expression);
|
||||
stack.push(notExpression);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitBinaryAndExpression(final BinaryAndExpressionContext ctx) {
|
||||
final Expression right = stack.pop();
|
||||
final TemporaryExpression operation = new AndTemporary();
|
||||
final Expression left = stack.pop();
|
||||
|
||||
stack.push(operation.toExpression(left, right));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitBinaryOrExpression(final BinaryOrExpressionContext ctx) {
|
||||
final Expression right = stack.pop();
|
||||
final TemporaryExpression operation = new OrTemporary();
|
||||
final Expression left = stack.pop();
|
||||
|
||||
stack.push(operation.toExpression(left, right));
|
||||
}
|
||||
};
|
||||
|
||||
// Specify our entry point
|
||||
final ParseTree parseTree = parser.start();
|
||||
|
||||
// Walk it and attach our listener
|
||||
final ParseTreeWalker walker = new ParseTreeWalker();
|
||||
walker.walk(listener, parseTree);
|
||||
|
||||
if (stack.size() != 1) {
|
||||
throw new RuntimeException("stack should have exactly one element " + stack);
|
||||
}
|
||||
|
||||
return stack.pop();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class QueryLanguageParser {
|
||||
public static Expression parse(final String query) {
|
||||
|
||||
final Expression result;
|
||||
if (StringUtils.isEmpty(query)) {
|
||||
result = Expression.matchAll();
|
||||
} else {
|
||||
final QueryLanguage lang = new QueryLanguage();
|
||||
result = lang.parse(query);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package org.lucares.pdb.datastore.lang;
|
||||
|
||||
import org.antlr.v4.runtime.ParserRuleContext;
|
||||
|
||||
public class SyntaxException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private int lineStart;
|
||||
private int startIndex;
|
||||
private int lineStop;
|
||||
private int stopIndex;
|
||||
|
||||
public SyntaxException(final ParserRuleContext context, final String message) {
|
||||
this(message, context.getStart().getLine(), context.getStart().getStartIndex(), context.getStop().getLine(),
|
||||
context.getStop().getStopIndex());
|
||||
}
|
||||
|
||||
public SyntaxException(final String message, final int lineStart, final int startIndex, final int lineStop,
|
||||
final int stopIndex) {
|
||||
super(message + ": " + generateMessage(lineStart, startIndex, lineStop, stopIndex));
|
||||
this.lineStart = lineStart;
|
||||
this.startIndex = startIndex;
|
||||
this.lineStop = lineStop;
|
||||
this.stopIndex = stopIndex;
|
||||
}
|
||||
|
||||
private static String generateMessage(final int lineStart, final int startIndex, final int lineStop,
|
||||
final int stopIndex) {
|
||||
|
||||
return String.format("line=%d, start=%d, to line=%d stop=%d", lineStart, startIndex, lineStop, stopIndex);
|
||||
}
|
||||
|
||||
public int getLineStart() {
|
||||
return lineStart;
|
||||
}
|
||||
|
||||
public void setLineStart(final int lineStart) {
|
||||
this.lineStart = lineStart;
|
||||
}
|
||||
|
||||
public int getStartIndex() {
|
||||
return startIndex;
|
||||
}
|
||||
|
||||
public void setStartIndex(final int startIndex) {
|
||||
this.startIndex = startIndex;
|
||||
}
|
||||
|
||||
public int getLineStop() {
|
||||
return lineStop;
|
||||
}
|
||||
|
||||
public void setLineStop(final int lineStop) {
|
||||
this.lineStop = lineStop;
|
||||
}
|
||||
|
||||
public int getStopIndex() {
|
||||
return stopIndex;
|
||||
}
|
||||
|
||||
public void setStopIndex(final int stopIndex) {
|
||||
this.stopIndex = stopIndex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
package org.lucares.pdb.datastore;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.lucares.pdb.api.Tags;
|
||||
import org.lucares.utils.CollectionUtils;
|
||||
import org.lucares.utils.file.FileUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test
|
||||
public class DataStoreTest {
|
||||
private Path dataDirectory;
|
||||
private DataStore dataStore;
|
||||
private Map<Tags, Path> tagsToPath;
|
||||
|
||||
@BeforeMethod
|
||||
public void beforeMethod() throws IOException {
|
||||
dataDirectory = Files.createTempDirectory("pdb");
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void afterMethod() throws IOException {
|
||||
FileUtils.delete(dataDirectory);
|
||||
dataStore = null;
|
||||
tagsToPath = null;
|
||||
}
|
||||
|
||||
public void testInsertSingleTag() throws Exception {
|
||||
final Tags tags = Tags.create("key1", "value1", "key2", "value2");
|
||||
final Path path;
|
||||
{
|
||||
final DataStore dataStore = new DataStore(dataDirectory);
|
||||
|
||||
path = dataStore.createNewFile(tags);
|
||||
assertSearch(dataStore, "key1=value1", path);
|
||||
}
|
||||
{
|
||||
final DataStore dataStore = new DataStore(dataDirectory);
|
||||
assertSearch(dataStore, "key1=value1", path);
|
||||
}
|
||||
}
|
||||
|
||||
public void testQuery() throws Exception {
|
||||
|
||||
tagsToPath = new LinkedHashMap<>();
|
||||
final Tags eagleTim = Tags.create("bird", "eagle", "name", "Tim");
|
||||
final Tags pigeonJennifer = Tags.create("bird", "pigeon", "name", "Jennifer");
|
||||
final Tags flamingoJennifer = Tags.create("bird", "flamingo", "name", "Jennifer");
|
||||
final Tags labradorJenny = Tags.create("dog", "labrador", "name", "Jenny");
|
||||
final Tags labradorTim = Tags.create("dog", "labrador", "name", "Tim");
|
||||
|
||||
tagsToPath.put(eagleTim, null);
|
||||
tagsToPath.put(pigeonJennifer, null);
|
||||
tagsToPath.put(flamingoJennifer, null);
|
||||
tagsToPath.put(labradorJenny, null);
|
||||
tagsToPath.put(labradorTim, null);
|
||||
|
||||
dataStore = new DataStore(dataDirectory);
|
||||
|
||||
for (final Tags tags : tagsToPath.keySet()) {
|
||||
final Path newFile = dataStore.createNewFile(tags);
|
||||
tagsToPath.put(tags, newFile);
|
||||
}
|
||||
|
||||
assertSearch("bird=eagle", eagleTim);
|
||||
assertSearch("dog=labrador", labradorJenny, labradorTim);
|
||||
assertSearch("name=Tim", eagleTim, labradorTim);
|
||||
assertSearch("dog=labrador and name=Tim", labradorTim);
|
||||
assertSearch("dog=labrador and !name=Tim", labradorJenny);
|
||||
assertSearch("name=Jennifer or name=Jenny", pigeonJennifer, flamingoJennifer, labradorJenny);
|
||||
|
||||
// a͟n͟d binds stronger than o͟r
|
||||
assertSearch("name=Tim and dog=labrador or bird=pigeon", pigeonJennifer, labradorTim);
|
||||
assertSearch("bird=pigeon or name=Tim and dog=labrador", pigeonJennifer, labradorTim);
|
||||
|
||||
// parenthesis override priority of a͟n͟d
|
||||
assertSearch("name=Tim and (dog=labrador or bird=pigeon)", labradorTim);
|
||||
assertSearch("(dog=labrador or bird=pigeon) and name=Tim", labradorTim);
|
||||
|
||||
// wildcards
|
||||
assertSearch("bird=*", eagleTim, pigeonJennifer, flamingoJennifer);
|
||||
assertSearch("name=Jen*", pigeonJennifer, flamingoJennifer, labradorJenny);
|
||||
assertSearch("dog=*dor", labradorJenny, labradorTim);
|
||||
assertSearch("dog=lab*dor", labradorJenny, labradorTim);
|
||||
assertSearch("dog=*lab*dor*", labradorJenny, labradorTim);
|
||||
|
||||
}
|
||||
|
||||
private void assertSearch(final String query, final Tags... tags) {
|
||||
final List<Doc> actualDocs = dataStore.search(query);
|
||||
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
|
||||
|
||||
final List<Path> expectedPaths = CollectionUtils.map(tags, tagsToPath::get);
|
||||
|
||||
Assert.assertEquals(actual, expectedPaths, "Query: " + query + " Found: " + getTagsForPaths(actual));
|
||||
}
|
||||
|
||||
private List<Tags> getTagsForPaths(final List<Path> paths) {
|
||||
|
||||
final List<Tags> result = new ArrayList<>();
|
||||
|
||||
for (final Path path : paths) {
|
||||
result.add(getTagForPath(path));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private Tags getTagForPath(final Path path) {
|
||||
for (final Entry<Tags, Path> e : tagsToPath.entrySet()) {
|
||||
|
||||
if (e.getValue().equals(path)) {
|
||||
return e.getKey();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private void assertSearch(final DataStore dataStore, final String query, final Path... paths) {
|
||||
final List<Doc> actualDocs = dataStore.search(query);
|
||||
final List<Path> actual = CollectionUtils.map(actualDocs, Doc::getPath);
|
||||
|
||||
Assert.assertEquals(actual, Arrays.asList(paths));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.lucares.utils.CollectionUtils;
|
||||
import org.lucares.utils.file.FileUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
public class FolderStorageTest {
|
||||
private static final String SUFFIX = ".txt";
|
||||
private Path dataDirectory;
|
||||
|
||||
@BeforeMethod
|
||||
public void beforeMethod() throws IOException {
|
||||
dataDirectory = Files.createTempDirectory("pdb");
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void afterMethod() throws IOException {
|
||||
FileUtils.delete(dataDirectory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFolderStructureRespectingToMaxFilesPerFolder() throws Exception {
|
||||
final int maxFilesPerFolder = 2;
|
||||
|
||||
storeFiles(maxFilesPerFolder);
|
||||
storeFiles(maxFilesPerFolder, "a", "b", "c", "d", "e");
|
||||
storeFiles(maxFilesPerFolder, "f");
|
||||
storeFiles(maxFilesPerFolder, "g", "h", "i");
|
||||
|
||||
final List<Path> actualFiles = getPathsRelativeToDataDirectory();
|
||||
|
||||
final List<Path> expectedFiles = Arrays.asList(//
|
||||
Paths.get("0", "0", "a" + SUFFIX), //
|
||||
Paths.get("0", "0", "b" + SUFFIX), //
|
||||
Paths.get("0", "1", "c" + SUFFIX), //
|
||||
Paths.get("0", "1", "d" + SUFFIX), //
|
||||
Paths.get("1", "0", "e" + SUFFIX), //
|
||||
Paths.get("1", "0", "f" + SUFFIX), //
|
||||
Paths.get("1", "1", "g" + SUFFIX), //
|
||||
Paths.get("1", "1", "h" + SUFFIX), //
|
||||
Paths.get("2", "0", "i" + SUFFIX)// The first level might
|
||||
// overflow
|
||||
);
|
||||
|
||||
Assert.assertEquals(actualFiles, expectedFiles);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDuplicateNames() throws Exception {
|
||||
final int maxFilesPerFolder = 3;
|
||||
|
||||
storeFiles(maxFilesPerFolder, "a", "a", "a", "a");
|
||||
|
||||
final List<Path> actualFiles = getPathsRelativeToDataDirectory();
|
||||
|
||||
final List<Path> expectedFiles = Arrays.asList(//
|
||||
Paths.get("0", "0", "a" + SUFFIX), //
|
||||
Paths.get("0", "0", "a1" + SUFFIX), //
|
||||
Paths.get("0", "0", "a2" + SUFFIX), //
|
||||
Paths.get("0", "1", "a" + SUFFIX)//
|
||||
);
|
||||
|
||||
Assert.assertEquals(actualFiles, expectedFiles);
|
||||
}
|
||||
|
||||
private List<Path> getPathsRelativeToDataDirectory() throws IOException {
|
||||
final List<Path> actualFiles = FileUtils.listRecursively(dataDirectory);
|
||||
CollectionUtils.mapInPlace(actualFiles, p -> dataDirectory.relativize(p));
|
||||
Collections.sort(actualFiles);
|
||||
return actualFiles;
|
||||
}
|
||||
|
||||
private void storeFiles(final int maxFilesPerFolder, final String... filenames) throws IOException {
|
||||
final FolderStorage storage = new FolderStorage(dataDirectory, maxFilesPerFolder);
|
||||
|
||||
for (final String filename : filenames) {
|
||||
storage.insert(filename, SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test
|
||||
public class RadixConverterTest {
|
||||
|
||||
public void testConvertRoundtrip() {
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
|
||||
final String string = RadixConverter.toString(i);
|
||||
final int actual = RadixConverter.fromString(string);
|
||||
|
||||
Assert.assertEquals(actual, i, "string representation: " + string);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testNoNegativeValues() {
|
||||
RadixConverter.toString(-1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package org.lucares.pdb.datastore.internal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.lucares.utils.file.FileUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test
|
||||
public class StringCompressorTest {
|
||||
private Path dataDirectory;
|
||||
|
||||
@BeforeMethod
|
||||
public void beforeMethod() throws IOException {
|
||||
dataDirectory = Files.createTempDirectory("pdb");
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void afterMethod() throws IOException {
|
||||
FileUtils.delete(dataDirectory);
|
||||
}
|
||||
|
||||
public void testKeyCompressorRoundtrip() throws Exception {
|
||||
final StringCompressor keyValueCompressor = StringCompressor.create(dataDirectory.resolve("key.csv"));
|
||||
|
||||
final String value = "foo";
|
||||
final Integer intFoo = keyValueCompressor.put(value);
|
||||
final String actual = keyValueCompressor.get(intFoo);
|
||||
|
||||
Assert.assertEquals(actual, value);
|
||||
}
|
||||
|
||||
public void testKeyCompressorInitialization() throws Exception {
|
||||
final Path database = dataDirectory.resolve("key.csv");
|
||||
final String value = "foo";
|
||||
{
|
||||
final StringCompressor keyValueCompressor = StringCompressor.create(database);
|
||||
|
||||
keyValueCompressor.put(value);
|
||||
}
|
||||
{
|
||||
final StringCompressor keyValueCompressor = StringCompressor.create(database);
|
||||
|
||||
keyValueCompressor.get(0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
package org.lucares.pdb.datastore.internal.map;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.lucares.utils.file.FileUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test
|
||||
public class UniqueStringIntegerPairsTest {
|
||||
|
||||
private Path dataDirectory;
|
||||
|
||||
@BeforeMethod
|
||||
public void beforeMethod() throws IOException {
|
||||
dataDirectory = Files.createTempDirectory("pdb");
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void afterMethod() throws IOException {
|
||||
FileUtils.delete(dataDirectory);
|
||||
}
|
||||
|
||||
public void testPutGet() throws Exception {
|
||||
final Path database = dataDirectory.resolve("key.csv");
|
||||
final String first = "key1";
|
||||
final Integer second = 1;
|
||||
|
||||
{
|
||||
final UniqueStringIntegerPairs usip = new UniqueStringIntegerPairs(database);
|
||||
|
||||
usip.put(first, second);
|
||||
Assert.assertEquals(usip.get(first), second);
|
||||
Assert.assertEquals(usip.getKey(second), first);
|
||||
}
|
||||
|
||||
{
|
||||
final UniqueStringIntegerPairs usip = new UniqueStringIntegerPairs(database);
|
||||
|
||||
Assert.assertEquals(usip.get(first), second);
|
||||
Assert.assertEquals(usip.getKey(second), first);
|
||||
}
|
||||
}
|
||||
|
||||
public void testUniqueKeyContstraint() throws Exception {
|
||||
final Path database = dataDirectory.resolve("key.csv");
|
||||
final String first = "key1";
|
||||
final Integer second = 1;
|
||||
|
||||
final UniqueStringIntegerPairs usip = new UniqueStringIntegerPairs(database);
|
||||
usip.put(first, second);
|
||||
try {
|
||||
// cannot add another pair with the first key
|
||||
final int another = second + 1;
|
||||
usip.put(first, another);
|
||||
Assert.fail("expected an IllegalArgumentException");
|
||||
} catch (final IllegalArgumentException e) {
|
||||
// expected
|
||||
}
|
||||
|
||||
try {
|
||||
// cannot add another pair with the same second value
|
||||
final String another = first + 1;
|
||||
usip.put(another, second);
|
||||
Assert.fail("expected an IllegalArgumentException");
|
||||
} catch (final IllegalArgumentException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user