From 61f131571aed9d5b0a9efc6df231b92446bc68fe Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 12 Sep 2018 13:42:23 +0200 Subject: [PATCH] add CamelCase matching to the query language --- .../lang/ExpressionToDocIdVisitor.java | 58 +------------ .../pdb/datastore/lang/GloblikePattern.java | 28 +++++++ .../lang/QueryCompletionPdbLangParser.java | 37 +++++---- .../pdb/datastore/internal/ProposerTest.java | 7 ++ .../QueryCompletionPdbLangParserTest.java | 83 +++++++++++++++++++ 5 files changed, 141 insertions(+), 72 deletions(-) create mode 100644 data-store/src/main/java/org/lucares/pdb/datastore/lang/GloblikePattern.java create mode 100644 data-store/src/test/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParserTest.java diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java index f57248c..a1a7710 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/ExpressionToDocIdVisitor.java @@ -10,16 +10,12 @@ import java.util.Map.Entry; import java.util.Objects; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringUtils; import org.lucares.collections.IntList; import org.lucares.pdb.datastore.Doc; import org.lucares.pdb.datastore.lang.Expression.And; import org.lucares.pdb.datastore.lang.Expression.Not; import org.lucares.pdb.datastore.lang.Expression.Or; import org.lucares.pdb.datastore.lang.Expression.Parentheses; -import org.lucares.pdb.datastore.lang.Expression.Property; -import org.lucares.pdb.datastore.lang.Expression.Terminal; -import org.lucares.utils.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -147,16 +143,11 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor { result = merge(allValuesForKey.values()); break; - } else if (containsWildcard(value)) { - - final Collection docIds = filterByWildcard(propertyName, globToRegex(value)); + } else { + final Collection docIds = filterByWildcard(propertyName, + GloblikePattern.globlikeToRegex(value)); final IntList mergedDocIds = merge(docIds); result = IntList.union(result, mergedDocIds); - } else { - final IntList docIds = keyToValueToDocId.// - getOrDefault(propertyName, EMPTY_VALUES).// - getOrDefault(value, EMPTY_DOC_IDS); - result = IntList.union(result, docIds); } } @@ -168,50 +159,13 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor { return allDocIds.getAllDocIds(); } - @Override - public IntList visit(final Property expression) { - - final String propertyName = expression.property; - final Terminal propertyValue = expression.value; - final String stringValue = propertyValue.getValue(); - final long start = System.nanoTime(); - final IntList result; - if (isMatchAll(stringValue)) { - - final Map allValuesForKey = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES); - - result = merge(allValuesForKey.values()); - } else if (containsWildcard(stringValue)) { - - final Collection docIds = filterByWildcard(propertyName, globToRegex(stringValue)); - - result = merge(docIds); - } else { - result = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES).getOrDefault(stringValue, - EMPTY_DOC_IDS); - } - LOGGER.trace("{} took {} ms results={}", expression, (System.nanoTime() - start) / 1_000_000.0, result.size()); - - return result; - } - - private Pattern globToRegex(final String globPattern) { - - final String[] tokens = StringUtils.splitPreserveAllTokens(globPattern, "*"); - - final List quotedTokens = CollectionUtils.map(tokens, Pattern::quote); - final String regex = String.join(".*", quotedTokens); - - return Pattern.compile(regex); - } - private List filterByWildcard(final String propertyName, final Pattern valuePattern) { final List result = new ArrayList<>(); final Map valueToDocId = keyToValueToDocId.getOrDefault(propertyName, EMPTY_VALUES); for (final Entry entry : valueToDocId.entrySet()) { - if (valuePattern.matcher(entry.getKey()).matches()) { + if (valuePattern.matcher(entry.getKey()).find()) { result.add(entry.getValue()); } } @@ -219,10 +173,6 @@ public class ExpressionToDocIdVisitor extends ExpressionVisitor { return result; } - private boolean containsWildcard(final String stringValue) { - return stringValue.contains("*"); - } - private IntList merge(final Collection lists) { IntList result = new IntList(); diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/GloblikePattern.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/GloblikePattern.java new file mode 100644 index 0000000..3a1688f --- /dev/null +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/GloblikePattern.java @@ -0,0 +1,28 @@ +package org.lucares.pdb.datastore.lang; + +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class GloblikePattern { + + private static final Logger LOGGER = LoggerFactory.getLogger(GloblikePattern.class); + + static Pattern globlikeToRegex(final String globPattern) { + // a character that cannot be in the globPattern + final String dotPlaceholder = "\ue003"; // fourth character in the private use area + + final String valueRegex = "^" + // + globPattern// + .replace("-", Pattern.quote("-"))// + .replace(".", dotPlaceholder)// + .replace("*", ".*")// + .replace(dotPlaceholder, ".*\\.")// + .replaceAll("([A-Z])", "[a-z]*$1"); + + LOGGER.trace(">{}< -> >{}<", globPattern, valueRegex); + + return Pattern.compile(valueRegex); + } +} diff --git a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParser.java b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParser.java index 9f3cbfe..352fce4 100644 --- a/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParser.java +++ b/data-store/src/main/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParser.java @@ -1,10 +1,13 @@ package org.lucares.pdb.datastore.lang; import java.util.BitSet; +import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.antlr.v4.runtime.ANTLRErrorListener; import org.antlr.v4.runtime.CommonToken; @@ -243,30 +246,13 @@ public class QueryCompletionPdbLangParser extends PdbLangParser { @Override public void exitPropertyTerminalExpression(final PropertyTerminalExpressionContext ctx) { - // if (containsCaret(ctx)) { - // final int start = ctx.getStart().getStartIndex(); - // final int end = ctx.getStop().getStopIndex(); - // final int ruleIndex = _ctx.getRuleIndex(); - // - // final String prefix = ctx.getText().substring(0, caretPosition - - // start); - // ctx.getParent().children.get(0).getText(); - // - // proposals.addAll(getPropertyValuesByPrefix(prefix)); - // } } private SortedSet getPropertyValuesByPrefix(final String propertyKey, final String propertyValuePrefix) { final SortedSet availableValuesForKey = dataStore.getAvailableValuesForKey("", propertyKey); - final SortedSet result = new TreeSet<>(); - - for (final String value : availableValuesForKey) { - if (value.startsWith(propertyValuePrefix) && !value.equals(propertyValuePrefix)) { - result.add(value); - } - } + final SortedSet result = filterValues(availableValuesForKey, propertyValuePrefix); return result; } @@ -325,4 +311,19 @@ public class QueryCompletionPdbLangParser extends PdbLangParser { super(input); } + static SortedSet filterValues(final Collection availableValues, final String valuePattern) { + final SortedSet result = new TreeSet<>(); + + final Pattern pattern = GloblikePattern.globlikeToRegex(valuePattern); + + for (final String value : availableValues) { + final Matcher matcher = pattern.matcher(value); + if (matcher.find() && !value.equals(valuePattern)) { + result.add(value); + } + } + + return result; + } + } diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java index d3eb14e..f63ad7c 100644 --- a/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java +++ b/data-store/src/test/java/org/lucares/pdb/datastore/internal/ProposerTest.java @@ -132,6 +132,13 @@ public class ProposerTest { ); } + public void testProposalWithWildcards() throws Exception { + assertProposals("name=*im", 8, // + new Proposal("Tim", "name=Tim", true, "name=Tim", 8), // + new Proposal("Timothy", "name=Timothy", true, "name=Timothy", 12)// + ); + } + private void assertProposals(final String query, final int caretIndex, final Proposal... expected) throws InterruptedException { diff --git a/data-store/src/test/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParserTest.java b/data-store/src/test/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParserTest.java new file mode 100644 index 0000000..3d39023 --- /dev/null +++ b/data-store/src/test/java/org/lucares/pdb/datastore/lang/QueryCompletionPdbLangParserTest.java @@ -0,0 +1,83 @@ +package org.lucares.pdb.datastore.lang; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +@Test +public class QueryCompletionPdbLangParserTest { + @DataProvider + public Object[][] providerPatterns() { + + final List result = new ArrayList<>(); + + // opinion-size-age-shape-colour-origin-material-purpose Noun + final List availableValues = new ArrayList<>(); + availableValues.add("Tim"); + availableValues.add("Timothy"); + availableValues.add("Tanja"); + availableValues.add("Danja"); + availableValues.add("Wanja"); + availableValues.add("BigOldGrey.Jennifer"); + availableValues.add("BigYoungGreen.Jennifer"); + availableValues.add("BigYoungBlue.Jenny"); + availableValues.add("SmallRoundBlue.Peter"); + + { + // infix does not match + final SortedSet expected = new TreeSet<>(); + result.add(new Object[] { availableValues, "nj", expected }); + } + { + final SortedSet expected = new TreeSet<>(); + expected.add("Danja"); + expected.add("Tanja"); + expected.add("Wanja"); + + result.add(new Object[] { availableValues, "*nj", expected }); + } + { + final SortedSet expected = new TreeSet<>(); + expected.add("BigYoungBlue.Jenny"); + + result.add(new Object[] { availableValues, "BYB", expected }); + } + { + final SortedSet expected = new TreeSet<>(); + expected.add("BigOldGrey.Jennifer"); + expected.add("BigYoungGreen.Jennifer"); + + result.add(new Object[] { availableValues, "B*Gr", expected }); + } + { + final SortedSet expected = new TreeSet<>(); + expected.add("BigOldGrey.Jennifer"); + expected.add("BigYoungGreen.Jennifer"); + expected.add("BigYoungBlue.Jenny"); + + result.add(new Object[] { availableValues, ".Jen", expected }); + } + { + final SortedSet expected = new TreeSet<>(); + expected.add("BigYoungBlue.Jenny"); + expected.add("BigYoungGreen.Jennifer"); + + result.add(new Object[] { availableValues, "BY.Jen", expected }); + } + + return result.toArray(new Object[0][]); + } + + @Test(dataProvider = "providerPatterns") + public void testPatterns(final List availableValues, final String valuePattern, + final SortedSet expectedValues) { + + final SortedSet actual = QueryCompletionPdbLangParser.filterValues(availableValues, valuePattern); + Assert.assertEquals(actual, expectedValues); + } +}