Added files for implementing index_prefix

Yelp · Feb 9, 2025 · 47aaded · 47aaded
1 parent 72b3d43
commit 47aaded
Show file tree

Hide file tree

Showing 13 changed files with 2,537 additions and 1,644 deletions.
diff --git a/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto b/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
@@ -586,8 +586,19 @@ message Field {
     // For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer than ignore_above will not be indexed or stored.
     // This option is also useful for protecting against Lucene’s term byte-length limit of 32766
     optional int32 ignoreAbove = 36;
+    // Parameter enables the indexing of term prefixes to speed up prefix searches
+    IndexPrefixes indexPrefixes = 37;
 }
 
+// Options for including IndexPrefixes for field
+message IndexPrefixes{
+    // The minimum prefix length to index. Must be greater than 0, and defaults to 2.
+    optional int32 min_chars = 1;
+    // The maximum prefix length to index. Must be less than 20, and defaults to 5.
+    optional int32 max_chars = 2;
+}
+
+
 /* Input to registerFields */
 message FieldDefRequest {
     string indexName = 1; // name of the index against which the field is to be created

diff --git a/grpc-gateway/luceneserver.pb.go b/grpc-gateway/luceneserver.pb.go
diff --git a/grpc-gateway/luceneserver.swagger.json b/grpc-gateway/luceneserver.swagger.json
@@ -3257,6 +3257,10 @@
           "type": "integer",
           "format": "int32",
           "title": "For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer than ignore_above will not be indexed or stored.\nThis option is also useful for protecting against Lucene’s term byte-length limit of 32766"
+        },
+        "indexPrefixes": {
+          "$ref": "#/definitions/luceneserverIndexPrefixes",
+          "title": "Parameter enables the indexing of term prefixes to speed up prefix searches"
         }
       }
     },
@@ -3849,6 +3853,22 @@
       "default": "DOCS_FREQS_POSITIONS",
       "description": "How the tokens should be indexed."
     },
+    "luceneserverIndexPrefixes": {
+      "type": "object",
+      "properties": {
+        "minChars": {
+          "type": "integer",
+          "format": "int32",
+          "description": "The minimum prefix length to index. Must be greater than 0, and defaults to 2."
+        },
+        "maxChars": {
+          "type": "integer",
+          "format": "int32",
+          "description": "The maximum prefix length to index. Must be less than 20, and defaults to 5."
+        }
+      },
+      "title": "Options for including IndexPrefixes for field"
+    },
     "luceneserverIndexSettings": {
       "type": "object",
       "properties": {

diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/QueryNodeMapper.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/QueryNodeMapper.java
@@ -27,6 +27,7 @@
 import com.yelp.nrtsearch.server.luceneserver.field.TextBaseFieldDef;
 import com.yelp.nrtsearch.server.luceneserver.field.properties.GeoQueryable;
 import com.yelp.nrtsearch.server.luceneserver.field.properties.PolygonQueryable;
+import com.yelp.nrtsearch.server.luceneserver.field.properties.PrefixQueryable;
 import com.yelp.nrtsearch.server.luceneserver.field.properties.RangeQueryable;
 import com.yelp.nrtsearch.server.luceneserver.field.properties.TermQueryable;
 import com.yelp.nrtsearch.server.luceneserver.script.ScoreScript;
@@ -44,7 +45,6 @@
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.stream.Collectors;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.function.FunctionMatchQuery;
 import org.apache.lucene.queries.function.FunctionScoreQuery;
@@ -173,7 +173,7 @@ private Query getQueryNode(
       case MATCHPHRASEPREFIXQUERY:
         return MatchPhrasePrefixQuery.build(query.getMatchPhrasePrefixQuery(), state);
       case PREFIXQUERY:
-        return getPrefixQuery(query.getPrefixQuery(), state);
+        return getPrefixQuery(query.getPrefixQuery(), state, false);
       case CONSTANTSCOREQUERY:
         return getConstantScoreQuery(query.getConstantScoreQuery(), state, docLookup);
       case SPANQUERY:
@@ -597,24 +597,16 @@ private Query getExistsQuery(ExistsQuery existsQuery, IndexState state) {
     return new ConstantScoreQuery(new TermQuery(new Term(IndexState.FIELD_NAMES, fieldName)));
   }
 
-  private static Query getPrefixQuery(PrefixQuery prefixQuery, IndexState state) {
+  private static Query getPrefixQuery(
+      PrefixQuery prefixQuery, IndexState state, boolean spanQuery) {
     FieldDef fieldDef = state.getField(prefixQuery.getField());
-    if (!(fieldDef instanceof IndexableFieldDef)) {
+    if (!(fieldDef instanceof PrefixQueryable)) {
       throw new IllegalArgumentException(
-          "Field \"" + prefixQuery.getPrefix() + "\" is not indexable");
+          "Field " + fieldDef.getName() + " does not support PrefixQuery");
     }
-    IndexOptions indexOptions = ((IndexableFieldDef) fieldDef).getFieldType().indexOptions();
-    if (indexOptions == IndexOptions.NONE) {
-      throw new IllegalArgumentException(
-          "Field \"" + prefixQuery.getField() + "\" is not indexed with terms");
-    }
-
-    org.apache.lucene.search.PrefixQuery query =
-        new org.apache.lucene.search.PrefixQuery(
-            new Term(prefixQuery.getField(), prefixQuery.getPrefix()));
-    query.setRewriteMethod(
-        getRewriteMethod(prefixQuery.getRewrite(), prefixQuery.getRewriteTopTermsSize()));
-    return query;
+    MultiTermQuery.RewriteMethod rewriteMethod =
+        getRewriteMethod(prefixQuery.getRewrite(), prefixQuery.getRewriteTopTermsSize());
+    return ((PrefixQueryable) fieldDef).getPrefixQuery(prefixQuery, rewriteMethod, spanQuery);
   }
 
   private static MultiTermQuery.RewriteMethod getRewriteMethod(
@@ -699,7 +691,7 @@ private SpanMultiTermQueryWrapper getSpanMultiTermQueryWrapper(
         FuzzyQuery fuzzyQuery = getFuzzyQuery(protoSpanMultiTermQuery);
         return new SpanMultiTermQueryWrapper<>(fuzzyQuery);
       case PREFIXQUERY:
-        Query prefixQuery = getPrefixQuery(protoSpanMultiTermQuery.getPrefixQuery(), state);
+        Query prefixQuery = getPrefixQuery(protoSpanMultiTermQuery.getPrefixQuery(), state, true);
         return new SpanMultiTermQueryWrapper<>((MultiTermQuery) prefixQuery);
       case REGEXPQUERY:
         RegexpQuery regexpQuery = getRegexpQuery(protoSpanMultiTermQuery);

diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/PrefixWrappedAnalyzer.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/PrefixWrappedAnalyzer.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2025 Yelp Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.yelp.nrtsearch.server.luceneserver.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
+
+/**
+ * An {@link AnalyzerWrapper} that wraps another analyzer and applies an Edge N-Gram token filter to
+ * the token stream.
+ */
+public class PrefixWrappedAnalyzer extends AnalyzerWrapper {
+  private final int minChars;
+  private final int maxChars;
+  private final Analyzer delegate;
+
+  /**
+   * Create a new {@link PrefixWrappedAnalyzer} that wraps the given {@link Analyzer} and sets
+   * applies an Edge N-Gram token filter to the token stream.
+   *
+   * @param delegate the analyzer to wrap
+   * @param minChars the minimum number of characters for the edge n-grams
+   * @param maxChars the maximum number of characters for the edge n-grams
+   */
+  public PrefixWrappedAnalyzer(Analyzer delegate, int minChars, int maxChars) {
+    super(delegate.getReuseStrategy());
+    this.delegate = delegate;
+    this.minChars = minChars;
+    this.maxChars = maxChars;
+  }
+
+  @Override
+  protected Analyzer getWrappedAnalyzer(String fieldName) {
+    return delegate;
+  }
+
+  @Override
+  protected TokenStreamComponents wrapComponents(
+      String fieldName, TokenStreamComponents components) {
+    TokenFilter filter =
+        new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
+    return new TokenStreamComponents(components.getSource(), filter);
+  }
+
+  @Override
+  public String toString() {
+    return "PrefixWrappedAnalyzer(" + delegate.toString() + ")";
+  }
+}
diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/AtomFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/AtomFieldDef.java
@@ -18,19 +18,24 @@
 import static com.yelp.nrtsearch.server.luceneserver.analysis.AnalyzerCreator.hasAnalyzer;
 
 import com.yelp.nrtsearch.server.grpc.Field;
+import com.yelp.nrtsearch.server.grpc.PrefixQuery;
 import com.yelp.nrtsearch.server.grpc.SortType;
+import com.yelp.nrtsearch.server.luceneserver.field.properties.PrefixQueryable;
 import com.yelp.nrtsearch.server.luceneserver.field.properties.Sortable;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortedSetSortField;
 import org.apache.lucene.util.BytesRef;
 
 /** Field class for 'ATOM' field type. Uses {@link KeywordAnalyzer} for text analysis. */
-public class AtomFieldDef extends TextBaseFieldDef implements Sortable {
+public class AtomFieldDef extends TextBaseFieldDef implements Sortable, PrefixQueryable {
   private static final Analyzer keywordAnalyzer = new KeywordAnalyzer();
 
   public AtomFieldDef(String name, Field requestField) {
@@ -145,4 +150,22 @@ public SortField getSortField(SortType type) {
     }
     return sortField;
   }
+
+  @Override
+  public Query getPrefixQuery(
+      PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod, boolean spanQuery) {
+    verifySearchable("Prefix query");
+    org.apache.lucene.search.PrefixQuery query =
+        new org.apache.lucene.search.PrefixQuery(
+            new Term(prefixQuery.getField(), prefixQuery.getPrefix()));
+    query.setRewriteMethod(rewriteMethod);
+    return query;
+  }
+
+  protected void verifySearchable(String featureName) {
+    if (!isSearchable()) {
+      throw new IllegalStateException(
+          featureName + " requires field to be searchable: " + getName());
+    }
+  }
 }
diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/PrefixFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/PrefixFieldDef.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2025 Yelp Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.yelp.nrtsearch.server.luceneserver.field;
+
+import com.yelp.nrtsearch.server.grpc.Field;
+import com.yelp.nrtsearch.server.grpc.PrefixQuery;
+import com.yelp.nrtsearch.server.luceneserver.analysis.PrefixWrappedAnalyzer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.*;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+
+public class PrefixFieldDef extends TextBaseFieldDef {
+  private final int minChars;
+  private final int maxChars;
+  private final String parentField;
+  private static final String INDEX_PREFIX = "._index_prefix";
+
+  public PrefixFieldDef(String parentName, Field requestField) {
+    super(parentName + INDEX_PREFIX, requestField);
+    this.minChars = requestField.getIndexPrefixes().getMinChars();
+    this.maxChars = requestField.getIndexPrefixes().getMaxChars();
+    this.parentField = parentName;
+  }
+
+  @Override
+  protected void setSearchProperties(FieldType fieldType, Field requestField) {
+    fieldType.setOmitNorms(true);
+    fieldType.setTokenized(true);
+    fieldType.setIndexOptions(IndexOptions.DOCS);
+  }
+
+  @Override
+  protected Analyzer parseIndexAnalyzer(Field requestField) {
+    Analyzer baseAnalyzer = super.parseIndexAnalyzer(requestField);
+    if (baseAnalyzer == null) {
+      throw new IllegalArgumentException("Could not determine analyzer");
+    }
+    return new PrefixWrappedAnalyzer(
+        baseAnalyzer,
+        requestField.getIndexPrefixes().getMinChars(),
+        requestField.getIndexPrefixes().getMaxChars());
+  }
+
+  boolean accept(int length) {
+    return length >= minChars - 1 && length <= maxChars;
+  }
+
+  public Query getPrefixQuery(PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod) {
+    String textValue = prefixQuery.getPrefix();
+    if (textValue.length() >= minChars) {
+      return super.getTermQueryFromTextValue(textValue);
+    }
+    List<Automaton> automata = new ArrayList<>();
+    automata.add(Automata.makeString(textValue));
+    for (int i = textValue.length(); i < minChars; i++) {
+      automata.add(Automata.makeAnyChar());
+    }
+    Automaton automaton = Operations.concatenate(automata);
+    AutomatonQuery query = new AutomatonQuery(new Term(getName(), textValue + "*"), automaton);
+    query.setRewriteMethod(rewriteMethod);
+    return new BooleanQuery.Builder()
+        .add(query, BooleanClause.Occur.SHOULD)
+        .add(new TermQuery(new Term(parentField, textValue)), BooleanClause.Occur.SHOULD)
+        .build();
+  }
+
+  @Override
+  public String getType() {
+    return "PREFIX";
+  }
+
+  public int getMinChars() {
+    return minChars;
+  }
+
+  public int getMaxChars() {
+    return maxChars;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(super.hashCode(), minChars, maxChars, parentField);
+  }
+}