Skip to content

Commit

Permalink
Added files for implementing index_prefix
Browse files Browse the repository at this point in the history
  • Loading branch information
manav113 committed Feb 9, 2025
1 parent 72b3d43 commit 47aaded
Show file tree
Hide file tree
Showing 13 changed files with 2,537 additions and 1,644 deletions.
11 changes: 11 additions & 0 deletions clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,19 @@ message Field {
// For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer than ignore_above will not be indexed or stored.
// This option is also useful for protecting against Lucene’s term byte-length limit of 32766
optional int32 ignoreAbove = 36;
// Parameter enables the indexing of term prefixes to speed up prefix searches
IndexPrefixes indexPrefixes = 37;
}

// Options for including IndexPrefixes for field
message IndexPrefixes{
// The minimum prefix length to index. Must be greater than 0, and defaults to 2.
optional int32 min_chars = 1;
// The maximum prefix length to index. Must be less than 20, and defaults to 5.
optional int32 max_chars = 2;
}


/* Input to registerFields */
message FieldDefRequest {
string indexName = 1; // name of the index against which the field is to be created
Expand Down
3,337 changes: 1,715 additions & 1,622 deletions grpc-gateway/luceneserver.pb.go

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions grpc-gateway/luceneserver.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3257,6 +3257,10 @@
"type": "integer",
"format": "int32",
"title": "For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer than ignore_above will not be indexed or stored.\nThis option is also useful for protecting against Lucene’s term byte-length limit of 32766"
},
"indexPrefixes": {
"$ref": "#/definitions/luceneserverIndexPrefixes",
"title": "Parameter enables the indexing of term prefixes to speed up prefix searches"
}
}
},
Expand Down Expand Up @@ -3849,6 +3853,22 @@
"default": "DOCS_FREQS_POSITIONS",
"description": "How the tokens should be indexed."
},
"luceneserverIndexPrefixes": {
"type": "object",
"properties": {
"minChars": {
"type": "integer",
"format": "int32",
"description": "The minimum prefix length to index. Must be greater than 0, and defaults to 2."
},
"maxChars": {
"type": "integer",
"format": "int32",
"description": "The maximum prefix length to index. Must be less than 20, and defaults to 5."
}
},
"title": "Options for including IndexPrefixes for field"
},
"luceneserverIndexSettings": {
"type": "object",
"properties": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import com.yelp.nrtsearch.server.luceneserver.field.TextBaseFieldDef;
import com.yelp.nrtsearch.server.luceneserver.field.properties.GeoQueryable;
import com.yelp.nrtsearch.server.luceneserver.field.properties.PolygonQueryable;
import com.yelp.nrtsearch.server.luceneserver.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.luceneserver.field.properties.RangeQueryable;
import com.yelp.nrtsearch.server.luceneserver.field.properties.TermQueryable;
import com.yelp.nrtsearch.server.luceneserver.script.ScoreScript;
Expand All @@ -44,7 +45,6 @@
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.FunctionMatchQuery;
import org.apache.lucene.queries.function.FunctionScoreQuery;
Expand Down Expand Up @@ -173,7 +173,7 @@ private Query getQueryNode(
case MATCHPHRASEPREFIXQUERY:
return MatchPhrasePrefixQuery.build(query.getMatchPhrasePrefixQuery(), state);
case PREFIXQUERY:
return getPrefixQuery(query.getPrefixQuery(), state);
return getPrefixQuery(query.getPrefixQuery(), state, false);
case CONSTANTSCOREQUERY:
return getConstantScoreQuery(query.getConstantScoreQuery(), state, docLookup);
case SPANQUERY:
Expand Down Expand Up @@ -597,24 +597,16 @@ private Query getExistsQuery(ExistsQuery existsQuery, IndexState state) {
return new ConstantScoreQuery(new TermQuery(new Term(IndexState.FIELD_NAMES, fieldName)));
}

private static Query getPrefixQuery(PrefixQuery prefixQuery, IndexState state) {
private static Query getPrefixQuery(
PrefixQuery prefixQuery, IndexState state, boolean spanQuery) {
FieldDef fieldDef = state.getField(prefixQuery.getField());
if (!(fieldDef instanceof IndexableFieldDef)) {
if (!(fieldDef instanceof PrefixQueryable)) {
throw new IllegalArgumentException(
"Field \"" + prefixQuery.getPrefix() + "\" is not indexable");
"Field " + fieldDef.getName() + " does not support PrefixQuery");
}
IndexOptions indexOptions = ((IndexableFieldDef) fieldDef).getFieldType().indexOptions();
if (indexOptions == IndexOptions.NONE) {
throw new IllegalArgumentException(
"Field \"" + prefixQuery.getField() + "\" is not indexed with terms");
}

org.apache.lucene.search.PrefixQuery query =
new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()));
query.setRewriteMethod(
getRewriteMethod(prefixQuery.getRewrite(), prefixQuery.getRewriteTopTermsSize()));
return query;
MultiTermQuery.RewriteMethod rewriteMethod =
getRewriteMethod(prefixQuery.getRewrite(), prefixQuery.getRewriteTopTermsSize());
return ((PrefixQueryable) fieldDef).getPrefixQuery(prefixQuery, rewriteMethod, spanQuery);
}

private static MultiTermQuery.RewriteMethod getRewriteMethod(
Expand Down Expand Up @@ -699,7 +691,7 @@ private SpanMultiTermQueryWrapper getSpanMultiTermQueryWrapper(
FuzzyQuery fuzzyQuery = getFuzzyQuery(protoSpanMultiTermQuery);
return new SpanMultiTermQueryWrapper<>(fuzzyQuery);
case PREFIXQUERY:
Query prefixQuery = getPrefixQuery(protoSpanMultiTermQuery.getPrefixQuery(), state);
Query prefixQuery = getPrefixQuery(protoSpanMultiTermQuery.getPrefixQuery(), state, true);
return new SpanMultiTermQueryWrapper<>((MultiTermQuery) prefixQuery);
case REGEXPQUERY:
RegexpQuery regexpQuery = getRegexpQuery(protoSpanMultiTermQuery);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.luceneserver.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;

/**
* An {@link AnalyzerWrapper} that wraps another analyzer and applies an Edge N-Gram token filter to
* the token stream.
*/
public class PrefixWrappedAnalyzer extends AnalyzerWrapper {
private final int minChars;
private final int maxChars;
private final Analyzer delegate;

/**
* Create a new {@link PrefixWrappedAnalyzer} that wraps the given {@link Analyzer} and sets
* applies an Edge N-Gram token filter to the token stream.
*
* @param delegate the analyzer to wrap
* @param minChars the minimum number of characters for the edge n-grams
* @param maxChars the maximum number of characters for the edge n-grams
*/
public PrefixWrappedAnalyzer(Analyzer delegate, int minChars, int maxChars) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.minChars = minChars;
this.maxChars = maxChars;
}

@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}

@Override
protected TokenStreamComponents wrapComponents(
String fieldName, TokenStreamComponents components) {
TokenFilter filter =
new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
return new TokenStreamComponents(components.getSource(), filter);
}

@Override
public String toString() {
return "PrefixWrappedAnalyzer(" + delegate.toString() + ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,24 @@
import static com.yelp.nrtsearch.server.luceneserver.analysis.AnalyzerCreator.hasAnalyzer;

import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import com.yelp.nrtsearch.server.grpc.SortType;
import com.yelp.nrtsearch.server.luceneserver.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.luceneserver.field.properties.Sortable;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.util.BytesRef;

/** Field class for 'ATOM' field type. Uses {@link KeywordAnalyzer} for text analysis. */
public class AtomFieldDef extends TextBaseFieldDef implements Sortable {
public class AtomFieldDef extends TextBaseFieldDef implements Sortable, PrefixQueryable {
private static final Analyzer keywordAnalyzer = new KeywordAnalyzer();

public AtomFieldDef(String name, Field requestField) {
Expand Down Expand Up @@ -145,4 +150,22 @@ public SortField getSortField(SortType type) {
}
return sortField;
}

@Override
public Query getPrefixQuery(
PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod, boolean spanQuery) {
verifySearchable("Prefix query");
org.apache.lucene.search.PrefixQuery query =
new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()));
query.setRewriteMethod(rewriteMethod);
return query;
}

protected void verifySearchable(String featureName) {
if (!isSearchable()) {
throw new IllegalStateException(
featureName + " requires field to be searchable: " + getName());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.luceneserver.field;

import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import com.yelp.nrtsearch.server.luceneserver.analysis.PrefixWrappedAnalyzer;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;

public class PrefixFieldDef extends TextBaseFieldDef {
private final int minChars;
private final int maxChars;
private final String parentField;
private static final String INDEX_PREFIX = "._index_prefix";

public PrefixFieldDef(String parentName, Field requestField) {
super(parentName + INDEX_PREFIX, requestField);
this.minChars = requestField.getIndexPrefixes().getMinChars();
this.maxChars = requestField.getIndexPrefixes().getMaxChars();
this.parentField = parentName;
}

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
fieldType.setOmitNorms(true);
fieldType.setTokenized(true);
fieldType.setIndexOptions(IndexOptions.DOCS);
}

@Override
protected Analyzer parseIndexAnalyzer(Field requestField) {
Analyzer baseAnalyzer = super.parseIndexAnalyzer(requestField);
if (baseAnalyzer == null) {
throw new IllegalArgumentException("Could not determine analyzer");
}
return new PrefixWrappedAnalyzer(
baseAnalyzer,
requestField.getIndexPrefixes().getMinChars(),
requestField.getIndexPrefixes().getMaxChars());
}

boolean accept(int length) {
return length >= minChars - 1 && length <= maxChars;
}

public Query getPrefixQuery(PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod) {
String textValue = prefixQuery.getPrefix();
if (textValue.length() >= minChars) {
return super.getTermQueryFromTextValue(textValue);
}
List<Automaton> automata = new ArrayList<>();
automata.add(Automata.makeString(textValue));
for (int i = textValue.length(); i < minChars; i++) {
automata.add(Automata.makeAnyChar());
}
Automaton automaton = Operations.concatenate(automata);
AutomatonQuery query = new AutomatonQuery(new Term(getName(), textValue + "*"), automaton);
query.setRewriteMethod(rewriteMethod);
return new BooleanQuery.Builder()
.add(query, BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(parentField, textValue)), BooleanClause.Occur.SHOULD)
.build();
}

@Override
public String getType() {
return "PREFIX";
}

public int getMinChars() {
return minChars;
}

public int getMaxChars() {
return maxChars;
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), minChars, maxChars, parentField);
}
}
Loading

0 comments on commit 47aaded

Please sign in to comment.