Skip to content

Commit

Permalink
SynonymV2GraphFilterFactory and NrtsearchSynonymParser (#632)
Browse files Browse the repository at this point in the history
Added SynonymV2GraphFilterFactory and NrtsearchSynonymParser to parse inline synonyms
  • Loading branch information
swethakann authored Mar 20, 2024
1 parent 4dd5d2c commit 92a8b5b
Show file tree
Hide file tree
Showing 8 changed files with 798 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/analysis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,8 @@ Available token filters:

* synonym

* synonymV2 - Similar to the ``synonymGraph`` filter, except rules are specified directly in the parameters. See `SynonymV2GraphFilterFactory <https://github.com/Yelp/nrtsearch/blob/master/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java>`_.

* synonymGraph

* flattenGraph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ public static void initialize(LuceneServerConfiguration configuration, Iterable<
.collect(Collectors.toSet());
instance.registerCharFilter(
MappingV2CharFilterFactory.NAME, MappingV2CharFilterFactory.class, builtInCharFilters);
instance.registerTokenFilter(
SynonymV2GraphFilterFactory.NAME, SynonymV2GraphFilterFactory.class, builtInTokenFilters);

for (Plugin plugin : plugins) {
if (plugin instanceof AnalysisPlugin) {
AnalysisPlugin analysisPlugin = (AnalysisPlugin) plugin;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
* Copyright 2024 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.luceneserver.analysis;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

class NrtsearchSynonymParser extends SynonymMap.Parser {
private final boolean expand;
private final String synonymsSeparator;
private static final String SYNONYM_MAPPING_SEPARATOR = ",";

/**
* This is a nrtsearch parser that extends SynonymMap.Parser to parse synonyms provided inline in
* a string instead of a file
*
* @param synonymsSeparator pattern used to split the synonym mappings
* @param dedup set to true to dedup duplicate synonym mappings
* @param expand set to true to map synonyms both ways
* @param analyzer analyzer for the synonyms
*/
public NrtsearchSynonymParser(
String synonymsSeparator, boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup, analyzer);
this.expand = expand;
this.synonymsSeparator = synonymsSeparator;
}

@Override
public void parse(Reader mappings) throws IOException, ParseException {
BufferedReader bufferedReader = new BufferedReader(mappings);
String line;
while ((line = bufferedReader.readLine()) != null) {
String[] synonyms = line.split(synonymsSeparator);
this.addInternal(synonyms);
}
}

public void addInternal(String[] synonyms) throws IOException {
String[] inputStrings;
CharsRef[] inputs;
int i;

for (String synonym : synonyms) {
inputStrings = split(synonym, SYNONYM_MAPPING_SEPARATOR);

if (inputStrings.length != 2) {
throw new IllegalArgumentException("synonym mapping is invalid for " + synonym);
}
inputs = new CharsRef[inputStrings.length];

for (i = 0; i < inputs.length; ++i) {
inputs[i] = this.analyze(this.unescape(inputStrings[i]).trim(), new CharsRefBuilder());
}

if (!this.expand) {
for (i = 0; i < inputs.length; ++i) {
this.add(inputs[i], inputs[0], false);
}
} else {
for (i = 0; i < inputs.length; ++i) {
for (int j = 0; j < inputs.length; ++j) {
if (i != j) {
this.add(inputs[i], inputs[j], true);
}
}
}
}
}
}

private static String[] split(String s, String separator) {
ArrayList<String> list = new ArrayList(2);
StringBuilder sb = new StringBuilder();
int pos = 0;
int end = s.length();

while (pos < end) {
if (s.startsWith(separator, pos)) {
if (sb.length() > 0) {
list.add(sb.toString());
sb = new StringBuilder();
}

pos += separator.length();
} else {
char ch = s.charAt(pos++);
if (ch == '\\') {
sb.append(ch);
if (pos >= end) {
break;
}

ch = s.charAt(pos++);
}

sb.append(ch);
}
}

if (sb.length() > 0) {
list.add(sb.toString());
}

return list.toArray(new String[list.size()]);
}

private String unescape(String s) {
if (s.indexOf("\\") < 0) {
return s;
} else {
StringBuilder sb = new StringBuilder();

for (int i = 0; i < s.length(); ++i) {
char ch = s.charAt(i);
if (ch == '\\' && i < s.length() - 1) {
++i;
sb.append(s.charAt(i));
} else {
sb.append(ch);
}
}

return sb.toString();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright 2024 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.luceneserver.analysis;

import java.io.IOException;
import java.io.StringReader;
import java.text.ParseException;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.TokenFilterFactory;

/**
* Implementation of a {@link TokenFilterFactory} that allows for loading synonym mappings. Unlike
* the lucene provided {@link org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory}, this
* one lets you specify the synonyms inline as a parameter (instead of within a file).
*
* <p>Synonyms must be specified in the 'synonyms' parameter string. This value is separated into
* multiple synonym mappings that are comma separated by splitting on a pattern, which defaults to
* '|'. This pattern may be changed by giving a 'separator_pattern' param.
*
* <p>Synonyms must be of the form synonym_from,synonym_to for example:
*
* <p>a,b
*
* <p>a,b|c,d
*/
public class SynonymV2GraphFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "synonymV2";

public static final String SYNONYMS = "synonyms";
public static final String SYNONYM_SEPARATOR_PATTERN = "separator_pattern";
public static final String DEFAULT_SYNONYM_SEPARATOR_PATTERN = "\\s*\\|\\s*";
public final boolean ignoreCase;
protected SynonymMap synonymMap;

public SynonymV2GraphFilterFactory(Map<String, String> args) throws IOException, ParseException {
super(args);
String synonymMappings = args.get(SYNONYMS);
String separatorPattern =
args.getOrDefault(SYNONYM_SEPARATOR_PATTERN, DEFAULT_SYNONYM_SEPARATOR_PATTERN);

this.ignoreCase = getBoolean(args, "ignoreCase", false);
boolean expand = getBoolean(args, "expand", true);
boolean dedup = getBoolean(args, "dedup", true);
String parserFormat = args.getOrDefault("parserFormat", "nrtsearch");
String analyzerName = args.get("analyzerName");

if (synonymMappings == null) {
throw new IllegalArgumentException("Synonym mappings must be specified");
}

if (!parserFormat.equals("nrtsearch")) {
throw new IllegalArgumentException(
"The parser format: " + parserFormat + " is not valid. It should be nrtsearch");
}

Analyzer analyzer;
if (analyzerName == null) {
analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
} else {
analyzer = AnalyzerCreator.getInstance().getAnalyzer(getPredefinedAnalyzer(analyzerName));
}

SynonymMap.Parser parser =
new NrtsearchSynonymParser(separatorPattern, dedup, expand, analyzer);
parser.parse(new StringReader(synonymMappings));
synonymMap = parser.build();
}

@Override
public TokenStream create(TokenStream input) {
return (this.synonymMap.fst == null
? input
: new SynonymGraphFilter(input, synonymMap, ignoreCase));
}

private com.yelp.nrtsearch.server.grpc.Analyzer getPredefinedAnalyzer(String analyzerName) {
return com.yelp.nrtsearch.server.grpc.Analyzer.newBuilder().setPredefined(analyzerName).build();
}
}
Loading

0 comments on commit 92a8b5b

Please sign in to comment.