From 35bf0b4c265c73629b14166ccfce54589959518f Mon Sep 17 00:00:00 2001 From: swethakann Date: Fri, 15 Mar 2024 09:21:12 -0700 Subject: [PATCH 1/3] SynonymV2GraphFilterFactory POC code --- .../analysis/NrtsearchSynonymParser.java | 137 ++++++++++++++++++ .../analysis/SynonymV2GraphFilterFactory.java | 76 ++++++++++ .../analysis/NrtsearchSynonymParserTest.java | 80 ++++++++++ .../SynonymV2GraphFilterFactoryTest.java | 82 +++++++++++ 4 files changed, 375 insertions(+) create mode 100644 src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java create mode 100644 src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java create mode 100644 src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java create mode 100644 src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java new file mode 100644 index 000000000..dc3f7ec22 --- /dev/null +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java @@ -0,0 +1,137 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.text.ParseException; +import java.util.ArrayList; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; + +class NrtsearchSynonymParser extends SynonymMap.Parser { + private final boolean expand; + private static final String SYNONYMS_SEPARATOR = "\\|"; + private static final String SYNONYM_MAPPING_SEPARATOR = ","; + + public NrtsearchSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { + super(dedup, analyzer); + this.expand = expand; + } + + @Override + public void parse(Reader mappings) throws IOException, ParseException { + BufferedReader bufferedReader = new BufferedReader(mappings); + String line; + while ((line = bufferedReader.readLine()) != null) { + // Splitting the mappings based on "|" + String[] synonyms = line.split(SYNONYMS_SEPARATOR); + this.addInternal(synonyms); + } + } + + public void addInternal(String[] synonyms) throws IOException { + String[] inputStrings; + CharsRef[] inputs; + int i; + + for (String synonym : synonyms) { + inputStrings = split(synonym, SYNONYM_MAPPING_SEPARATOR); + + if (inputStrings.length != 2) { + throw new IllegalArgumentException("synonym mapping is invalid for " + synonym); + } + inputs = new CharsRef[inputStrings.length]; + + for (i = 0; i < inputs.length; ++i) { + inputs[i] = this.analyze(this.unescape(inputStrings[i]).trim(), new CharsRefBuilder()); + } + + if (!this.expand) { + for (i = 0; i < inputs.length; ++i) { + this.add(inputs[i], inputs[0], false); + } + } else { + for (i = 0; i < inputs.length; ++i) { + for (int j = 0; j < inputs.length; ++j) { + if (i != j) { + this.add(inputs[i], inputs[j], true); + } + } + } + } + } + } + + private static String[] split(String s, String separator) { + ArrayList list = new ArrayList(2); + StringBuilder sb = new StringBuilder(); + int pos = 0; + int end = s.length(); + + while (pos < end) { + if (s.startsWith(separator, pos)) { + if (sb.length() > 0) { + list.add(sb.toString()); + sb = new StringBuilder(); + } + + pos += separator.length(); + } else { + char ch = s.charAt(pos++); + if (ch == '\\') { + sb.append(ch); + if (pos >= end) { + break; + } + + ch = s.charAt(pos++); + } + + sb.append(ch); + } + } + + if (sb.length() > 0) { + list.add(sb.toString()); + } + + return list.toArray(new String[list.size()]); + } + + private String unescape(String s) { + if (s.indexOf("\\") < 0) { + return s; + } else { + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < s.length(); ++i) { + char ch = s.charAt(i); + if (ch == '\\' && i < s.length() - 1) { + ++i; + sb.append(s.charAt(i)); + } else { + sb.append(ch); + } + } + + return sb.toString(); + } + } +} diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java new file mode 100644 index 000000000..91eebf0fe --- /dev/null +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.Map; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +public class SynonymV2GraphFilterFactory extends TokenFilterFactory { + public static final String MAPPINGS = "mappings"; + public final boolean ignoreCase; + protected SynonymMap synonymMap; + + public SynonymV2GraphFilterFactory(Map args, Analyzer analyzer) + throws IOException, ParseException { + super(args); + this.ignoreCase = getBoolean(args, "ignoreCase", false); + boolean expand = getBoolean(args, "expand", true); + String parserFormat = args.get("parserFormat"); + String synonymMappings = args.get(MAPPINGS); + if (synonymMappings == null) { + throw new IllegalArgumentException("Synonym mappings must be specified"); + } + if (parserFormat == null) { + throw new IllegalArgumentException("Parser format must be specified"); + } + synonymMap = loadSynonymsFromString(synonymMappings, parserFormat, expand, analyzer); + } + + @Override + public TokenStream create(TokenStream input) { + return new SynonymGraphFilter(input, synonymMap, ignoreCase); + } + + public SynonymMap loadSynonymsFromString( + String synonymMappings, String parserFormat, boolean expand, Analyzer analyzer) + throws IOException, ParseException { + SynonymMap.Parser parser; + + if (parserFormat.equals("solr")) { + parser = new SolrSynonymParser(true, expand, analyzer); + } else if (parserFormat.equals("wordnet")) { + parser = new WordnetSynonymParser(true, expand, analyzer); + } else if (parserFormat.equals("nrtsearch")) { + parser = new NrtsearchSynonymParser(true, expand, analyzer); + } else { + throw new IllegalArgumentException( + "The parser format: " + + parserFormat + + " is not valid. It should be solr, wordnet or nrtsearch"); + } + parser.parse(new StringReader(synonymMappings)); + return parser.build(); + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java new file mode 100644 index 000000000..0636f13a0 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java @@ -0,0 +1,80 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.runner.RunWith; + +@RunWith(RandomizedRunner.class) +public class NrtsearchSynonymParserTest extends LuceneTestCase { + + public void testParse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = + "a, b|ix, pie-ix|plaza, pla\\xE7a|plaza, plz|str, strada|str, strasse|str, stra\\xDFe|village, vlg"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + + analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents( + tokenizer, new SynonymGraphFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + assertAnalyzesTo( + analyzer, "plaza", new String[] {"plaxe7a", "plz", "plaza"}, new int[] {1, 0, 0}); + assertAnalyzesTo( + analyzer, + "str", + new String[] {"strada", "strasse", "straxdfe", "str"}, + new int[] {1, 0, 0, 0}); + + analyzer.close(); + } + + public void testInvalidMappings() { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a, b, c, d, e"; + expectThrows( + IllegalArgumentException.class, + () -> { + parser.parse(new StringReader(synonyms)); + }); + analyzer.close(); + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java new file mode 100644 index 000000000..acdbb9a4a --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java @@ -0,0 +1,82 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import static org.apache.lucene.util.LuceneTestCase.random; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.runner.RunWith; + +@RunWith(RandomizedRunner.class) +public class SynonymV2GraphFilterFactoryTest extends LuceneTestCase { + public void testNoMappings() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + try { + new SynonymV2GraphFilterFactory(new HashMap<>(), analyzer); + fail(); + } catch (IllegalArgumentException e) { + assertEquals("Synonym mappings must be specified", e.getMessage()); + } + } + + public void testSingleMapping() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a, b"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + private static void assertTokenStream( + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory, + TokenStream tokenStream, + String[] expectedTokens) { + try { + TokenStream output = synonymV2GraphFilterFactory.create(tokenStream); + CharTermAttribute charTermAtt = output.addAttribute(CharTermAttribute.class); + int i = 0; + output.reset(); + while (output.incrementToken()) { + assertEquals(expectedTokens[i], charTermAtt.toString()); + i += 1; + } + output.end(); + output.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private SynonymV2GraphFilterFactory getFactory(String mappings) + throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put("parserFormat", "nrtsearch"); + return new SynonymV2GraphFilterFactory(params, analyzer); + } +} From 5ec98eff113ec901dd0b64861f9e59dd58c3bd31 Mon Sep 17 00:00:00 2001 From: swethakann Date: Tue, 19 Mar 2024 08:58:19 -0700 Subject: [PATCH 2/3] Adding SynonymV2GraphFilterFactory and NrtsearchSynonymParser --- .../analysis/AnalyzerCreator.java | 3 + .../analysis/NrtsearchSynonymParser.java | 9 +- .../analysis/SynonymV2GraphFilterFactory.java | 84 ++++++++--- .../analysis/NrtsearchSynonymParserTest.java | 95 ++++++++++--- .../SynonymV2GraphFilterFactoryITest.java | 105 ++++++++++++++ .../SynonymV2GraphFilterFactoryTest.java | 133 ++++++++++++++++-- .../registerFieldsSynonymTokenFilter.json | 38 +++++ 7 files changed, 417 insertions(+), 50 deletions(-) create mode 100644 src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java create mode 100644 src/test/resources/analysis/registerFieldsSynonymTokenFilter.json diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java index f18eead39..5281c7cf3 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java @@ -143,6 +143,9 @@ public static void initialize(LuceneServerConfiguration configuration, Iterable< .collect(Collectors.toSet()); instance.registerCharFilter( MappingV2CharFilterFactory.NAME, MappingV2CharFilterFactory.class, builtInCharFilters); + instance.registerTokenFilter( + SynonymV2GraphFilterFactory.NAME, SynonymV2GraphFilterFactory.class, builtInTokenFilters); + for (Plugin plugin : plugins) { if (plugin instanceof AnalysisPlugin) { AnalysisPlugin analysisPlugin = (AnalysisPlugin) plugin; diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java index dc3f7ec22..05a9322d0 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java @@ -27,12 +27,14 @@ class NrtsearchSynonymParser extends SynonymMap.Parser { private final boolean expand; - private static final String SYNONYMS_SEPARATOR = "\\|"; + private final String synonymsSeparator; private static final String SYNONYM_MAPPING_SEPARATOR = ","; - public NrtsearchSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { + public NrtsearchSynonymParser( + String synonymsSeparator, boolean dedup, boolean expand, Analyzer analyzer) { super(dedup, analyzer); this.expand = expand; + this.synonymsSeparator = synonymsSeparator; } @Override @@ -40,8 +42,7 @@ public void parse(Reader mappings) throws IOException, ParseException { BufferedReader bufferedReader = new BufferedReader(mappings); String line; while ((line = bufferedReader.readLine()) != null) { - // Splitting the mappings based on "|" - String[] synonyms = line.split(SYNONYMS_SEPARATOR); + String[] synonyms = line.split(synonymsSeparator); this.addInternal(synonyms); } } diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java index 91eebf0fe..3c49ecb1e 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java @@ -17,35 +17,62 @@ import java.io.IOException; import java.io.StringReader; +import java.lang.reflect.InvocationTargetException; +import java.text.MessageFormat; import java.text.ParseException; import java.util.Map; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.apache.lucene.analysis.util.TokenFilterFactory; public class SynonymV2GraphFilterFactory extends TokenFilterFactory { + /** SPI name */ + public static final String NAME = "synonymV2"; + public static final String MAPPINGS = "mappings"; + private static final String LUCENE_ANALYZER_PATH = + "org.apache.lucene.analysis.standard.{0}Analyzer"; + public static final String SYNONYM_SEPARATOR_PATTERN = "separator_pattern"; + public static final String DEFAULT_SYNONYM_SEPARATOR_PATTERN = "\\s*\\|\\s*"; public final boolean ignoreCase; protected SynonymMap synonymMap; - public SynonymV2GraphFilterFactory(Map args, Analyzer analyzer) - throws IOException, ParseException { + public SynonymV2GraphFilterFactory(Map args) throws IOException, ParseException { super(args); + String synonymMappings = args.get(MAPPINGS); + String separatorPattern = + args.getOrDefault(SYNONYM_SEPARATOR_PATTERN, DEFAULT_SYNONYM_SEPARATOR_PATTERN); + this.ignoreCase = getBoolean(args, "ignoreCase", false); boolean expand = getBoolean(args, "expand", true); - String parserFormat = args.get("parserFormat"); - String synonymMappings = args.get(MAPPINGS); + String parserFormat = args.getOrDefault("parserFormat", "nrtsearch"); + String analyzerName = args.get("analyzerName"); + if (synonymMappings == null) { throw new IllegalArgumentException("Synonym mappings must be specified"); } - if (parserFormat == null) { - throw new IllegalArgumentException("Parser format must be specified"); + + Analyzer analyzer; + if (analyzerName == null) { + analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new WhitespaceTokenizer(); + TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + } else { + analyzer = loadAnalyzer(analyzerName); } - synonymMap = loadSynonymsFromString(synonymMappings, parserFormat, expand, analyzer); + synonymMap = + loadSynonymsFromString(separatorPattern, synonymMappings, parserFormat, expand, analyzer); } @Override @@ -54,23 +81,42 @@ public TokenStream create(TokenStream input) { } public SynonymMap loadSynonymsFromString( - String synonymMappings, String parserFormat, boolean expand, Analyzer analyzer) + String separatorPattern, + String synonymMappings, + String parserFormat, + boolean expand, + Analyzer analyzer) throws IOException, ParseException { SynonymMap.Parser parser; - if (parserFormat.equals("solr")) { - parser = new SolrSynonymParser(true, expand, analyzer); - } else if (parserFormat.equals("wordnet")) { - parser = new WordnetSynonymParser(true, expand, analyzer); - } else if (parserFormat.equals("nrtsearch")) { - parser = new NrtsearchSynonymParser(true, expand, analyzer); + if (parserFormat.equals("nrtsearch")) { + parser = new NrtsearchSynonymParser(separatorPattern, true, expand, analyzer); } else { throw new IllegalArgumentException( - "The parser format: " - + parserFormat - + " is not valid. It should be solr, wordnet or nrtsearch"); + "The parser format: " + parserFormat + " is not valid. It should be nrtsearch"); } parser.parse(new StringReader(synonymMappings)); return parser.build(); } + + private Analyzer loadAnalyzer(String analyzerName) { + Analyzer analyzer; + String analyzerClassName = MessageFormat.format(LUCENE_ANALYZER_PATH, analyzerName); + try { + analyzer = + (Analyzer) + Analyzer.class + .getClassLoader() + .loadClass(analyzerClassName) + .getDeclaredConstructor() + .newInstance(); + } catch (InstantiationException + | IllegalAccessException + | NoSuchMethodException + | ClassNotFoundException + | InvocationTargetException e) { + throw new RuntimeException(e); + } + return analyzer; + } } diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java index 0636f13a0..d978f6a18 100644 --- a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java @@ -32,43 +32,65 @@ @RunWith(RandomizedRunner.class) public class NrtsearchSynonymParserTest extends LuceneTestCase { + public final String DEFAULT_SEPARATOR_PATTERN = "\\s*\\|\\s*"; public void testParse() throws IOException, ParseException { Analyzer analyzer = new MockAnalyzer(random()); NrtsearchSynonymParser parser = - new NrtsearchSynonymParser(Boolean.TRUE, Boolean.TRUE, analyzer); + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); String synonyms = - "a, b|ix, pie-ix|plaza, pla\\xE7a|plaza, plz|str, strada|str, strasse|str, stra\\xDFe|village, vlg"; + "a , b|ix,pie-ix|plaza, pla|plaza, plz|str, strada |str, strasse|str, straße|village ,vlg"; parser.parse(new StringReader(synonyms)); final SynonymMap map = parser.build(); analyzer.close(); - - analyzer = - new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); - return new TokenStreamComponents( - tokenizer, new SynonymGraphFilter(tokenizer, map, true)); - } - }; + analyzer = getAnalyzer(map); assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); - assertAnalyzesTo( - analyzer, "plaza", new String[] {"plaxe7a", "plz", "plaza"}, new int[] {1, 0, 0}); + assertAnalyzesTo(analyzer, "pie-ix", new String[] {"ix", "pie-ix"}, new int[] {1, 0, 1}); + assertAnalyzesTo(analyzer, "plaza", new String[] {"pla", "plz", "plaza"}, new int[] {1, 0, 0}); assertAnalyzesTo( analyzer, "str", - new String[] {"strada", "strasse", "straxdfe", "str"}, + new String[] {"strada", "strasse", "straße", "str"}, new int[] {1, 0, 0, 0}); + assertAnalyzesTo(analyzer, "vlg", new String[] {"village", "vlg"}, new int[] {1, 0}); + analyzer.close(); + } + + public void testParseDedupFalse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser( + DEFAULT_SEPARATOR_PATTERN, Boolean.FALSE, Boolean.TRUE, analyzer); + String synonyms = "a , b|a,b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"b", "b", "a"}, new int[] {1, 0, 0}); + analyzer.close(); + } + + public void testParseExpandFalse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser( + DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.FALSE, analyzer); + String synonyms = "a , b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"a"}, new int[] {1}); + assertAnalyzesTo(analyzer, "b", new String[] {"a"}, new int[] {1}); analyzer.close(); } public void testInvalidMappings() { Analyzer analyzer = new MockAnalyzer(random()); NrtsearchSynonymParser parser = - new NrtsearchSynonymParser(Boolean.TRUE, Boolean.TRUE, analyzer); + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); String synonyms = "a, b, c, d, e"; expectThrows( IllegalArgumentException.class, @@ -77,4 +99,45 @@ public void testInvalidMappings() { }); analyzer.close(); } + + public void testParseCustomSeparator() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser("\\s*\\$\\s*", Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a , b$ix,pie-ix"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + assertAnalyzesTo(analyzer, "pie-ix", new String[] {"ix", "pie-ix"}, new int[] {1, 0, 1}); + analyzer.close(); + } + + public void testParseUnescape() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a , \\b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + analyzer.close(); + } + + private Analyzer getAnalyzer(SynonymMap map) { + Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents( + tokenizer, new SynonymGraphFilter(tokenizer, map, true)); + } + }; + return analyzer; + } } diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java new file mode 100644 index 000000000..e39a45043 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java @@ -0,0 +1,105 @@ +/* + * Copyright 2023 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import static org.junit.Assert.assertEquals; + +import com.yelp.nrtsearch.server.grpc.*; +import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField; +import com.yelp.nrtsearch.server.luceneserver.ServerTestCase; +import io.grpc.testing.GrpcCleanupRule; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Test; + +public class SynonymV2GraphFilterFactoryITest extends ServerTestCase { + @ClassRule public static final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + protected List getIndices() { + return Collections.singletonList(DEFAULT_TEST_INDEX); + } + + protected FieldDefRequest getIndexDef(String name) throws IOException { + return getFieldsFromResourceFile("/analysis/registerFieldsSynonymTokenFilter.json"); + } + + protected void initIndex(String name) throws Exception { + List docs = new ArrayList<>(); + AddDocumentRequest request = + AddDocumentRequest.newBuilder() + .setIndexName(name) + .putFields("doc_id", MultiValuedField.newBuilder().addValue("1").build()) + .putFields("text_field", MultiValuedField.newBuilder().addValue("plaza").build()) + .build(); + docs.add(request); + request = + AddDocumentRequest.newBuilder() + .setIndexName(name) + .putFields("doc_id", MultiValuedField.newBuilder().addValue("2").build()) + .putFields("text_field", MultiValuedField.newBuilder().addValue("str").build()) + .build(); + docs.add(request); + addDocuments(docs.stream()); + } + + @Test + public void testSynonymV2GraphFilter() { + SearchResponse response = + getGrpcServer() + .getBlockingStub() + .search( + SearchRequest.newBuilder() + .setIndexName(DEFAULT_TEST_INDEX) + .setTopHits(10) + .addRetrieveFields("doc_id") + .setQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("text_field") + .setTextValue("plaça") + .build()) + .build()) + .build()); + assertEquals(1, response.getHitsCount()); + assertEquals( + "1", response.getHits(0).getFieldsOrThrow("doc_id").getFieldValue(0).getTextValue()); + + response = + getGrpcServer() + .getBlockingStub() + .search( + SearchRequest.newBuilder() + .setIndexName(DEFAULT_TEST_INDEX) + .setTopHits(10) + .addRetrieveFields("doc_id") + .setQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("text_field") + .setTextValue("straße") + .build()) + .build()) + .build()); + assertEquals(1, response.getHitsCount()); + assertEquals( + "2", response.getHits(0).getFieldsOrThrow("doc_id").getFieldValue(0).getTextValue()); + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java index acdbb9a4a..c62d81528 100644 --- a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java @@ -15,10 +15,6 @@ */ package com.yelp.nrtsearch.server.luceneserver.analysis; -import static org.apache.lucene.util.LuceneTestCase.random; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - import com.carrotsearch.randomizedtesting.RandomizedRunner; import java.io.IOException; import java.io.StringReader; @@ -33,17 +29,16 @@ @RunWith(RandomizedRunner.class) public class SynonymV2GraphFilterFactoryTest extends LuceneTestCase { - public void testNoMappings() throws IOException, ParseException { - Analyzer analyzer = new MockAnalyzer(random()); + public void testNoSynonymMappings() throws IOException, ParseException { try { - new SynonymV2GraphFilterFactory(new HashMap<>(), analyzer); + new SynonymV2GraphFilterFactory(new HashMap<>()); fail(); } catch (IllegalArgumentException e) { assertEquals("Synonym mappings must be specified", e.getMessage()); } } - public void testSingleMapping() throws IOException, ParseException { + public void testSingleMappingWithDefaultAnalyzer() throws IOException, ParseException { SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a, b"); TokenStream tokenStream = new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); @@ -51,6 +46,106 @@ public void testSingleMapping() throws IOException, ParseException { assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); } + public void testMultipleMappingsWithDefaultAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("#, ste|a, b|c/, calle|plaza, plaça|p.o, po| v, väg"); + TokenStream tokenStream = + new StandardAnalyzer() + .tokenStream("field", new StringReader("# a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c/", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithStandardAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("a, b|c/, calle|plaza, plaça|p.o, po| v, väg", "Standard"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testInvalidAnalyzer() throws IOException, ParseException { + try { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put("analyzerName", "invalid"); + new SynonymV2GraphFilterFactory(params); + fail(); + } catch (RuntimeException e) { + assertEquals( + "java.lang.ClassNotFoundException: org.apache.lucene.analysis.standard.invalidAnalyzer", + e.getMessage()); + } + } + + public void testNrtsearchParserFormat() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("a, b", "Standard", "nrtsearch"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testInvalidParserFormat() throws IOException, ParseException { + try { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put("parserFormat", "invalid"); + new SynonymV2GraphFilterFactory(params); + fail(); + } catch (RuntimeException e) { + assertEquals( + "The parser format: invalid is not valid. It should be nrtsearch", e.getMessage()); + } + } + + public void testSingleMappingWithExpandFalse() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put("expand", "false"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithCustomSeparator() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put( + SynonymV2GraphFilterFactory.MAPPINGS, + "#, ste=>a, b=>c/, calle=>plaza, plaça=>p.o, po=> v, väg"); + params.put("separator_pattern", "\\s*\\=>\\s*"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer() + .tokenStream("field", new StringReader("# a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c/", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testSingleMappingIgnoreCase() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, "A, B"); + params.put("ignoreCase", "true"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + private static void assertTokenStream( SynonymV2GraphFilterFactory synonymV2GraphFilterFactory, TokenStream tokenStream, @@ -73,10 +168,26 @@ private static void assertTokenStream( private SynonymV2GraphFilterFactory getFactory(String mappings) throws IOException, ParseException { - Analyzer analyzer = new MockAnalyzer(random()); Map params = new HashMap<>(); params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); - params.put("parserFormat", "nrtsearch"); - return new SynonymV2GraphFilterFactory(params, analyzer); + return new SynonymV2GraphFilterFactory(params); + } + + private SynonymV2GraphFilterFactory getFactory(String mappings, String analyzerName) + throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put("analyzerName", analyzerName); + return new SynonymV2GraphFilterFactory(params); + } + + private SynonymV2GraphFilterFactory getFactory( + String mappings, String analyzerName, String parserFormat) + throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put("analyzerName", analyzerName); + params.put("parserFormat", parserFormat); + return new SynonymV2GraphFilterFactory(params); } } diff --git a/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json new file mode 100644 index 000000000..d9c8d57e0 --- /dev/null +++ b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json @@ -0,0 +1,38 @@ +{ + "indexName": "test_index", + "field": [ + { + "name": "doc_id", + "type": "_ID", + "search": true, + "storeDocValues": true + }, + { + "name": "text_field", + "type": "TEXT", + "search": true, + "tokenize": true, + "multiValued": true, + "storeDocValues": true, + "analyzer": { + "custom": { + "tokenizer": { + "name": "keyword" + }, + "tokenFilters": [ + { + "name": "lowercase" + }, + { + "name": "synonymV2", + "params": { + "mappings": "#,ste|a,b|blvd, boulevard|u.s,us|plaza, plaça|ix,pie-ix|str, straße|v, väg", + "parserFormat": "nrtsearch" + } + } + ] + } + } + } + ] +} \ No newline at end of file From 395c4ae57a732a1a3df031a96d6821a09560f1c5 Mon Sep 17 00:00:00 2001 From: swethakann Date: Wed, 20 Mar 2024 11:26:36 -0700 Subject: [PATCH 3/3] Use AnalyzerCreator and address other PR comments --- docs/analysis.rst | 2 + .../analysis/NrtsearchSynonymParser.java | 9 ++ .../analysis/SynonymV2GraphFilterFactory.java | 82 ++++++++----------- .../SynonymV2GraphFilterFactoryITest.java | 18 ++++ .../SynonymV2GraphFilterFactoryTest.java | 72 ++++++++++++---- .../registerFieldsSynonymTokenFilter.json | 5 +- 6 files changed, 122 insertions(+), 66 deletions(-) diff --git a/docs/analysis.rst b/docs/analysis.rst index 3dd5a0e29..ffe2e44c6 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -388,6 +388,8 @@ Available token filters: * synonym + * synonymV2 - Similar to the ``synonymGraph`` filter, except rules are specified directly in the parameters. See `SynonymV2GraphFilterFactory `_. + * synonymGraph * flattenGraph diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java index 05a9322d0..f07d9e136 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java @@ -30,6 +30,15 @@ class NrtsearchSynonymParser extends SynonymMap.Parser { private final String synonymsSeparator; private static final String SYNONYM_MAPPING_SEPARATOR = ","; + /** + * This is a nrtsearch parser that extends SynonymMap.Parser to parse synonyms provided inline in + * a string instead of a file + * + * @param synonymsSeparator pattern used to split the synonym mappings + * @param dedup set to true to dedup duplicate synonym mappings + * @param expand set to true to map synonyms both ways + * @param analyzer analyzer for the synonyms + */ public NrtsearchSynonymParser( String synonymsSeparator, boolean dedup, boolean expand, Analyzer analyzer) { super(dedup, analyzer); diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java index 3c49ecb1e..5ae447181 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java @@ -17,8 +17,6 @@ import java.io.IOException; import java.io.StringReader; -import java.lang.reflect.InvocationTargetException; -import java.text.MessageFormat; import java.text.ParseException; import java.util.Map; import org.apache.lucene.analysis.Analyzer; @@ -30,13 +28,26 @@ import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.util.TokenFilterFactory; +/** + * Implementation of a {@link TokenFilterFactory} that allows for loading synonym mappings. Unlike + * the lucene provided {@link org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory}, this + * one lets you specify the synonyms inline as a parameter (instead of within a file). + * + *

Synonyms must be specified in the 'synonyms' parameter string. This value is separated into + * multiple synonym mappings that are comma separated by splitting on a pattern, which defaults to + * '|'. This pattern may be changed by giving a 'separator_pattern' param. + * + *

Synonyms must be of the form synonym_from,synonym_to for example: + * + *

a,b + * + *

a,b|c,d + */ public class SynonymV2GraphFilterFactory extends TokenFilterFactory { /** SPI name */ public static final String NAME = "synonymV2"; - public static final String MAPPINGS = "mappings"; - private static final String LUCENE_ANALYZER_PATH = - "org.apache.lucene.analysis.standard.{0}Analyzer"; + public static final String SYNONYMS = "synonyms"; public static final String SYNONYM_SEPARATOR_PATTERN = "separator_pattern"; public static final String DEFAULT_SYNONYM_SEPARATOR_PATTERN = "\\s*\\|\\s*"; public final boolean ignoreCase; @@ -44,12 +55,13 @@ public class SynonymV2GraphFilterFactory extends TokenFilterFactory { public SynonymV2GraphFilterFactory(Map args) throws IOException, ParseException { super(args); - String synonymMappings = args.get(MAPPINGS); + String synonymMappings = args.get(SYNONYMS); String separatorPattern = args.getOrDefault(SYNONYM_SEPARATOR_PATTERN, DEFAULT_SYNONYM_SEPARATOR_PATTERN); this.ignoreCase = getBoolean(args, "ignoreCase", false); boolean expand = getBoolean(args, "expand", true); + boolean dedup = getBoolean(args, "dedup", true); String parserFormat = args.getOrDefault("parserFormat", "nrtsearch"); String analyzerName = args.get("analyzerName"); @@ -57,6 +69,11 @@ public SynonymV2GraphFilterFactory(Map args) throws IOException, throw new IllegalArgumentException("Synonym mappings must be specified"); } + if (!parserFormat.equals("nrtsearch")) { + throw new IllegalArgumentException( + "The parser format: " + parserFormat + " is not valid. It should be nrtsearch"); + } + Analyzer analyzer; if (analyzerName == null) { analyzer = @@ -69,54 +86,23 @@ protected TokenStreamComponents createComponents(String fieldName) { } }; } else { - analyzer = loadAnalyzer(analyzerName); + analyzer = AnalyzerCreator.getInstance().getAnalyzer(getPredefinedAnalyzer(analyzerName)); } - synonymMap = - loadSynonymsFromString(separatorPattern, synonymMappings, parserFormat, expand, analyzer); + + SynonymMap.Parser parser = + new NrtsearchSynonymParser(separatorPattern, dedup, expand, analyzer); + parser.parse(new StringReader(synonymMappings)); + synonymMap = parser.build(); } @Override public TokenStream create(TokenStream input) { - return new SynonymGraphFilter(input, synonymMap, ignoreCase); + return (this.synonymMap.fst == null + ? input + : new SynonymGraphFilter(input, synonymMap, ignoreCase)); } - public SynonymMap loadSynonymsFromString( - String separatorPattern, - String synonymMappings, - String parserFormat, - boolean expand, - Analyzer analyzer) - throws IOException, ParseException { - SynonymMap.Parser parser; - - if (parserFormat.equals("nrtsearch")) { - parser = new NrtsearchSynonymParser(separatorPattern, true, expand, analyzer); - } else { - throw new IllegalArgumentException( - "The parser format: " + parserFormat + " is not valid. It should be nrtsearch"); - } - parser.parse(new StringReader(synonymMappings)); - return parser.build(); - } - - private Analyzer loadAnalyzer(String analyzerName) { - Analyzer analyzer; - String analyzerClassName = MessageFormat.format(LUCENE_ANALYZER_PATH, analyzerName); - try { - analyzer = - (Analyzer) - Analyzer.class - .getClassLoader() - .loadClass(analyzerClassName) - .getDeclaredConstructor() - .newInstance(); - } catch (InstantiationException - | IllegalAccessException - | NoSuchMethodException - | ClassNotFoundException - | InvocationTargetException e) { - throw new RuntimeException(e); - } - return analyzer; + private com.yelp.nrtsearch.server.grpc.Analyzer getPredefinedAnalyzer(String analyzerName) { + return com.yelp.nrtsearch.server.grpc.Analyzer.newBuilder().setPredefined(analyzerName).build(); } } diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java index e39a45043..fa3fbe5d0 100644 --- a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java @@ -17,20 +17,38 @@ import static org.junit.Assert.assertEquals; +import com.yelp.nrtsearch.server.config.LuceneServerConfiguration; import com.yelp.nrtsearch.server.grpc.*; import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField; import com.yelp.nrtsearch.server.luceneserver.ServerTestCase; +import com.yelp.nrtsearch.server.plugins.Plugin; import io.grpc.testing.GrpcCleanupRule; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import org.junit.Before; import org.junit.ClassRule; import org.junit.Test; public class SynonymV2GraphFilterFactoryITest extends ServerTestCase { @ClassRule public static final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + @Before + public void init() { + init(Collections.emptyList()); + } + + private void init(List plugins) { + AnalyzerCreator.initialize(getEmptyConfig(), plugins); + } + + private LuceneServerConfiguration getEmptyConfig() { + String config = "nodeName: \"lucene_server_foo\""; + return new LuceneServerConfiguration(new ByteArrayInputStream(config.getBytes())); + } + protected List getIndices() { return Collections.singletonList(DEFAULT_TEST_INDEX); } diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java index c62d81528..79d2f6549 100644 --- a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java @@ -16,19 +16,43 @@ package com.yelp.nrtsearch.server.luceneserver.analysis; import com.carrotsearch.randomizedtesting.RandomizedRunner; +import com.yelp.nrtsearch.server.config.LuceneServerConfiguration; +import com.yelp.nrtsearch.server.plugins.Plugin; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringReader; import java.text.ParseException; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.LuceneTestCase; +import org.junit.Before; import org.junit.runner.RunWith; @RunWith(RandomizedRunner.class) public class SynonymV2GraphFilterFactoryTest extends LuceneTestCase { + + private static final String STANDARD_ANALYZER = "standard"; + + @Before + public void init() { + init(Collections.emptyList()); + } + + private void init(List plugins) { + AnalyzerCreator.initialize(getEmptyConfig(), plugins); + } + + private LuceneServerConfiguration getEmptyConfig() { + String config = "nodeName: \"lucene_server_foo\""; + return new LuceneServerConfiguration(new ByteArrayInputStream(config.getBytes())); + } + public void testNoSynonymMappings() throws IOException, ParseException { try { new SynonymV2GraphFilterFactory(new HashMap<>()); @@ -38,6 +62,14 @@ public void testNoSynonymMappings() throws IOException, ParseException { } } + public void testNoSynonymsReturnsInputTokenStream() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a,b"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is test string")); + String[] expectedTokens = {"this", "is", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + public void testSingleMappingWithDefaultAnalyzer() throws IOException, ParseException { SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a, b"); TokenStream tokenStream = @@ -60,7 +92,7 @@ public void testMultipleMappingsWithDefaultAnalyzer() throws IOException, ParseE public void testMultipleMappingsWithStandardAnalyzer() throws IOException, ParseException { SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = - getFactory("a, b|c/, calle|plaza, plaça|p.o, po| v, väg", "Standard"); + getFactory("a, b|c/, calle|plaza, plaça|p.o, po| v, väg", STANDARD_ANALYZER); TokenStream tokenStream = new StandardAnalyzer().tokenStream("field", new StringReader("a b calle plaça p.o väg v")); String[] expectedTokens = { @@ -69,23 +101,31 @@ public void testMultipleMappingsWithStandardAnalyzer() throws IOException, Parse assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); } + public void testMultipleMappingsWithClassicAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("c/, calle|plaza, plaça", "classic"); + TokenStream tokenStream = + new ClassicAnalyzer() + .tokenStream("field", new StringReader("this is a test for calle and plaça")); + String[] expectedTokens = {"test", "c", "calle", "plaza", "plaça"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + public void testInvalidAnalyzer() throws IOException, ParseException { try { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); params.put("analyzerName", "invalid"); new SynonymV2GraphFilterFactory(params); fail(); } catch (RuntimeException e) { - assertEquals( - "java.lang.ClassNotFoundException: org.apache.lucene.analysis.standard.invalidAnalyzer", - e.getMessage()); + assertEquals("Unable to find predefined analyzer: invalid", e.getMessage()); } } public void testNrtsearchParserFormat() throws IOException, ParseException { SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = - getFactory("a, b", "Standard", "nrtsearch"); + getFactory("a, b", STANDARD_ANALYZER, "nrtsearch"); TokenStream tokenStream = new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; @@ -95,7 +135,7 @@ public void testNrtsearchParserFormat() throws IOException, ParseException { public void testInvalidParserFormat() throws IOException, ParseException { try { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); params.put("parserFormat", "invalid"); new SynonymV2GraphFilterFactory(params); fail(); @@ -107,7 +147,7 @@ public void testInvalidParserFormat() throws IOException, ParseException { public void testSingleMappingWithExpandFalse() throws IOException, ParseException { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, "a, b"); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); params.put("expand", "false"); SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = new SynonymV2GraphFilterFactory(params); @@ -120,7 +160,7 @@ public void testSingleMappingWithExpandFalse() throws IOException, ParseExceptio public void testMultipleMappingsWithCustomSeparator() throws IOException, ParseException { Map params = new HashMap<>(); params.put( - SynonymV2GraphFilterFactory.MAPPINGS, + SynonymV2GraphFilterFactory.SYNONYMS, "#, ste=>a, b=>c/, calle=>plaza, plaça=>p.o, po=> v, väg"); params.put("separator_pattern", "\\s*\\=>\\s*"); SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = @@ -136,7 +176,7 @@ public void testMultipleMappingsWithCustomSeparator() throws IOException, ParseE public void testSingleMappingIgnoreCase() throws IOException, ParseException { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, "A, B"); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "A, B"); params.put("ignoreCase", "true"); SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = new SynonymV2GraphFilterFactory(params); @@ -166,26 +206,26 @@ private static void assertTokenStream( } } - private SynonymV2GraphFilterFactory getFactory(String mappings) + private SynonymV2GraphFilterFactory getFactory(String synonyms) throws IOException, ParseException { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); return new SynonymV2GraphFilterFactory(params); } - private SynonymV2GraphFilterFactory getFactory(String mappings, String analyzerName) + private SynonymV2GraphFilterFactory getFactory(String synonyms, String analyzerName) throws IOException, ParseException { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); params.put("analyzerName", analyzerName); return new SynonymV2GraphFilterFactory(params); } private SynonymV2GraphFilterFactory getFactory( - String mappings, String analyzerName, String parserFormat) + String synonyms, String analyzerName, String parserFormat) throws IOException, ParseException { Map params = new HashMap<>(); - params.put(SynonymV2GraphFilterFactory.MAPPINGS, mappings); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); params.put("analyzerName", analyzerName); params.put("parserFormat", parserFormat); return new SynonymV2GraphFilterFactory(params); diff --git a/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json index d9c8d57e0..8a654c39e 100644 --- a/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json +++ b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json @@ -26,8 +26,9 @@ { "name": "synonymV2", "params": { - "mappings": "#,ste|a,b|blvd, boulevard|u.s,us|plaza, plaça|ix,pie-ix|str, straße|v, väg", - "parserFormat": "nrtsearch" + "synonyms": "blvd, boulevard|u.s,us|plaza, plaça|ix,pie-ix|str, straße|v, väg", + "parserFormat": "nrtsearch", + "analyzerName": "standard" } } ]