diff --git a/docs/analysis.rst b/docs/analysis.rst index 3dd5a0e29..ffe2e44c6 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -388,6 +388,8 @@ Available token filters: * synonym + * synonymV2 - Similar to the ``synonymGraph`` filter, except rules are specified directly in the parameters. See `SynonymV2GraphFilterFactory `_. + * synonymGraph * flattenGraph diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java index f18eead39..5281c7cf3 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/AnalyzerCreator.java @@ -143,6 +143,9 @@ public static void initialize(LuceneServerConfiguration configuration, Iterable< .collect(Collectors.toSet()); instance.registerCharFilter( MappingV2CharFilterFactory.NAME, MappingV2CharFilterFactory.class, builtInCharFilters); + instance.registerTokenFilter( + SynonymV2GraphFilterFactory.NAME, SynonymV2GraphFilterFactory.class, builtInTokenFilters); + for (Plugin plugin : plugins) { if (plugin instanceof AnalysisPlugin) { AnalysisPlugin analysisPlugin = (AnalysisPlugin) plugin; diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java new file mode 100644 index 000000000..f07d9e136 --- /dev/null +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParser.java @@ -0,0 +1,147 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.text.ParseException; +import java.util.ArrayList; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; + +class NrtsearchSynonymParser extends SynonymMap.Parser { + private final boolean expand; + private final String synonymsSeparator; + private static final String SYNONYM_MAPPING_SEPARATOR = ","; + + /** + * This is a nrtsearch parser that extends SynonymMap.Parser to parse synonyms provided inline in + * a string instead of a file + * + * @param synonymsSeparator pattern used to split the synonym mappings + * @param dedup set to true to dedup duplicate synonym mappings + * @param expand set to true to map synonyms both ways + * @param analyzer analyzer for the synonyms + */ + public NrtsearchSynonymParser( + String synonymsSeparator, boolean dedup, boolean expand, Analyzer analyzer) { + super(dedup, analyzer); + this.expand = expand; + this.synonymsSeparator = synonymsSeparator; + } + + @Override + public void parse(Reader mappings) throws IOException, ParseException { + BufferedReader bufferedReader = new BufferedReader(mappings); + String line; + while ((line = bufferedReader.readLine()) != null) { + String[] synonyms = line.split(synonymsSeparator); + this.addInternal(synonyms); + } + } + + public void addInternal(String[] synonyms) throws IOException { + String[] inputStrings; + CharsRef[] inputs; + int i; + + for (String synonym : synonyms) { + inputStrings = split(synonym, SYNONYM_MAPPING_SEPARATOR); + + if (inputStrings.length != 2) { + throw new IllegalArgumentException("synonym mapping is invalid for " + synonym); + } + inputs = new CharsRef[inputStrings.length]; + + for (i = 0; i < inputs.length; ++i) { + inputs[i] = this.analyze(this.unescape(inputStrings[i]).trim(), new CharsRefBuilder()); + } + + if (!this.expand) { + for (i = 0; i < inputs.length; ++i) { + this.add(inputs[i], inputs[0], false); + } + } else { + for (i = 0; i < inputs.length; ++i) { + for (int j = 0; j < inputs.length; ++j) { + if (i != j) { + this.add(inputs[i], inputs[j], true); + } + } + } + } + } + } + + private static String[] split(String s, String separator) { + ArrayList list = new ArrayList(2); + StringBuilder sb = new StringBuilder(); + int pos = 0; + int end = s.length(); + + while (pos < end) { + if (s.startsWith(separator, pos)) { + if (sb.length() > 0) { + list.add(sb.toString()); + sb = new StringBuilder(); + } + + pos += separator.length(); + } else { + char ch = s.charAt(pos++); + if (ch == '\\') { + sb.append(ch); + if (pos >= end) { + break; + } + + ch = s.charAt(pos++); + } + + sb.append(ch); + } + } + + if (sb.length() > 0) { + list.add(sb.toString()); + } + + return list.toArray(new String[list.size()]); + } + + private String unescape(String s) { + if (s.indexOf("\\") < 0) { + return s; + } else { + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < s.length(); ++i) { + char ch = s.charAt(i); + if (ch == '\\' && i < s.length() - 1) { + ++i; + sb.append(s.charAt(i)); + } else { + sb.append(ch); + } + } + + return sb.toString(); + } + } +} diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java new file mode 100644 index 000000000..5ae447181 --- /dev/null +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactory.java @@ -0,0 +1,108 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.Map; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Implementation of a {@link TokenFilterFactory} that allows for loading synonym mappings. Unlike + * the lucene provided {@link org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory}, this + * one lets you specify the synonyms inline as a parameter (instead of within a file). + * + *

Synonyms must be specified in the 'synonyms' parameter string. This value is separated into + * multiple synonym mappings that are comma separated by splitting on a pattern, which defaults to + * '|'. This pattern may be changed by giving a 'separator_pattern' param. + * + *

Synonyms must be of the form synonym_from,synonym_to for example: + * + *

a,b + * + *

a,b|c,d + */ +public class SynonymV2GraphFilterFactory extends TokenFilterFactory { + /** SPI name */ + public static final String NAME = "synonymV2"; + + public static final String SYNONYMS = "synonyms"; + public static final String SYNONYM_SEPARATOR_PATTERN = "separator_pattern"; + public static final String DEFAULT_SYNONYM_SEPARATOR_PATTERN = "\\s*\\|\\s*"; + public final boolean ignoreCase; + protected SynonymMap synonymMap; + + public SynonymV2GraphFilterFactory(Map args) throws IOException, ParseException { + super(args); + String synonymMappings = args.get(SYNONYMS); + String separatorPattern = + args.getOrDefault(SYNONYM_SEPARATOR_PATTERN, DEFAULT_SYNONYM_SEPARATOR_PATTERN); + + this.ignoreCase = getBoolean(args, "ignoreCase", false); + boolean expand = getBoolean(args, "expand", true); + boolean dedup = getBoolean(args, "dedup", true); + String parserFormat = args.getOrDefault("parserFormat", "nrtsearch"); + String analyzerName = args.get("analyzerName"); + + if (synonymMappings == null) { + throw new IllegalArgumentException("Synonym mappings must be specified"); + } + + if (!parserFormat.equals("nrtsearch")) { + throw new IllegalArgumentException( + "The parser format: " + parserFormat + " is not valid. It should be nrtsearch"); + } + + Analyzer analyzer; + if (analyzerName == null) { + analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new WhitespaceTokenizer(); + TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + } else { + analyzer = AnalyzerCreator.getInstance().getAnalyzer(getPredefinedAnalyzer(analyzerName)); + } + + SynonymMap.Parser parser = + new NrtsearchSynonymParser(separatorPattern, dedup, expand, analyzer); + parser.parse(new StringReader(synonymMappings)); + synonymMap = parser.build(); + } + + @Override + public TokenStream create(TokenStream input) { + return (this.synonymMap.fst == null + ? input + : new SynonymGraphFilter(input, synonymMap, ignoreCase)); + } + + private com.yelp.nrtsearch.server.grpc.Analyzer getPredefinedAnalyzer(String analyzerName) { + return com.yelp.nrtsearch.server.grpc.Analyzer.newBuilder().setPredefined(analyzerName).build(); + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java new file mode 100644 index 000000000..d978f6a18 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/NrtsearchSynonymParserTest.java @@ -0,0 +1,143 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.runner.RunWith; + +@RunWith(RandomizedRunner.class) +public class NrtsearchSynonymParserTest extends LuceneTestCase { + public final String DEFAULT_SEPARATOR_PATTERN = "\\s*\\|\\s*"; + + public void testParse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = + "a , b|ix,pie-ix|plaza, pla|plaza, plz|str, strada |str, strasse|str, straße|village ,vlg"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + assertAnalyzesTo(analyzer, "pie-ix", new String[] {"ix", "pie-ix"}, new int[] {1, 0, 1}); + assertAnalyzesTo(analyzer, "plaza", new String[] {"pla", "plz", "plaza"}, new int[] {1, 0, 0}); + assertAnalyzesTo( + analyzer, + "str", + new String[] {"strada", "strasse", "straße", "str"}, + new int[] {1, 0, 0, 0}); + + assertAnalyzesTo(analyzer, "vlg", new String[] {"village", "vlg"}, new int[] {1, 0}); + analyzer.close(); + } + + public void testParseDedupFalse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser( + DEFAULT_SEPARATOR_PATTERN, Boolean.FALSE, Boolean.TRUE, analyzer); + String synonyms = "a , b|a,b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"b", "b", "a"}, new int[] {1, 0, 0}); + analyzer.close(); + } + + public void testParseExpandFalse() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser( + DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.FALSE, analyzer); + String synonyms = "a , b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"a"}, new int[] {1}); + assertAnalyzesTo(analyzer, "b", new String[] {"a"}, new int[] {1}); + analyzer.close(); + } + + public void testInvalidMappings() { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a, b, c, d, e"; + expectThrows( + IllegalArgumentException.class, + () -> { + parser.parse(new StringReader(synonyms)); + }); + analyzer.close(); + } + + public void testParseCustomSeparator() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser("\\s*\\$\\s*", Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a , b$ix,pie-ix"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + assertAnalyzesTo(analyzer, "pie-ix", new String[] {"ix", "pie-ix"}, new int[] {1, 0, 1}); + analyzer.close(); + } + + public void testParseUnescape() throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + NrtsearchSynonymParser parser = + new NrtsearchSynonymParser(DEFAULT_SEPARATOR_PATTERN, Boolean.TRUE, Boolean.TRUE, analyzer); + String synonyms = "a , \\b"; + parser.parse(new StringReader(synonyms)); + final SynonymMap map = parser.build(); + analyzer.close(); + analyzer = getAnalyzer(map); + assertAnalyzesTo(analyzer, "a", new String[] {"b", "a"}, new int[] {1, 0}); + analyzer.close(); + } + + private Analyzer getAnalyzer(SynonymMap map) { + Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents( + tokenizer, new SynonymGraphFilter(tokenizer, map, true)); + } + }; + return analyzer; + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java new file mode 100644 index 000000000..fa3fbe5d0 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryITest.java @@ -0,0 +1,123 @@ +/* + * Copyright 2023 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import static org.junit.Assert.assertEquals; + +import com.yelp.nrtsearch.server.config.LuceneServerConfiguration; +import com.yelp.nrtsearch.server.grpc.*; +import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField; +import com.yelp.nrtsearch.server.luceneserver.ServerTestCase; +import com.yelp.nrtsearch.server.plugins.Plugin; +import io.grpc.testing.GrpcCleanupRule; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; + +public class SynonymV2GraphFilterFactoryITest extends ServerTestCase { + @ClassRule public static final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + @Before + public void init() { + init(Collections.emptyList()); + } + + private void init(List plugins) { + AnalyzerCreator.initialize(getEmptyConfig(), plugins); + } + + private LuceneServerConfiguration getEmptyConfig() { + String config = "nodeName: \"lucene_server_foo\""; + return new LuceneServerConfiguration(new ByteArrayInputStream(config.getBytes())); + } + + protected List getIndices() { + return Collections.singletonList(DEFAULT_TEST_INDEX); + } + + protected FieldDefRequest getIndexDef(String name) throws IOException { + return getFieldsFromResourceFile("/analysis/registerFieldsSynonymTokenFilter.json"); + } + + protected void initIndex(String name) throws Exception { + List docs = new ArrayList<>(); + AddDocumentRequest request = + AddDocumentRequest.newBuilder() + .setIndexName(name) + .putFields("doc_id", MultiValuedField.newBuilder().addValue("1").build()) + .putFields("text_field", MultiValuedField.newBuilder().addValue("plaza").build()) + .build(); + docs.add(request); + request = + AddDocumentRequest.newBuilder() + .setIndexName(name) + .putFields("doc_id", MultiValuedField.newBuilder().addValue("2").build()) + .putFields("text_field", MultiValuedField.newBuilder().addValue("str").build()) + .build(); + docs.add(request); + addDocuments(docs.stream()); + } + + @Test + public void testSynonymV2GraphFilter() { + SearchResponse response = + getGrpcServer() + .getBlockingStub() + .search( + SearchRequest.newBuilder() + .setIndexName(DEFAULT_TEST_INDEX) + .setTopHits(10) + .addRetrieveFields("doc_id") + .setQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("text_field") + .setTextValue("plaça") + .build()) + .build()) + .build()); + assertEquals(1, response.getHitsCount()); + assertEquals( + "1", response.getHits(0).getFieldsOrThrow("doc_id").getFieldValue(0).getTextValue()); + + response = + getGrpcServer() + .getBlockingStub() + .search( + SearchRequest.newBuilder() + .setIndexName(DEFAULT_TEST_INDEX) + .setTopHits(10) + .addRetrieveFields("doc_id") + .setQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("text_field") + .setTextValue("straße") + .build()) + .build()) + .build()); + assertEquals(1, response.getHitsCount()); + assertEquals( + "2", response.getHits(0).getFieldsOrThrow("doc_id").getFieldValue(0).getTextValue()); + } +} diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java new file mode 100644 index 000000000..79d2f6549 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/analysis/SynonymV2GraphFilterFactoryTest.java @@ -0,0 +1,233 @@ +/* + * Copyright 2024 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.luceneserver.analysis; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import com.yelp.nrtsearch.server.config.LuceneServerConfiguration; +import com.yelp.nrtsearch.server.plugins.Plugin; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Before; +import org.junit.runner.RunWith; + +@RunWith(RandomizedRunner.class) +public class SynonymV2GraphFilterFactoryTest extends LuceneTestCase { + + private static final String STANDARD_ANALYZER = "standard"; + + @Before + public void init() { + init(Collections.emptyList()); + } + + private void init(List plugins) { + AnalyzerCreator.initialize(getEmptyConfig(), plugins); + } + + private LuceneServerConfiguration getEmptyConfig() { + String config = "nodeName: \"lucene_server_foo\""; + return new LuceneServerConfiguration(new ByteArrayInputStream(config.getBytes())); + } + + public void testNoSynonymMappings() throws IOException, ParseException { + try { + new SynonymV2GraphFilterFactory(new HashMap<>()); + fail(); + } catch (IllegalArgumentException e) { + assertEquals("Synonym mappings must be specified", e.getMessage()); + } + } + + public void testNoSynonymsReturnsInputTokenStream() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a,b"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is test string")); + String[] expectedTokens = {"this", "is", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testSingleMappingWithDefaultAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = getFactory("a, b"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithDefaultAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("#, ste|a, b|c/, calle|plaza, plaça|p.o, po| v, väg"); + TokenStream tokenStream = + new StandardAnalyzer() + .tokenStream("field", new StringReader("# a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c/", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithStandardAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("a, b|c/, calle|plaza, plaça|p.o, po| v, väg", STANDARD_ANALYZER); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithClassicAnalyzer() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("c/, calle|plaza, plaça", "classic"); + TokenStream tokenStream = + new ClassicAnalyzer() + .tokenStream("field", new StringReader("this is a test for calle and plaça")); + String[] expectedTokens = {"test", "c", "calle", "plaza", "plaça"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testInvalidAnalyzer() throws IOException, ParseException { + try { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); + params.put("analyzerName", "invalid"); + new SynonymV2GraphFilterFactory(params); + fail(); + } catch (RuntimeException e) { + assertEquals("Unable to find predefined analyzer: invalid", e.getMessage()); + } + } + + public void testNrtsearchParserFormat() throws IOException, ParseException { + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + getFactory("a, b", STANDARD_ANALYZER, "nrtsearch"); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testInvalidParserFormat() throws IOException, ParseException { + try { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); + params.put("parserFormat", "invalid"); + new SynonymV2GraphFilterFactory(params); + fail(); + } catch (RuntimeException e) { + assertEquals( + "The parser format: invalid is not valid. It should be nrtsearch", e.getMessage()); + } + } + + public void testSingleMappingWithExpandFalse() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "a, b"); + params.put("expand", "false"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testMultipleMappingsWithCustomSeparator() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put( + SynonymV2GraphFilterFactory.SYNONYMS, + "#, ste=>a, b=>c/, calle=>plaza, plaça=>p.o, po=> v, väg"); + params.put("separator_pattern", "\\s*\\=>\\s*"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer() + .tokenStream("field", new StringReader("# a b calle plaça p.o väg v")); + String[] expectedTokens = { + "b", "a", "a", "b", "c/", "calle", "plaza", "plaça", "po", "p.o", "v", "väg", "väg", "v" + }; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + public void testSingleMappingIgnoreCase() throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, "A, B"); + params.put("ignoreCase", "true"); + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory = + new SynonymV2GraphFilterFactory(params); + TokenStream tokenStream = + new StandardAnalyzer().tokenStream("field", new StringReader("this is a test string")); + String[] expectedTokens = {"this", "is", "b", "a", "test", "string"}; + assertTokenStream(synonymV2GraphFilterFactory, tokenStream, expectedTokens); + } + + private static void assertTokenStream( + SynonymV2GraphFilterFactory synonymV2GraphFilterFactory, + TokenStream tokenStream, + String[] expectedTokens) { + try { + TokenStream output = synonymV2GraphFilterFactory.create(tokenStream); + CharTermAttribute charTermAtt = output.addAttribute(CharTermAttribute.class); + int i = 0; + output.reset(); + while (output.incrementToken()) { + assertEquals(expectedTokens[i], charTermAtt.toString()); + i += 1; + } + output.end(); + output.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private SynonymV2GraphFilterFactory getFactory(String synonyms) + throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); + return new SynonymV2GraphFilterFactory(params); + } + + private SynonymV2GraphFilterFactory getFactory(String synonyms, String analyzerName) + throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); + params.put("analyzerName", analyzerName); + return new SynonymV2GraphFilterFactory(params); + } + + private SynonymV2GraphFilterFactory getFactory( + String synonyms, String analyzerName, String parserFormat) + throws IOException, ParseException { + Map params = new HashMap<>(); + params.put(SynonymV2GraphFilterFactory.SYNONYMS, synonyms); + params.put("analyzerName", analyzerName); + params.put("parserFormat", parserFormat); + return new SynonymV2GraphFilterFactory(params); + } +} diff --git a/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json new file mode 100644 index 000000000..8a654c39e --- /dev/null +++ b/src/test/resources/analysis/registerFieldsSynonymTokenFilter.json @@ -0,0 +1,39 @@ +{ + "indexName": "test_index", + "field": [ + { + "name": "doc_id", + "type": "_ID", + "search": true, + "storeDocValues": true + }, + { + "name": "text_field", + "type": "TEXT", + "search": true, + "tokenize": true, + "multiValued": true, + "storeDocValues": true, + "analyzer": { + "custom": { + "tokenizer": { + "name": "keyword" + }, + "tokenFilters": [ + { + "name": "lowercase" + }, + { + "name": "synonymV2", + "params": { + "synonyms": "blvd, boulevard|u.s,us|plaza, plaça|ix,pie-ix|str, straße|v, väg", + "parserFormat": "nrtsearch", + "analyzerName": "standard" + } + } + ] + } + } + } + ] +} \ No newline at end of file