Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add analyzers #40

Merged
merged 15 commits into from
Feb 6, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright 2020 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package org.apache.platypus.server.luceneserver;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.platypus.server.grpc.ConditionalTokenFilter;
import org.apache.platypus.server.grpc.Field;
import org.apache.platypus.server.grpc.NameAndParams;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.HashMap;

public class AnalyzerCreator {

private static final String LUCENE_ANALYZER_PATH = "org.apache.lucene.analysis.{0}Analyzer";

static Analyzer getAnalyzer(org.apache.platypus.server.grpc.Analyzer analyzer) {
if (!analyzer.getPredefined().isEmpty()) {
String predefinedAnalyzer = analyzer.getPredefined();

if ("standard".equals(predefinedAnalyzer)) {
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
return new StandardAnalyzer();
} else if ("classic".equals(predefinedAnalyzer)) {
return new ClassicAnalyzer();
} else {
// Try to dynamically load the analyzer class
try {
String className = MessageFormat.format(LUCENE_ANALYZER_PATH, predefinedAnalyzer);
return (Analyzer) AnalyzerCreator.class.getClassLoader().loadClass(className).getDeclaredConstructor().newInstance();
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
} catch (InstantiationException | IllegalAccessException | NoSuchMethodException | ClassNotFoundException | InvocationTargetException e) {
throw new AnalyzerCreationException("Unable to find predefined analyzer: " + predefinedAnalyzer, e);
}
}
} else if (analyzer.hasCustom()) {
return getCustomAnalyzer(analyzer.getCustom());
} else {
return null;
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
}
}

/**
* Create an {@link Analyzer} from user parameters. Note that we create new maps with the param maps because
* the Protobuf one may be unmodifiable and Lucene may modify the maps.
*/
private static Analyzer getCustomAnalyzer(org.apache.platypus.server.grpc.CustomAnalyzer analyzer) {
CustomAnalyzer.Builder builder = CustomAnalyzer.builder();

if (analyzer.hasPositionIncrementGap()) {
builder.withPositionIncrementGap(analyzer.getPositionIncrementGap().getInt());
}
if (analyzer.hasOffsetGap()) {
builder.withOffsetGap(analyzer.getOffsetGap().getInt());
}

try {
if (!analyzer.getDefaultMatchVersion().isEmpty()) {
builder.withDefaultMatchVersion(Version.parseLeniently(analyzer.getDefaultMatchVersion()));
}

for (NameAndParams charFilter : analyzer.getCharFiltersList()) {
builder.addCharFilter(charFilter.getName(), new HashMap<>(charFilter.getParamsMap()));
}

builder.withTokenizer(analyzer.getTokenizer().getName(), new HashMap<>(analyzer.getTokenizer().getParamsMap()));

for (NameAndParams tokenFilter : analyzer.getTokenFiltersList()) {
builder.addTokenFilter(tokenFilter.getName(), new HashMap<>(tokenFilter.getParamsMap()));
}

// TODO: The only impl of ConditionalTokenFilter is ProtectedTermFilter (https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.html)
// It needs a protected terms file as input which is not supported yet.
for (ConditionalTokenFilter conditionalTokenFilter : analyzer.getConditionalTokenFiltersList()) {
NameAndParams condition = conditionalTokenFilter.getCondition();
CustomAnalyzer.ConditionBuilder when = builder.when(condition.getName(), condition.getParamsMap());

for (NameAndParams tokenFilter : conditionalTokenFilter.getTokenFiltersList()) {
when.addTokenFilter(tokenFilter.getName(), tokenFilter.getParamsMap());
}

when.endwhen();
}

return builder.build();
} catch (ParseException | IOException e) {
throw new AnalyzerCreationException("Unable to create custom analyzer: " + analyzer, e);
}
}

// TODO: replace all usages of this method with getAnalyzer
static Analyzer getStandardAnalyzer() {
return new StandardAnalyzer();
}

static boolean hasAnalyzer(Field field) {
return !field.getAnalyzer().getPredefined().isEmpty() || field.getAnalyzer().hasCustom();
}

static class AnalyzerCreationException extends RuntimeException {

AnalyzerCreationException(String message, Throwable cause) {
super(message, cause);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
int options = 0;
FuzzySuggester fuzzySuggester = buildSuggestRequest.getFuzzySuggester();
if (fuzzySuggester.getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand Down Expand Up @@ -157,10 +157,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
int options = 0;
org.apache.platypus.server.grpc.AnalyzingSuggester analyzingSuggester = buildSuggestRequest.getAnalyzingSuggester();
if (analyzingSuggester.getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand All @@ -184,10 +184,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
maxGraphExpansions, true);
} else if (buildSuggestRequest.hasInfixSuggester()) {
if (buildSuggestRequest.getInfixSuggester().getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
import java.text.SimpleDateFormat;
import java.util.*;

import static org.apache.platypus.server.luceneserver.AnalyzerCreator.hasAnalyzer;

public class RegisterFieldsHandler implements Handler<FieldDefRequest, FieldDefResponse> {

Logger logger = LoggerFactory.getLogger(RegisterFieldsHandler.class);
Expand Down Expand Up @@ -229,7 +231,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
break;

case ATOM:
if (!currentField.getAnalyzer().isEmpty()) {
if (hasAnalyzer(currentField)) {
throw new RegisterFieldsException("no analyzer allowed with atom (it's hardwired to KeywordAnalyzer internally)");
}
if (highlighted) {
Expand Down Expand Up @@ -341,7 +343,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
throw new RegisterFieldsException("search must be true when highlight is true");
}

if (!currentField.getAnalyzer().isEmpty() && ft.indexOptions() == IndexOptions.NONE) {
if (hasAnalyzer(currentField) && ft.indexOptions() == IndexOptions.NONE) {
throw new RegisterFieldsException("no analyzer allowed when search=false");
}

Expand All @@ -361,7 +363,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
} else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS)) {
} else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS_PAYLOADS)) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
Expand All @@ -371,13 +373,13 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
}
}

if (currentField.getIndexOptions().equals(IndexOptions.DOCS)) {
if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS)) {
ft.setIndexOptions(IndexOptions.DOCS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS_OFFSETS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
} else { //default option
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
Expand Down Expand Up @@ -420,12 +422,12 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>

Analyzer indexAnalyzer;
Analyzer searchAnalyzer;
Analyzer analyzer = getAnalyzer(indexState, currentField, currentField.getAnalyzer());
Analyzer analyzer = AnalyzerCreator.getAnalyzer(currentField.getAnalyzer());
if (analyzer != null) {
indexAnalyzer = searchAnalyzer = analyzer;
} else {
indexAnalyzer = getAnalyzer(indexState, currentField, currentField.getIndexAnalyzer());
searchAnalyzer = getAnalyzer(indexState, currentField, currentField.getSearchAnalyzer());
indexAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getIndexAnalyzer());
searchAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getSearchAnalyzer());
}

if (type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE) {
Expand Down Expand Up @@ -530,19 +532,6 @@ private FieldDef parseOneVirtualFieldType(IndexState indexState, Map<String, Fie

}

//TODO: Always return StandardAnalyzer for now, eventually we want to support all analyzers from lucene-analysis. Also support building custom
//analyzers
static Analyzer getAnalyzer(IndexState state, Field currentField, String name) {
Analyzer analyzer;
if (!name.isEmpty()) {
//TODO: support all analyzers from lucene-analysis, CJK, and CustomAnalyzers
analyzer = new StandardAnalyzer();
} else {
analyzer = null;
}
return analyzer;
}

public static class RegisterFieldsException extends Handler.HandlerException {
public RegisterFieldsException(String errorMessage) {
super(errorMessage);
Expand Down
41 changes: 41 additions & 0 deletions src/main/proto/analysis.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* Description of analyzers, predefined and custom */
syntax = "proto3";

option java_multiple_files = true;
option java_package = "org.apache.platypus.server.grpc";
option java_outer_classname = "AnalysisProto";
option objc_class_prefix = "HLW";

package luceneserver;

message NameAndParams {
string name = 1;
map<string, string> params = 2;
}

message ConditionalTokenFilter {
NameAndParams condition = 1;
repeated NameAndParams tokenFilters = 2;
}

// Used to be able to check if a value was set
message IntObject {
int32 int = 1;
}

message CustomAnalyzer {
repeated NameAndParams charFilters = 1; // Available char filters as of Lucene 8.2.0: htmlstrip, mapping, persian, patternreplace
NameAndParams tokenizer = 2;
repeated NameAndParams tokenFilters = 3;
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
repeated ConditionalTokenFilter conditionalTokenFilters = 4; // TODO: this is not properly supported yet, the only impl requires a protected terms file. Can support this properly later if needed
string defaultMatchVersion = 5; // Lucene version as LUCENE_X_Y_Z or X.Y.Z, LATEST by default
IntObject positionIncrementGap = 6;
IntObject offsetGap = 7;
}

message Analyzer {
oneof AnalyzerType {
string predefined = 1; // Analyzers predefined in Lucene, apart from standard and classic there are en.English, bn.Bengali, eu.Basque, etc. (names derived from Lucene's analyzer class names)
CustomAnalyzer custom = 2;
}
}
7 changes: 4 additions & 3 deletions src/main/proto/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
syntax = "proto3";

import "search.proto";
import "analysis.proto";

option java_multiple_files = true;
option java_package = "org.apache.platypus.server.grpc";
Expand Down Expand Up @@ -220,9 +221,9 @@ message Field {
IndexOptions indexOptions = 15; //How the tokens should be indexed.
string expression = 16; // The JavaScript expression defining a virtual field's value (only used with type=virtual).
//TODO make analyzers message types i.e. StandardAnalyzer, EnglishAnalyzer, CustomAnalyzer etc
string analyzer = 17; // Analyzer to use for this field during indexing and searching.
string indexAnalyzer = 18; // Analyzer to use for this field during indexing.
string searchAnalyzer = 19; //Analyzer to use for this field during searching.
Analyzer analyzer = 17; // Analyzer to use for this field during indexing and searching.
Analyzer indexAnalyzer = 18; // Analyzer to use for this field during indexing.
Analyzer searchAnalyzer = 19; //Analyzer to use for this field during searching.
TermVectors termVectors = 20; // Whether/how term vectors should be indexed.
//TODO make similarity message types i.d. DefaultSimilarity, CustomSimilarity, BM25Similarity;
string similarity = 21; // Which Similarity implementation to use for this field.
Expand Down
Loading