Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regexp interval source #1917

Merged
merged 2 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.opensearch.LegacyESVersion;
import org.opensearch.common.ParseField;
import org.opensearch.common.ParsingException;
Expand Down Expand Up @@ -100,12 +101,14 @@ public static IntervalsSourceProvider fromXContent(XContentParser parser) throws
return Prefix.fromXContent(parser);
case "wildcard":
return Wildcard.fromXContent(parser);
case "regexp":
return Regexp.fromXContent(parser);
case "fuzzy":
return Fuzzy.fromXContent(parser);
}
throw new ParsingException(
parser.getTokenLocation(),
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]"
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard, regexp]"
);
}

Expand Down Expand Up @@ -630,6 +633,155 @@ String getUseField() {
}
}

public static class Regexp extends IntervalsSourceProvider {

public static final String NAME = "regexp";
public static final int DEFAULT_FLAGS_VALUE = RegexpFlag.ALL.value();

private final String pattern;
private final int flags;
private final String useField;
private final Integer maxExpansions;

public Regexp(String pattern, int flags, String useField, Integer maxExpansions) {
this.pattern = pattern;
this.flags = flags;
this.useField = useField;
this.maxExpansions = (maxExpansions != null && maxExpansions > 0) ? maxExpansions : null;
}

public Regexp(StreamInput in) throws IOException {
mattweber marked this conversation as resolved.
Show resolved Hide resolved
this.pattern = in.readString();
this.flags = in.readVInt();
this.useField = in.readOptionalString();
this.maxExpansions = in.readOptionalVInt();
}

@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
final org.apache.lucene.util.automaton.RegExp regexp = new org.apache.lucene.util.automaton.RegExp(pattern, flags);
final CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());

if (useField != null) {
fieldType = context.fieldMapper(useField);
assert fieldType != null;
checkPositions(fieldType);

IntervalsSource regexpSource = maxExpansions == null
? Intervals.multiterm(automaton, regexp.toString())
: Intervals.multiterm(automaton, maxExpansions, regexp.toString());
return Intervals.fixField(useField, regexpSource);
} else {
checkPositions(fieldType);
return maxExpansions == null
? Intervals.multiterm(automaton, regexp.toString())
: Intervals.multiterm(automaton, maxExpansions, regexp.toString());
}
}

private void checkPositions(MappedFieldType type) {
if (type.getTextSearchInfo().hasPositions() == false) {
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
}
}

@Override
public void extractFields(Set<String> fields) {
if (useField != null) {
fields.add(useField);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Regexp regexp = (Regexp) o;
return Objects.equals(pattern, regexp.pattern)
&& Objects.equals(flags, regexp.flags)
&& Objects.equals(useField, regexp.useField)
&& Objects.equals(maxExpansions, regexp.maxExpansions);
}

@Override
public int hashCode() {
return Objects.hash(pattern, flags, useField, maxExpansions);
}

@Override
public String getWriteableName() {
return NAME;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(pattern);
out.writeVInt(flags);
out.writeOptionalString(useField);
out.writeOptionalVInt(maxExpansions);
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field("pattern", pattern);
if (flags != DEFAULT_FLAGS_VALUE) {
builder.field("flags_value", flags);
}
if (useField != null) {
builder.field("use_field", useField);
}
if (maxExpansions != null) {
builder.field("max_expansions", maxExpansions);
}
builder.endObject();
return builder;
}

private static final ConstructingObjectParser<Regexp, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
String pattern = (String) args[0];
String flags = (String) args[1];
Integer flagsValue = (Integer) args[2];
String useField = (String) args[3];
Integer maxExpansions = (Integer) args[4];

if (flagsValue != null) {
return new Regexp(pattern, flagsValue, useField, maxExpansions);
} else if (flags != null) {
return new Regexp(pattern, RegexpFlag.resolveValue(flags), useField, maxExpansions);
} else {
return new Regexp(pattern, DEFAULT_FLAGS_VALUE, useField, maxExpansions);
}
});
static {
PARSER.declareString(constructorArg(), new ParseField("pattern"));
PARSER.declareString(optionalConstructorArg(), new ParseField("flags"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("flags_value"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("max_expansions"));
}

public static Regexp fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}

String getPattern() {
return pattern;
}

int getFlags() {
return flags;
}

String getUseField() {
return useField;
}

Integer getMaxExpansions() {
return maxExpansions;
}
}

public static class Wildcard extends IntervalsSourceProvider {

public static final String NAME = "wildcard";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,11 @@ public static List<NamedWriteableRegistry.Entry> getIntervalsSourceProviderNamed
IntervalsSourceProvider.Wildcard.NAME,
IntervalsSourceProvider.Wildcard::new
),
new NamedWriteableRegistry.Entry(
IntervalsSourceProvider.class,
IntervalsSourceProvider.Regexp.NAME,
IntervalsSourceProvider.Regexp::new
),
new NamedWriteableRegistry.Entry(
IntervalsSourceProvider.class,
IntervalsSourceProvider.Fuzzy.NAME,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,14 @@
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.common.ParsingException;
import org.opensearch.common.Strings;
import org.opensearch.common.compress.CompressedXContent;
Expand Down Expand Up @@ -656,6 +659,114 @@ public void testWildcard() throws IOException {
assertEquals(expected, builder.toQuery(createShardContext()));
}

private static IntervalsSource buildRegexpSource(String pattern, int flags, Integer maxExpansions) {
final RegExp regexp = new RegExp(pattern, flags);
CompiledAutomaton automaton = new CompiledAutomaton(regexp.toAutomaton());

if (maxExpansions != null) {
return Intervals.multiterm(automaton, maxExpansions, regexp.toString());
} else {
return Intervals.multiterm(automaton, regexp.toString());
}
}

public void testRegexp() throws IOException {
final int DEFAULT_FLAGS = RegexpFlag.ALL.value();
String json = "{ \"intervals\" : { \"" + TEXT_FIELD_NAME + "\": { " + "\"regexp\" : { \"pattern\" : \"te.m\" } } } }";

IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);
Query expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, null));
assertEquals(expected, builder.toQuery(createShardContext()));

String no_positions_json = "{ \"intervals\" : { \""
+ NO_POSITIONS_FIELD
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"[Tt]erm\" } } } }";
expectThrows(IllegalArgumentException.class, () -> {
IntervalQueryBuilder builder1 = (IntervalQueryBuilder) parseQuery(no_positions_json);
builder1.toQuery(createShardContext());
});

String fixed_field_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"use_field\" : \"masked_field\" } } } }";

builder = (IntervalQueryBuilder) parseQuery(fixed_field_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, Intervals.fixField(MASKED_FIELD, buildRegexpSource("te.m", DEFAULT_FLAGS, null)));
assertEquals(expected, builder.toQuery(createShardContext()));

String fixed_field_json_no_positions = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"use_field\" : \""
+ NO_POSITIONS_FIELD
+ "\" } } } }";
expectThrows(IllegalArgumentException.class, () -> {
IntervalQueryBuilder builder1 = (IntervalQueryBuilder) parseQuery(fixed_field_json_no_positions);
builder1.toQuery(createShardContext());
});

String flags_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"flags\" : \"NONE\" } } } }";

builder = (IntervalQueryBuilder) parseQuery(flags_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", RegexpFlag.NONE.value(), null));
assertEquals(expected, builder.toQuery(createShardContext()));

String flags_value_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"flags_value\" : \""
+ RegexpFlag.ANYSTRING.value()
+ "\" } } } }";

builder = (IntervalQueryBuilder) parseQuery(flags_value_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", RegexpFlag.ANYSTRING.value(), null));
assertEquals(expected, builder.toQuery(createShardContext()));

String regexp_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"max_expansions\" : 500 } } } }";

builder = (IntervalQueryBuilder) parseQuery(regexp_max_expand_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, 500));
assertEquals(expected, builder.toQuery(createShardContext()));

String regexp_neg_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"max_expansions\" : -20 } } } }";

builder = (IntervalQueryBuilder) parseQuery(regexp_neg_max_expand_json);
// max expansions use default
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", DEFAULT_FLAGS, null));
assertEquals(expected, builder.toQuery(createShardContext()));

String regexp_over_max_expand_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"max_expansions\" : "
+ (BooleanQuery.getMaxClauseCount() + 1)
+ " } } } }";
expectThrows(IllegalArgumentException.class, () -> {
IntervalQueryBuilder builder1 = (IntervalQueryBuilder) parseQuery(regexp_over_max_expand_json);
builder1.toQuery(createShardContext());
});

String regexp_max_expand_with_flags_json = "{ \"intervals\" : { \""
+ TEXT_FIELD_NAME
+ "\": { "
+ "\"regexp\" : { \"pattern\" : \"te.m\", \"flags\": \"NONE\", \"max_expansions\" : 500 } } } }";

builder = (IntervalQueryBuilder) parseQuery(regexp_max_expand_with_flags_json);
expected = new IntervalQuery(TEXT_FIELD_NAME, buildRegexpSource("te.m", RegexpFlag.NONE.value(), 500));
assertEquals(expected, builder.toQuery(createShardContext()));
}

private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
return Intervals.multiterm(fq.getAutomata(), label);
Expand Down
Loading