diff --git a/data/esci/ubi_queries_events.ndjson.bz2 b/data/esci/ubi_queries_events.ndjson.bz2 deleted file mode 100644 index d728d94..0000000 --- a/data/esci/ubi_queries_events.ndjson.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6811cd6c99311f7b08a549e7783eefdc84bf3bc40e3bfe3abef65efa91548fe9 -size 36696778 diff --git a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml index 5e8d7cf..8938320 100644 --- a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml +++ b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml @@ -10,7 +10,7 @@ services: logger.level: info OPENSEARCH_INITIAL_ADMIN_PASSWORD: SuperSecretPassword_123 http.max_content_length: 500mb - OPENSEARCH_JAVA_OPTS: "-Xms8192m -Xmx8192m" + OPENSEARCH_JAVA_OPTS: "-Xms8g -Xmx8g" ulimits: memlock: soft: -1 diff --git a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh index c04886a..fc053d2 100755 --- a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh +++ b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh @@ -1,7 +1,7 @@ #!/bin/bash -e #QUERY_SET=`curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss" | jq .query_set | tr -d '"'` -curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=none&max_queries=500" +curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=none&query_set_size=500" #echo ${QUERY_SET} diff --git a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh index 5f9f928..283afef 100755 --- a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh +++ b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh @@ -1,11 +1,11 @@ #!/bin/bash -e #QUERY_SET=`curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss" | jq .query_set | tr -d '"'` -curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&max_queries=500" +curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&query_set_size=5000" #echo ${QUERY_SET} -#curl -s http://localhost:9200/search_quality_eval_query_sets/_search | jq +#curl -s -X GET http://localhost:9200/search_quality_eval_query_sets/_doc/${QUERY_SET} | jq # Run the query set now. #curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/run?id=${QUERY_SET}" | jq diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java index 83129ad..8743e0b 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java @@ -8,7 +8,6 @@ */ package org.opensearch.eval; -import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.action.delete.DeleteRequest; @@ -24,13 +23,16 @@ import org.opensearch.core.rest.RestStatus; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModel; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModelParameters; +import org.opensearch.eval.samplers.AllQueriesQuerySampler; +import org.opensearch.eval.samplers.AllQueriesQuerySamplerParameters; +import org.opensearch.eval.samplers.ProbabilityProportionalToSizeAbstractQuerySampler; +import org.opensearch.eval.samplers.ProbabilityProportionalToSizeParameters; import org.opensearch.index.query.QueryBuilders; import org.opensearch.jobscheduler.spi.schedule.IntervalSchedule; import org.opensearch.rest.BaseRestHandler; import org.opensearch.rest.BytesRestResponse; import org.opensearch.rest.RestRequest; import org.opensearch.rest.RestResponse; -import org.opensearch.search.SearchHit; import org.opensearch.search.builder.SearchSourceBuilder; import java.io.IOException; @@ -38,10 +40,8 @@ import java.time.temporal.ChronoUnit; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.UUID; public class SearchQualityEvaluationRestHandler extends BaseRestHandler { @@ -87,7 +87,7 @@ public List routes() { protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException { // Handle managing query sets. - if(StringUtils.equalsIgnoreCase(request.path(), QUERYSET_MANAGEMENT_URL)) { + if(QUERYSET_MANAGEMENT_URL.equalsIgnoreCase(request.path())) { // Creating a new query set by sampling the UBI queries. if (request.method().equals(RestRequest.Method.POST)) { @@ -95,39 +95,22 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final String name = request.param("name"); final String description = request.param("description"); final String sampling = request.param("sampling", "pptss"); - final int maxQueries = Integer.parseInt(request.param("max_queries", "1000")); + final int querySetSize = Integer.parseInt(request.param("query_set_size", "1000")); // Create a query set by finding all the unique user_query terms. - if (StringUtils.equalsIgnoreCase(sampling, "none")) { + if ("none".equalsIgnoreCase(sampling)) { // If we are not sampling queries, the query sets should just be directly // indexed into OpenSearch using the `ubu_queries` index directly. try { - // Get queries from the UBI queries index. - final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); - searchSourceBuilder.query(QueryBuilders.matchAllQuery()); - searchSourceBuilder.from(0); - searchSourceBuilder.size(maxQueries); + final AllQueriesQuerySamplerParameters parameters = new AllQueriesQuerySamplerParameters(name, description, sampling, querySetSize); + final AllQueriesQuerySampler sampler = new AllQueriesQuerySampler(client, parameters); - final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); - searchRequest.source(searchSourceBuilder); + // Sample and index the queries. + final String querySetId = sampler.sample(); - final SearchResponse searchResponse = client.search(searchRequest).get(); - - LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); - - final Set queries = new HashSet<>(); - for(final SearchHit hit : searchResponse.getHits().getHits()) { - final Map fields = hit.getSourceAsMap(); - queries.add(fields.get("user_query").toString()); - } - - LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); - - // Create the query set and return its ID. - final String querySetId = indexQuerySet(client, name, description, sampling, queries); return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"query_set\": \"" + querySetId + "\"}")); } catch(Exception ex) { @@ -136,15 +119,18 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Create a query set by using PPTSS sampling. - } else if (StringUtils.equalsIgnoreCase(sampling, "pptss")) { + } else if ("pptss".equalsIgnoreCase(sampling)) { - // TODO: Use the PPTSS sampling method - https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/ - final Collection queries = List.of("computer", "desk", "table", "battery"); + LOGGER.info("Creating query set using PPTSS"); + + final ProbabilityProportionalToSizeParameters parameters = new ProbabilityProportionalToSizeParameters(name, description, sampling, querySetSize); + final ProbabilityProportionalToSizeAbstractQuerySampler sampler = new ProbabilityProportionalToSizeAbstractQuerySampler(client, parameters); try { - // Create the query set and return its ID. - final String querySetId = indexQuerySet(client, name, description, sampling, queries); + // Sample and index the queries. + final String querySetId = sampler.sample(); + return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"query_set\": \"" + querySetId + "\"}")); } catch(Exception ex) { @@ -162,7 +148,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli } // Handle running query sets. - } else if(StringUtils.equalsIgnoreCase(request.path(), QUERYSET_RUN_URL)) { + } else if(QUERYSET_RUN_URL.equalsIgnoreCase(request.path())) { final String id = request.param("id"); @@ -197,7 +183,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"message\": \"Query set " + id + " run initiated.\"}")); // Handle the on-demand creation of implicit judgments. - } else if(StringUtils.equalsIgnoreCase(request.path(), IMPLICIT_JUDGMENTS_URL)) { + } else if(IMPLICIT_JUDGMENTS_URL.equalsIgnoreCase(request.path())) { if (request.method().equals(RestRequest.Method.POST)) { @@ -206,7 +192,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final int maxRank = Integer.parseInt(request.param("max_rank", "20")); final long judgments; - if (StringUtils.equalsIgnoreCase(clickModel, "coec")) { + if ("coec".equalsIgnoreCase(clickModel)) { final CoecClickModelParameters coecClickModelParameters = new CoecClickModelParameters(true, maxRank); final CoecClickModel coecClickModel = new CoecClickModel(client, coecClickModelParameters); @@ -249,7 +235,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli } // Handle the scheduling of creating implicit judgments. - } else if(StringUtils.equalsIgnoreCase(request.path(), SCHEDULING_URL)) { + } else if(SCHEDULING_URL.equalsIgnoreCase(request.path())) { if (request.method().equals(RestRequest.Method.POST)) { @@ -270,7 +256,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Read the start_time. final Instant startTime; - if (StringUtils.isEmpty(request.param("start_time"))) { + if (request.param("start_time") == null) { startTime = Instant.now(); } else { startTime = Instant.ofEpochMilli(Long.parseLong(request.param("start_time"))); @@ -278,7 +264,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Read the interval. final int interval; - if (StringUtils.isEmpty(request.param("interval"))) { + if (request.param("interval") == null) { // Default to every 24 hours. interval = 1440; } else { @@ -355,29 +341,4 @@ public void onFailure(Exception e) { } - /** - * Index the query set. - */ - private String indexQuerySet(final NodeClient client, final String name, final String description, final String sampling, Collection queries) throws Exception { - - final Map querySet = new HashMap<>(); - querySet.put("name", name); - querySet.put("description", description); - querySet.put("sampling", sampling); - querySet.put("queries", queries); - querySet.put("created_at", Instant.now().toEpochMilli()); - - final String querySetId = UUID.randomUUID().toString(); - - final IndexRequest indexRequest = new IndexRequest().index(SearchQualityEvaluationPlugin.QUERY_SETS_INDEX_NAME) - .id(querySetId) - .source(querySet) - .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); - - client.index(indexRequest).get(); - - return querySetId; - - } - } diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java index fad79d6..ca9070d 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java @@ -11,7 +11,7 @@ import com.google.gson.annotations.SerializedName; /** - * A UBI event. + * Creates a representation of a UBI event. */ public class UbiEvent { @@ -27,6 +27,13 @@ public class UbiEvent { @SerializedName("event_attributes") private EventAttributes eventAttributes; + /** + * Creates a new representation of an UBI event. + */ + public UbiEvent() { + + } + @Override public String toString() { return actionName + ", " + clientId + ", " + queryId + ", " + eventAttributes.getObject().toString() + ", " + eventAttributes.getPosition().getIndex(); diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java new file mode 100644 index 0000000..f31ec89 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java @@ -0,0 +1,64 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.action.index.IndexRequest; +import org.opensearch.action.support.WriteRequest; +import org.opensearch.client.node.NodeClient; +import org.opensearch.eval.SearchQualityEvaluationPlugin; + +import java.time.Instant; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * An interface for sampling UBI queries. + */ +public abstract class AbstractQuerySampler { + + /** + * Gets the name of the sampler. + * @return The name of the sampler. + */ + abstract String getName(); + + /** + * Samples the queries and inserts the query set into an index. + * @return A query set ID. + */ + abstract String sample() throws Exception; + + /** + * Index the query set. + */ + protected String indexQuerySet(final NodeClient client, final String name, final String description, final String sampling, Collection queries) throws Exception { + + final Map querySet = new HashMap<>(); + querySet.put("name", name); + querySet.put("description", description); + querySet.put("sampling", sampling); + querySet.put("queries", queries); + querySet.put("created_at", Instant.now().toEpochMilli()); + + final String querySetId = UUID.randomUUID().toString(); + + final IndexRequest indexRequest = new IndexRequest().index(SearchQualityEvaluationPlugin.QUERY_SETS_INDEX_NAME) + .id(querySetId) + .source(querySet) + .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + + client.index(indexRequest).get(); + + return querySetId; + + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java new file mode 100644 index 0000000..c8d731a --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java @@ -0,0 +1,41 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class AbstractSamplerParameters { + + private final String name; + private final String description; + private final String sampling; + private final int querySetSize; + + public AbstractSamplerParameters(final String name, final String description, final String sampling, final int querySetSize) { + this.name = name; + this.description = description; + this.sampling = sampling; + this.querySetSize = querySetSize; + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } + + public String getSampling() { + return sampling; + } + + public int getQuerySetSize() { + return querySetSize; + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java new file mode 100644 index 0000000..9a584f1 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java @@ -0,0 +1,74 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.node.NodeClient; +import org.opensearch.eval.SearchQualityEvaluationPlugin; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * An implementation of {@link AbstractQuerySampler} that uses all UBI queries without any sampling. + */ +public class AllQueriesQuerySampler extends AbstractQuerySampler { + + private final NodeClient client; + private final AllQueriesQuerySamplerParameters parameters; + + /** + * Creates a new sampler. + * @param client The OpenSearch {@link NodeClient client}. + */ + public AllQueriesQuerySampler(final NodeClient client, final AllQueriesQuerySamplerParameters parameters) { + this.client = client; + this.parameters = parameters; + } + + @Override + public String getName() { + return "none"; + } + + @Override + public String sample() throws Exception { + + // Get queries from the UBI queries index. + // TODO: This needs to use scroll or something else. + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.query(QueryBuilders.matchAllQuery()); + searchSourceBuilder.from(0); + searchSourceBuilder.size(parameters.getQuerySetSize()); + + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + searchRequest.source(searchSourceBuilder); + + final SearchResponse searchResponse = client.search(searchRequest).get(); + + // LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); + + final Set queries = new HashSet<>(); + for(final SearchHit hit : searchResponse.getHits().getHits()) { + final Map fields = hit.getSourceAsMap(); + queries.add(fields.get("user_query").toString()); + } + + // LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); + + return indexQuerySet(client, parameters.getName(), parameters.getDescription(), parameters.getSampling(), queries); + + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java new file mode 100644 index 0000000..3149668 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java @@ -0,0 +1,17 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class AllQueriesQuerySamplerParameters extends AbstractSamplerParameters { + + public AllQueriesQuerySamplerParameters(final String name, final String description, final String sampling, final int querySetSize) { + super(name, description, sampling, querySetSize); + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java new file mode 100644 index 0000000..0e83d32 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java @@ -0,0 +1,165 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.action.search.SearchScrollRequest; +import org.opensearch.client.node.NodeClient; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.eval.SearchQualityEvaluationPlugin; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.Scroll; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * An implementation of {@link AbstractQuerySampler} that uses PPTSS sampling. + * See https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/ + * for more information on PPTSS. + */ +public class ProbabilityProportionalToSizeAbstractQuerySampler extends AbstractQuerySampler { + + private static final Logger LOGGER = LogManager.getLogger(ProbabilityProportionalToSizeAbstractQuerySampler.class); + + private final NodeClient client; + private final ProbabilityProportionalToSizeParameters parameters; + + /** + * Creates a new PPTSS sampler. + * @param client The OpenSearch {@link NodeClient client}. + * @param parameters The {@link ProbabilityProportionalToSizeParameters parameters} for the sampling. + */ + public ProbabilityProportionalToSizeAbstractQuerySampler(final NodeClient client, final ProbabilityProportionalToSizeParameters parameters) { + this.client = client; + this.parameters = parameters; + } + + @Override + public String getName() { + return "pptss"; + } + + @Override + public String sample() throws Exception { + + // TODO: Can this be changed to an aggregation? + // An aggregation is limited (?) to 10,000 which could miss some queries. + + // Get queries from the UBI queries index. + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.query(QueryBuilders.matchAllQuery()); + searchSourceBuilder.size(1000); + final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(10L)); + + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + searchRequest.scroll(scroll); + searchRequest.source(searchSourceBuilder); + + SearchResponse searchResponse = client.search(searchRequest).get(); + + String scrollId = searchResponse.getScrollId(); + SearchHit[] searchHits = searchResponse.getHits().getHits(); + + final Collection userQueries = new ArrayList<>(); + + while (searchHits != null && searchHits.length > 0) { + + LOGGER.info("search hits size = " + searchHits.length); + + for(final SearchHit hit : searchHits) { + final Map fields = hit.getSourceAsMap(); + userQueries.add(fields.get("user_query").toString()); + // LOGGER.info("user queries count: {} user query: {}", userQueries.size(), fields.get("user_query").toString()); + } + + final SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); + scrollRequest.scroll(scroll); + + searchResponse = client.searchScroll(scrollRequest).get(); + + scrollId = searchResponse.getScrollId(); + searchHits = searchResponse.getHits().getHits(); + + } + + // LOGGER.info("User queries found: {}", userQueries); + + final Map weights = new HashMap<>(); + + // Increment the weight for each user query. + for(final String userQuery : userQueries) { + weights.merge(userQuery, 1L, Long::sum); + } + + // The total number of queries will be used to normalize the weights. + final long countOfQueries = userQueries.size(); + + // Calculate the normalized weights by dividing by the total number of queries. + final Map normalizedWeights = new HashMap<>(); + for(final String userQuery : weights.keySet()) { + normalizedWeights.put(userQuery, weights.get(userQuery) / (double) countOfQueries); + //LOGGER.info("{}: {}/{} = {}", userQuery, weights.get(userQuery), countOfQueries, normalizedWeights.get(userQuery)); + } + + // Ensure all normalized weights sum to 1. + final double sumOfNormalizedWeights = normalizedWeights.values().stream().reduce(0.0, Double::sum); + if(!compare(1.0, sumOfNormalizedWeights)) { + throw new RuntimeException("Summed normalized weights do not equal 1.0: Actual value: " + sumOfNormalizedWeights); + } else { + LOGGER.info("Summed normalized weights sum to {}", sumOfNormalizedWeights); + } + + final Set querySet = new HashSet<>(); + final Set randomNumbers = new HashSet<>(); + + // Generate a random number between 0 and 1 for the size of the query set. + for(int count = 0; count < parameters.getQuerySetSize(); count++) { + + // Make a random number not yet used. + double random; + do { + random = Math.random(); + } while (randomNumbers.contains(random)); + randomNumbers.add(random); + + // Find the weight closest to the random weight in the map of deltas. + double smallestDelta = Integer.MAX_VALUE; + String closestQuery = null; + for(final String query : normalizedWeights.keySet()) { + final double delta = Math.abs(normalizedWeights.get(query) - random); + if(delta < smallestDelta) { + smallestDelta = delta; + closestQuery = query; + } + + } + + // LOGGER.info("Generated random value: {}; Smallest delta = {}; Closest query = {}", random, smallestDelta, closestQuery); + + } + + return indexQuerySet(client, parameters.getName(), parameters.getDescription(), parameters.getSampling(), querySet); + + } + + public static boolean compare(double a, double b) { + return Math.abs(a - b) < 0.00001; + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java new file mode 100644 index 0000000..d5e4311 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java @@ -0,0 +1,17 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class ProbabilityProportionalToSizeParameters extends AbstractSamplerParameters { + + public ProbabilityProportionalToSizeParameters(final String name, final String description, final String sampling, final int querySetSize) { + super(name, description, sampling, querySetSize); + } + +}