From 0fcfeb7682fd6bf066f69b8e284e93529eb41735 Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Thu, 21 Nov 2024 11:27:59 -0500 Subject: [PATCH 1/4] #44 Adding PPTSS sampling. --- .../SearchQualityEvaluationRestHandler.java | 18 ++-- ...obabilityProportionalToSizeParameters.java | 23 ++++ ...abilityProportionalToSizeQuerySampler.java | 102 ++++++++++++++++++ .../eval/samplers/QuerySampler.java | 33 ++++++ 4 files changed, 170 insertions(+), 6 deletions(-) create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java index 83129ad..966c8c2 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java @@ -24,6 +24,8 @@ import org.opensearch.core.rest.RestStatus; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModel; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModelParameters; +import org.opensearch.eval.samplers.ProbabilityProportionalToSizeParameters; +import org.opensearch.eval.samplers.ProbabilityProportionalToSizeQuerySampler; import org.opensearch.index.query.QueryBuilders; import org.opensearch.jobscheduler.spi.schedule.IntervalSchedule; import org.opensearch.rest.BaseRestHandler; @@ -95,7 +97,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final String name = request.param("name"); final String description = request.param("description"); final String sampling = request.param("sampling", "pptss"); - final int maxQueries = Integer.parseInt(request.param("max_queries", "1000")); + final int querySetSize = Integer.parseInt(request.param("query_set_size", "1000")); // Create a query set by finding all the unique user_query terms. if (StringUtils.equalsIgnoreCase(sampling, "none")) { @@ -109,14 +111,14 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(QueryBuilders.matchAllQuery()); searchSourceBuilder.from(0); - searchSourceBuilder.size(maxQueries); + searchSourceBuilder.size(querySetSize); final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); searchRequest.source(searchSourceBuilder); final SearchResponse searchResponse = client.search(searchRequest).get(); - LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); + // LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); final Set queries = new HashSet<>(); for(final SearchHit hit : searchResponse.getHits().getHits()) { @@ -124,7 +126,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli queries.add(fields.get("user_query").toString()); } - LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); + // LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); // Create the query set and return its ID. final String querySetId = indexQuerySet(client, name, description, sampling, queries); @@ -138,8 +140,12 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Create a query set by using PPTSS sampling. } else if (StringUtils.equalsIgnoreCase(sampling, "pptss")) { - // TODO: Use the PPTSS sampling method - https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/ - final Collection queries = List.of("computer", "desk", "table", "battery"); + final ProbabilityProportionalToSizeParameters parameters = new ProbabilityProportionalToSizeParameters(querySetSize); + final ProbabilityProportionalToSizeQuerySampler sampler = new ProbabilityProportionalToSizeQuerySampler(parameters); + + // TODO: Get all queries from the ubi_queries index. + + final Collection queries = sampler.sample(); try { diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java new file mode 100644 index 0000000..0001821 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java @@ -0,0 +1,23 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class ProbabilityProportionalToSizeParameters { + + private final int querySetSize; + + public ProbabilityProportionalToSizeParameters(int querySetSize) { + this.querySetSize = querySetSize; + } + + public int getQuerySetSize() { + return querySetSize; + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java new file mode 100644 index 0000000..1307e9f --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java @@ -0,0 +1,102 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.eval.judgments.model.ubi.query.UbiQuery; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * An implementation of {@link QuerySampler} that uses PPTSS sampling. + * See https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/ + * for more information on PPTSS. + */ +public class ProbabilityProportionalToSizeQuerySampler implements QuerySampler { + + private final ProbabilityProportionalToSizeParameters parameters; + + /** + * Creates a new PPTSS sampler. + * @param parameters The {@link ProbabilityProportionalToSizeParameters parameters} for the sampling. + */ + public ProbabilityProportionalToSizeQuerySampler(final ProbabilityProportionalToSizeParameters parameters) { + this.parameters = parameters; + } + + @Override + public String getName() { + return "pptss"; + } + + @Override + public Collection sample(final Collection userQueries) { + + final Map weights = new HashMap<>(); + + // Increment the weight for each user query. + for(final String userQuery : userQueries) { + weights.merge(userQuery, 1L, Long::sum); + } + + // The total number of queries will be used to normalize the weights. + final long countOfQueries = userQueries.size(); + + // Calculate the normalized weights by dividing by the total number of queries. + final Map normalizedWeights = new HashMap<>(); + for(final String userQuery : weights.keySet()) { + normalizedWeights.put(userQuery, weights.get(userQuery) / (double) countOfQueries); + } + + // Ensure all normalized weights sum to 1. + final double sumOfNormalizedWeights = normalizedWeights.values().stream().reduce(0.0, Double::sum); + if(sumOfNormalizedWeights != 1.0) { + throw new RuntimeException("Summed normalized weights do not equal 1.0"); + } + + final Collection querySet = new ArrayList<>(); + final Set randomNumbers = new HashSet<>(); + + // Generate a random number between 0 and 1 for the size of the query set. + for(int count = 0; count < parameters.getQuerySetSize(); count++) { + + // Make a random number not yet used. + double random; + do { + random = Math.random(); + } while (randomNumbers.contains(random)); + randomNumbers.add(random); + + // Find the weight closest to the random weight. + double finalRandom = random; + double nearestWeight = normalizedWeights.values().stream() + .min(Comparator.comparingDouble(i -> Math.abs(i - finalRandom))) + .orElseThrow(() -> new NoSuchElementException("No value present")); + + // Find the query having the weight closest to this random number. + for(Map.Entry entry : normalizedWeights.entrySet()) { + if(entry.getValue() == nearestWeight) { + querySet.add(entry.getKey()); + break; + } + } + + } + + return querySet; + + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java new file mode 100644 index 0000000..e943599 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java @@ -0,0 +1,33 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.eval.judgments.model.ubi.query.UbiQuery; + +import java.util.Collection; + +/** + * An interface for sampling UBI queries. + */ +public interface QuerySampler { + + /** + * Gets the name of the sampler. + * @return The name of the sampler. + */ + String getName(); + + /** + * Samples the queries. + * @param userQueries A collection of user queries from UBI queries. + * @return A collection of sampled user queries. + */ + Collection sample(Collection userQueries); + +} From b17024243a57c6dfd31e4ad2d8fac01203fb3b52 Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Fri, 22 Nov 2024 09:53:06 -0500 Subject: [PATCH 2/4] #44 Working on wiring up PPTSS. --- data/esci/ubi_queries_events.ndjson.bz2 | 3 - .../scripts/create-query-set-no-sampling.sh | 2 +- .../create-query-set-using-pptss-sampling.sh | 4 +- .../SearchQualityEvaluationRestHandler.java | 89 +++++-------------- .../eval/samplers/AbstractQuerySampler.java | 64 +++++++++++++ .../samplers/AbstractSamplerParameters.java | 41 +++++++++ .../eval/samplers/AllQueriesQuerySampler.java | 73 +++++++++++++++ .../AllQueriesQuerySamplerParameters.java | 17 ++++ ...oportionalToSizeAbstractQuerySampler.java} | 56 ++++++++++-- ...obabilityProportionalToSizeParameters.java | 12 +-- .../eval/samplers/QuerySampler.java | 33 ------- 11 files changed, 271 insertions(+), 123 deletions(-) delete mode 100644 data/esci/ubi_queries_events.ndjson.bz2 create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java create mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java rename opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/{ProbabilityProportionalToSizeQuerySampler.java => ProbabilityProportionalToSizeAbstractQuerySampler.java} (56%) delete mode 100644 opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java diff --git a/data/esci/ubi_queries_events.ndjson.bz2 b/data/esci/ubi_queries_events.ndjson.bz2 deleted file mode 100644 index d728d94..0000000 --- a/data/esci/ubi_queries_events.ndjson.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6811cd6c99311f7b08a549e7783eefdc84bf3bc40e3bfe3abef65efa91548fe9 -size 36696778 diff --git a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh index c04886a..fc053d2 100755 --- a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh +++ b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-no-sampling.sh @@ -1,7 +1,7 @@ #!/bin/bash -e #QUERY_SET=`curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss" | jq .query_set | tr -d '"'` -curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=none&max_queries=500" +curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=none&query_set_size=500" #echo ${QUERY_SET} diff --git a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh index 5f9f928..96c822a 100755 --- a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh +++ b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh @@ -1,11 +1,11 @@ #!/bin/bash -e #QUERY_SET=`curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss" | jq .query_set | tr -d '"'` -curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&max_queries=500" +curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&query_set_size=500" #echo ${QUERY_SET} -#curl -s http://localhost:9200/search_quality_eval_query_sets/_search | jq +#curl -s -X GET http://localhost:9200/search_quality_eval_query_sets/_doc/${QUERY_SET} | jq # Run the query set now. #curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/run?id=${QUERY_SET}" | jq diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java index 966c8c2..8743e0b 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/SearchQualityEvaluationRestHandler.java @@ -8,7 +8,6 @@ */ package org.opensearch.eval; -import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.action.delete.DeleteRequest; @@ -24,15 +23,16 @@ import org.opensearch.core.rest.RestStatus; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModel; import org.opensearch.eval.judgments.clickmodel.coec.CoecClickModelParameters; +import org.opensearch.eval.samplers.AllQueriesQuerySampler; +import org.opensearch.eval.samplers.AllQueriesQuerySamplerParameters; +import org.opensearch.eval.samplers.ProbabilityProportionalToSizeAbstractQuerySampler; import org.opensearch.eval.samplers.ProbabilityProportionalToSizeParameters; -import org.opensearch.eval.samplers.ProbabilityProportionalToSizeQuerySampler; import org.opensearch.index.query.QueryBuilders; import org.opensearch.jobscheduler.spi.schedule.IntervalSchedule; import org.opensearch.rest.BaseRestHandler; import org.opensearch.rest.BytesRestResponse; import org.opensearch.rest.RestRequest; import org.opensearch.rest.RestResponse; -import org.opensearch.search.SearchHit; import org.opensearch.search.builder.SearchSourceBuilder; import java.io.IOException; @@ -40,10 +40,8 @@ import java.time.temporal.ChronoUnit; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.UUID; public class SearchQualityEvaluationRestHandler extends BaseRestHandler { @@ -89,7 +87,7 @@ public List routes() { protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException { // Handle managing query sets. - if(StringUtils.equalsIgnoreCase(request.path(), QUERYSET_MANAGEMENT_URL)) { + if(QUERYSET_MANAGEMENT_URL.equalsIgnoreCase(request.path())) { // Creating a new query set by sampling the UBI queries. if (request.method().equals(RestRequest.Method.POST)) { @@ -100,36 +98,19 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final int querySetSize = Integer.parseInt(request.param("query_set_size", "1000")); // Create a query set by finding all the unique user_query terms. - if (StringUtils.equalsIgnoreCase(sampling, "none")) { + if ("none".equalsIgnoreCase(sampling)) { // If we are not sampling queries, the query sets should just be directly // indexed into OpenSearch using the `ubu_queries` index directly. try { - // Get queries from the UBI queries index. - final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); - searchSourceBuilder.query(QueryBuilders.matchAllQuery()); - searchSourceBuilder.from(0); - searchSourceBuilder.size(querySetSize); + final AllQueriesQuerySamplerParameters parameters = new AllQueriesQuerySamplerParameters(name, description, sampling, querySetSize); + final AllQueriesQuerySampler sampler = new AllQueriesQuerySampler(client, parameters); - final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); - searchRequest.source(searchSourceBuilder); + // Sample and index the queries. + final String querySetId = sampler.sample(); - final SearchResponse searchResponse = client.search(searchRequest).get(); - - // LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); - - final Set queries = new HashSet<>(); - for(final SearchHit hit : searchResponse.getHits().getHits()) { - final Map fields = hit.getSourceAsMap(); - queries.add(fields.get("user_query").toString()); - } - - // LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); - - // Create the query set and return its ID. - final String querySetId = indexQuerySet(client, name, description, sampling, queries); return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"query_set\": \"" + querySetId + "\"}")); } catch(Exception ex) { @@ -138,19 +119,18 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Create a query set by using PPTSS sampling. - } else if (StringUtils.equalsIgnoreCase(sampling, "pptss")) { + } else if ("pptss".equalsIgnoreCase(sampling)) { - final ProbabilityProportionalToSizeParameters parameters = new ProbabilityProportionalToSizeParameters(querySetSize); - final ProbabilityProportionalToSizeQuerySampler sampler = new ProbabilityProportionalToSizeQuerySampler(parameters); + LOGGER.info("Creating query set using PPTSS"); - // TODO: Get all queries from the ubi_queries index. - - final Collection queries = sampler.sample(); + final ProbabilityProportionalToSizeParameters parameters = new ProbabilityProportionalToSizeParameters(name, description, sampling, querySetSize); + final ProbabilityProportionalToSizeAbstractQuerySampler sampler = new ProbabilityProportionalToSizeAbstractQuerySampler(client, parameters); try { - // Create the query set and return its ID. - final String querySetId = indexQuerySet(client, name, description, sampling, queries); + // Sample and index the queries. + final String querySetId = sampler.sample(); + return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"query_set\": \"" + querySetId + "\"}")); } catch(Exception ex) { @@ -168,7 +148,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli } // Handle running query sets. - } else if(StringUtils.equalsIgnoreCase(request.path(), QUERYSET_RUN_URL)) { + } else if(QUERYSET_RUN_URL.equalsIgnoreCase(request.path())) { final String id = request.param("id"); @@ -203,7 +183,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli return restChannel -> restChannel.sendResponse(new BytesRestResponse(RestStatus.OK, "{\"message\": \"Query set " + id + " run initiated.\"}")); // Handle the on-demand creation of implicit judgments. - } else if(StringUtils.equalsIgnoreCase(request.path(), IMPLICIT_JUDGMENTS_URL)) { + } else if(IMPLICIT_JUDGMENTS_URL.equalsIgnoreCase(request.path())) { if (request.method().equals(RestRequest.Method.POST)) { @@ -212,7 +192,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli final int maxRank = Integer.parseInt(request.param("max_rank", "20")); final long judgments; - if (StringUtils.equalsIgnoreCase(clickModel, "coec")) { + if ("coec".equalsIgnoreCase(clickModel)) { final CoecClickModelParameters coecClickModelParameters = new CoecClickModelParameters(true, maxRank); final CoecClickModel coecClickModel = new CoecClickModel(client, coecClickModelParameters); @@ -255,7 +235,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli } // Handle the scheduling of creating implicit judgments. - } else if(StringUtils.equalsIgnoreCase(request.path(), SCHEDULING_URL)) { + } else if(SCHEDULING_URL.equalsIgnoreCase(request.path())) { if (request.method().equals(RestRequest.Method.POST)) { @@ -276,7 +256,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Read the start_time. final Instant startTime; - if (StringUtils.isEmpty(request.param("start_time"))) { + if (request.param("start_time") == null) { startTime = Instant.now(); } else { startTime = Instant.ofEpochMilli(Long.parseLong(request.param("start_time"))); @@ -284,7 +264,7 @@ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient cli // Read the interval. final int interval; - if (StringUtils.isEmpty(request.param("interval"))) { + if (request.param("interval") == null) { // Default to every 24 hours. interval = 1440; } else { @@ -361,29 +341,4 @@ public void onFailure(Exception e) { } - /** - * Index the query set. - */ - private String indexQuerySet(final NodeClient client, final String name, final String description, final String sampling, Collection queries) throws Exception { - - final Map querySet = new HashMap<>(); - querySet.put("name", name); - querySet.put("description", description); - querySet.put("sampling", sampling); - querySet.put("queries", queries); - querySet.put("created_at", Instant.now().toEpochMilli()); - - final String querySetId = UUID.randomUUID().toString(); - - final IndexRequest indexRequest = new IndexRequest().index(SearchQualityEvaluationPlugin.QUERY_SETS_INDEX_NAME) - .id(querySetId) - .source(querySet) - .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); - - client.index(indexRequest).get(); - - return querySetId; - - } - } diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java new file mode 100644 index 0000000..f31ec89 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractQuerySampler.java @@ -0,0 +1,64 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.action.index.IndexRequest; +import org.opensearch.action.support.WriteRequest; +import org.opensearch.client.node.NodeClient; +import org.opensearch.eval.SearchQualityEvaluationPlugin; + +import java.time.Instant; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * An interface for sampling UBI queries. + */ +public abstract class AbstractQuerySampler { + + /** + * Gets the name of the sampler. + * @return The name of the sampler. + */ + abstract String getName(); + + /** + * Samples the queries and inserts the query set into an index. + * @return A query set ID. + */ + abstract String sample() throws Exception; + + /** + * Index the query set. + */ + protected String indexQuerySet(final NodeClient client, final String name, final String description, final String sampling, Collection queries) throws Exception { + + final Map querySet = new HashMap<>(); + querySet.put("name", name); + querySet.put("description", description); + querySet.put("sampling", sampling); + querySet.put("queries", queries); + querySet.put("created_at", Instant.now().toEpochMilli()); + + final String querySetId = UUID.randomUUID().toString(); + + final IndexRequest indexRequest = new IndexRequest().index(SearchQualityEvaluationPlugin.QUERY_SETS_INDEX_NAME) + .id(querySetId) + .source(querySet) + .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + + client.index(indexRequest).get(); + + return querySetId; + + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java new file mode 100644 index 0000000..c8d731a --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AbstractSamplerParameters.java @@ -0,0 +1,41 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class AbstractSamplerParameters { + + private final String name; + private final String description; + private final String sampling; + private final int querySetSize; + + public AbstractSamplerParameters(final String name, final String description, final String sampling, final int querySetSize) { + this.name = name; + this.description = description; + this.sampling = sampling; + this.querySetSize = querySetSize; + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } + + public String getSampling() { + return sampling; + } + + public int getQuerySetSize() { + return querySetSize; + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java new file mode 100644 index 0000000..29fb0b7 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java @@ -0,0 +1,73 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.node.NodeClient; +import org.opensearch.eval.SearchQualityEvaluationPlugin; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * An implementation of {@link AbstractQuerySampler} that uses all UBI queries without any sampling. + */ +public class AllQueriesQuerySampler extends AbstractQuerySampler { + + private final NodeClient client; + private final AllQueriesQuerySamplerParameters parameters; + + /** + * Creates a new sampler. + * @param client The OpenSearch {@link NodeClient client}. + */ + public AllQueriesQuerySampler(final NodeClient client, final AllQueriesQuerySamplerParameters parameters) { + this.client = client; + this.parameters = parameters; + } + + @Override + public String getName() { + return "none"; + } + + @Override + public String sample() throws Exception { + + // Get queries from the UBI queries index. + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.query(QueryBuilders.matchAllQuery()); + searchSourceBuilder.from(0); + searchSourceBuilder.size(parameters.getQuerySetSize()); + + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + searchRequest.source(searchSourceBuilder); + + final SearchResponse searchResponse = client.search(searchRequest).get(); + + // LOGGER.info("Found {} user queries from the ubi_queries index.", searchResponse.getHits().getTotalHits().toString()); + + final Set queries = new HashSet<>(); + for(final SearchHit hit : searchResponse.getHits().getHits()) { + final Map fields = hit.getSourceAsMap(); + queries.add(fields.get("user_query").toString()); + } + + // LOGGER.info("Found {} user queries from the ubi_queries index.", queries.size()); + + return indexQuerySet(client, parameters.getName(), parameters.getDescription(), parameters.getSampling(), queries); + + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java new file mode 100644 index 0000000..3149668 --- /dev/null +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySamplerParameters.java @@ -0,0 +1,17 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ +package org.opensearch.eval.samplers; + +public class AllQueriesQuerySamplerParameters extends AbstractSamplerParameters { + + public AllQueriesQuerySamplerParameters(final String name, final String description, final String sampling, final int querySetSize) { + super(name, description, sampling, querySetSize); + } + +} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java similarity index 56% rename from opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java rename to opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java index 1307e9f..d348dc0 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeQuerySampler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java @@ -8,7 +8,15 @@ */ package org.opensearch.eval.samplers; -import org.opensearch.eval.judgments.model.ubi.query.UbiQuery; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.node.NodeClient; +import org.opensearch.eval.SearchQualityEvaluationPlugin; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; import java.util.ArrayList; import java.util.Collection; @@ -20,19 +28,24 @@ import java.util.Set; /** - * An implementation of {@link QuerySampler} that uses PPTSS sampling. + * An implementation of {@link AbstractQuerySampler} that uses PPTSS sampling. * See https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/ * for more information on PPTSS. */ -public class ProbabilityProportionalToSizeQuerySampler implements QuerySampler { +public class ProbabilityProportionalToSizeAbstractQuerySampler extends AbstractQuerySampler { + private static final Logger LOGGER = LogManager.getLogger(ProbabilityProportionalToSizeAbstractQuerySampler.class); + + private final NodeClient client; private final ProbabilityProportionalToSizeParameters parameters; /** * Creates a new PPTSS sampler. + * @param client The OpenSearch {@link NodeClient client}. * @param parameters The {@link ProbabilityProportionalToSizeParameters parameters} for the sampling. */ - public ProbabilityProportionalToSizeQuerySampler(final ProbabilityProportionalToSizeParameters parameters) { + public ProbabilityProportionalToSizeAbstractQuerySampler(final NodeClient client, final ProbabilityProportionalToSizeParameters parameters) { + this.client = client; this.parameters = parameters; } @@ -42,7 +55,29 @@ public String getName() { } @Override - public Collection sample(final Collection userQueries) { + public String sample() throws Exception { + + // TODO: Can this be changed to an aggregation? + // An aggregation is limited (?) to 10,000 which could miss some queries. + + // Get queries from the UBI queries index. + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.query(QueryBuilders.matchAllQuery()); + searchSourceBuilder.from(0); + // TODO: Need to get all queries. + searchSourceBuilder.size(10000); + + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + searchRequest.source(searchSourceBuilder); + + final SearchResponse searchResponse = client.search(searchRequest).get(); + + final Collection userQueries = new ArrayList<>(); + + for(final SearchHit hit : searchResponse.getHits().getHits()) { + final Map fields = hit.getSourceAsMap(); + userQueries.add(fields.get("user_query").toString()); + } final Map weights = new HashMap<>(); @@ -62,8 +97,8 @@ public Collection sample(final Collection userQueries) { // Ensure all normalized weights sum to 1. final double sumOfNormalizedWeights = normalizedWeights.values().stream().reduce(0.0, Double::sum); - if(sumOfNormalizedWeights != 1.0) { - throw new RuntimeException("Summed normalized weights do not equal 1.0"); + if(!compare(1.0, sumOfNormalizedWeights)) { + throw new RuntimeException("Summed normalized weights do not equal 1.0: Actual value: " + sumOfNormalizedWeights); } final Collection querySet = new ArrayList<>(); @@ -89,14 +124,19 @@ public Collection sample(final Collection userQueries) { for(Map.Entry entry : normalizedWeights.entrySet()) { if(entry.getValue() == nearestWeight) { querySet.add(entry.getKey()); + LOGGER.info("Generated random value: {}; Closest value = {}", random, entry.getKey()); break; } } } - return querySet; + return indexQuerySet(client, parameters.getName(), parameters.getDescription(), parameters.getSampling(), querySet); + + } + public static boolean compare(double a, double b) { + return Math.abs(a - b) < 0.00001; } } diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java index 0001821..d5e4311 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeParameters.java @@ -8,16 +8,10 @@ */ package org.opensearch.eval.samplers; -public class ProbabilityProportionalToSizeParameters { +public class ProbabilityProportionalToSizeParameters extends AbstractSamplerParameters { - private final int querySetSize; - - public ProbabilityProportionalToSizeParameters(int querySetSize) { - this.querySetSize = querySetSize; - } - - public int getQuerySetSize() { - return querySetSize; + public ProbabilityProportionalToSizeParameters(final String name, final String description, final String sampling, final int querySetSize) { + super(name, description, sampling, querySetSize); } } diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java deleted file mode 100644 index e943599..0000000 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/QuerySampler.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ -package org.opensearch.eval.samplers; - -import org.opensearch.eval.judgments.model.ubi.query.UbiQuery; - -import java.util.Collection; - -/** - * An interface for sampling UBI queries. - */ -public interface QuerySampler { - - /** - * Gets the name of the sampler. - * @return The name of the sampler. - */ - String getName(); - - /** - * Samples the queries. - * @param userQueries A collection of user queries from UBI queries. - * @return A collection of sampled user queries. - */ - Collection sample(Collection userQueries); - -} From 175f3236a9dcf37bd251d28ee44be4800664dd2b Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Fri, 22 Nov 2024 10:11:37 -0500 Subject: [PATCH 3/4] Adding scroll for getting queries. --- .../docker-compose.yaml | 2 +- ...roportionalToSizeAbstractQuerySampler.java | 56 ++++++++++++------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml index 5e8d7cf..c2ab3b7 100644 --- a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml +++ b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml @@ -10,7 +10,7 @@ services: logger.level: info OPENSEARCH_INITIAL_ADMIN_PASSWORD: SuperSecretPassword_123 http.max_content_length: 500mb - OPENSEARCH_JAVA_OPTS: "-Xms8192m -Xmx8192m" + OPENSEARCH_JAVA_OPTS: "-Xms16g -Xmx16g" ulimits: memlock: soft: -1 diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java index d348dc0..9eaea47 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java @@ -12,19 +12,20 @@ import org.apache.logging.log4j.Logger; import org.opensearch.action.search.SearchRequest; import org.opensearch.action.search.SearchResponse; +import org.opensearch.action.search.SearchScrollRequest; import org.opensearch.client.node.NodeClient; +import org.opensearch.common.unit.TimeValue; import org.opensearch.eval.SearchQualityEvaluationPlugin; import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.Scroll; import org.opensearch.search.SearchHit; import org.opensearch.search.builder.SearchSourceBuilder; import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Set; /** @@ -64,21 +65,36 @@ public String sample() throws Exception { final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(QueryBuilders.matchAllQuery()); searchSourceBuilder.from(0); - // TODO: Need to get all queries. - searchSourceBuilder.size(10000); + searchSourceBuilder.size(1000); + final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(10L)); - final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME).scroll(scroll); searchRequest.source(searchSourceBuilder); final SearchResponse searchResponse = client.search(searchRequest).get(); + String scrollId = searchResponse.getScrollId(); + SearchHit[] searchHits = searchResponse.getHits().getHits(); + final Collection userQueries = new ArrayList<>(); - for(final SearchHit hit : searchResponse.getHits().getHits()) { - final Map fields = hit.getSourceAsMap(); - userQueries.add(fields.get("user_query").toString()); + while (searchHits != null && searchHits.length > 0) { + + for(final SearchHit hit : searchResponse.getHits().getHits()) { + final Map fields = hit.getSourceAsMap(); + userQueries.add(fields.get("user_query").toString()); + } + + final SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); + scrollRequest.scroll(scroll); + + scrollId = searchResponse.getScrollId(); + searchHits = searchResponse.getHits().getHits(); + } + LOGGER.info("User queries found: {}", userQueries); + final Map weights = new HashMap<>(); // Increment the weight for each user query. @@ -93,6 +109,7 @@ public String sample() throws Exception { final Map normalizedWeights = new HashMap<>(); for(final String userQuery : weights.keySet()) { normalizedWeights.put(userQuery, weights.get(userQuery) / (double) countOfQueries); + LOGGER.info("{}: {}/{} = {}", userQuery, weights.get(userQuery), countOfQueries, normalizedWeights.get(userQuery)); } // Ensure all normalized weights sum to 1. @@ -114,21 +131,20 @@ public String sample() throws Exception { } while (randomNumbers.contains(random)); randomNumbers.add(random); - // Find the weight closest to the random weight. - double finalRandom = random; - double nearestWeight = normalizedWeights.values().stream() - .min(Comparator.comparingDouble(i -> Math.abs(i - finalRandom))) - .orElseThrow(() -> new NoSuchElementException("No value present")); - - // Find the query having the weight closest to this random number. - for(Map.Entry entry : normalizedWeights.entrySet()) { - if(entry.getValue() == nearestWeight) { - querySet.add(entry.getKey()); - LOGGER.info("Generated random value: {}; Closest value = {}", random, entry.getKey()); - break; + // Find the weight closest to the random weight in the map of deltas. + double smallestDelta = Integer.MAX_VALUE; + String closestQuery = null; + for(final String query : normalizedWeights.keySet()) { + final double delta = Math.abs(normalizedWeights.get(query) - random); + if(delta < smallestDelta) { + smallestDelta = delta; + closestQuery = query; } + } + LOGGER.info("Generated random value: {}; Smallest delta = {}; Closest query = {}", random, smallestDelta, closestQuery); + } return indexQuerySet(client, parameters.getName(), parameters.getDescription(), parameters.getSampling(), querySet); From 09ebd5a551a931d105bc3f9f0bf1e0704a35ce71 Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Fri, 22 Nov 2024 10:45:45 -0500 Subject: [PATCH 4/4] #44 Working on wiring up PPTSS. --- .../docker-compose.yaml | 2 +- .../create-query-set-using-pptss-sampling.sh | 2 +- .../judgments/model/ubi/event/UbiEvent.java | 9 +++++++- .../eval/samplers/AllQueriesQuerySampler.java | 1 + ...roportionalToSizeAbstractQuerySampler.java | 23 ++++++++++++------- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml index c2ab3b7..8938320 100644 --- a/opensearch-search-quality-evaluation-plugin/docker-compose.yaml +++ b/opensearch-search-quality-evaluation-plugin/docker-compose.yaml @@ -10,7 +10,7 @@ services: logger.level: info OPENSEARCH_INITIAL_ADMIN_PASSWORD: SuperSecretPassword_123 http.max_content_length: 500mb - OPENSEARCH_JAVA_OPTS: "-Xms16g -Xmx16g" + OPENSEARCH_JAVA_OPTS: "-Xms8g -Xmx8g" ulimits: memlock: soft: -1 diff --git a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh index 96c822a..283afef 100755 --- a/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh +++ b/opensearch-search-quality-evaluation-plugin/scripts/create-query-set-using-pptss-sampling.sh @@ -1,7 +1,7 @@ #!/bin/bash -e #QUERY_SET=`curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss" | jq .query_set | tr -d '"'` -curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&query_set_size=500" +curl -s -X POST "http://localhost:9200/_plugins/search_quality_eval/queryset?name=test&description=fake&sampling=pptss&query_set_size=5000" #echo ${QUERY_SET} diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java index fad79d6..ca9070d 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/judgments/model/ubi/event/UbiEvent.java @@ -11,7 +11,7 @@ import com.google.gson.annotations.SerializedName; /** - * A UBI event. + * Creates a representation of a UBI event. */ public class UbiEvent { @@ -27,6 +27,13 @@ public class UbiEvent { @SerializedName("event_attributes") private EventAttributes eventAttributes; + /** + * Creates a new representation of an UBI event. + */ + public UbiEvent() { + + } + @Override public String toString() { return actionName + ", " + clientId + ", " + queryId + ", " + eventAttributes.getObject().toString() + ", " + eventAttributes.getPosition().getIndex(); diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java index 29fb0b7..9a584f1 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/AllQueriesQuerySampler.java @@ -46,6 +46,7 @@ public String getName() { public String sample() throws Exception { // Get queries from the UBI queries index. + // TODO: This needs to use scroll or something else. final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(QueryBuilders.matchAllQuery()); searchSourceBuilder.from(0); diff --git a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java index 9eaea47..0e83d32 100644 --- a/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java +++ b/opensearch-search-quality-evaluation-plugin/src/main/java/org/opensearch/eval/samplers/ProbabilityProportionalToSizeAbstractQuerySampler.java @@ -64,14 +64,14 @@ public String sample() throws Exception { // Get queries from the UBI queries index. final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(QueryBuilders.matchAllQuery()); - searchSourceBuilder.from(0); searchSourceBuilder.size(1000); final Scroll scroll = new Scroll(TimeValue.timeValueMinutes(10L)); - final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME).scroll(scroll); + final SearchRequest searchRequest = new SearchRequest(SearchQualityEvaluationPlugin.UBI_QUERIES_INDEX_NAME); + searchRequest.scroll(scroll); searchRequest.source(searchSourceBuilder); - final SearchResponse searchResponse = client.search(searchRequest).get(); + SearchResponse searchResponse = client.search(searchRequest).get(); String scrollId = searchResponse.getScrollId(); SearchHit[] searchHits = searchResponse.getHits().getHits(); @@ -80,20 +80,25 @@ public String sample() throws Exception { while (searchHits != null && searchHits.length > 0) { - for(final SearchHit hit : searchResponse.getHits().getHits()) { + LOGGER.info("search hits size = " + searchHits.length); + + for(final SearchHit hit : searchHits) { final Map fields = hit.getSourceAsMap(); userQueries.add(fields.get("user_query").toString()); + // LOGGER.info("user queries count: {} user query: {}", userQueries.size(), fields.get("user_query").toString()); } final SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); scrollRequest.scroll(scroll); + searchResponse = client.searchScroll(scrollRequest).get(); + scrollId = searchResponse.getScrollId(); searchHits = searchResponse.getHits().getHits(); } - LOGGER.info("User queries found: {}", userQueries); + // LOGGER.info("User queries found: {}", userQueries); final Map weights = new HashMap<>(); @@ -109,16 +114,18 @@ public String sample() throws Exception { final Map normalizedWeights = new HashMap<>(); for(final String userQuery : weights.keySet()) { normalizedWeights.put(userQuery, weights.get(userQuery) / (double) countOfQueries); - LOGGER.info("{}: {}/{} = {}", userQuery, weights.get(userQuery), countOfQueries, normalizedWeights.get(userQuery)); + //LOGGER.info("{}: {}/{} = {}", userQuery, weights.get(userQuery), countOfQueries, normalizedWeights.get(userQuery)); } // Ensure all normalized weights sum to 1. final double sumOfNormalizedWeights = normalizedWeights.values().stream().reduce(0.0, Double::sum); if(!compare(1.0, sumOfNormalizedWeights)) { throw new RuntimeException("Summed normalized weights do not equal 1.0: Actual value: " + sumOfNormalizedWeights); + } else { + LOGGER.info("Summed normalized weights sum to {}", sumOfNormalizedWeights); } - final Collection querySet = new ArrayList<>(); + final Set querySet = new HashSet<>(); final Set randomNumbers = new HashSet<>(); // Generate a random number between 0 and 1 for the size of the query set. @@ -143,7 +150,7 @@ public String sample() throws Exception { } - LOGGER.info("Generated random value: {}; Smallest delta = {}; Closest query = {}", random, smallestDelta, closestQuery); + // LOGGER.info("Generated random value: {}; Smallest delta = {}; Closest query = {}", random, smallestDelta, closestQuery); }