From fd43adc78aff661b9a0b2435587f6d4b03009aa8 Mon Sep 17 00:00:00 2001 From: Ioana Tagirta Date: Mon, 21 Oct 2024 15:43:47 +0200 Subject: [PATCH] ES|QL Add initial support for semantic_text field type (#113920) * Add initial support for semantic_text field type * Update docs/changelog/113920.yaml * More tests and fixes * Use mock inference service * Fix tests * Spotless * Fix mixed-cluster and multi-clusters tests * sort * Attempt another fix for bwc tests * Spotless * Fix merge * Attempt another fix * Don't load the inference-service-test plugin for mixed versions/clusters * Add more tests, address review comments * trivial * revert * post-merge fix block loader * post-merge fix compile * add mixed version testing * whitespace * fix MultiClusterSpecIT * add more fields to mapping * Revert mixed version testing * whitespace --------- Co-authored-by: ChrisHegarty Co-authored-by: Elastic Machine --- docs/changelog/113920.yaml | 5 + .../esql/core/plugin/EsqlCorePlugin.java | 1 + .../xpack/esql/core/type/DataType.java | 12 +- .../esql/qa/mixed/MixedClusterEsqlSpecIT.java | 5 + .../xpack/esql/ccq/MultiClusterSpecIT.java | 5 + .../esql/qa/server/multi-node/build.gradle | 1 + .../xpack/esql/qa/multi_node/EsqlSpecIT.java | 2 +- .../esql/qa/server/single-node/build.gradle | 1 + .../xpack/esql/qa/single_node/EsqlSpecIT.java | 2 +- .../xpack/esql/qa/rest/EsqlSpecTestCase.java | 20 +- .../elasticsearch/xpack/esql/CsvAssert.java | 6 +- .../xpack/esql/CsvTestUtils.java | 1 + .../xpack/esql/CsvTestsDataLoader.java | 132 +++++++++++-- .../xpack/esql/EsqlTestUtils.java | 2 +- .../main/resources/mapping-semantic_text.json | 73 ++++++++ .../src/main/resources/semantic_text.csv | 4 + .../src/main/resources/semantic_text.csv-spec | 175 ++++++++++++++++++ .../xpack/esql/action/EsqlCapabilities.java | 6 +- .../xpack/esql/action/PositionToXContent.java | 2 +- .../xpack/esql/action/ResponseValueUtils.java | 2 +- .../esql/planner/LocalExecutionPlanner.java | 2 +- .../xpack/esql/planner/PlannerUtils.java | 2 +- .../esql/action/EsqlQueryResponseTests.java | 4 +- .../scalar/conditional/CaseTests.java | 2 +- .../mapper/SemanticTextFieldMapper.java | 9 + .../test/esql/40_unsupported_types.yml | 49 +++++ 26 files changed, 490 insertions(+), 35 deletions(-) create mode 100644 docs/changelog/113920.yaml create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-semantic_text.json create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv-spec diff --git a/docs/changelog/113920.yaml b/docs/changelog/113920.yaml new file mode 100644 index 0000000000000..4699ae6d7dd65 --- /dev/null +++ b/docs/changelog/113920.yaml @@ -0,0 +1,5 @@ +pr: 113920 +summary: Add initial support for `semantic_text` field type +area: Search +type: enhancement +issues: [] diff --git a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/plugin/EsqlCorePlugin.java b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/plugin/EsqlCorePlugin.java index 639d8ed68d0a3..d84a471815a9a 100644 --- a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/plugin/EsqlCorePlugin.java +++ b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/plugin/EsqlCorePlugin.java @@ -14,4 +14,5 @@ public class EsqlCorePlugin extends Plugin implements ExtensiblePlugin { public static final FeatureFlag DATE_NANOS_FEATURE_FLAG = new FeatureFlag("esql_date_nanos"); + public static final FeatureFlag SEMANTIC_TEXT_FEATURE_FLAG = new FeatureFlag("esql_semantic_text"); } diff --git a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java index 12699ca3ee720..5041c96128a1e 100644 --- a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java +++ b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java @@ -194,7 +194,14 @@ public enum DataType { * inside alongside time-series aggregations. These fields are not parsable from the * mapping and should be hidden from users. */ - PARTIAL_AGG(builder().esType("partial_agg").unknownSize()); + PARTIAL_AGG(builder().esType("partial_agg").unknownSize()), + /** + * String fields that are split into chunks, where each chunk has attached embeddings + * used for semantic search. Generally ESQL only sees {@code semantic_text} fields when + * loaded from the index and ESQL will load these fields as strings without their attached + * chunks or embeddings. + */ + SEMANTIC_TEXT(builder().esType("semantic_text").unknownSize()); /** * Types that are actively being built. These types are not returned @@ -203,7 +210,8 @@ public enum DataType { * check that sending them to a function produces a sane error message. */ public static final Map UNDER_CONSTRUCTION = Map.ofEntries( - Map.entry(DATE_NANOS, EsqlCorePlugin.DATE_NANOS_FEATURE_FLAG) + Map.entry(DATE_NANOS, EsqlCorePlugin.DATE_NANOS_FEATURE_FLAG), + Map.entry(SEMANTIC_TEXT, EsqlCorePlugin.SEMANTIC_TEXT_FEATURE_FLAG) ); private final String typeName; diff --git a/x-pack/plugin/esql/qa/server/mixed-cluster/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/mixed/MixedClusterEsqlSpecIT.java b/x-pack/plugin/esql/qa/server/mixed-cluster/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/mixed/MixedClusterEsqlSpecIT.java index d0d6d5fa49c42..0e23b29172c32 100644 --- a/x-pack/plugin/esql/qa/server/mixed-cluster/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/mixed/MixedClusterEsqlSpecIT.java +++ b/x-pack/plugin/esql/qa/server/mixed-cluster/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/mixed/MixedClusterEsqlSpecIT.java @@ -86,4 +86,9 @@ protected boolean supportsAsync() { protected boolean enableRoundingDoubleValuesOnAsserting() { return true; } + + @Override + protected boolean supportsInferenceTestService() { + return false; + } } diff --git a/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java b/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java index 3e799730f7269..8446ac63f43a1 100644 --- a/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java +++ b/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java @@ -261,4 +261,9 @@ static boolean hasIndexMetadata(String query) { protected boolean enableRoundingDoubleValuesOnAsserting() { return true; } + + @Override + protected boolean supportsInferenceTestService() { + return false; + } } diff --git a/x-pack/plugin/esql/qa/server/multi-node/build.gradle b/x-pack/plugin/esql/qa/server/multi-node/build.gradle index 9f8ca78aba81e..2dcc001c4e159 100644 --- a/x-pack/plugin/esql/qa/server/multi-node/build.gradle +++ b/x-pack/plugin/esql/qa/server/multi-node/build.gradle @@ -11,6 +11,7 @@ dependencies { clusterPlugins project(':plugins:mapper-size') clusterPlugins project(':plugins:mapper-murmur3') + clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin') } GradleUtils.extendSourceSet(project, "javaRestTest", "yamlRestTest") diff --git a/x-pack/plugin/esql/qa/server/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/multi_node/EsqlSpecIT.java b/x-pack/plugin/esql/qa/server/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/multi_node/EsqlSpecIT.java index bda10709ed947..64c113345bd53 100644 --- a/x-pack/plugin/esql/qa/server/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/multi_node/EsqlSpecIT.java +++ b/x-pack/plugin/esql/qa/server/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/multi_node/EsqlSpecIT.java @@ -14,7 +14,7 @@ public class EsqlSpecIT extends EsqlSpecTestCase { @ClassRule - public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> {}); + public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> spec.plugin("inference-service-test")); @Override protected String getTestRestCluster() { diff --git a/x-pack/plugin/esql/qa/server/single-node/build.gradle b/x-pack/plugin/esql/qa/server/single-node/build.gradle index ab8e3d4b32d9a..a37db5dc245e0 100644 --- a/x-pack/plugin/esql/qa/server/single-node/build.gradle +++ b/x-pack/plugin/esql/qa/server/single-node/build.gradle @@ -22,6 +22,7 @@ dependencies { clusterPlugins project(':plugins:mapper-size') clusterPlugins project(':plugins:mapper-murmur3') + clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin') } restResources { diff --git a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/EsqlSpecIT.java b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/EsqlSpecIT.java index 676fffd553ca8..368eebe808eee 100644 --- a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/EsqlSpecIT.java +++ b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/EsqlSpecIT.java @@ -18,7 +18,7 @@ @ThreadLeakFilters(filters = TestClustersThreadFilter.class) public class EsqlSpecIT extends EsqlSpecTestCase { @ClassRule - public static ElasticsearchCluster cluster = Clusters.testCluster(); + public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> spec.plugin("inference-service-test")); @Override protected String getTestRestCluster() { diff --git a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java index 319e67512c7ac..57f58fc448822 100644 --- a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java +++ b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java @@ -65,7 +65,10 @@ import static org.elasticsearch.xpack.esql.CsvTestUtils.ExpectedResults; import static org.elasticsearch.xpack.esql.CsvTestUtils.isEnabled; import static org.elasticsearch.xpack.esql.CsvTestUtils.loadCsvSpecValues; -import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.CSV_DATASET_MAP; +import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.availableDatasetsForEs; +import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.clusterHasInferenceEndpoint; +import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.createInferenceEndpoint; +import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.deleteInferenceEndpoint; import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.loadDataSetIntoEs; import static org.elasticsearch.xpack.esql.EsqlTestUtils.classpathResources; @@ -129,7 +132,11 @@ protected EsqlSpecTestCase( @Before public void setup() throws IOException { - if (indexExists(CSV_DATASET_MAP.keySet().iterator().next()) == false) { + if (supportsInferenceTestService() && clusterHasInferenceEndpoint(client()) == false) { + createInferenceEndpoint(client()); + } + + if (indexExists(availableDatasetsForEs(client()).iterator().next().indexName()) == false) { loadDataSetIntoEs(client()); } } @@ -148,6 +155,8 @@ public static void wipeTestData() throws IOException { throw e; } } + + deleteInferenceEndpoint(client()); } public boolean logResults() { @@ -164,6 +173,9 @@ public final void test() throws Throwable { } protected void shouldSkipTest(String testName) throws IOException { + if (testCase.requiredCapabilities.contains("semantic_text_type")) { + assumeTrue("Inference test service needs to be supported for semantic_text", supportsInferenceTestService()); + } checkCapabilities(adminClient(), testFeatureService, testName, testCase); assumeTrue("Test " + testName + " is not enabled", isEnabled(testName, instructions, Version.CURRENT)); } @@ -207,6 +219,10 @@ protected static void checkCapabilities(RestClient client, TestFeatureService te } } + protected boolean supportsInferenceTestService() { + return true; + } + protected final void doTest() throws Throwable { RequestObjectBuilder builder = new RequestObjectBuilder(randomFrom(XContentType.values())); diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java index 36d785c24ab23..1a2aa122c85ca 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java @@ -134,7 +134,11 @@ private static void assertMetadata( || expectedType == UNSIGNED_LONG)) { continue; } - if (blockType == Type.KEYWORD && (expectedType == Type.IP || expectedType == Type.VERSION || expectedType == Type.TEXT)) { + if (blockType == Type.KEYWORD + && (expectedType == Type.IP + || expectedType == Type.VERSION + || expectedType == Type.TEXT + || expectedType == Type.SEMANTIC_TEXT)) { // Type.asType translates all bytes references into keywords continue; } diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java index eaec6811fbc24..bd8bd0f688837 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java @@ -447,6 +447,7 @@ public enum Type { SCALED_FLOAT(s -> s == null ? null : scaledFloat(s, "100"), Double.class), KEYWORD(Object::toString, BytesRef.class), TEXT(Object::toString, BytesRef.class), + SEMANTIC_TEXT(Object::toString, BytesRef.class), IP( StringUtils::parseIP, (l, r) -> l instanceof String maybeIP diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java index d63585086f1cd..cf9d66727a900 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java @@ -19,6 +19,7 @@ import org.apache.logging.log4j.core.config.plugins.util.PluginManager; import org.elasticsearch.client.Request; import org.elasticsearch.client.Response; +import org.elasticsearch.client.ResponseException; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestClientBuilder; import org.elasticsearch.common.Strings; @@ -36,9 +37,11 @@ import java.net.URI; import java.net.URL; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.elasticsearch.common.logging.LoggerMessageFormat.format; import static org.elasticsearch.xpack.esql.CsvTestUtils.COMMA_ESCAPING_REGEX; @@ -81,6 +84,7 @@ public class CsvTestsDataLoader { private static final TestsDataset K8S = new TestsDataset("k8s", "k8s-mappings.json", "k8s.csv").withSetting("k8s-settings.json"); private static final TestsDataset ADDRESSES = new TestsDataset("addresses"); private static final TestsDataset BOOKS = new TestsDataset("books"); + private static final TestsDataset SEMANTIC_TEXT = new TestsDataset("semantic_text").withInferenceEndpoint(true); public static final Map CSV_DATASET_MAP = Map.ofEntries( Map.entry(EMPLOYEES.indexName, EMPLOYEES), @@ -112,7 +116,8 @@ public class CsvTestsDataLoader { Map.entry(K8S.indexName, K8S), Map.entry(DISTANCES.indexName, DISTANCES), Map.entry(ADDRESSES.indexName, ADDRESSES), - Map.entry(BOOKS.indexName, BOOKS) + Map.entry(BOOKS.indexName, BOOKS), + Map.entry(SEMANTIC_TEXT.indexName, SEMANTIC_TEXT) ); private static final EnrichConfig LANGUAGES_ENRICH = new EnrichConfig("languages_policy", "enrich-policy-languages.json"); @@ -219,8 +224,13 @@ public static void main(String[] args) throws IOException { } } - private static void loadDataSetIntoEs(RestClient client, IndexCreator indexCreator) throws IOException { - loadDataSetIntoEs(client, LogManager.getLogger(CsvTestsDataLoader.class), indexCreator); + public static Set availableDatasetsForEs(RestClient client) throws IOException { + boolean inferenceEnabled = clusterHasInferenceEndpoint(client); + + return CSV_DATASET_MAP.values() + .stream() + .filter(d -> d.requiresInferenceEndpoint == false || inferenceEnabled) + .collect(Collectors.toCollection(HashSet::new)); } public static void loadDataSetIntoEs(RestClient client) throws IOException { @@ -229,22 +239,61 @@ public static void loadDataSetIntoEs(RestClient client) throws IOException { }); } - public static void loadDataSetIntoEs(RestClient client, Logger logger) throws IOException { - loadDataSetIntoEs(client, logger, (restClient, indexName, indexMapping, indexSettings) -> { - ESRestTestCase.createIndex(restClient, indexName, indexSettings, indexMapping, null); - }); - } + private static void loadDataSetIntoEs(RestClient client, IndexCreator indexCreator) throws IOException { + Logger logger = LogManager.getLogger(CsvTestsDataLoader.class); - private static void loadDataSetIntoEs(RestClient client, Logger logger, IndexCreator indexCreator) throws IOException { - for (var dataset : CSV_DATASET_MAP.values()) { + Set loadedDatasets = new HashSet<>(); + for (var dataset : availableDatasetsForEs(client)) { load(client, dataset, logger, indexCreator); + loadedDatasets.add(dataset.indexName); } - forceMerge(client, CSV_DATASET_MAP.keySet(), logger); + forceMerge(client, loadedDatasets, logger); for (var policy : ENRICH_POLICIES) { loadEnrichPolicy(client, policy.policyName, policy.policyFileName, logger); } } + /** The semantic_text mapping type require an inference endpoint that needs to be setup before creating the index. */ + public static void createInferenceEndpoint(RestClient client) throws IOException { + Request request = new Request("PUT", "_inference/sparse_embedding/test_sparse_inference"); + request.setJsonEntity(""" + { + "service": "test_service", + "service_settings": { + "model": "my_model", + "api_key": "abc64" + }, + "task_settings": { + } + } + """); + client.performRequest(request); + } + + public static void deleteInferenceEndpoint(RestClient client) throws IOException { + try { + client.performRequest(new Request("DELETE", "_inference/test_sparse_inference")); + } catch (ResponseException e) { + // 404 here means the endpoint was not created + if (e.getResponse().getStatusLine().getStatusCode() != 404) { + throw e; + } + } + } + + public static boolean clusterHasInferenceEndpoint(RestClient client) throws IOException { + Request request = new Request("GET", "_inference/sparse_embedding/test_sparse_inference"); + try { + client.performRequest(request); + } catch (ResponseException e) { + if (e.getResponse().getStatusLine().getStatusCode() == 404) { + return false; + } + throw e; + } + return true; + } + private static void loadEnrichPolicy(RestClient client, String policyName, String policyFileName, Logger logger) throws IOException { URL policyMapping = CsvTestsDataLoader.class.getResource("/" + policyFileName); if (policyMapping == null) { @@ -511,34 +560,79 @@ public record TestsDataset( String dataFileName, String settingFileName, boolean allowSubFields, - Map typeMapping + Map typeMapping, + boolean requiresInferenceEndpoint ) { public TestsDataset(String indexName, String mappingFileName, String dataFileName) { - this(indexName, mappingFileName, dataFileName, null, true, null); + this(indexName, mappingFileName, dataFileName, null, true, null, false); } public TestsDataset(String indexName) { - this(indexName, "mapping-" + indexName + ".json", indexName + ".csv", null, true, null); + this(indexName, "mapping-" + indexName + ".json", indexName + ".csv", null, true, null, false); } public TestsDataset withIndex(String indexName) { - return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping); + return new TestsDataset( + indexName, + mappingFileName, + dataFileName, + settingFileName, + allowSubFields, + typeMapping, + requiresInferenceEndpoint + ); } public TestsDataset withData(String dataFileName) { - return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping); + return new TestsDataset( + indexName, + mappingFileName, + dataFileName, + settingFileName, + allowSubFields, + typeMapping, + requiresInferenceEndpoint + ); } public TestsDataset withSetting(String settingFileName) { - return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping); + return new TestsDataset( + indexName, + mappingFileName, + dataFileName, + settingFileName, + allowSubFields, + typeMapping, + requiresInferenceEndpoint + ); } public TestsDataset noSubfields() { - return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, false, typeMapping); + return new TestsDataset( + indexName, + mappingFileName, + dataFileName, + settingFileName, + false, + typeMapping, + requiresInferenceEndpoint + ); } public TestsDataset withTypeMapping(Map typeMapping) { - return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping); + return new TestsDataset( + indexName, + mappingFileName, + dataFileName, + settingFileName, + allowSubFields, + typeMapping, + requiresInferenceEndpoint + ); + } + + public TestsDataset withInferenceEndpoint(boolean needsInference) { + return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping, needsInference); } } diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java index f5bcb37c63e84..d71c66b4c467f 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java @@ -648,7 +648,7 @@ public static Literal randomLiteral(DataType type) { case KEYWORD -> new BytesRef(randomAlphaOfLength(5)); case IP -> new BytesRef(InetAddressPoint.encode(randomIp(randomBoolean()))); case TIME_DURATION -> Duration.ofMillis(randomLongBetween(-604800000L, 604800000L)); // plus/minus 7 days - case TEXT -> new BytesRef(randomAlphaOfLength(50)); + case TEXT, SEMANTIC_TEXT -> new BytesRef(randomAlphaOfLength(50)); case VERSION -> randomVersion().toBytesRef(); case GEO_POINT -> GEO.asWkb(GeometryTestUtils.randomPoint()); case CARTESIAN_POINT -> CARTESIAN.asWkb(ShapeTestUtils.randomPoint()); diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-semantic_text.json b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-semantic_text.json new file mode 100644 index 0000000000000..b110d6fd4cdd5 --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-semantic_text.json @@ -0,0 +1,73 @@ +{ + "properties": { + "semantic_text_field": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_bool": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_cartesian_point": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_cartesian_shape": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_datetime": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_double": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_geopoint": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_geoshape": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_integer": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_ip": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_long": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_unsigned_long": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_version": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_multi_value": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "st_unicode": { + "type": "semantic_text", + "inference_id": "test_sparse_inference" + }, + "host" : { + "type" : "keyword" + }, + "description" : { + "type" : "text" + }, + "value": { + "type": "long" + } + } +} diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv new file mode 100644 index 0000000000000..c6de9a208e9a7 --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv @@ -0,0 +1,4 @@ +_id:keyword,semantic_text_field:semantic_text,st_bool:semantic_text,st_cartesian_point:semantic_text,st_cartesian_shape:semantic_text,st_datetime:semantic_text,st_double:semantic_text,st_geopoint:semantic_text,st_geoshape:semantic_text,st_integer:semantic_text,st_ip:semantic_text,st_long:semantic_text,st_unsigned_long:semantic_text,st_version:semantic_text,st_multi_value:semantic_text,st_unicode:semantic_text,host:keyword,description:text,value:long +1,live long and prosper,false,"POINT(4297.11 -1475.53)",,1953-09-02T00:00:00.000Z,5.20128E11,"POINT(42.97109630194 14.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",23,1.1.1.1,2147483648,2147483648,1.2.3,["Hello there!", "This is a random value", "for testing purposes"],你吃饭了吗,"host1","some description1",1001 +2,all we have to decide is what to do with the time that is given to us,true,"POINT(7580.93 2272.77)",,2023-09-24T15:57:00.000Z,4541.11,"POINT(37.97109630194 21.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",122,1.1.2.1,123,2147483648.2,9.0.0,["nice to meet you", "bye bye!"],["谢谢", "对不起我的中文不好"],"host2","some description2",1002 +3,be excellent to each other,,,,,,,,,,,,,,,"host3","some description3",1003 diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv-spec new file mode 100644 index 0000000000000..683bcdc3f7490 --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv-spec @@ -0,0 +1,175 @@ +simple +required_capability: semantic_text_type + +FROM semantic_text +| KEEP semantic_text_field +| sort semantic_text_field asc; + +semantic_text_field:semantic_text +all we have to decide is what to do with the time that is given to us +be excellent to each other +live long and prosper +; + +simpleWithUnicode +required_capability: semantic_text_type + +FROM semantic_text +| KEEP st_unicode +| SORT st_unicode +; + +st_unicode:semantic_text +你吃饭了吗 +["谢谢", "对不起我的中文不好"] +null +; + +mvExpand +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| KEEP _id, st_multi_value +| MV_EXPAND st_multi_value +| SORT st_multi_value +; + +_id:keyword | st_multi_value:semantic_text +1 | Hello there! +1 | This is a random value +2 | bye bye! +1 | for testing purposes +2 | nice to meet you +3 | null +; + +withDropAndKeep +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| KEEP _id, semantic_text_field, st_double +| DROP st_double +| SORT _id +; + +_id:keyword | semantic_text_field:semantic_text +1 | live long and prosper +2 | all we have to decide is what to do with the time that is given to us +3 | be excellent to each other +; + +rename +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| RENAME semantic_text_field AS my_field +| KEEP _id, my_field +| SORT _id +; + +_id:keyword | my_field:semantic_text +1 | live long and prosper +2 | all we have to decide is what to do with the time that is given to us +3 | be excellent to each other +; + +eval +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| EVAL my_field = semantic_text_field +| KEEP _id, my_field +| SORT _id +; + +_id:keyword | my_field:semantic_text +1 | live long and prosper +2 | all we have to decide is what to do with the time that is given to us +3 | be excellent to each other +; + +simpleStats +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| STATS COUNT(*) +; + +COUNT(*):long +3 +; + +statsWithGrouping +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| STATS COUNT(*) BY st_version +| SORT st_version +; + +COUNT(*):long | st_version:semantic_text +1 | 1.2.3 +1 | 9.0.0 +1 | null +; + +withDropKeepStatsMvExpandRenameSortLimit +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| KEEP _id, semantic_text_field, st_multi_value +| DROP semantic_text_field +| RENAME st_multi_value AS my_field +| MV_EXPAND my_field +| STATS COUNT(*) BY my_field +| SORT my_field +| LIMIT 3 +; + +COUNT(*):long | my_field:semantic_text +1 | Hello there! +1 | This is a random value +1 | bye bye! +; + +simpleWithLongValue +required_capability: semantic_text_type + +FROM semantic_text +| KEEP value, semantic_text_field +| SORT value +; + +value:long | semantic_text_field:semantic_text +1001 | live long and prosper +1002 | all we have to decide is what to do with the time that is given to us +1003 | be excellent to each other +; + +simpleWithText +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| KEEP description, semantic_text_field +| SORT description +; + +description:text | semantic_text_field:semantic_text +"some description1" | live long and prosper +"some description2" | all we have to decide is what to do with the time that is given to us +"some description3" | be excellent to each other +; + +simpleWithKeyword +required_capability: semantic_text_type + +FROM semantic_text METADATA _id +| KEEP host, semantic_text_field +| SORT host +; + +host:keyword | semantic_text_field:semantic_text +"host1" | live long and prosper +"host2" | all we have to decide is what to do with the time that is given to us +"host3" | be excellent to each other +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index adfba4c487618..3c39406198da3 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -395,7 +395,11 @@ public enum Cap { /** * Adding stats for functions (stack telemetry) */ - FUNCTION_STATS; + FUNCTION_STATS, + /** + * Support for semantic_text field mapping + */ + SEMANTIC_TEXT_TYPE(EsqlCorePlugin.SEMANTIC_TEXT_FEATURE_FLAG); private final boolean snapshotOnly; private final FeatureFlag featureFlag; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/PositionToXContent.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/PositionToXContent.java index 0b1bafdab1a99..0def56c70dc35 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/PositionToXContent.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/PositionToXContent.java @@ -89,7 +89,7 @@ protected XContentBuilder valueToXContent(XContentBuilder builder, ToXContent.Pa return builder.value(unsignedLongAsNumber(l)); } }; - case KEYWORD, TEXT -> new PositionToXContent(block) { + case KEYWORD, SEMANTIC_TEXT, TEXT -> new PositionToXContent(block) { @Override protected XContentBuilder valueToXContent(XContentBuilder builder, ToXContent.Params params, int valueIndex) throws IOException { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/ResponseValueUtils.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/ResponseValueUtils.java index 3b18bda120e2e..49fcc167dce0f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/ResponseValueUtils.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/ResponseValueUtils.java @@ -114,7 +114,7 @@ private static Object valueAt(DataType dataType, Block block, int offset, BytesR case LONG, COUNTER_LONG -> ((LongBlock) block).getLong(offset); case INTEGER, COUNTER_INTEGER -> ((IntBlock) block).getInt(offset); case DOUBLE, COUNTER_DOUBLE -> ((DoubleBlock) block).getDouble(offset); - case KEYWORD, TEXT -> ((BytesRefBlock) block).getBytesRef(offset, scratch).utf8ToString(); + case KEYWORD, SEMANTIC_TEXT, TEXT -> ((BytesRefBlock) block).getBytesRef(offset, scratch).utf8ToString(); case IP -> { BytesRef val = ((BytesRefBlock) block).getBytesRef(offset, scratch); yield ipToString(val); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/LocalExecutionPlanner.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/LocalExecutionPlanner.java index b28c80211c649..dc732258d9fa5 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/LocalExecutionPlanner.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/LocalExecutionPlanner.java @@ -349,7 +349,7 @@ private PhysicalOperation planTopN(TopNExec topNExec, LocalExecutionPlannerConte elementTypes[channel] = PlannerUtils.toElementType(inverse.get(channel).type()); encoders[channel] = switch (inverse.get(channel).type()) { case IP -> TopNEncoder.IP; - case TEXT, KEYWORD -> TopNEncoder.UTF8; + case TEXT, KEYWORD, SEMANTIC_TEXT -> TopNEncoder.UTF8; case VERSION -> TopNEncoder.VERSION; case BOOLEAN, NULL, BYTE, SHORT, INTEGER, LONG, DOUBLE, FLOAT, HALF_FLOAT, DATETIME, DATE_NANOS, DATE_PERIOD, TIME_DURATION, OBJECT, SCALED_FLOAT, UNSIGNED_LONG, DOC_DATA_TYPE, TSID_DATA_TYPE -> TopNEncoder.DEFAULT_SORTABLE; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/PlannerUtils.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/PlannerUtils.java index 7beed64dda8cb..7868984d6b6e2 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/PlannerUtils.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/PlannerUtils.java @@ -247,7 +247,7 @@ public static ElementType toElementType(DataType dataType, MappedFieldType.Field case INTEGER, COUNTER_INTEGER -> ElementType.INT; case DOUBLE, COUNTER_DOUBLE -> ElementType.DOUBLE; // unsupported fields are passed through as a BytesRef - case KEYWORD, TEXT, IP, SOURCE, VERSION, UNSUPPORTED -> ElementType.BYTES_REF; + case KEYWORD, TEXT, IP, SOURCE, VERSION, SEMANTIC_TEXT, UNSUPPORTED -> ElementType.BYTES_REF; case NULL -> ElementType.NULL; case BOOLEAN -> ElementType.BOOLEAN; case DOC_DATA_TYPE -> ElementType.DOC; diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/action/EsqlQueryResponseTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/action/EsqlQueryResponseTests.java index b147cfde21721..27343bf7ce205 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/action/EsqlQueryResponseTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/action/EsqlQueryResponseTests.java @@ -193,7 +193,7 @@ private Page randomPage(List columns) { case INTEGER, COUNTER_INTEGER -> ((IntBlock.Builder) builder).appendInt(randomInt()); case DOUBLE, COUNTER_DOUBLE -> ((DoubleBlock.Builder) builder).appendDouble(randomDouble()); case KEYWORD -> ((BytesRefBlock.Builder) builder).appendBytesRef(new BytesRef(randomAlphaOfLength(10))); - case TEXT -> ((BytesRefBlock.Builder) builder).appendBytesRef(new BytesRef(randomAlphaOfLength(10000))); + case TEXT, SEMANTIC_TEXT -> ((BytesRefBlock.Builder) builder).appendBytesRef(new BytesRef(randomAlphaOfLength(10000))); case IP -> ((BytesRefBlock.Builder) builder).appendBytesRef( new BytesRef(InetAddressPoint.encode(randomIp(randomBoolean()))) ); @@ -866,7 +866,7 @@ static Page valuesToPage(BlockFactory blockFactory, List columns case LONG, COUNTER_LONG -> ((LongBlock.Builder) builder).appendLong(((Number) value).longValue()); case INTEGER, COUNTER_INTEGER -> ((IntBlock.Builder) builder).appendInt(((Number) value).intValue()); case DOUBLE, COUNTER_DOUBLE -> ((DoubleBlock.Builder) builder).appendDouble(((Number) value).doubleValue()); - case KEYWORD, TEXT -> ((BytesRefBlock.Builder) builder).appendBytesRef(new BytesRef(value.toString())); + case KEYWORD, TEXT, SEMANTIC_TEXT -> ((BytesRefBlock.Builder) builder).appendBytesRef(new BytesRef(value.toString())); case UNSUPPORTED -> ((BytesRefBlock.Builder) builder).appendNull(); case IP -> ((BytesRefBlock.Builder) builder).appendBytesRef(stringToIP(value.toString())); case DATETIME -> { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/conditional/CaseTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/conditional/CaseTests.java index 9d0d9c3da30a8..db3fce244c9a8 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/conditional/CaseTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/conditional/CaseTests.java @@ -59,7 +59,7 @@ public class CaseTests extends AbstractScalarFunctionTestCase { DataType.NULL ).collect(Collectors.toList()); if (Build.current().isSnapshot()) { - t.addAll(DataType.UNDER_CONSTRUCTION.keySet()); + t.add(DataType.DATE_NANOS); } TYPES = unmodifiableList(t); } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index ce0b3a099d472..fb18cfb4959c7 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -23,6 +23,8 @@ import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.mapper.BlockLoader; +import org.elasticsearch.index.mapper.BlockSourceReader; import org.elasticsearch.index.mapper.DocumentParserContext; import org.elasticsearch.index.mapper.DocumentParsingException; import org.elasticsearch.index.mapper.FieldMapper; @@ -611,6 +613,13 @@ private String generateInvalidQueryInferenceResultsMessage(StringBuilder baseMes return baseMessageBuilder.toString(); } + + @Override + public BlockLoader blockLoader(MappedFieldType.BlockLoaderContext blContext) { + SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name().concat(".text"))); + var sourceMode = blContext.indexSettings().getIndexMappingSourceMode(); + return new BlockSourceReader.BytesRefsBlockLoader(fetcher, BlockSourceReader.lookupMatchingAll(), sourceMode); + } } /** diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml index e100f30717aef..049895bc9f31a 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml @@ -504,3 +504,52 @@ double nested declared in mapping: # The `nested` field is not visible, nor are any of it's subfields. - match: { columns: [{name: name, type: keyword}] } + +--- +semantic_text declared in mapping: + - requires: + test_runner_features: [ capabilities ] + capabilities: + - method: POST + path: /_query + parameters: [ ] + capabilities: [ semantic_text_type ] + reason: "support for semantic_text type" + - do: + indices.create: + index: test_semantic_text + body: + settings: + number_of_shards: 5 + mappings: + properties: + semantic_text_field: + type: semantic_text + inference_id: my_inference_id + - do: + bulk: + index: test_semantic_text + refresh: true + body: + - { "index": { } } + - { + "semantic_text_field": { + "text": "be excellent to each other", + "inference": { + "inference_id": "my_inference_id", + "model_settings": { + "task_type": "sparse_embedding" + }, + "chunks": [{ "text": "be excellent to each other", "embeddings": { "a": 1,"b": 2 } }] + } + } + } + - do: + allowed_warnings_regex: + - "No limit defined, adding default limit of \\[.*\\]" + esql.query: + body: + query: 'FROM test_semantic_text' + - match: { columns: [{name: semantic_text_field, type: semantic_text}] } + - length: { values: 1 } + - match: { values.0: ["be excellent to each other"] }