elastic · jan-elastic · Dec 6, 2024 · Dec 7, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/docs/changelog/118173.yaml b/docs/changelog/118173.yaml
@@ -0,0 +1,5 @@
+pr: 118173
+summary: ES|QL categorize with multiple groupings
+area: Machine Learning
+type: feature
+issues: []
diff --git a/...esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java b/...esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
@@ -180,13 +180,16 @@ public static BlockHash buildCategorizeBlockHash(
         List<GroupSpec> groups,
         AggregatorMode aggregatorMode,
         BlockFactory blockFactory,
-        AnalysisRegistry analysisRegistry
+        AnalysisRegistry analysisRegistry,
+        int emitBatchSize
     ) {
-        if (groups.size() != 1) {
-            throw new IllegalArgumentException("only a single CATEGORIZE group can used");
+        if (groups.size() == 1) {
+            return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
+        } else {
+            assert groups.get(0).isCategorize();
+            assert groups.subList(1, groups.size()).stream().noneMatch(GroupSpec::isCategorize);
+            return new CategorizePackedValuesBlockHash(groups, blockFactory, aggregatorMode, analysisRegistry, emitBatchSize);
         }
-
-        return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
     }
 
     /**

diff --git a/...te/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/...te/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
@@ -44,7 +44,7 @@
 import java.util.Objects;
 
 /**
- * Base BlockHash implementation for {@code Categorize} grouping function.
+ * BlockHash implementation for {@code Categorize} grouping function.
  */
 public class CategorizeBlockHash extends BlockHash {
 
@@ -95,12 +95,14 @@ public class CategorizeBlockHash extends BlockHash {
         }
     }
 
+    boolean seenNull() {
+        return seenNull;
+    }
+
     @Override
     public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
-        if (aggregatorMode.isInputPartial() == false) {
-            addInitial(page, addInput);
-        } else {
-            addIntermediate(page, addInput);
+        try (IntBlock block = add(page)) {
+            addInput.add(0, block);
         }
     }
 
@@ -129,50 +131,39 @@ public void close() {
         Releasables.close(evaluator, categorizer);
     }
 
+    private IntBlock add(Page page) {
+        return aggregatorMode.isInputPartial() == false ? addInitial(page) : addIntermediate(page);
+    }
+
     /**
      * Adds initial (raw) input to the state.
      */
-    private void addInitial(Page page, GroupingAggregatorFunction.AddInput addInput) {
-        try (IntBlock result = (IntBlock) evaluator.eval(page.getBlock(channel))) {
-            addInput.add(0, result);
-        }
+    IntBlock addInitial(Page page) {
+        return (IntBlock) evaluator.eval(page.getBlock(channel));
     }
 
     /**
      * Adds intermediate state to the state.
      */
-    private void addIntermediate(Page page, GroupingAggregatorFunction.AddInput addInput) {
+    private IntBlock addIntermediate(Page page) {
         if (page.getPositionCount() == 0) {
-            return;
+            return null;
         }
         BytesRefBlock categorizerState = page.getBlock(channel);
         if (categorizerState.areAllValuesNull()) {
             seenNull = true;
-            try (var newIds = blockFactory.newConstantIntVector(NULL_ORD, 1)) {
-                addInput.add(0, newIds);
-            }
-            return;
-        }
-
-        Map<Integer, Integer> idMap = readIntermediate(categorizerState.getBytesRef(0, new BytesRef()));
-        try (IntBlock.Builder newIdsBuilder = blockFactory.newIntBlockBuilder(idMap.size())) {
-            int fromId = idMap.containsKey(0) ? 0 : 1;
-            int toId = fromId + idMap.size();
-            for (int i = fromId; i < toId; i++) {
-                newIdsBuilder.appendInt(idMap.get(i));
-            }
-            try (IntBlock newIds = newIdsBuilder.build()) {
-                addInput.add(0, newIds);
-            }
+            return blockFactory.newConstantIntBlockWith(NULL_ORD, 1);
         }
+        int[] ids = recategorize(categorizerState.getBytesRef(0, new BytesRef()), null);
+        return blockFactory.newIntArrayVector(ids, ids.length).asBlock();
     }
 
     /**
-     * Read intermediate state from a block.
-     *
-     * @return a map from the old category id to the new one. The old ids go from 0 to {@code size - 1}.
+     * Reads the intermediate state from a block and recategorizes the provided IDs.
+     * If no IDs are provided, the IDs are the IDs in the categorizer's state in order.
+     * (So 0...N-1 or 1...N, depending on whether null is present.)
      */
-    private Map<Integer, Integer> readIntermediate(BytesRef bytes) {
+    int[] recategorize(BytesRef bytes, int[] ids) {
         Map<Integer, Integer> idMap = new HashMap<>();
         try (StreamInput in = new BytesArray(bytes).streamInput()) {
             if (in.readBoolean()) {
@@ -185,10 +176,20 @@ private Map<Integer, Integer> readIntermediate(BytesRef bytes) {
                 // +1 because the 0 ordinal is reserved for null
                 idMap.put(oldCategoryId + 1, newCategoryId + 1);
             }
-            return idMap;
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
+        if (ids == null) {
+            ids = new int[idMap.size()];
+            int idOffset = idMap.containsKey(0) ? 0 : 1;
+            for (int i = 0; i < idMap.size(); i++) {
+                ids[i] = i + idOffset;
+            }
+        }
+        for (int i = 0; i < ids.length; i++) {
+            ids[i] = idMap.get(ids[i]);
+        }
+        return ids;
     }
 
     /**
@@ -198,15 +199,19 @@ private Block buildIntermediateBlock() {
         if (categorizer.getCategoryCount() == 0) {
             return blockFactory.newConstantNullBlock(seenNull ? 1 : 0);
         }
+        int positionCount = categorizer.getCategoryCount() + (seenNull ? 1 : 0);
+        // We're returning a block with N positions just because the Page must have all blocks with the same position count!
+        return blockFactory.newConstantBytesRefBlockWith(serializeCategorizer(), positionCount);
+    }
+
+    BytesRef serializeCategorizer() {
         try (BytesStreamOutput out = new BytesStreamOutput()) {
             out.writeBoolean(seenNull);
             out.writeVInt(categorizer.getCategoryCount());
             for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
                 category.writeTo(out);
             }
-            // We're returning a block with N positions just because the Page must have all blocks with the same position count!
-            int positionCount = categorizer.getCategoryCount() + (seenNull ? 1 : 0);
-            return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), positionCount);
+            return out.bytes().toBytesRef();
         } catch (IOException e) {
             throw new RuntimeException(e);
         }

diff --git a/...java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java b/...java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
@@ -0,0 +1,177 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.compute.aggregation.blockhash;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.BitArray;
+import org.elasticsearch.compute.aggregation.AggregatorMode;
+import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
+import org.elasticsearch.compute.data.Block;
+import org.elasticsearch.compute.data.BlockFactory;
+import org.elasticsearch.compute.data.BytesRefBlock;
+import org.elasticsearch.compute.data.ElementType;
+import org.elasticsearch.compute.data.IntBlock;
+import org.elasticsearch.compute.data.IntVector;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.core.ReleasableIterator;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * BlockHash implementation for {@code Categorize} grouping function as first
+ * grouping expression, followed by one or mode other grouping expressions.
+ * <p>
+ * For the first grouping (the {@code Categorize} grouping function), a
+ * {@code CategorizeBlockHash} is used, which outputs integers (category IDs).
+ * Next, a {@code PackedValuesBlockHash} is used on the category IDs and the
+ * other groupings (which are not {@code Categorize}s).
+ */
+public class CategorizePackedValuesBlockHash extends BlockHash {
+
+    private final List<GroupSpec> specs;
+    private final AggregatorMode aggregatorMode;
+    private final Block[] blocks;
+    private final CategorizeBlockHash categorizeBlockHash;
+    private final PackedValuesBlockHash packedValuesBlockHash;
+
+    CategorizePackedValuesBlockHash(
+        List<GroupSpec> specs,
+        BlockFactory blockFactory,
+        AggregatorMode aggregatorMode,
+        AnalysisRegistry analysisRegistry,
+        int emitBatchSize
+    ) {
+        super(blockFactory);
+        this.specs = specs;
+        this.aggregatorMode = aggregatorMode;
+        blocks = new Block[specs.size()];
+
+        List<GroupSpec> delegateSpecs = new ArrayList<>();
+        delegateSpecs.add(new GroupSpec(0, ElementType.INT));
+        for (int i = 1; i < specs.size(); i++) {
+            delegateSpecs.add(new GroupSpec(i, specs.get(i).elementType()));
+        }
+
+        boolean success = false;
+        try {
+            categorizeBlockHash = new CategorizeBlockHash(blockFactory, specs.get(0).channel(), aggregatorMode, analysisRegistry);
+            packedValuesBlockHash = new PackedValuesBlockHash(delegateSpecs, blockFactory, emitBatchSize);
+            success = true;
+        } finally {
+            if (success == false) {
+                close();
+            }
+        }
+    }
+
+    @Override
+    public void add(Page page, GroupingAggregatorFunction.AddInput addInput) {
+        try (IntBlock categories = getCategories(page)) {
+            blocks[0] = categories;
+            for (int i = 1; i < specs.size(); i++) {
+                blocks[i] = page.getBlock(specs.get(i).channel());
+            }
+            packedValuesBlockHash.add(new Page(blocks), addInput);
+        }
+    }
+
+    private IntBlock getCategories(Page page) {
+        if (aggregatorMode.isInputPartial() == false) {
+            return categorizeBlockHash.addInitial(page);
+        } else {
+            BytesRefBlock stateBlock = page.getBlock(0);
+            BytesRef stateBytes = stateBlock.getBytesRef(0, new BytesRef());
+            try (StreamInput in = new BytesArray(stateBytes).streamInput()) {
+                BytesRef categorizerState = in.readBytesRef();
+                int[] ids = in.readIntArray();
+                ids = categorizeBlockHash.recategorize(categorizerState, ids);
+                return blockFactory.newIntArrayVector(ids, ids.length).asBlock();
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Override
+    public Block[] getKeys() {
+        Block[] keys = packedValuesBlockHash.getKeys();
+        if (aggregatorMode.isOutputPartial() == false) {
+            // For final output, the keys are the category regexes.
+            try (
+                BytesRefBlock regexes = (BytesRefBlock) categorizeBlockHash.getKeys()[0];
+                BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(keys[0].getPositionCount())
+            ) {
+                IntVector idsVector = (IntVector) keys[0].asVector();
+                int idsOffset = categorizeBlockHash.seenNull() ? 0 : -1;
+                BytesRef scratch = new BytesRef();
+                for (int i = 0; i < idsVector.getPositionCount(); i++) {
+                    int id = idsVector.getInt(i);
+                    if (id == 0) {
+                        builder.appendNull();
+                    } else {
+                        builder.appendBytesRef(regexes.getBytesRef(id + idsOffset, scratch));
+                    }
+                }
+                keys[0].close();
+                keys[0] = builder.build();
+            }
+        } else {
+            // For intermediate output, the keys are the delegate PackedValuesBlockHash's
+            // keys, with the category IDs replaced by the categorizer's internal state
+            // together with the list of category IDs.
+            BytesRef state;
+            try (BytesStreamOutput out = new BytesStreamOutput()) {
+                out.writeBytesRef(categorizeBlockHash.serializeCategorizer());
+                // It's a bit inefficient to copy the IntVector's values into an int[]
+                // and discard the array soon after. IntVector should maybe expose the
+                // underlying array instead. TODO: investigate whether that's worth it
+                IntVector idsVector = (IntVector) keys[0].asVector();
+                int[] idsArray = new int[idsVector.getPositionCount()];
+                for (int i = 0; i < idsVector.getPositionCount(); i++) {
+                    idsArray[i] = idsVector.getInt(i);
+                }
+                out.writeIntArray(idsArray);
+                state = out.bytes().toBytesRef();
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            keys[0].close();
+            keys[0] = blockFactory.newConstantBytesRefBlockWith(state, keys[0].getPositionCount());
+        }
+        return keys;
+    }
+
+    @Override
+    public IntVector nonEmpty() {
+        return packedValuesBlockHash.nonEmpty();
+    }
+
+    @Override
+    public BitArray seenGroupIds(BigArrays bigArrays) {
+        return packedValuesBlockHash.seenGroupIds(bigArrays);
+    }
+
+    @Override
+    public final ReleasableIterator<IntBlock> lookup(Page page, ByteSizeValue targetBlockSize) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void close() {
+        Releasables.close(categorizeBlockHash, packedValuesBlockHash);
+    }
+}
diff --git a/...sql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java b/...sql/compute/src/main/java/org/elasticsearch/compute/operator/HashAggregationOperator.java
@@ -51,7 +51,13 @@ public Operator get(DriverContext driverContext) {
             if (groups.stream().anyMatch(BlockHash.GroupSpec::isCategorize)) {
                 return new HashAggregationOperator(
                     aggregators,
-                    () -> BlockHash.buildCategorizeBlockHash(groups, aggregatorMode, driverContext.blockFactory(), analysisRegistry),
+                    () -> BlockHash.buildCategorizeBlockHash(
+                        groups,
+                        aggregatorMode,
+                        driverContext.blockFactory(),
+                        analysisRegistry,
+                        maxPageSize
+                    ),
                     driverContext
                 );
             }