Skip to content

Commit

Permalink
Modify the launchers to support returning results.
Browse files Browse the repository at this point in the history
The launchers ceased really supporting dynamic kernel results
at some point. This PR adds that functionality back into the
mix. This support is added pervasively across the runtime
library calls.

Some notes:

  - Return values of static size continue to be supported as
    they were before. Specifically, these values are stored
    into the data buffer by the thunk adaptor so they can be
    returned, ultimately, to the original caller.

  - Return values of dynamic size follow exactly 1 of 2
    possible calling conventions. The convention must be
    selected by the runtime layers.

    1. Everything is running within a single process; i.e.,
       this is a simulation. In this case, the kernel will
       create a span of data and that span will be returned
       to the original caller which will use it to construct
       the std::vector result object.

    2. There are multiple processes and/or memory spaces
       involved. The result span will be appended to the
       original data packet and the new data packet will be
       returned as a new span by the runtime. The calling
       code will follow a similar process, but the data will
       be passed in the runtime in a pointer-free encoding.

Make the cast more robust to sneak it past -Werror.

Update another launchKernel override.

Add some doxygen goop to try an evade CI issues.

Fix the python builder errors.

Signed-off-by: Eric Schweitz <[email protected]>
  • Loading branch information
schweitzpgi committed Oct 17, 2024
1 parent 490930d commit fe11537
Show file tree
Hide file tree
Showing 25 changed files with 627 additions and 349 deletions.
4 changes: 4 additions & 0 deletions docs/sphinx/api/languages/cpp_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ Platform

.. doxygentypedef:: cudaq::KernelExecutionTask

.. doxygentypedef:: cudaq::KernelThunkResultType

.. doxygentypedef:: cudaq::KernelThunkType

Utilities
=========

Expand Down
14 changes: 11 additions & 3 deletions lib/Optimizer/Builder/Intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,14 @@ static constexpr IntrinsicCode intrinsicTable[] = {
})#"},

{"__nvqpp_createDynamicResult",
/* arguments:
arg0: original buffer ptr
arg1: original buffer size
arg2: ptr to span of the return data: {ptr, bytes}
arg3: offset to result slot in buffer */
{cudaq::llvmMemCopyIntrinsic, "malloc"},
R"#(
func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.struct<{!cc.ptr<i8>, i64}> {
func.func private @__nvqpp_createDynamicResult(%arg0: !cc.ptr<i8>, %arg1: i64, %arg2: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, %arg3: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
%0 = cc.compute_ptr %arg2[1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
%1 = cc.load %0 : !cc.ptr<i64>
%2 = arith.addi %arg1, %1 : i64
Expand All @@ -249,6 +254,9 @@ static constexpr IntrinsicCode intrinsicTable[] = {
%7 = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
%8 = cc.insert_value %3, %7[0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
%9 = cc.insert_value %2, %8[1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
%11 = cc.compute_ptr %10[%arg3] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
%12 = cc.cast %11 : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
cc.store %6, %12 : !cc.ptr<!cc.ptr<i8>>
return %9 : !cc.struct<{!cc.ptr<i8>, i64}>
})#"},

Expand Down Expand Up @@ -319,7 +327,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
{cudaq::runtime::launchKernelFuncName,
{},
R"#(
func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> ())#"},
func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},

{cudaq::runtime::CudaqRegisterArgsCreator,
{},
Expand All @@ -346,7 +354,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
{cudaq::runtime::launchKernelHybridFuncName,
{},
R"#(
func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> ())#"},
func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>)#"},

{cudaq::llvmMemCopyIntrinsic, // llvm.memcpy.p0i8.p0i8.i64
{},
Expand Down
146 changes: 106 additions & 40 deletions lib/Optimizer/Transforms/GenKernelExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ static bool isCodegenArgumentGather(std::size_t kind) {
return kind == 0 || kind == 2;
}

/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
/// side (mangled) stub to the code for every entry-point kernel in the module.
/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
/// creates registration hooks for the CUDA-Q runtime to be able to find the
/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
/// function.
namespace {
class GenerateKernelExecution
: public cudaq::opt::impl::GenerateKernelExecutionBase<
Expand All @@ -57,6 +63,19 @@ class GenerateKernelExecution

/// Creates the function signature for a thunk function. The signature is
/// always the same for all thunk functions.
///
/// Every thunk function has an identical signature, making it callable from a
/// generic "kernel launcher" in the CUDA-Q runtime.
///
/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
///
/// The first argument is a pointer to a data buffer that encodes all the
/// arguments (and static return) values to (and from) the kernel in the
/// pointer-free encoding. The second argument indicates if this call is to a
/// remote process (if true). The result is a pointer and size (span) if the
/// kernel returns a dynamically sized result, otherwise it will be
/// `{nullptr, 0}`. It is the responsibility of calling code to free any
/// dynamic result buffer(s) and convert those to `std::vector` objects.
FunctionType getThunkType(MLIRContext *ctx) {
auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
Expand Down Expand Up @@ -769,31 +788,32 @@ class GenerateKernelExecution
auto *thenBlock = builder.createBlock(reg);
auto *elseBlock = builder.createBlock(reg);
builder.setInsertionPointToEnd(currentBlock);
auto eleTy = structTy.getMember(offset);
auto memTy = cudaq::cc::PointerType::get(eleTy);
auto mem = builder.create<cudaq::cc::ComputePtrOp>(
loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
elseBlock);
builder.setInsertionPointToEnd(thenBlock);
auto gepRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(structTy.getMember(offset)), castOp,
ArrayRef<cudaq::cc::ComputePtrArg>{offset});
auto resAsVec = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(funcTy.getResult(0)), gepRes);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), resAsVec);
auto resAsArg = builder.create<cudaq::cc::CastOp>(
loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), gepRes);
// createDynamicResult packs the input values and the dynamic results
// into a single buffer to pass back as a message.
loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem);
auto retOffset = genComputeReturnOffset(loc, builder, funcTy, structTy);
// createDynamicResult allocates a new buffer and packs the input values
// and the dynamic results into this single new buffer to pass back as a
// message.
// NB: This code only handles one dimensional vectors of static types. It
// will have to be changed if there is a need to return recursively
// dynamic structures, i.e., vectors of vectors.
auto res = builder.create<func::CallOp>(
loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
ValueRange{thunkEntry->getArgument(0), structSize, resAsArg});
ValueRange{thunkEntry->getArgument(0), structSize, resAsArg,
retOffset});
builder.create<func::ReturnOp>(loc, res.getResult(0));
builder.setInsertionPointToEnd(elseBlock);
auto eleTy = structTy.getMember(offset);
auto memTy = cudaq::cc::PointerType::get(eleTy);
auto mem = builder.create<cudaq::cc::ComputePtrOp>(
loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
// For the else case, the span was already copied to the block.
} else {
// FIXME: Should check for recursive vector case.
// If the kernel returns non-dynamic results (no spans), then take those
Expand Down Expand Up @@ -854,8 +874,6 @@ class GenerateKernelExecution
auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
Value vecPtr = builder.create<cudaq::cc::LoadOp>(loc, ptrTy, sret0);
builder.create<func::CallOp>(loc, std::nullopt, "free", ValueRange{vecPtr});
auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
Expand Down Expand Up @@ -1338,21 +1356,72 @@ class GenerateKernelExecution
auto castLoadKernName =
builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);

auto hostFuncTy = hostFunc.getFunctionType();
assert((hostFuncTy.getResults().empty() ||
(hostFuncTy.getNumResults() == 1)) &&
"C++ function expected to have 0 or 1 return value");
const bool resultVal = !hostFuncTy.getResults().empty();
const bool kernelReturnsValue =
resultVal || cudaq::opt::factory::hasSRet(hostFunc);
Value launchResult;
Value launchResultToFree;
auto decodeLaunchResults = [&](Value spanReturned) {
if (!kernelReturnsValue)
return;
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
auto rptr = builder.create<cudaq::cc::ExtractValueOp>(loc, ptrI8Ty,
spanReturned, 0);
launchResultToFree = rptr;
auto rIntPtr = builder.create<cudaq::cc::CastOp>(loc, i64Ty, rptr);
auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
rIntPtr, zero);
auto *currentBlock = builder.getBlock();
auto *reg = currentBlock->getParent();
auto *thenBlock = builder.createBlock(reg);
auto *elseBlock = builder.createBlock(reg);
auto *endifBlock = builder.createBlock(
reg, reg->end(), TypeRange{ptrResTy}, SmallVector<Location>(1, loc));
builder.setInsertionPointToEnd(currentBlock);
builder.create<cf::CondBranchOp>(loc, cmp, thenBlock, elseBlock);
builder.setInsertionPointToEnd(thenBlock);
// dynamic result was returned.
// We need to free() this buffer before the end of this function.
auto rStructPtr =
builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rptr);
Value lRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, rStructPtr,
ArrayRef<cudaq::cc::ComputePtrArg>{offset});
builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{lRes});
builder.setInsertionPointToEnd(elseBlock);
// span was returned in the original buffer.
Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
builder.setInsertionPointToEnd(endifBlock);
launchResult = endifBlock->getArgument(0);
};

// Generate the call to `launchKernel`.
switch (codegenKind) {
case 0: {
assert(vecArgPtrs && castLoadThunk);
builder.create<func::CallOp>(
loc, std::nullopt, cudaq::runtime::launchKernelHybridFuncName,
auto launch = builder.create<func::CallOp>(
loc, cudaq::opt::factory::getDynamicBufferType(ctx),
cudaq::runtime::launchKernelHybridFuncName,
ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
extendedStructSize, resultOffset, vecArgPtrs});
decodeLaunchResults(launch.getResult(0));
} break;
case 1: {
assert(!vecArgPtrs && castLoadThunk);
builder.create<func::CallOp>(
loc, std::nullopt, cudaq::runtime::launchKernelFuncName,
auto launch = builder.create<func::CallOp>(
loc, cudaq::opt::factory::getDynamicBufferType(ctx),
cudaq::runtime::launchKernelFuncName,
ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
extendedStructSize, resultOffset});
decodeLaunchResults(launch.getResult(0));
} break;
case 2: {
assert(vecArgPtrs && !castLoadThunk);
Expand All @@ -1377,17 +1446,13 @@ class GenerateKernelExecution
// result value(s) from the struct returned by `launchKernel` and return
// them to our caller.
SmallVector<Value> results;
auto hostFuncTy = hostFunc.getFunctionType();
assert((hostFuncTy.getResults().empty() ||
(hostFuncTy.getNumResults() == 1)) &&
"C++ function expected to have 0 or 1 return value");
const bool resultVal = !hostFuncTy.getResults().empty();
if (resultVal || cudaq::opt::factory::hasSRet(hostFunc)) {
if (kernelReturnsValue) {
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
// Host function returns a value. Either returning by value or via an sret
// reference.
if (resultVal) {
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
// Static values. std::vector are necessarily sret, see below.
auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
Expand All @@ -1398,22 +1463,22 @@ class GenerateKernelExecution
}();
results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
} else {
// Check if device is returning a span. If it is, then we will need to
// convert it to a std::vector here. The vector is constructed in-place
// on the sret memory block.
// This is an sret return. Check if device is returning a span. If it
// is, then we will need to convert it to a std::vector here. The vector
// is constructed in-place on the sret memory block.
Value arg0 = hostFuncEntryBlock->getArguments().front();
if (auto spanTy =
dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
auto eleTy = spanTy.getElementType();
auto ptrTy = cudaq::cc::PointerType::get(eleTy);
auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
loc, cudaq::cc::PointerType::get(ptrTy), temp,
SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 0});
loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
SmallVector<cudaq::cc::ComputePtrArg>{0});
auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
loc, lenPtrTy, temp,
SmallVector<cudaq::cc::ComputePtrArg>{0, offset, 1});
loc, lenPtrTy, launchResult,
SmallVector<cudaq::cc::ComputePtrArg>{1});
auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
if (spanTy.getElementType() == builder.getI1Type()) {
genStdvecBoolFromInitList(loc, builder, arg0, dataPtr, vecLen);
Expand All @@ -1422,13 +1487,14 @@ class GenerateKernelExecution
builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
genStdvecTFromInitList(loc, builder, arg0, dataPtr, tSize, vecLen);
}
// free(nullptr) is defined to be a nop in the standard.
builder.create<func::CallOp>(loc, std::nullopt, "free",
ArrayRef<Value>{launchResultToFree});
} else {
// Otherwise, we can just copy the aggregate into the sret memory
// block. Uses the size of the host function's sret pointer element
// type for the memcpy, so the device should return an (aggregate)
// value of suitable size.
Type res0Ty = structTy.getMember(offset);
auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
loc, ptrResTy, temp,
ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
Expand Down
Loading

0 comments on commit fe11537

Please sign in to comment.