Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds better Affine support for GPUs when using CUDA 11. Introduces a new bias addition kernel for CUDA < 11 #778

Merged
merged 23 commits into from
Apr 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
66006ec
Added new inference operator that peforms bias addition and optionall…
rhenry-nv Oct 9, 2020
23a3223
Fix compilation for CPU only. Missed DISPATCH10 call when CUDA was un…
rhenry-nv Oct 10, 2020
dbb1653
Places fused affine under one node
rhenry-nv Dec 9, 2020
3e94764
Adds basic affine support for cublaslt in cuda 11
rhenry-nv Dec 9, 2020
4b41b86
Works around some bugs in CUDA 11
rhenry-nv Dec 9, 2020
cc30a32
Fixes API and removes comment
rhenry-nv Dec 15, 2020
792669f
Removes SPDX identifier in operator tests
rhenry-nv Dec 15, 2020
0c0b8ac
Adds SPDX identifier in operator tests - will remove in a different PR
rhenry-nv Dec 15, 2020
6258fd5
Removes NVIDIA notices
rhenry-nv Dec 15, 2020
764d7b8
Fixes windows compile errors
rhenry-nv Dec 15, 2020
95fb427
Format changes
rhenry-nv Dec 15, 2020
4bbf17f
updates changelog
rhenry-nv Dec 15, 2020
256e397
Adds bigobj flag to windows builds
rhenry-nv Oct 20, 2020
493c213
Merge remote-tracking branch 'public/master' into cuda_11
rhenry-nv Mar 6, 2021
e5b549d
Merge branch 'master' of https://github.com/marian-nmt/marian-dev int…
rhenry-nv Mar 23, 2021
a484893
refactor
emjotde Mar 25, 2021
9de84d4
remove previous code
emjotde Mar 25, 2021
43c54ce
add unit tests
emjotde Mar 25, 2021
04b0f95
fix incorrect function signature for CUDA 10 and smaller
emjotde Mar 25, 2021
5c7b6ee
only use AffineWithReluNodeOp for float types
emjotde Mar 25, 2021
c027c6d
switch back to using affineWithRelu on gpu only for now
emjotde Mar 25, 2021
7ea2d09
Merge pull request #2 from marian-nmt/mjd/refactor_cuda11
rhenry-nv Apr 2, 2021
36d8f0c
Merge branch 'master' into cuda_11
emjotde Apr 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
- Adds custom bias epilogue kernel.
- Adds support for fusing relu and bias addition into gemms when using cuda 11.
- Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special
- Display decoder time statistics with marian-decoder --stat-freq 10 ...
- Support for MS-internal binary shortlist
Expand Down
14 changes: 13 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,20 @@ if(CUDA_FOUND)
endif()
message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
else(USE_STATIC_LIBS)
set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
# We actually only need cublasLt here after cuda 11. Marian will work fine without it pre cuda 11. We want to force CMake to use the cublas
# version that ships with CUDA 11 so we force the search to occur inside of the cuda toolkit directory.
set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
message(STATUS "Found CUDA libraries: ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
endif(USE_STATIC_LIBS)

if(USE_CUDNN)
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ if(CUDA_FOUND)
tensors/gpu/device.cu
tensors/gpu/algorithm.cu
tensors/gpu/prod.cpp
tensors/gpu/prod.cu
tensors/gpu/prod_sparse.cpp
tensors/gpu/topk.cu
tensors/gpu/element.cu
Expand Down
12 changes: 11 additions & 1 deletion src/graph/expression_operators.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "graph/expression_operators.h"
#include "common/definitions.h"
#include "layers/constructors.h"

#include "graph/node_operators.h"
Expand Down Expand Up @@ -518,7 +519,7 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
return Expression<DotBatchedNodeOp>(a, b, transA, transB, scale);
}

static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// general version, MKL, CBlas or CUDA

int rows = a->shape().elements() / a->shape()[-1];
Expand Down Expand Up @@ -577,6 +578,15 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
}
}

Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
auto graph = a->graph();

if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu)
return Expression<AffineWithReluNodeOp>(a, b, bias, transA, transB, scale);
else
return relu(affine(a, b, bias, transA, transB, scale));
}

// @TODO: Not a great place to check this
#if CUDA_VERSION < 11000
// multiply a CSR matrix A with a matrix B
Expand Down
12 changes: 11 additions & 1 deletion src/graph/expression_operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,11 +488,21 @@ Expr bdot(Expr a,
*/
Expr affine(Expr a,
Expr b,
Expr c,
Expr bias,
bool transA = false,
bool transB = false,
float scalar = 1.f);

/**
* As above, but efficiently applies relu transformation to output. For inference only.
*/
Expr affineWithRelu(Expr a,
Expr b,
Expr bias,
bool transA = false,
bool transB = false,
float scalar = 1.f);

/**
* Computes the dot product of CSR-tensor @p A with @p B.
*/
Expand Down
124 changes: 106 additions & 18 deletions src/graph/node_operators_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,17 +266,18 @@ class AffineNodeOp : public NaryNodeOp {

NodeOps forwardOps() override {
using namespace functional;

return {
NodeOp(
Prod(val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_);
Prod(val_, child(3)->val(), child(2)->val(), false, false, 1.f, 1.f))
NodeOp(Affine(val_,
graph()->allocator(),
child(0)->val(),
child(1)->val(),
child(2)->val(),
transA_,
transB_,
0.f,
scalar_,
/*doRelu=*/false))
};
}

Expand Down Expand Up @@ -323,8 +324,7 @@ class AffineNodeOp : public NaryNodeOp {
false,
1.0,
scalar_, computeTypeB)),
NodeOp(Prod(
child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
};

if(transA_ && !transB_)
Expand All @@ -343,8 +343,7 @@ class AffineNodeOp : public NaryNodeOp {
false,
1.0,
scalar_, computeTypeB)),
NodeOp(Prod(
child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
};

if(transA_ && transB_)
Expand All @@ -363,8 +362,7 @@ class AffineNodeOp : public NaryNodeOp {
true,
1.0,
scalar_, computeTypeB)),
NodeOp(Prod(
child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
};

return {
Expand All @@ -382,8 +380,7 @@ class AffineNodeOp : public NaryNodeOp {
false,
1.0,
scalar_, computeTypeB)),
NodeOp(Prod(
child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
};
}

Expand Down Expand Up @@ -414,6 +411,97 @@ class AffineNodeOp : public NaryNodeOp {

};

class AffineWithReluNodeOp : public NaryNodeOp {
private:
friend class SerializationHelpers;
bool transA_;
bool transB_;
float scalar_;

public:
AffineWithReluNodeOp(Expr a,
Expr b,
Expr bias,
bool transA,
bool transB,
float scalar)
: NaryNodeOp({a, b, bias}, newShape(a, b, transA, transB)),
transA_(transA),
transB_(transB),
scalar_(scalar) {
ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu,
"AffineWithReluNodeOp currently only supported for inference on GPU");
}

Shape newShape(Expr a, Expr b, bool transA, bool transB) {
auto shapeA = a->shape();
if(transA) {
shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
}

auto shapeB = b->shape();
if(transB) {
shapeB.set(shapeB.size() - 2, b->shape()[shapeB.size() - 1]);
shapeB.set(shapeB.size() - 1, b->shape()[shapeB.size() - 2]);
}

Shape outShape = shapeA;
outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
"Matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB);
return outShape;
}

NodeOps forwardOps() override {
ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu,
"AffineWithReluNodeOp currently only supported for inference on GPU");

return {
NodeOp(Affine(val_,
graph()->allocator(),
child(0)->val(),
child(1)->val(),
child(2)->val(),
transA_,
transB_,
0.f,
scalar_,
/*doRelu=*/true))
};
}

NodeOps backwardOps() override {
ABORT("AffineWithReluNodeOp cannot be used for training??");
return {};
}

const std::string type() override { return "affineWithRelu"; }

virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, transA_);
util::hash_combine(seed, transB_);
util::hash_combine(seed, scalar_);
return seed;
}

virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<AffineWithReluNodeOp>(node);
if(!cnode)
return false;
if(transA_ != cnode->transA_)
return false;
if(transB_ != cnode->transB_)
return false;
if(scalar_ != cnode->scalar_)
return false;
return true;
}
};

class DotBatchedNodeOp : public NaryNodeOp {
private:
friend class SerializationHelpers;
Expand Down
25 changes: 21 additions & 4 deletions src/layers/generic.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include "common/definitions.h"
#include "graph/expression_operators.h"
#include "marian.h"

#include "data/shortlist.h"
Expand Down Expand Up @@ -168,22 +170,37 @@ class Dense : public LayerBase, public IUnaryLayer {
// --- a few layers with built-in parameters created on the fly, without proper object
// @TODO: change to a proper layer object

static inline std::function<Expr(Expr)> activationByName(const std::string& actName) {
if (actName == "relu")
return (ActivationFunction*)relu;
else if (actName == "swish")
return (ActivationFunction*)swish;
else if (actName == "gelu")
return (ActivationFunction*)gelu;
else if (actName == "") // return identity function if activation name is empty
return [](Expr x) { return x; };
ABORT("Invalid activation name '{}'", actName);
}

// like affine() but with built-in parameters, activation, and dropout
static inline Expr denseInline(Expr x,
std::string prefix,
std::string suffix,
int outDim,
Ptr<inits::NodeInitializer> initFn = inits::glorotUniform(),
const std::function<Expr(Expr)>& actFn = nullptr,
std::string actName = "",
float dropProb = 0.0f) {
auto graph = x->graph();

auto W = graph->param(prefix + "_W" + suffix, {x->shape()[-1], outDim}, inits::glorotUniform());
auto b = graph->param(prefix + "_b" + suffix, {1, outDim}, inits::zeros());

x = affine(x, W, b);
if(actFn)
x = actFn(x);
if(actName == "relu") {
x = affineWithRelu(x, W, b); // speed optimization for inference, @TODO: handle better in future layer framework
} else {
x = affine(x, W, b);
x = activationByName(actName)(x);
}
x = dropout(x, dropProb); // @TODO: check for infernce?
return x;
}
Expand Down
2 changes: 1 addition & 1 deletion src/layers/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
/*suffix=*/"1",
ffnDim,
inits::glorotUniform(),
(ActivationFunction*)relu,
"relu",
ffnDropProb);
f = denseInline(f, name + "_ffn", /*suffix=*/"2", inputDim);
// add & norm
Expand Down
27 changes: 7 additions & 20 deletions src/models/transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,28 +396,16 @@ class Transformer : public EncoderOrDecoderBase {
opt<int>("transformer-heads"), /*cache=*/false);
}

static inline
std::function<Expr(Expr)> activationByName(const std::string& actName)
{
if (actName == "relu")
return (ActivationFunction*)relu;
else if (actName == "swish")
return (ActivationFunction*)swish;
else if (actName == "gelu")
return (ActivationFunction*)gelu;
ABORT("Invalid activation name '{}'", actName);
}

Expr LayerFFN(std::string prefix, Expr input) const {
int dimModel = input->shape()[-1];

float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
auto opsPre = opt<std::string>("transformer-preprocess");
auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);

auto actName = opt<std::string>("transformer-ffn-activation");
int dimFfn = opt<int>("transformer-dim-ffn");
int depthFfn = opt<int>("transformer-ffn-depth");
auto actFn = activationByName(opt<std::string>("transformer-ffn-activation"));
float ffnDropProb
= inference_ ? 0 : opt<float>("transformer-dropout-ffn");

Expand All @@ -427,12 +415,11 @@ class Transformer : public EncoderOrDecoderBase {

// the stack of FF layers
for(int i = 1; i < depthFfn; ++i)
output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actFn, ffnDropProb);
output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actName, ffnDropProb);
output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel, initFn);

auto opsPost = opt<std::string>("transformer-postprocess");
output
= postProcess(prefix + "_ffn", opsPost, output, input, dropProb);
output = postProcess(prefix + "_ffn", opsPost, output, input, dropProb);

return output;
}
Expand All @@ -450,21 +437,21 @@ class Transformer : public EncoderOrDecoderBase {
// FFN
int dimAan = opt<int>("transformer-dim-aan");
int depthAan = opt<int>("transformer-aan-depth");
auto actFn = activationByName(opt<std::string>("transformer-aan-activation"));
auto actName = opt<std::string>("transformer-aan-activation");
float aanDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");

auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);

// the stack of AAN layers
for(int i = 1; i < depthAan; ++i)
y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actFn, aanDropProb);
y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actName, aanDropProb);
if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
y = denseInline(y, prefix, std::to_string(depthAan), dimModel, initFn);

bool noGate = opt<bool>("transformer-aan-nogate");
if(!noGate) {
auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, (ActivationFunction*)sigmoid);
auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, (ActivationFunction*)sigmoid);
auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, "sigmoid");
auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, "sigmoid");
y = gi * x + gf * y;
}

Expand Down
Loading