marian-nmt · emjotde · Apr 9, 2021 · Oct 9, 2020 · Oct 10, 2020 · Dec 9, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Adds custom bias epilogue kernel.
+- Adds support for fusing relu and bias addition into gemms when using cuda 11.
 - Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special
 - Display decoder time statistics with marian-decoder --stat-freq 10 ...
 - Support for MS-internal binary shortlist

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -347,8 +347,20 @@ if(CUDA_FOUND)
     endif()
     message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
   else(USE_STATIC_LIBS)
+  set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+    # We actually only need cublasLt here after cuda 11. Marian will work fine without it pre cuda 11. We want to force CMake to use the cublas
+    # version that ships with CUDA 11 so we force the search to occur inside of the cuda toolkit directory.
+    set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+    if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
+      find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
+      if(NOT CUDA_cublasLt_LIBRARY)
+        message(FATAL_ERROR "cuBLASLt library not found")
+      endif()
+      set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
+      set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
+    endif()
     set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-    message(STATUS "Found CUDA libraries: ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
+    message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
   endif(USE_STATIC_LIBS)
 
   if(USE_CUDNN)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -175,6 +175,7 @@ if(CUDA_FOUND)
     tensors/gpu/device.cu
     tensors/gpu/algorithm.cu
     tensors/gpu/prod.cpp
+    tensors/gpu/prod.cu
     tensors/gpu/prod_sparse.cpp
     tensors/gpu/topk.cu
     tensors/gpu/element.cu

diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
@@ -1,4 +1,5 @@
 #include "graph/expression_operators.h"
+#include "common/definitions.h"
 #include "layers/constructors.h"
 
 #include "graph/node_operators.h"
@@ -518,7 +519,7 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
   return Expression<DotBatchedNodeOp>(a, b, transA, transB, scale);
 }
 
-static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
+Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   // general version, MKL, CBlas or CUDA
 
   int rows = a->shape().elements() / a->shape()[-1];
@@ -577,6 +578,15 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   }
 }
 
+Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
+  auto graph = a->graph();
+
+  if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu)
+    return Expression<AffineWithReluNodeOp>(a, b, bias, transA, transB, scale);
+  else
+    return relu(affine(a, b, bias, transA, transB, scale));
+}
+
 // @TODO: Not a great place to check this
 #if CUDA_VERSION < 11000
 // multiply a CSR matrix A with a matrix B

diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
@@ -488,11 +488,21 @@ Expr bdot(Expr a,
  */
 Expr affine(Expr a,
             Expr b,
-            Expr c,
+            Expr bias,
             bool transA = false,
             bool transB = false,
             float scalar = 1.f);
 
+/**
+ * As above, but efficiently applies relu transformation to output. For inference only.
+ */
+Expr affineWithRelu(Expr a,
+                    Expr b,
+                    Expr bias,
+                    bool transA = false,
+                    bool transB = false,
+                    float scalar = 1.f);
+
 /**
  * Computes the dot product of CSR-tensor @p A with @p B.
  */

diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
@@ -266,17 +266,18 @@ class AffineNodeOp : public NaryNodeOp {
 
   NodeOps forwardOps() override {
     using namespace functional;
-
+    
     return {
-      NodeOp(
-          Prod(val_,
-               child(0)->val(),
-               child(1)->val(),
-               transA_,
-               transB_,
-               0.f,
-               scalar_);
-          Prod(val_, child(3)->val(), child(2)->val(), false, false, 1.f, 1.f))
+      NodeOp(Affine(val_,
+                    graph()->allocator(),
+                    child(0)->val(),
+                    child(1)->val(),
+                    child(2)->val(),
+                    transA_,
+                    transB_,
+                    0.f,
+                    scalar_,
+                    /*doRelu=*/false))
     };
   }
 
@@ -323,8 +324,7 @@ class AffineNodeOp : public NaryNodeOp {
                       false,
                       1.0,
                       scalar_, computeTypeB)),
-          NodeOp(Prod(
-              child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
+          NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
       };
 
     if(transA_ && !transB_)
@@ -343,8 +343,7 @@ class AffineNodeOp : public NaryNodeOp {
                       false,
                       1.0,
                       scalar_, computeTypeB)),
-          NodeOp(Prod(
-              child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
+          NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
       };
 
     if(transA_ && transB_)
@@ -363,8 +362,7 @@ class AffineNodeOp : public NaryNodeOp {
                       true,
                       1.0,
                       scalar_, computeTypeB)),
-          NodeOp(Prod(
-              child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
+          NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
       };
 
     return {
@@ -382,8 +380,7 @@ class AffineNodeOp : public NaryNodeOp {
                     false,
                     1.0,
                     scalar_, computeTypeB)),
-        NodeOp(Prod(
-            child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
+        NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC))
     };
   }
 
@@ -414,6 +411,97 @@ class AffineNodeOp : public NaryNodeOp {
 
 };
 
+class AffineWithReluNodeOp : public NaryNodeOp {
+private:
+  friend class SerializationHelpers;
+  bool transA_;
+  bool transB_;
+  float scalar_;
+
+public:
+  AffineWithReluNodeOp(Expr a, 
+                       Expr b, 
+                       Expr bias,
+                       bool transA,
+                       bool transB,
+                       float scalar)
+      : NaryNodeOp({a, b, bias}, newShape(a, b, transA, transB)),
+        transA_(transA),
+        transB_(transB),
+        scalar_(scalar) {
+    ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu,
+             "AffineWithReluNodeOp currently only supported for inference on GPU");
+  }
+
+  Shape newShape(Expr a, Expr b, bool transA, bool transB) {
+    auto shapeA = a->shape();
+    if(transA) {
+      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
+      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
+    }
+
+    auto shapeB = b->shape();
+    if(transB) {
+      shapeB.set(shapeB.size() - 2, b->shape()[shapeB.size() - 1]);
+      shapeB.set(shapeB.size() - 1, b->shape()[shapeB.size() - 2]);
+    }
+
+    Shape outShape = shapeA;
+    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
+    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
+             "Matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB);
+    return outShape;
+  }
+
+  NodeOps forwardOps() override {
+    ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu,
+             "AffineWithReluNodeOp currently only supported for inference on GPU");
+
+    return {
+      NodeOp(Affine(val_,
+                    graph()->allocator(),
+                    child(0)->val(),
+                    child(1)->val(),
+                    child(2)->val(),
+                    transA_,
+                    transB_,
+                    0.f,
+                    scalar_,
+                    /*doRelu=*/true))
+    };
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("AffineWithReluNodeOp cannot be used for training??");
+    return {};
+  }
+
+  const std::string type() override { return "affineWithRelu"; }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, transA_);
+    util::hash_combine(seed, transB_);
+    util::hash_combine(seed, scalar_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<AffineWithReluNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(transA_ != cnode->transA_)
+      return false;
+    if(transB_ != cnode->transB_)
+      return false;
+    if(scalar_ != cnode->scalar_)
+      return false;
+    return true;
+  }
+};
+
 class DotBatchedNodeOp : public NaryNodeOp {
 private:
   friend class SerializationHelpers;

diff --git a/src/layers/generic.h b/src/layers/generic.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "common/definitions.h"
+#include "graph/expression_operators.h"
 #include "marian.h"
 
 #include "data/shortlist.h"
@@ -168,22 +170,37 @@ class Dense : public LayerBase, public IUnaryLayer {
 // --- a few layers with built-in parameters created on the fly, without proper object
 // @TODO: change to a proper layer object
 
+static inline std::function<Expr(Expr)> activationByName(const std::string& actName) {
+  if (actName == "relu")
+    return (ActivationFunction*)relu;
+  else if (actName == "swish")
+    return (ActivationFunction*)swish;
+  else if (actName == "gelu")
+    return (ActivationFunction*)gelu;
+  else if (actName == "") // return identity function if activation name is empty
+    return [](Expr x) { return x; };
+  ABORT("Invalid activation name '{}'", actName);
+}
+
 // like affine() but with built-in parameters, activation, and dropout
 static inline Expr denseInline(Expr x,
                                std::string prefix,
                                std::string suffix,
                                int outDim,
                                Ptr<inits::NodeInitializer> initFn = inits::glorotUniform(),
-                               const std::function<Expr(Expr)>& actFn = nullptr,
+                               std::string actName = "",
                                float dropProb = 0.0f) {
   auto graph = x->graph();
 
   auto W = graph->param(prefix + "_W" + suffix, {x->shape()[-1], outDim}, inits::glorotUniform());
   auto b = graph->param(prefix + "_b" + suffix, {1, outDim}, inits::zeros());
 
-  x = affine(x, W, b);
-  if(actFn)
-    x = actFn(x);
+  if(actName == "relu") {
+    x = affineWithRelu(x, W, b); // speed optimization for inference, @TODO: handle better in future layer framework
+  } else {
+    x = affine(x, W, b);
+    x = activationByName(actName)(x);
+  }
   x = dropout(x, dropProb);  // @TODO: check for infernce?
   return x;
 }

diff --git a/src/layers/output.cpp b/src/layers/output.cpp
@@ -170,7 +170,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
                              /*suffix=*/"1",
                              ffnDim,
                              inits::glorotUniform(),
-                             (ActivationFunction*)relu,
+                             "relu",
                              ffnDropProb);
         f = denseInline(f, name + "_ffn", /*suffix=*/"2", inputDim);
         // add & norm

diff --git a/src/models/transformer.h b/src/models/transformer.h
@@ -396,28 +396,16 @@ class Transformer : public EncoderOrDecoderBase {
                           opt<int>("transformer-heads"), /*cache=*/false);
   }
 
-  static inline
-  std::function<Expr(Expr)> activationByName(const std::string& actName)
-  {
-    if (actName == "relu")
-      return (ActivationFunction*)relu;
-    else if (actName == "swish")
-      return (ActivationFunction*)swish;
-    else if (actName == "gelu")
-      return (ActivationFunction*)gelu;
-    ABORT("Invalid activation name '{}'", actName);
-  }
-
   Expr LayerFFN(std::string prefix, Expr input) const {
     int dimModel = input->shape()[-1];
 
     float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
     auto opsPre = opt<std::string>("transformer-preprocess");
     auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);
 
+    auto actName = opt<std::string>("transformer-ffn-activation");
     int dimFfn = opt<int>("transformer-dim-ffn");
     int depthFfn = opt<int>("transformer-ffn-depth");
-    auto actFn = activationByName(opt<std::string>("transformer-ffn-activation"));
     float ffnDropProb
       = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
 
@@ -427,12 +415,11 @@ class Transformer : public EncoderOrDecoderBase {
 
     // the stack of FF layers
     for(int i = 1; i < depthFfn; ++i)
-      output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actFn, ffnDropProb);
+      output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actName, ffnDropProb);
     output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel, initFn);
 
     auto opsPost = opt<std::string>("transformer-postprocess");
-    output
-      = postProcess(prefix + "_ffn", opsPost, output, input, dropProb);
+    output = postProcess(prefix + "_ffn", opsPost, output, input, dropProb);
 
     return output;
   }
@@ -450,21 +437,21 @@ class Transformer : public EncoderOrDecoderBase {
     // FFN
     int dimAan   = opt<int>("transformer-dim-aan");
     int depthAan = opt<int>("transformer-aan-depth");
-    auto actFn = activationByName(opt<std::string>("transformer-aan-activation"));
+    auto actName = opt<std::string>("transformer-aan-activation");
     float aanDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
 
     auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);
 
     // the stack of AAN layers
     for(int i = 1; i < depthAan; ++i)
-      y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actFn, aanDropProb);
+      y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actName, aanDropProb);
     if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
       y = denseInline(y, prefix, std::to_string(depthAan), dimModel, initFn);
 
     bool noGate = opt<bool>("transformer-aan-nogate");
     if(!noGate) {
-      auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, (ActivationFunction*)sigmoid);
-      auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, (ActivationFunction*)sigmoid);
+      auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, "sigmoid");
+      auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, "sigmoid");
       y = gi * x + gf * y;
     }