Merge sycl plugin support to xgboost v2.1.0 (#53)

* add tests for prediction cache * add the rest of plugin sunctionality * linting * fix compilation failure * fix dispatching * add pruner initialisation * fix fp64 support determination * make sycl updater launchable from python * apply PredictRow optimisation * fix x86 build --------- Co-authored-by: Dmitry Razdoburdin <>
IntelPython · Jun 11, 2024 · 5ecb394 · 5ecb394
1 parent 7e94cbf
commit 5ecb394
Show file tree

Hide file tree

Showing 20 changed files with 1,188 additions and 141 deletions.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
@@ -107,7 +107,7 @@ class Predictor {
    */
   virtual void PredictBatch(DMatrix* dmat, PredictionCacheEntry* out_preds,
                             const gbm::GBTreeModel& model, uint32_t tree_begin,
-                            uint32_t tree_end = 0) const = 0;
+                            uint32_t tree_end = 0, bool training = false) const = 0;
 
   /**
    * \brief Inplace prediction.

diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
@@ -31,6 +31,33 @@ template void InitHist(::sycl::queue qu,
                        GHistRow<double, MemoryType::on_device>* hist,
                        size_t size, ::sycl::event* event);
 
+/*!
+ * \brief Copy histogram from src to dst
+ */
+template<typename GradientSumT>
+void CopyHist(::sycl::queue qu,
+              GHistRow<GradientSumT, MemoryType::on_device>* dst,
+              const GHistRow<GradientSumT, MemoryType::on_device>& src,
+              size_t size) {
+  GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
+  const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      pdst[i] = psrc[i];
+    });
+  }).wait();
+}
+template void CopyHist(::sycl::queue qu,
+                       GHistRow<float, MemoryType::on_device>* dst,
+                       const GHistRow<float, MemoryType::on_device>& src,
+                       size_t size);
+template void CopyHist(::sycl::queue qu,
+                       GHistRow<double, MemoryType::on_device>* dst,
+                       const GHistRow<double, MemoryType::on_device>& src,
+                       size_t size);
+
 /*!
  * \brief Compute Subtraction: dst = src1 - src2
  */

diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
@@ -36,6 +36,15 @@ void InitHist(::sycl::queue qu,
               GHistRow<GradientSumT, MemoryType::on_device>* hist,
               size_t size, ::sycl::event* event);
 
+/*!
+ * \brief Copy histogram from src to dst
+ */
+template<typename GradientSumT>
+void CopyHist(::sycl::queue qu,
+              GHistRow<GradientSumT, MemoryType::on_device>* dst,
+              const GHistRow<GradientSumT, MemoryType::on_device>& src,
+              size_t size);
+
 /*!
  * \brief Compute subtraction: dst = src1 - src2
  */

diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
@@ -96,12 +96,9 @@ class USMVector {
   ~USMVector() {
   }
 
-  USMVector<T>& operator=(const USMVector<T>& other) {
-    size_ = other.size_;
-    capacity_ = other.capacity_;
-    data_ = other.data_;
-    return *this;
-  }
+  USMVector(const USMVector&) = delete;
+
+  USMVector<T>& operator=(const USMVector<T>& other) = delete;
 
   T* Data() { return data_.get(); }
   const T* DataConst() const { return data_.get(); }
@@ -139,6 +136,17 @@ class USMVector {
     }
   }
 
+  /* Resize without keeping the data*/
+  void ResizeNoCopy(::sycl::queue* qu, size_t size_new) {
+    if (size_new <= capacity_) {
+      size_ = size_new;
+    } else {
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+    }
+  }
+
   void Resize(::sycl::queue* qu, size_t size_new, T v) {
     if (size_new <= size_) {
       size_ = size_new;
@@ -162,7 +170,7 @@ class USMVector {
     if (size_new <= size_) {
       size_ = size_new;
     } else if (size_new <= capacity_) {
-      auto event = qu->fill(data_.get() + size_, v, size_new - size_);
+      *event = qu->fill(data_.get() + size_, v, size_new - size_, *event);
       size_ = size_new;
     } else {
       size_t size_old = size_;
@@ -215,16 +223,35 @@ class USMVector {
 
 /* Wrapper for DMatrix which stores all batches in a single USM buffer */
 struct DeviceMatrix {
-  DMatrix* p_mat;  // Pointer to the original matrix on the host
+  DMatrix* p_mat = nullptr;  // Pointer to the original matrix on the host
   ::sycl::queue qu_;
   USMVector<size_t, MemoryType::on_device> row_ptr;
   USMVector<Entry, MemoryType::on_device> data;
   size_t total_offset;
+  bool is_from_cache = false;
 
   DeviceMatrix() = default;
 
-  void Init(::sycl::queue qu, DMatrix* dmat) {
+  DeviceMatrix(const DeviceMatrix& other) = delete;
+
+  DeviceMatrix& operator= (const DeviceMatrix& other) = delete;
+
+  // During training the same dmatrix is used, so we don't need reload it on device
+  bool ReinitializationRequired(DMatrix* dmat, bool training) {
+    if (!training) return true;
+    if (p_mat != dmat) return true;
+    return false;
+  }
+
+  void Init(::sycl::queue qu, DMatrix* dmat, bool training = false) {
     qu_ = qu;
+    if (!ReinitializationRequired(dmat, training)) {
+      is_from_cache = true;
+      return;
+    }
+
+    is_from_cache = false;
+
     p_mat = dmat;
 
     size_t num_row = 0;

diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
@@ -1,10 +1,11 @@
 /*!
- * Copyright by Contributors 2017-2023
+ * Copyright by Contributors 2017-2024
  */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
-#pragma GCC diagnostic ignored "-W#pragma-messages"
-#pragma GCC diagnostic pop
+#include <dmlc/timer.h>
+// #pragma GCC diagnostic push
+// #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+// #pragma GCC diagnostic ignored "-W#pragma-messages"
+// #pragma GCC diagnostic pop
 
 #include <cstddef>
 #include <limits>
@@ -158,6 +159,8 @@ float GetLeafWeight(const Node* nodes, const float* fval_buff) {
 
 template <bool any_missing>
 void DevicePredictInternal(::sycl::queue* qu,
+                           USMVector<float,   MemoryType::on_device>* fval_buff,
+                           USMVector<uint8_t, MemoryType::on_device>* miss_buff,
                            const sycl::DeviceMatrix& dmat,
                            HostDeviceVector<float>* out_preds,
                            const gbm::GBTreeModel& model,
@@ -178,15 +181,17 @@ void DevicePredictInternal(::sycl::queue* qu,
   int num_rows = dmat.row_ptr.Size() - 1;
   int num_group = model.learner_model_param->num_output_group;
 
-  USMVector<float,   MemoryType::on_device> fval_buff(qu, num_features * num_rows);
-  USMVector<uint8_t, MemoryType::on_device> miss_buff;
-  auto* fval_buff_ptr = fval_buff.Data();
+  bool update_buffs = !dmat.is_from_cache;
 
   std::vector<::sycl::event> events(1);
-  if constexpr (any_missing) {
-    miss_buff.Resize(qu, num_features * num_rows, 1, &events[0]);
+  if (update_buffs) {
+    fval_buff->Resize(qu, num_features * num_rows);
+    if constexpr (any_missing) {
+      miss_buff->Resize(qu, num_features * num_rows, 1, &events[0]);
+    }
   }
-  auto* miss_buff_ptr = miss_buff.Data();
+  auto* fval_buff_ptr = fval_buff->Data();
+  auto* miss_buff_ptr = miss_buff->Data();
 
   auto& out_preds_vec = out_preds->HostVector();
   ::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
@@ -198,12 +203,14 @@ void DevicePredictInternal(::sycl::queue* qu,
       auto* fval_buff_row_ptr = fval_buff_ptr + num_features * row_idx;
       auto* miss_buff_row_ptr = miss_buff_ptr + num_features * row_idx;
 
-      const Entry* first_entry = data + row_ptr[row_idx];
-      const Entry* last_entry = data + row_ptr[row_idx + 1];
-      for (const Entry* entry = first_entry; entry < last_entry; entry += 1) {
-        fval_buff_row_ptr[entry->index] = entry->fvalue;
-        if constexpr (any_missing) {
-          miss_buff_row_ptr[entry->index] = 0;
+      if (update_buffs) {
+        const Entry* first_entry = data + row_ptr[row_idx];
+        const Entry* last_entry = data + row_ptr[row_idx + 1];
+        for (const Entry* entry = first_entry; entry < last_entry; entry += 1) {
+          fval_buff_row_ptr[entry->index] = entry->fvalue;
+          if constexpr (any_missing) {
+            miss_buff_row_ptr[entry->index] = 0;
+          }
         }
       }
 
@@ -241,6 +248,7 @@ class Predictor : public xgboost::Predictor {
   void InitOutPredictions(const MetaInfo& info,
                           HostDeviceVector<bst_float>* out_preds,
                           const gbm::GBTreeModel& model) const override {
+    predictor_monitor_.Start("InitOutPredictions");
     CHECK_NE(model.learner_model_param->num_output_group, 0);
     size_t n = model.learner_model_param->num_output_group * info.num_row_;
     const auto& base_margin = info.base_margin_.Data()->HostVector();
@@ -268,33 +276,40 @@ class Predictor : public xgboost::Predictor {
       }
       std::fill(out_preds_h.begin(), out_preds_h.end(), base_score);
     }
+    predictor_monitor_.Stop("InitOutPredictions");
   }
 
   explicit Predictor(Context const* context) :
       xgboost::Predictor::Predictor{context},
-      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {}
+      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {
+        predictor_monitor_.Init("SyclPredictor");
+      }
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, uint32_t tree_begin,
-                    uint32_t tree_end = 0) const override {
+                    uint32_t tree_end = 0, bool training = false) const override {
     ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
-    // TODO(razdoburdin): remove temporary workaround after cache fix
-    sycl::DeviceMatrix device_matrix;
-    device_matrix.Init(qu, dmat);
+    predictor_monitor_.Start("InitDeviceMatrix");
+    device_matrix.Init(qu, dmat, training);
+    predictor_monitor_.Stop("InitDeviceMatrix");
 
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
       tree_end = model.trees.size();
     }
 
+    predictor_monitor_.Start("DevicePredictInternal");
     if (tree_begin < tree_end) {
       const bool any_missing = !(dmat->IsDense());
       if (any_missing) {
-        DevicePredictInternal<true>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
+        DevicePredictInternal<true>(&qu, &fval_buff, &miss_buff, device_matrix,
+                                    out_preds, model, tree_begin, tree_end);
       } else {
-        DevicePredictInternal<false>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
+        DevicePredictInternal<false>(&qu, &fval_buff, &miss_buff, device_matrix,
+                                    out_preds, model, tree_begin, tree_end);
       }
     }
+    predictor_monitor_.Stop("DevicePredictInternal");
   }
 
   bool InplacePredict(std::shared_ptr<DMatrix> p_m,
@@ -341,7 +356,11 @@ class Predictor : public xgboost::Predictor {
 
  private:
   DeviceManager device_manager;
+  mutable sycl::DeviceMatrix device_matrix;
+  mutable USMVector<float,   MemoryType::on_device> fval_buff;
+  mutable USMVector<uint8_t, MemoryType::on_device> miss_buff;
 
+  mutable xgboost::common::Monitor predictor_monitor_;
   std::unique_ptr<xgboost::Predictor> cpu_predictor;
 };
 

diff --git a/plugin/sycl/tree/hist_row_adder.h b/plugin/sycl/tree/hist_row_adder.h
@@ -39,6 +39,41 @@ class BatchHistRowsAdder: public HistRowsAdder<GradientSumT> {
   }
 };
 
+template <typename GradientSumT>
+class DistributedHistRowsAdder: public HistRowsAdder<GradientSumT> {
+ public:
+  void AddHistRows(HistUpdater<GradientSumT>* builder,
+                   std::vector<int>* sync_ids, RegTree *p_tree) override {
+    builder->builder_monitor_.Start("AddHistRows");
+    const size_t explicit_size = builder->nodes_for_explicit_hist_build_.size();
+    const size_t subtaction_size = builder->nodes_for_subtraction_trick_.size();
+    std::vector<int> merged_node_ids(explicit_size + subtaction_size);
+    for (size_t i = 0; i < explicit_size; ++i) {
+      merged_node_ids[i] = builder->nodes_for_explicit_hist_build_[i].nid;
+    }
+    for (size_t i = 0; i < subtaction_size; ++i) {
+      merged_node_ids[explicit_size + i] =
+      builder->nodes_for_subtraction_trick_[i].nid;
+    }
+    std::sort(merged_node_ids.begin(), merged_node_ids.end());
+    sync_ids->clear();
+    for (auto const& nid : merged_node_ids) {
+      if ((*p_tree)[nid].IsLeftChild()) {
+        builder->hist_.AddHistRow(nid);
+        builder->hist_local_worker_.AddHistRow(nid);
+        sync_ids->push_back(nid);
+      }
+    }
+    for (auto const& nid : merged_node_ids) {
+      if (!((*p_tree)[nid].IsLeftChild())) {
+        builder->hist_.AddHistRow(nid);
+        builder->hist_local_worker_.AddHistRow(nid);
+      }
+    }
+    builder->builder_monitor_.Stop("AddHistRows");
+  }
+};
+
 }  // namespace tree
 }  // namespace sycl
 }  // namespace xgboost

diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h
@@ -61,6 +61,62 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
   std::vector<::sycl::event> hist_sync_events_;
 };
 
+template <typename GradientSumT>
+class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
+ public:
+  void SyncHistograms(HistUpdater<GradientSumT>* builder,
+                      const std::vector<int>& sync_ids,
+                      RegTree *p_tree) override {
+    builder->builder_monitor_.Start("SyncHistograms");
+    const size_t nbins = builder->hist_builder_.GetNumBins();
+    for (int node = 0; node < builder->nodes_for_explicit_hist_build_.size(); node++) {
+      const auto entry = builder->nodes_for_explicit_hist_build_[node];
+      auto& this_hist = builder->hist_[entry.nid];
+      // Store posible parent node
+      auto& this_local = builder->hist_local_worker_[entry.nid];
+      common::CopyHist(builder->qu_, &this_local, this_hist, nbins);
+
+      if (!(*p_tree)[entry.nid].IsRoot()) {
+        const size_t parent_id = (*p_tree)[entry.nid].Parent();
+        auto sibling_nid = entry.GetSiblingId(p_tree, parent_id);
+        auto& parent_hist = builder->hist_local_worker_[parent_id];
+        auto& sibling_hist = builder->hist_[sibling_nid];
+        common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
+                                this_hist, nbins, ::sycl::event());
+        // Store posible parent node
+        auto& sibling_local = builder->hist_local_worker_[sibling_nid];
+        common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
+      }
+    }
+    builder->ReduceHists(sync_ids, nbins);
+
+    ParallelSubtractionHist(builder, builder->nodes_for_explicit_hist_build_, p_tree);
+    ParallelSubtractionHist(builder, builder->nodes_for_subtraction_trick_, p_tree);
+
+    builder->builder_monitor_.Stop("SyncHistograms");
+  }
+
+  void ParallelSubtractionHist(HistUpdater<GradientSumT>* builder,
+                               const std::vector<ExpandEntry>& nodes,
+                               const RegTree * p_tree) {
+    const size_t nbins = builder->hist_builder_.GetNumBins();
+    for (int node = 0; node < nodes.size(); node++) {
+      const auto entry = nodes[node];
+      if (!((*p_tree)[entry.nid].IsLeftChild())) {
+        auto& this_hist = builder->hist_[entry.nid];
+
+        if (!(*p_tree)[entry.nid].IsRoot()) {
+          const size_t parent_id = (*p_tree)[entry.nid].Parent();
+          auto& parent_hist = builder->hist_[parent_id];
+          auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
+          common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
+                                  sibling_hist, nbins, ::sycl::event());
+        }
+      }
+    }
+  }
+};
+
 }  // namespace tree
 }  // namespace sycl
 }  // namespace xgboost