From ccc50d1edb2dcffd3653b640c9da5869fba79e42 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 29 Oct 2020 15:11:17 +0800
Subject: [PATCH 01/15] init

---
 cgotorch/Makefile   |  4 ++--
 cgotorch/gloo.cc    | 25 +++++++++++++++++++++++++
 cgotorch/gloo.h     | 16 ++++++++++++++++
 cgotorch/torchdef.h |  8 +++++++-
 4 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 cgotorch/gloo.cc
 create mode 100644 cgotorch/gloo.h
diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index 31fb1a1e..4f60b7a4 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -13,13 +13,13 @@ objs := $(srcs:%.cc=%.o)
 	-O -c $< -o $@
 
 libcgotorch.so: $(objs) ${LIBTORCH_DIR}
-	${CXX} 	-L libtorch/lib \
+	${CXX} 	-v -L libtorch/lib \
 	$(objs) \
 	-shared \
 	-o $@ ${INSTALL_NAME} \
 	-Wl,-rpath,libtorch/lib \
 	-Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \
-	-lc10 -ltorch -ltorch_cpu
+	-lc10 -lgloo -lc10d -ltorch -ltorch_cpu
 
 clean:
 	rm -rf *.so *.o
diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
new file mode 100644
index 00000000..12b1080e
--- /dev/null
+++ b/cgotorch/gloo.cc
@@ -0,0 +1,25 @@
+// Copyright 2020, GoTorch Authors
+
+#include "cgotorch/gloo.h"
+
+const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
+                              FileStore *store) {
+  try {
+    *store = new c10d::FileStore(std::string(path), num_workers);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
+const char *Gloo_NewProcessGroupGloo(FileStore *store, int64_t rank,
+                                     int64_t size, ProcessGroupGloo *pg) {
+  try {
+    *pg = new c10d::ProcessGroupGloo(
+        std::shared_ptr<c10d::Store>(static_cast<c10d::FileStore *>(*store)),
+        rank, size);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
new file mode 100644
index 00000000..b7444670
--- /dev/null
+++ b/cgotorch/gloo.h
@@ -0,0 +1,16 @@
+/* Copyright 2020, GoTorch Authors */
+
+#pragma once
+
+#include "cgotorch/torchdef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
+                              FileStore *store);
+const char *Gloo_NewProcessGroupGloo(FileStore *store, int64_t rank,
+                                     int64_t size, ProcessGroupGloo *pg);
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/cgotorch/torchdef.h b/cgotorch/torchdef.h
index 55b2b037..ef0885ca 100644
--- a/cgotorch/torchdef.h
+++ b/cgotorch/torchdef.h
@@ -6,8 +6,10 @@
 
 #ifdef __cplusplus
 #include <torch/torch.h>
-
+#include <c10d/FileStore.hpp>
+#include <c10d/ProcessGroupGloo.hpp>
 #include <vector>
+
 extern "C" {
 typedef at::Tensor *Tensor;
 typedef torch::optim::Optimizer *Optimizer;
@@ -15,6 +17,8 @@ typedef torch::data::datasets::MNIST *MNIST;
 typedef torch::data::transforms::Normalize<> *Normalize;
 typedef torch::Device *Device;
 typedef std::vector<char> *ByteBuffer;  // NOLINT
+typedef c10d::FileStore *FileStore;
+typedef c10d::ProcessGroupGloo *ProcessGroupGloo;
 #else
 typedef void *Tensor;
 typedef void *Optimizer;
@@ -22,6 +26,8 @@ typedef void *MNIST;
 typedef void *Normalize;
 typedef void *Device;
 typedef void *ByteBuffer;
+typedef void *FileStore;
+typedef void *ProcessGroupGloo;
 #endif
 typedef void *CUDAStream;
 

From e99c26fc5cfa8f74049ca71e16fe6f898c7b9d77 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 29 Oct 2020 16:12:42 +0800
Subject: [PATCH 02/15] expose C++ method to Go

---
 cgotorch/Makefile |  2 +-
 cgotorch/gloo.cc  | 19 +++++++++++++++++--
 cgotorch/gloo.h   |  5 ++++-
 gloo.go           | 42 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 gloo.go

diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index 4f60b7a4..7fc8822d 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -13,7 +13,7 @@ objs := $(srcs:%.cc=%.o)
 	-O -c $< -o $@
 
 libcgotorch.so: $(objs) ${LIBTORCH_DIR}
-	${CXX} 	-v -L libtorch/lib \
+	${CXX} -L libtorch/lib \
 	$(objs) \
 	-shared \
 	-o $@ ${INSTALL_NAME} \
diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index 12b1080e..f341601d 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -12,14 +12,29 @@ const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
   }
 }
 
-const char *Gloo_NewProcessGroupGloo(FileStore *store, int64_t rank,
+const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
                                      int64_t size, ProcessGroupGloo *pg) {
   try {
     *pg = new c10d::ProcessGroupGloo(
-        std::shared_ptr<c10d::Store>(static_cast<c10d::FileStore *>(*store)),
+        std::shared_ptr<c10d::Store>(static_cast<c10d::FileStore *>(store)),
         rank, size);
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
   }
 }
+
+const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
+                           int64_t length) {
+  try {
+    std::vector<torch::Tensor> ts;
+    while (ts.size() < length) {
+      ts.push_back(**tensors++);
+    }
+    auto work = static_cast<c10d::ProcessGroupGloo *>(pg)->allreduce(ts);
+    work->wait();
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
index b7444670..c294af44 100644
--- a/cgotorch/gloo.h
+++ b/cgotorch/gloo.h
@@ -9,8 +9,11 @@ extern "C" {
 #endif
 const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
                               FileStore *store);
-const char *Gloo_NewProcessGroupGloo(FileStore *store, int64_t rank,
+const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
                                      int64_t size, ProcessGroupGloo *pg);
+
+const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
+                           int64_t length);
 #ifdef __cplusplus
 }
 #endif
\ No newline at end of file
diff --git a/gloo.go b/gloo.go
new file mode 100644
index 00000000..cf5d4556
--- /dev/null
+++ b/gloo.go
@@ -0,0 +1,42 @@
+package gotorch
+
+// #cgo CFLAGS: -I ${SRCDIR}
+// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch -Wl,-rpath ${SRCDIR}/cgotorch -lcgotorch
+// #cgo LDFLAGS: -L ${SRCDIR}/cgotorch/libtorch/lib -Wl,-rpath ${SRCDIR}/cgotorch/libtorch/lib -lc10 -lgloo -lc10d -ltorch -ltorch_cpu
+// #include "cgotorch/cgotorch.h"
+import "C"
+import "unsafe"
+
+// FileStore struct
+type FileStore struct {
+	FS *C.FileStore
+}
+
+// NewFileStore creates a FileStore instance
+func NewFileStore(path string, size int64) FileStore {
+	var t C.FileStore
+	MustNil(unsafe.Pointer(C.Gloo_NewFileStore(C.CString(path), C.int64_t(size), &t)))
+	return FileStore{&t}
+}
+
+// ProcessGroupGloo struct
+type ProcessGroupGloo struct {
+	PGG *C.ProcessGroupGloo
+}
+
+// NewProcessGroupGloo creates a ProcessGroupGloo instance
+func NewProcessGroupGloo(fs FileStore, rank, size int64) ProcessGroupGloo {
+	var t C.ProcessGroupGloo
+	MustNil(unsafe.Pointer(C.Gloo_NewProcessGroupGloo(*fs.FS, C.int64_t(rank), C.int64_t(size), &t)))
+	return ProcessGroupGloo{&t}
+}
+
+// AllReduce sum tensors
+func (ProcessGroupGloo pg) AllReduce(tensors []Tensor) {
+	CT := []C.Tensor{}
+	for _, t := range tensors {
+		CT = append(CT, C.Tensor(*t.T))
+	}
+	p := (*C.Tensor)(unsafe.Pointer(&CT[0]))
+	MustNil(unsafe.Pointer(C.Gloo_allreduce(*pg.PGG, p, C.int64_t(len(CT)))))
+}

From e4f38aae43f4e6f01ad20220c4e85a8243032a86 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 29 Oct 2020 16:24:20 +0800
Subject: [PATCH 03/15] fix build

---
 cgotorch/cgotorch.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cgotorch/cgotorch.h b/cgotorch/cgotorch.h
index e1fe6e1f..0d3adaa3 100644
--- a/cgotorch/cgotorch.h
+++ b/cgotorch/cgotorch.h
@@ -3,6 +3,7 @@
 #include "cgotorch/cuda.h"
 #include "cgotorch/device.h"
 #include "cgotorch/functional.h"
+#include "cgotorch/gloo.h"
 #include "cgotorch/init.h"
 #include "cgotorch/memory.h"
 #include "cgotorch/optim.h"

From fa63e8b83bb9c6ea3cd78f3aac546c9e0728a3cd Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 09:21:48 +0800
Subject: [PATCH 04/15] add unit test for gloo allreduce

---
 cgotorch/Makefile |  6 ++++--
 cgotorch/gloo.cc  |  5 ++++-
 gloo.go           |  2 +-
 gloo_test.go      | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 gloo_test.go

diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index 7fc8822d..447a0331 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -13,13 +13,15 @@ objs := $(srcs:%.cc=%.o)
 	-O -c $< -o $@
 
 libcgotorch.so: $(objs) ${LIBTORCH_DIR}
-	${CXX} -L libtorch/lib \
+	${CXX} -v -L libtorch/lib \
 	$(objs) \
 	-shared \
 	-o $@ ${INSTALL_NAME} \
+	-lc10d -lgloo \
 	-Wl,-rpath,libtorch/lib \
 	-Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \
-	-lc10 -lgloo -lc10d -ltorch -ltorch_cpu
+	-lc10 -ltorch -ltorch_cpu \
+	`pkg-config --cflags --libs libuv`
 
 clean:
 	rm -rf *.so *.o
diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index f341601d..69c29efe 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -15,9 +15,12 @@ const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
 const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
                                      int64_t size, ProcessGroupGloo *pg) {
   try {
+    auto d = c10d::ProcessGroupGloo::createDefaultDevice();
+    auto opt = c10d::ProcessGroupGloo::Options();
+    opt.devices.push_back(d);
     *pg = new c10d::ProcessGroupGloo(
         std::shared_ptr<c10d::Store>(static_cast<c10d::FileStore *>(store)),
-        rank, size);
+        rank, size, opt);
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
diff --git a/gloo.go b/gloo.go
index cf5d4556..ee7d5607 100644
--- a/gloo.go
+++ b/gloo.go
@@ -32,7 +32,7 @@ func NewProcessGroupGloo(fs FileStore, rank, size int64) ProcessGroupGloo {
 }
 
 // AllReduce sum tensors
-func (ProcessGroupGloo pg) AllReduce(tensors []Tensor) {
+func (pg ProcessGroupGloo) AllReduce(tensors []Tensor) {
 	CT := []C.Tensor{}
 	for _, t := range tensors {
 		CT = append(CT, C.Tensor(*t.T))
diff --git a/gloo_test.go b/gloo_test.go
new file mode 100644
index 00000000..75037777
--- /dev/null
+++ b/gloo_test.go
@@ -0,0 +1,36 @@
+package gotorch
+
+import (
+	"io/ioutil"
+	"os"
+	"sync"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func allreduce(rank, size int64, a Tensor, f *os.File, wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	fs := NewFileStore(f.Name(), size)
+	pg := NewProcessGroupGloo(fs, rank, size)
+	pg.AllReduce([]Tensor{a})
+}
+
+func TestGlooAllReduce(t *testing.T) {
+	f, _ := ioutil.TempFile("", "sample")
+	defer os.Remove(f.Name())
+
+	a := NewTensor([][]float32{{1, 2}, {3, 4}})
+	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+	wg := sync.WaitGroup{}
+	wg.Add(2)
+
+	go allreduce(0, 2, a, f, &wg)
+	go allreduce(1, 2, b, f, &wg)
+
+	wg.Wait()
+
+	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", a.String())
+	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", b.String())
+}

From 7fa3ed0319a421aef304c251b89d249ec54abb61 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 09:23:17 +0800
Subject: [PATCH 05/15] update travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index add9ec93..83486413 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,6 +22,7 @@ addons:
     packages:
       - opencv
       - pkgconfig
+      - libuv # to use gloo in macOS, we install libuv
     update: true
 
 branches:

From 5c56ac89e550132a04d87323135dd333519430f0 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 09:24:59 +0800
Subject: [PATCH 06/15] clean code

---
 cgotorch/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index 447a0331..e3a3f5c9 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -13,7 +13,7 @@ objs := $(srcs:%.cc=%.o)
 	-O -c $< -o $@
 
 libcgotorch.so: $(objs) ${LIBTORCH_DIR}
-	${CXX} -v -L libtorch/lib \
+	${CXX} -L libtorch/lib \
 	$(objs) \
 	-shared \
 	-o $@ ${INSTALL_NAME} \

From d94e949574f24a572adfa8b8eb18c3fff41f82ea Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 10:31:59 +0800
Subject: [PATCH 07/15] install gloo

---
 .circleci/Dockerfile      | 5 +++++
 .circleci/install_gloo.sh | 6 ++++++
 cgotorch/Makefile         | 2 +-
 cgotorch/build.sh         | 5 ++++-
 4 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100755 .circleci/install_gloo.sh

diff --git a/.circleci/Dockerfile b/.circleci/Dockerfile
index 2eb6021f..41ac2a25 100644
--- a/.circleci/Dockerfile
+++ b/.circleci/Dockerfile
@@ -10,6 +10,11 @@ RUN sudo python3 -m pip install -qq pre-commit cpplint
 RUN sudo gem install mdl
 RUN go get golang.org/x/lint/golint
 RUN sudo cp $GOPATH/bin/* /usr/local/bin/
+
 # install gocv
 RUN go get -u -d gocv.io/x/gocv
 RUN cd $GOPATH/src/gocv.io/x/gocv && make install
+
+# install gloo
+COPY install_gloo.sh /
+RUN sudo /install_gloo.sh && sudo rm /install_gloo.sh
diff --git a/.circleci/install_gloo.sh b/.circleci/install_gloo.sh
new file mode 100755
index 00000000..c1d28fb7
--- /dev/null
+++ b/.circleci/install_gloo.sh
@@ -0,0 +1,6 @@
+git clone https://github.com/facebookincubator/gloo.git
+cd gloo
+mkdir -p build
+cd build
+cmake ..
+make && make install
diff --git a/cgotorch/Makefile b/cgotorch/Makefile
index e3a3f5c9..a518d160 100644
--- a/cgotorch/Makefile
+++ b/cgotorch/Makefile
@@ -21,7 +21,7 @@ libcgotorch.so: $(objs) ${LIBTORCH_DIR}
 	-Wl,-rpath,libtorch/lib \
 	-Wl,-${LOAD} libtorch/lib/libc10.${LIB_SUFFIX} \
 	-lc10 -ltorch -ltorch_cpu \
-	`pkg-config --cflags --libs libuv`
+	${DEPS}
 
 clean:
 	rm -rf *.so *.o
diff --git a/cgotorch/build.sh b/cgotorch/build.sh
index e74671f0..9680105b 100755
--- a/cgotorch/build.sh
+++ b/cgotorch/build.sh
@@ -13,6 +13,7 @@ LOAD="force_load"
 LIB_SUFFIX="so"
 INSTALL_NAME=""
 CUDA_FLAGS=""
+DEPS=""
 
 function build_linux_no_cuda() {
     CXX="clang++"
@@ -67,6 +68,7 @@ elif [[ "$OS" == "darwin" ]]; then
     LIB_SUFFIX="dylib"
     INSTALL_NAME="-install_name @rpath/\$@"
     LOAD="all_load"
+    DEPS=`pkg-config --cflags --libs libuv`
     if [[ ! -d "$DIR/$LIBTORCH_DIR" ]]; then
         curl -LsO https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.6.0.zip
         unzip -qq -o libtorch-macos-1.6.0.zip -d macos
@@ -84,5 +86,6 @@ make CXX="$CXX" \
      GLIBCXX_USE_CXX11_ABI="$GLIBCXX_USE_CXX11_ABI" \
      LOAD="$LOAD" \
      CUDA_FLAGS="$CUDA_FLAGS" \
-     -f Makefile -j
+     DEPS="$DEPS" \
+     -f Makefile -j `nproc`
 popd

From 07ab3cc325a539289af43a3de94e9fe005f7aa27 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 10:40:42 +0800
Subject: [PATCH 08/15] format code

---
 cgotorch/gloo.cc    | 3 +++
 cgotorch/gloo.h     | 2 +-
 cgotorch/torchdef.h | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index 69c29efe..f8b30ecb 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -1,6 +1,9 @@
 // Copyright 2020, GoTorch Authors
 
 #include "cgotorch/gloo.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
                               FileStore *store) {
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
index c294af44..18fc2403 100644
--- a/cgotorch/gloo.h
+++ b/cgotorch/gloo.h
@@ -16,4 +16,4 @@ const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
                            int64_t length);
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/cgotorch/torchdef.h b/cgotorch/torchdef.h
index ef0885ca..06b380cb 100644
--- a/cgotorch/torchdef.h
+++ b/cgotorch/torchdef.h
@@ -8,8 +8,8 @@
 #include <torch/torch.h>
 #include <c10d/FileStore.hpp>
 #include <c10d/ProcessGroupGloo.hpp>
-#include <vector>
 
+#include <vector>  // NOLINT
 extern "C" {
 typedef at::Tensor *Tensor;
 typedef torch::optim::Optimizer *Optimizer;

From 2be8a1061fda2af0ad473db8597d2ee7698f710e Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 11:12:11 +0800
Subject: [PATCH 09/15] port allreduce_coalesced and broadcast

---
 cgotorch/build.sh |  2 +-
 cgotorch/gloo.cc  | 31 ++++++++++++++++++
 cgotorch/gloo.h   |  7 +++++
 gloo.go           | 24 +++++++++++++-
 gloo_test.go      | 80 +++++++++++++++++++++++++++++++++++++++++------
 5 files changed, 132 insertions(+), 12 deletions(-)

diff --git a/cgotorch/build.sh b/cgotorch/build.sh
index 9680105b..470dfa3d 100755
--- a/cgotorch/build.sh
+++ b/cgotorch/build.sh
@@ -87,5 +87,5 @@ make CXX="$CXX" \
      LOAD="$LOAD" \
      CUDA_FLAGS="$CUDA_FLAGS" \
      DEPS="$DEPS" \
-     -f Makefile -j `nproc`
+     -f Makefile -j `getconf _NPROCESSORS_ONLN`
 popd
diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index f8b30ecb..ccffdfb3 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -44,3 +44,34 @@ const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
     return exception_str(e.what());
   }
 }
+
+const char *Gloo_allreduce_coalesced(ProcessGroupGloo pg, Tensor *tensors,
+                                     int64_t length) {
+  try {
+    std::vector<torch::Tensor> ts;
+    while (ts.size() < length) {
+      ts.push_back(**tensors++);
+    }
+    auto work =
+        static_cast<c10d::ProcessGroupGloo *>(pg)->allreduce_coalesced(ts);
+    work->wait();
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
+const char *Gloo_broadcast(ProcessGroupGloo pg, Tensor *tensors,
+                           int64_t length) {
+  try {
+    std::vector<torch::Tensor> ts;
+    while (ts.size() < length) {
+      ts.push_back(**tensors++);
+    }
+    auto work = static_cast<c10d::ProcessGroupGloo *>(pg)->broadcast(ts);
+    work->wait();
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
index 18fc2403..e2b84f6d 100644
--- a/cgotorch/gloo.h
+++ b/cgotorch/gloo.h
@@ -14,6 +14,13 @@ const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
 
 const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
                            int64_t length);
+
+const char *Gloo_allreduce_coalesced(ProcessGroupGloo pg, Tensor *tensors,
+                                     int64_t length);
+
+const char *Gloo_broadcast(ProcessGroupGloo pg, Tensor *tensors,
+                           int64_t length);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gloo.go b/gloo.go
index ee7d5607..d3aef5d0 100644
--- a/gloo.go
+++ b/gloo.go
@@ -31,7 +31,7 @@ func NewProcessGroupGloo(fs FileStore, rank, size int64) ProcessGroupGloo {
 	return ProcessGroupGloo{&t}
 }
 
-// AllReduce sum tensors
+// AllReduce method: todo(qijun) only support sum
 func (pg ProcessGroupGloo) AllReduce(tensors []Tensor) {
 	CT := []C.Tensor{}
 	for _, t := range tensors {
@@ -40,3 +40,25 @@ func (pg ProcessGroupGloo) AllReduce(tensors []Tensor) {
 	p := (*C.Tensor)(unsafe.Pointer(&CT[0]))
 	MustNil(unsafe.Pointer(C.Gloo_allreduce(*pg.PGG, p, C.int64_t(len(CT)))))
 }
+
+// AllReduceCoalesced method: tensors will be flattened and
+// concatenated (coalesced). This means that input tensors
+// must have the same device, layout and type.
+func (pg ProcessGroupGloo) AllReduceCoalesced(tensors []Tensor) {
+	CT := []C.Tensor{}
+	for _, t := range tensors {
+		CT = append(CT, C.Tensor(*t.T))
+	}
+	p := (*C.Tensor)(unsafe.Pointer(&CT[0]))
+	MustNil(unsafe.Pointer(C.Gloo_allreduce_coalesced(*pg.PGG, p, C.int64_t(len(CT)))))
+}
+
+// Broadcast method
+func (pg ProcessGroupGloo) Broadcast(tensors []Tensor) {
+	CT := []C.Tensor{}
+	for _, t := range tensors {
+		CT = append(CT, C.Tensor(*t.T))
+	}
+	p := (*C.Tensor)(unsafe.Pointer(&CT[0]))
+	MustNil(unsafe.Pointer(C.Gloo_broadcast(*pg.PGG, p, C.int64_t(len(CT)))))
+}
diff --git a/gloo_test.go b/gloo_test.go
index 75037777..eac1fc16 100644
--- a/gloo_test.go
+++ b/gloo_test.go
@@ -9,28 +9,88 @@ import (
 	"github.com/stretchr/testify/assert"
 )
 
-func allreduce(rank, size int64, a Tensor, f *os.File, wg *sync.WaitGroup) {
-	defer wg.Done()
-
-	fs := NewFileStore(f.Name(), size)
-	pg := NewProcessGroupGloo(fs, rank, size)
-	pg.AllReduce([]Tensor{a})
-}
-
 func TestGlooAllReduce(t *testing.T) {
 	f, _ := ioutil.TempFile("", "sample")
 	defer os.Remove(f.Name())
 
 	a := NewTensor([][]float32{{1, 2}, {3, 4}})
 	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+
+	ts := []Tensor{a, b}
 	wg := sync.WaitGroup{}
 	wg.Add(2)
 
-	go allreduce(0, 2, a, f, &wg)
-	go allreduce(1, 2, b, f, &wg)
+	for i := 0; i < 2; i++ {
+		go func(rank int64, a Tensor) {
+			defer wg.Done()
+			fs := NewFileStore(f.Name(), 2)
+			pg := NewProcessGroupGloo(fs, rank, 2)
+			pg.AllReduce([]Tensor{a})
+		}(int64(i), ts[i])
+	}
 
 	wg.Wait()
 
 	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", a.String())
 	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", b.String())
 }
+
+func TestGlooAllReduceCoalesced(t *testing.T) {
+	f, _ := ioutil.TempFile("", "sample")
+	defer os.Remove(f.Name())
+
+	a1 := NewTensor([][]float32{{1, 2}})
+	a2 := NewTensor([][]float32{{3, 4}})
+	a := []Tensor{a1, a2}
+
+	b1 := NewTensor([][]float32{{4, 3}})
+	b2 := NewTensor([][]float32{{2, 1}})
+	b := []Tensor{b1, b2}
+
+	ts := [][]Tensor{a, b}
+
+	wg := sync.WaitGroup{}
+	wg.Add(2)
+
+	for i := 0; i < 2; i++ {
+		go func(rank int64, a []Tensor) {
+			defer wg.Done()
+			fs := NewFileStore(f.Name(), 2)
+			pg := NewProcessGroupGloo(fs, rank, 2)
+			pg.AllReduceCoalesced(a)
+		}(int64(i), ts[i])
+	}
+
+	wg.Wait()
+
+	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", a1.String())
+	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", b1.String())
+	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", a2.String())
+	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", b2.String())
+}
+
+func TestGlooBroadcast(t *testing.T) {
+	f, _ := ioutil.TempFile("", "sample")
+	defer os.Remove(f.Name())
+
+	a := NewTensor([][]float32{{1, 2}, {3, 4}})
+	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+
+	ts := []Tensor{a, b}
+	wg := sync.WaitGroup{}
+	wg.Add(2)
+
+	for i := 0; i < 2; i++ {
+		go func(rank int64, a Tensor) {
+			defer wg.Done()
+			fs := NewFileStore(f.Name(), 2)
+			pg := NewProcessGroupGloo(fs, rank, 2)
+			pg.Broadcast([]Tensor{a})
+		}(int64(i), ts[i])
+	}
+
+	wg.Wait()
+
+	assert.Equal(t, " 1  2\n 3  4\n[ CPUFloatType{2,2} ]", a.String())
+	assert.Equal(t, " 1  2\n 3  4\n[ CPUFloatType{2,2} ]", b.String())
+}

From 26db4340d978acdbe8d50a18ae93ccd0cc58ad0f Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 2 Nov 2020 14:02:25 +0800
Subject: [PATCH 10/15] port TCPStore

---
 cgotorch/gloo.cc    | 22 +++++++++++++++++-----
 cgotorch/gloo.h     | 11 ++++++++---
 cgotorch/torchdef.h |  5 +++++
 gloo.go             | 27 +++++++++++++++++++--------
 gloo_test.go        | 23 +++++++++++++++++++++++
 5 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index ccffdfb3..d8406807 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -6,7 +6,7 @@
 #include <vector>
 
 const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
-                              FileStore *store) {
+                              Store *store) {
   try {
     *store = new c10d::FileStore(std::string(path), num_workers);
     return nullptr;
@@ -15,15 +15,27 @@ const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
   }
 }
 
-const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
-                                     int64_t size, ProcessGroupGloo *pg) {
+const char *Gloo_NewTCPStore(const char *addr, int64_t port,
+                             int64_t num_workers, int64_t is_server,
+                             Store *store) {
+  try {
+    *store =
+        new c10d::TCPStore(std::string(addr), port, num_workers, is_server);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
+const char *Gloo_NewProcessGroupGloo(Store store, int64_t rank, int64_t size,
+                                     ProcessGroupGloo *pg) {
   try {
     auto d = c10d::ProcessGroupGloo::createDefaultDevice();
     auto opt = c10d::ProcessGroupGloo::Options();
     opt.devices.push_back(d);
     *pg = new c10d::ProcessGroupGloo(
-        std::shared_ptr<c10d::Store>(static_cast<c10d::FileStore *>(store)),
-        rank, size, opt);
+        std::shared_ptr<c10d::Store>(static_cast<c10d::Store *>(store)), rank,
+        size, opt);
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
index e2b84f6d..e6acae86 100644
--- a/cgotorch/gloo.h
+++ b/cgotorch/gloo.h
@@ -8,9 +8,14 @@
 extern "C" {
 #endif
 const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
-                              FileStore *store);
-const char *Gloo_NewProcessGroupGloo(FileStore store, int64_t rank,
-                                     int64_t size, ProcessGroupGloo *pg);
+                              Store *store);
+
+const char *Gloo_NewTCPStore(const char *addr, int64_t port,
+                             int64_t num_workers, int64_t is_server,
+                             Store *store);
+
+const char *Gloo_NewProcessGroupGloo(Store store, int64_t rank, int64_t size,
+                                     ProcessGroupGloo *pg);
 
 const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
                            int64_t length);
diff --git a/cgotorch/torchdef.h b/cgotorch/torchdef.h
index 06b380cb..d17dd7b3 100644
--- a/cgotorch/torchdef.h
+++ b/cgotorch/torchdef.h
@@ -8,6 +8,7 @@
 #include <torch/torch.h>
 #include <c10d/FileStore.hpp>
 #include <c10d/ProcessGroupGloo.hpp>
+#include <c10d/TCPStore.hpp>
 
 #include <vector>  // NOLINT
 extern "C" {
@@ -17,7 +18,9 @@ typedef torch::data::datasets::MNIST *MNIST;
 typedef torch::data::transforms::Normalize<> *Normalize;
 typedef torch::Device *Device;
 typedef std::vector<char> *ByteBuffer;  // NOLINT
+typedef c10d::Store *Store;
 typedef c10d::FileStore *FileStore;
+typedef c10d::TCPStore *TCPStore;
 typedef c10d::ProcessGroupGloo *ProcessGroupGloo;
 #else
 typedef void *Tensor;
@@ -26,7 +29,9 @@ typedef void *MNIST;
 typedef void *Normalize;
 typedef void *Device;
 typedef void *ByteBuffer;
+typedef void *Store;
 typedef void *FileStore;
+typedef void *TCPStore;
 typedef void *ProcessGroupGloo;
 #endif
 typedef void *CUDAStream;
diff --git a/gloo.go b/gloo.go
index d3aef5d0..9377f133 100644
--- a/gloo.go
+++ b/gloo.go
@@ -7,16 +7,27 @@ package gotorch
 import "C"
 import "unsafe"
 
-// FileStore struct
-type FileStore struct {
-	FS *C.FileStore
+// Store struct
+type Store struct {
+	Store *C.Store
 }
 
 // NewFileStore creates a FileStore instance
-func NewFileStore(path string, size int64) FileStore {
-	var t C.FileStore
+func NewFileStore(path string, size int64) Store {
+	var t C.Store
 	MustNil(unsafe.Pointer(C.Gloo_NewFileStore(C.CString(path), C.int64_t(size), &t)))
-	return FileStore{&t}
+	return Store{&t}
+}
+
+// NewTCPStore creates a TCPStore instance
+func NewTCPStore(addr string, port, size int64, isServer bool) Store {
+	is := 0
+	if isServer {
+		is = 1
+	}
+	var t C.Store
+	MustNil(unsafe.Pointer(C.Gloo_NewTCPStore(C.CString(addr), C.int64_t(port), C.int64_t(size), C.int64_t(is), &t)))
+	return Store{&t}
 }
 
 // ProcessGroupGloo struct
@@ -25,9 +36,9 @@ type ProcessGroupGloo struct {
 }
 
 // NewProcessGroupGloo creates a ProcessGroupGloo instance
-func NewProcessGroupGloo(fs FileStore, rank, size int64) ProcessGroupGloo {
+func NewProcessGroupGloo(s Store, rank, size int64) ProcessGroupGloo {
 	var t C.ProcessGroupGloo
-	MustNil(unsafe.Pointer(C.Gloo_NewProcessGroupGloo(*fs.FS, C.int64_t(rank), C.int64_t(size), &t)))
+	MustNil(unsafe.Pointer(C.Gloo_NewProcessGroupGloo(*s.Store, C.int64_t(rank), C.int64_t(size), &t)))
 	return ProcessGroupGloo{&t}
 }
 
diff --git a/gloo_test.go b/gloo_test.go
index eac1fc16..669468c6 100644
--- a/gloo_test.go
+++ b/gloo_test.go
@@ -35,6 +35,29 @@ func TestGlooAllReduce(t *testing.T) {
 	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", b.String())
 }
 
+func TestGlooAllReduceWithTCPStore(t *testing.T) {
+	a := NewTensor([][]float32{{1, 2}, {3, 4}})
+	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+
+	ts := []Tensor{a, b}
+	wg := sync.WaitGroup{}
+	wg.Add(2)
+
+	for i := 0; i < 2; i++ {
+		go func(rank int64, a Tensor) {
+			defer wg.Done()
+			ts := NewTCPStore("127.0.0.1", 11111, 2, rank == 0)
+			pg := NewProcessGroupGloo(ts, rank, 2)
+			pg.AllReduce([]Tensor{a})
+		}(int64(i), ts[i])
+	}
+
+	wg.Wait()
+
+	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", a.String())
+	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", b.String())
+}
+
 func TestGlooAllReduceCoalesced(t *testing.T) {
 	f, _ := ioutil.TempFile("", "sample")
 	defer os.Remove(f.Name())

From 43b822f43b052dc3c65d32dbdeb87655aa26985a Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Nov 2020 13:53:31 +0800
Subject: [PATCH 11/15] add Close method

---
 cgotorch/gloo.cc    | 46 ++++++++++++++++++++++++++++++++-------------
 cgotorch/gloo.h     |  4 ++++
 cgotorch/torchdef.h | 12 ++++--------
 gloo.go             | 12 ++++++++++++
 gloo_test.go        |  8 ++++++++
 5 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/cgotorch/gloo.cc b/cgotorch/gloo.cc
index d8406807..b3c839b1 100644
--- a/cgotorch/gloo.cc
+++ b/cgotorch/gloo.cc
@@ -1,14 +1,19 @@
 // Copyright 2020, GoTorch Authors
 
 #include "cgotorch/gloo.h"
-#include <memory>
-#include <string>
-#include <vector>
+
+#include <c10d/FileStore.hpp>
+#include <c10d/TCPStore.hpp>
+
+#include <memory>  // NOLINT
+#include <string>  // NOLINT
+#include <vector>  // NOLINT
 
 const char *Gloo_NewFileStore(const char *path, int64_t num_workers,
                               Store *store) {
   try {
-    *store = new c10d::FileStore(std::string(path), num_workers);
+    *store = new std::shared_ptr<c10d::Store>(
+        new c10d::FileStore(std::string(path), num_workers));
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
@@ -19,8 +24,17 @@ const char *Gloo_NewTCPStore(const char *addr, int64_t port,
                              int64_t num_workers, int64_t is_server,
                              Store *store) {
   try {
-    *store =
-        new c10d::TCPStore(std::string(addr), port, num_workers, is_server);
+    *store = new std::shared_ptr<c10d::Store>(
+        new c10d::TCPStore(std::string(addr), port, num_workers, is_server));
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
+const char *Gloo_DeleteStore(Store store) {
+  try {
+    store->reset();
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
@@ -33,9 +47,16 @@ const char *Gloo_NewProcessGroupGloo(Store store, int64_t rank, int64_t size,
     auto d = c10d::ProcessGroupGloo::createDefaultDevice();
     auto opt = c10d::ProcessGroupGloo::Options();
     opt.devices.push_back(d);
-    *pg = new c10d::ProcessGroupGloo(
-        std::shared_ptr<c10d::Store>(static_cast<c10d::Store *>(store)), rank,
-        size, opt);
+    *pg = new c10d::ProcessGroupGloo(*store, rank, size, opt);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+}
+
+const char *Gloo_DeleteProcessGroupGloo(ProcessGroupGloo pg) {
+  try {
+    delete pg;
     return nullptr;
   } catch (const std::exception &e) {
     return exception_str(e.what());
@@ -49,7 +70,7 @@ const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
     while (ts.size() < length) {
       ts.push_back(**tensors++);
     }
-    auto work = static_cast<c10d::ProcessGroupGloo *>(pg)->allreduce(ts);
+    auto work = pg->allreduce(ts);
     work->wait();
     return nullptr;
   } catch (const std::exception &e) {
@@ -64,8 +85,7 @@ const char *Gloo_allreduce_coalesced(ProcessGroupGloo pg, Tensor *tensors,
     while (ts.size() < length) {
       ts.push_back(**tensors++);
     }
-    auto work =
-        static_cast<c10d::ProcessGroupGloo *>(pg)->allreduce_coalesced(ts);
+    auto work = pg->allreduce_coalesced(ts);
     work->wait();
     return nullptr;
   } catch (const std::exception &e) {
@@ -80,7 +100,7 @@ const char *Gloo_broadcast(ProcessGroupGloo pg, Tensor *tensors,
     while (ts.size() < length) {
       ts.push_back(**tensors++);
     }
-    auto work = static_cast<c10d::ProcessGroupGloo *>(pg)->broadcast(ts);
+    auto work = pg->broadcast(ts);
     work->wait();
     return nullptr;
   } catch (const std::exception &e) {
diff --git a/cgotorch/gloo.h b/cgotorch/gloo.h
index e6acae86..63ac56f9 100644
--- a/cgotorch/gloo.h
+++ b/cgotorch/gloo.h
@@ -14,9 +14,13 @@ const char *Gloo_NewTCPStore(const char *addr, int64_t port,
                              int64_t num_workers, int64_t is_server,
                              Store *store);
 
+const char *Gloo_DeleteStore(Store store);
+
 const char *Gloo_NewProcessGroupGloo(Store store, int64_t rank, int64_t size,
                                      ProcessGroupGloo *pg);
 
+const char *Gloo_DeleteProcessGroupGloo(ProcessGroupGloo pg);
+
 const char *Gloo_allreduce(ProcessGroupGloo pg, Tensor *tensors,
                            int64_t length);
 
diff --git a/cgotorch/torchdef.h b/cgotorch/torchdef.h
index d17dd7b3..f8685da6 100644
--- a/cgotorch/torchdef.h
+++ b/cgotorch/torchdef.h
@@ -6,10 +6,10 @@
 
 #ifdef __cplusplus
 #include <torch/torch.h>
-#include <c10d/FileStore.hpp>
 #include <c10d/ProcessGroupGloo.hpp>
-#include <c10d/TCPStore.hpp>
+#include <c10d/Store.hpp>
 
+#include <memory>  // NOLINT
 #include <vector>  // NOLINT
 extern "C" {
 typedef at::Tensor *Tensor;
@@ -17,10 +17,8 @@ typedef torch::optim::Optimizer *Optimizer;
 typedef torch::data::datasets::MNIST *MNIST;
 typedef torch::data::transforms::Normalize<> *Normalize;
 typedef torch::Device *Device;
-typedef std::vector<char> *ByteBuffer;  // NOLINT
-typedef c10d::Store *Store;
-typedef c10d::FileStore *FileStore;
-typedef c10d::TCPStore *TCPStore;
+typedef std::vector<char> *ByteBuffer;        // NOLINT
+typedef std::shared_ptr<c10d::Store> *Store;  // NOLINT
 typedef c10d::ProcessGroupGloo *ProcessGroupGloo;
 #else
 typedef void *Tensor;
@@ -30,8 +28,6 @@ typedef void *Normalize;
 typedef void *Device;
 typedef void *ByteBuffer;
 typedef void *Store;
-typedef void *FileStore;
-typedef void *TCPStore;
 typedef void *ProcessGroupGloo;
 #endif
 typedef void *CUDAStream;
diff --git a/gloo.go b/gloo.go
index 9377f133..d8a084a9 100644
--- a/gloo.go
+++ b/gloo.go
@@ -30,6 +30,12 @@ func NewTCPStore(addr string, port, size int64, isServer bool) Store {
 	return Store{&t}
 }
 
+// Close a store
+func (s Store) Close() {
+	MustNil(unsafe.Pointer(C.Gloo_DeleteStore(*s.Store)))
+	s.Store = nil
+}
+
 // ProcessGroupGloo struct
 type ProcessGroupGloo struct {
 	PGG *C.ProcessGroupGloo
@@ -42,6 +48,12 @@ func NewProcessGroupGloo(s Store, rank, size int64) ProcessGroupGloo {
 	return ProcessGroupGloo{&t}
 }
 
+// Close a group
+func (pg ProcessGroupGloo) Close() {
+	MustNil(unsafe.Pointer(C.Gloo_DeleteProcessGroupGloo(*pg.PGG)))
+	pg.PGG = nil
+}
+
 // AllReduce method: todo(qijun) only support sum
 func (pg ProcessGroupGloo) AllReduce(tensors []Tensor) {
 	CT := []C.Tensor{}
diff --git a/gloo_test.go b/gloo_test.go
index 669468c6..f36472f7 100644
--- a/gloo_test.go
+++ b/gloo_test.go
@@ -24,7 +24,9 @@ func TestGlooAllReduce(t *testing.T) {
 		go func(rank int64, a Tensor) {
 			defer wg.Done()
 			fs := NewFileStore(f.Name(), 2)
+			defer fs.Close()
 			pg := NewProcessGroupGloo(fs, rank, 2)
+			defer pg.Close()
 			pg.AllReduce([]Tensor{a})
 		}(int64(i), ts[i])
 	}
@@ -47,7 +49,9 @@ func TestGlooAllReduceWithTCPStore(t *testing.T) {
 		go func(rank int64, a Tensor) {
 			defer wg.Done()
 			ts := NewTCPStore("127.0.0.1", 11111, 2, rank == 0)
+			defer ts.Close()
 			pg := NewProcessGroupGloo(ts, rank, 2)
+			defer pg.Close()
 			pg.AllReduce([]Tensor{a})
 		}(int64(i), ts[i])
 	}
@@ -79,7 +83,9 @@ func TestGlooAllReduceCoalesced(t *testing.T) {
 		go func(rank int64, a []Tensor) {
 			defer wg.Done()
 			fs := NewFileStore(f.Name(), 2)
+			defer fs.Close()
 			pg := NewProcessGroupGloo(fs, rank, 2)
+			defer pg.Close()
 			pg.AllReduceCoalesced(a)
 		}(int64(i), ts[i])
 	}
@@ -107,7 +113,9 @@ func TestGlooBroadcast(t *testing.T) {
 		go func(rank int64, a Tensor) {
 			defer wg.Done()
 			fs := NewFileStore(f.Name(), 2)
+			defer fs.Close()
 			pg := NewProcessGroupGloo(fs, rank, 2)
+			defer pg.Close()
 			pg.Broadcast([]Tensor{a})
 		}(int64(i), ts[i])
 	}

From 11152937fe7c7eff50a5d71cdf6cb3c68bfa86e8 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Nov 2020 15:43:22 +0800
Subject: [PATCH 12/15] refine test

---
 gloo_test.go | 62 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/gloo_test.go b/gloo_test.go
index f36472f7..379a1eab 100644
--- a/gloo_test.go
+++ b/gloo_test.go
@@ -38,28 +38,40 @@ func TestGlooAllReduce(t *testing.T) {
 }
 
 func TestGlooAllReduceWithTCPStore(t *testing.T) {
-	a := NewTensor([][]float32{{1, 2}, {3, 4}})
-	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+	f, _ := ioutil.TempFile("", "sample")
+	defer os.Remove(f.Name())
+
+	a1 := NewTensor([][]float32{{1, 2}})
+	a2 := NewTensor([][]float32{{1, 3}})
+	a := []Tensor{a1, a2}
+
+	b1 := NewTensor([][]float32{{4, 3}})
+	b2 := NewTensor([][]float32{{1, 1}})
+	b := []Tensor{b1, b2}
+
+	ts := [][]Tensor{a, b}
 
-	ts := []Tensor{a, b}
 	wg := sync.WaitGroup{}
 	wg.Add(2)
 
 	for i := 0; i < 2; i++ {
-		go func(rank int64, a Tensor) {
+		go func(rank int64, a []Tensor) {
 			defer wg.Done()
-			ts := NewTCPStore("127.0.0.1", 11111, 2, rank == 0)
-			defer ts.Close()
-			pg := NewProcessGroupGloo(ts, rank, 2)
+			fs := NewFileStore(f.Name(), 2)
+			defer fs.Close()
+			pg := NewProcessGroupGloo(fs, rank, 2)
 			defer pg.Close()
-			pg.AllReduce([]Tensor{a})
+			pg.AllReduce(a)
 		}(int64(i), ts[i])
 	}
 
 	wg.Wait()
 
-	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", a.String())
-	assert.Equal(t, " 5  5\n 5  5\n[ CPUFloatType{2,2} ]", b.String())
+	assert.Equal(t, " 7  9\n[ CPUFloatType{1,2} ]", a1.String())
+	assert.Equal(t, " 7  9\n[ CPUFloatType{1,2} ]", b1.String())
+	assert.Equal(t, " 7  9\n[ CPUFloatType{1,2} ]", a2.String())
+	assert.Equal(t, " 7  9\n[ CPUFloatType{1,2} ]", b2.String())
+
 }
 
 func TestGlooAllReduceCoalesced(t *testing.T) {
@@ -67,11 +79,11 @@ func TestGlooAllReduceCoalesced(t *testing.T) {
 	defer os.Remove(f.Name())
 
 	a1 := NewTensor([][]float32{{1, 2}})
-	a2 := NewTensor([][]float32{{3, 4}})
+	a2 := NewTensor([][]float32{{1, 3}})
 	a := []Tensor{a1, a2}
 
 	b1 := NewTensor([][]float32{{4, 3}})
-	b2 := NewTensor([][]float32{{2, 1}})
+	b2 := NewTensor([][]float32{{1, 1}})
 	b := []Tensor{b1, b2}
 
 	ts := [][]Tensor{a, b}
@@ -94,34 +106,42 @@ func TestGlooAllReduceCoalesced(t *testing.T) {
 
 	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", a1.String())
 	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", b1.String())
-	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", a2.String())
-	assert.Equal(t, " 5  5\n[ CPUFloatType{1,2} ]", b2.String())
+	assert.Equal(t, " 2  4\n[ CPUFloatType{1,2} ]", a2.String())
+	assert.Equal(t, " 2  4\n[ CPUFloatType{1,2} ]", b2.String())
 }
 
 func TestGlooBroadcast(t *testing.T) {
 	f, _ := ioutil.TempFile("", "sample")
 	defer os.Remove(f.Name())
 
-	a := NewTensor([][]float32{{1, 2}, {3, 4}})
-	b := NewTensor([][]float32{{4, 3}, {2, 1}})
+	a1 := NewTensor([][]float32{{1, 2}})
+	a2 := NewTensor([][]float32{{1, 3}})
+	a := []Tensor{a1, a2}
+
+	b1 := NewTensor([][]float32{{4, 3}})
+	b2 := NewTensor([][]float32{{1, 1}})
+	b := []Tensor{b1, b2}
+
+	ts := [][]Tensor{a, b}
 
-	ts := []Tensor{a, b}
 	wg := sync.WaitGroup{}
 	wg.Add(2)
 
 	for i := 0; i < 2; i++ {
-		go func(rank int64, a Tensor) {
+		go func(rank int64, a []Tensor) {
 			defer wg.Done()
 			fs := NewFileStore(f.Name(), 2)
 			defer fs.Close()
 			pg := NewProcessGroupGloo(fs, rank, 2)
 			defer pg.Close()
-			pg.Broadcast([]Tensor{a})
+			pg.Broadcast(a)
 		}(int64(i), ts[i])
 	}
 
 	wg.Wait()
 
-	assert.Equal(t, " 1  2\n 3  4\n[ CPUFloatType{2,2} ]", a.String())
-	assert.Equal(t, " 1  2\n 3  4\n[ CPUFloatType{2,2} ]", b.String())
+	assert.Equal(t, " 1  2\n[ CPUFloatType{1,2} ]", a1.String())
+	assert.Equal(t, " 1  2\n[ CPUFloatType{1,2} ]", b1.String())
+	assert.Equal(t, " 1  2\n[ CPUFloatType{1,2} ]", a2.String())
+	assert.Equal(t, " 1  2\n[ CPUFloatType{1,2} ]", b2.String())
 }

From 45f42b446264b8349b9414dcafcde63aab7edd85 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Nov 2020 16:16:23 +0800
Subject: [PATCH 13/15] debug ci

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 83486413..565f05f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,6 +34,7 @@ install:
 
 script:
   - export LD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/cgotorch/libtorch/lib
+  - pushd $HOME && bash .circle/install_gloo.sh && popd
   - go generate ./...
   - go install ./...
   - go test -coverprofile=coverage.txt -covermode=atomic -v -race ./...

From f347eae2d89199a982b295ff48ae88d47957f56c Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Nov 2020 16:29:52 +0800
Subject: [PATCH 14/15] debug ci

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 565f05f9..06d6d154 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ install:
 
 script:
   - export LD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/cgotorch/libtorch/lib
-  - pushd $HOME && bash .circle/install_gloo.sh && popd
+  - pushd $HOME && bash $TRAVIS_BUILD_DIR/.circle/install_gloo.sh && popd
   - go generate ./...
   - go install ./...
   - go test -coverprofile=coverage.txt -covermode=atomic -v -race ./...

From 26e3d056dcf77f579e2344ac20c746dfea069d00 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 3 Nov 2020 16:42:03 +0800
Subject: [PATCH 15/15] debug ci

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 06d6d154..4e65a3ef 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ install:
 
 script:
   - export LD_LIBRARY_PATH=$TRAVIS_BUILD_DIR/cgotorch/libtorch/lib
-  - pushd $HOME && bash $TRAVIS_BUILD_DIR/.circle/install_gloo.sh && popd
+  - pushd $HOME && bash $TRAVIS_BUILD_DIR/.circleci/install_gloo.sh && popd
   - go generate ./...
   - go install ./...
   - go test -coverprofile=coverage.txt -covermode=atomic -v -race ./...