Skip to content

Commit

Permalink
Add performance check for mscclpp-test (#110)
Browse files Browse the repository at this point in the history
- Add ndmv4 perf baseline
- change mscclpp-test to output perf number into a json file
- add python script to check the perf result with the baseline
  • Loading branch information
Binyang2014 authored Jun 21, 2023
1 parent cd7797f commit 2640578
Show file tree
Hide file tree
Showing 16 changed files with 295 additions and 130 deletions.
30 changes: 20 additions & 10 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ steps:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -49,7 +49,7 @@ steps:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -60,9 +60,9 @@ steps:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -73,6 +73,16 @@ steps:
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: CheckPerfNumber
displayName: Check collective primitives performance
inputs:
targetType: 'inline'
script: |
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'
2 changes: 2 additions & 0 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ steps:
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
Expand All @@ -85,6 +86,7 @@ steps:
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
Expand Down
5 changes: 5 additions & 0 deletions .black
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[tool.black]
line-length = 120
target-version = ['py38']
include = '\.pyi?$'
extend-exclude = 'python/'
32 changes: 22 additions & 10 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ jobs:
runs-on: ubuntu-20.04

steps:
- name: Check git version
env:
min-ver: "2.18"
run: dpkg --compare-versions $(git version | awk '{print $3}') ge ${{ env.min-ver }}

- name: Check out Git repository
uses: actions/checkout@v3

Expand All @@ -28,15 +23,32 @@ jobs:
clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
clang-format --dry-run ${PYTHONCPPSOURCES}
spelling:
pylint:
runs-on: ubuntu-20.04

steps:
- name: Check git version
env:
min-ver: "2.18"
run: dpkg --compare-versions $(git version | awk '{print $3}') ge ${{ env.min-ver }}
- name: Check out Git repository
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8

- name: Install Python dependencies
run: python3.8 -m pip install black

- name: Run linters
uses: wearerequired/lint-action@v2
with:
black: true
black_auto_fix: false
black_args: "--config .black --check"

spelling:
runs-on: ubuntu-20.04

steps:
- name: Check out Git repository
uses: actions/checkout@v3

Expand Down
2 changes: 2 additions & 0 deletions test/deploy/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ DST_DIR="/tmp/mscclpp"
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy"
SSH_OPTION="StrictHostKeyChecking=no"
MSCCLPP_TEST_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test"

chmod 400 ${KeyFilePath}
ssh-keygen -t rsa -f sshkey -P ""
Expand All @@ -28,6 +29,7 @@ parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${MSCCLPP_TEST_DIR}/check_perf_result.py ${DST_DIR}

# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
Expand Down
7 changes: 7 additions & 0 deletions test/deploy/perf_ndmv4.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":271.83, "busBw":237.85, "size":1073741824, "time":3949.94, "target":"throughput"}
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":243.86, "busBw":228.62, "size":3221225472, "time":13209.19,"target":"throughput"}
{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1133, "busBw":0.1016, "size":8192, "time":72.88, "target":"latency"}
{"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.04, "busBw":243.32, "size":1073741824, "time":7722.32, "target":"throughput"}
{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.40, "busBw":2.45, "size":8192, "time":5.86, "target":"latency"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.49, "busBw":43.5928,"size":1073741824, "time":23091.7, "target":"throughput"}
{"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":275.54, "busBw":241.10, "size":1073741824, "time":3896.75, "target":"throughput"}
16 changes: 10 additions & 6 deletions test/deploy/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,34 @@ function run_mscclpp_test()
echo "=================Run allgather_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

# For kernel 2, the message size must can be divided by 3
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl

echo "==================Run allreduce_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl

echo "==================Run alltoall_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

echo "========================Run performance check==============================="
python3 /root/mscclpp/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
--baseline-file /root/mscclpp/perf_ndmv4.jsonl
}

function run_mp_ut()
Expand Down
5 changes: 4 additions & 1 deletion test/mscclpp-test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
FetchContent_MakeAvailable(json)

function(add_mscclpp_test_executable name sources)
add_executable(${name} ${sources} common.cc)
target_link_libraries(${name} mscclpp MPI::MPI_CXX CUDA::cudart CUDA::cuda_driver MSCCLPP::numa)
target_link_libraries(${name} mscclpp MPI::MPI_CXX CUDA::cudart CUDA::cuda_driver MSCCLPP::numa nlohmann_json::nlohmann_json)
endfunction()

add_mscclpp_test_executable(sendrecv_test_perf sendrecv_test.cu)
Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/allgather_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ class AllGatherTestEngine : public BaseTestEngine {
std::shared_ptr<int[]> expectedBuff_;
};

AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args) {}
AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}

void AllGatherTestEngine::allocateBuffer() {
sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
Expand Down
5 changes: 4 additions & 1 deletion test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ void AllReduceTestColl::setupCollTest(size_t size) {
recvCount_ = base;
paramCount_ = base;
recvCount_ = base;
expectedCount_ = base;

mscclpp::DeviceSyncer syncer = {};
CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
Expand Down Expand Up @@ -358,7 +359,9 @@ class AllReduceTestEngine : public BaseTestEngine {
std::shared_ptr<int[]> expectedBuff_;
};

AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args) { inPlace_ = isInPlace(); }
AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") {
inPlace_ = isInPlace();
}

bool AllReduceTestEngine::isUsePacket() const { return (args_.kernelNum == 2); }

Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/alltoall_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class AllToAllTestEngine : public BaseTestEngine {
std::shared_ptr<int[]> expectedBuff_;
};

AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args) { inPlace_ = false; }
AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }

void AllToAllTestEngine::allocateBuffer() {
sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
Expand Down
80 changes: 80 additions & 0 deletions test/mscclpp-test/check_perf_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import json
import logging


def load_perf_file(perf_fine: str) -> dict:
res = {}
with open(perf_fine, "r") as f:
for line in f:
data = json.loads(line)
res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])] = {
"algBw": data["algBw"],
"busBw": data["busBw"],
"time": data["time"],
}
if "target" in data:
res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
"target"
]
return res


def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float, bandwidth_threshold: float) -> bool:
res = True
for key, value in perf_result.items():
if key not in baseline:
continue
if baseline[key]["target"] == "latency":
if abs(value["time"] - baseline[key]["time"]) / baseline[key]["time"] > time_threshold:
logging.error(
"%s: time %f not match baseline %f with threshold %f",
str(key),
value["time"],
baseline[key]["time"],
time_threshold,
)
res = False
elif baseline[key]["target"] == "throughput":
if abs(value["algBw"] - baseline[key]["algBw"]) / baseline[key]["algBw"] > bandwidth_threshold:
logging.error(
"%s: algBw %f not match baseline %f with threshold %f",
str(key),
value["algBw"],
baseline[key]["algBw"],
bandwidth_threshold,
)
res = False
if abs(value["busBw"] - baseline[key]["busBw"]) / baseline[key]["busBw"] > bandwidth_threshold:
logging.error(
"%s: busBw %f not match baseline %f with threshold %f",
str(key),
value["busBw"],
baseline[key]["busBw"],
bandwidth_threshold,
)
res = False
return res


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--perf-file", type=str, required=True)
parser.add_argument("--baseline-file", type=str, required=True)
# We use different threshold for latency and bandwidth. For latency,
# small data size is used which introduces more variance. For bandwidth, the performance is more stable.
parser.add_argument("--time-threshold", type=float, default=0.15)
parser.add_argument("--bandwidth-threshold", type=float, default=0.05)
args = parser.parse_args()

perf_result = load_perf_file(args.perf_file)
baseline = load_perf_file(args.baseline_file)
if check_perf_result(perf_result, baseline, args.time_threshold, args.bandwidth_threshold):
print("PASS")
else:
print("FAIL")
exit(1)
24 changes: 22 additions & 2 deletions test/mscclpp-test/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <iomanip>
#include <iostream>
#include <mscclpp/utils.hpp>
#include <nlohmann/json.hpp>
#include <sstream>
#include <string>
#include <type_traits>
Expand Down Expand Up @@ -46,6 +47,7 @@ int iters = 20;
int average = 1;
int kernel_num = 0;
int cudaGraphLaunches = 15;
std::string output_file;

double parseSize(const char* value) {
std::string valueStr(value);
Expand Down Expand Up @@ -148,7 +150,8 @@ void numaBind(int node) {
numa_bind_compat(&mask);
}

BaseTestEngine::BaseTestEngine(const TestArgs& args) : args_(args), inPlace_(true), error_(0) {
BaseTestEngine::BaseTestEngine(const TestArgs& args, const std::string& name)
: args_(args), name_(name), inPlace_(true), error_(0) {
this->coll_ = getTestColl();
CUDATHROW(cudaStreamCreateWithFlags(&this->stream_, cudaStreamNonBlocking));
}
Expand Down Expand Up @@ -254,6 +257,18 @@ void BaseTestEngine::runTest() {
}
double algBw, busBw;
this->coll_->getBw(deltaSec, algBw, busBw);
if (!output_file.empty()) {
nlohmann::json perfOutput = {{"name", name_},
{"kernel", args_.kernelNum},
{"ranks", args_.totalRanks},
{"ranksPerNode", args_.nRanksPerNode},
{"size", size},
{"time", timeUsec},
{"algBw", algBw},
{"busBw", busBw}};
std::ofstream out(output_file, std::ios_base::app);
if (isMainProc) out << perfOutput << std::endl;
}
if (!this->inPlace_) {
ss << " ";
}
Expand Down Expand Up @@ -417,12 +432,13 @@ int main(int argc, char* argv[]) {
{"cudagraph", required_argument, 0, 'G'},
{"average", required_argument, 0, 'a'},
{"kernel_num", required_argument, 0, 'k'},
{"output_file", required_argument, 0, 'o'},
{"help", no_argument, 0, 'h'},
{}};

while (1) {
int c;
c = getopt_long(argc, argv, "b:e:i:f:n:w:c:G:a:k:h:", longopts, &longindex);
c = getopt_long(argc, argv, "b:e:i:f:n:w:c:G:a:k:o:h:", longopts, &longindex);

if (c == -1) break;

Expand Down Expand Up @@ -471,6 +487,9 @@ int main(int argc, char* argv[]) {
case 'k':
kernel_num = (int)strtol(optarg, NULL, 0);
break;
case 'o':
output_file = optarg;
break;
case 'h':
default:
if (c != 'h') printf("invalid option '%c'\n", c);
Expand All @@ -488,6 +507,7 @@ int main(int argc, char* argv[]) {
"[-C,--report_cputime <0/1>] \n\t"
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
"[-k,--kernel_num <kernel number of commnication primitive>] \n\t"
"[-o, --output_file <output file name>] \n\t"
"[-h,--help]\n",
basename(argv[0]));
return 0;
Expand Down
Loading

0 comments on commit 2640578

Please sign in to comment.