Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Redo #3326.
Browse files Browse the repository at this point in the history
The change was accidentally reverted in
#3349.
wujingyue committed Nov 7, 2024
1 parent 267b7e0 commit 77a8db7
Showing 1 changed file with 61 additions and 83 deletions.
144 changes: 61 additions & 83 deletions tests/cpp/test_allocation_domain.cpp
Original file line number Diff line number Diff line change
@@ -29,8 +29,7 @@ using ::testing::ElementsAre;
// A global->shared->global copy kernel, shared memory allocated transposed to
// avoid bank conflict.
TEST_F(AllocationDomainTest, TransposedIntermediate) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigConcreteTensor({32, 32});
@@ -59,16 +58,15 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) {
at::Tensor t0 = at::randn({32, 32}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
auto cg_outputs = ke.run({t0});
ke.compileFusion(&fusion, {t0});
auto cg_outputs = ke.runFusion({t0});
testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
}

// A global->global copy kernel converting NCHW memory format into NHWC, with a
// 4d allocation domain in output.
TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -97,7 +95,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
at::Tensor t0 = at::randn({n, c, h, w}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

auto cg_outputs = ke.run({t0});

@@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
// A global->global copy kernel converting NCHW memory format into NHWC, with a
// 1d allocation domain in output.
TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -136,7 +133,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
at::Tensor t0 = at::randn({n, c, h, w}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

auto cg_outputs = ke.run({t0});

@@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
// A global->global copy kernel converting NCHW memory format into NHWC, with a
// 2d allocation domain in output.
TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -176,7 +172,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
at::Tensor t0 = at::randn({n, c, h, w}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

auto cg_outputs = ke.run({t0});

@@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
// Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation
// domain in fusion output.
TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -223,7 +218,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
at::Tensor t0 = at::randn({n1, n2, h * w * c}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

auto cg_outputs = ke.run({t0});

@@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
// output. The allocation domain is on both the producer and the consumer side
// of the rFactor domain.
TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -283,7 +277,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
at::Tensor t0 = at::randn({n1, n2, c * h * w}, options);

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

auto cg_outputs = ke.run({t0});

@@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
// A global->global copy kernel where both inputs and outputs are NHWC memory
// format
TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -339,7 +332,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
// A global->global copy kernel where both inputs are NHWC memory format. The
// allocation domain view the input as a 1d tensor.
TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -398,7 +390,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
// A global->global copy kernel where both inputs are NHWC memory format. The
// allocation domain of the output view the output as a 1d tensor.
TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -454,7 +445,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
// A global->global copy kernel where both inputs are NHWC memory format. The
// allocation domain view both the input and the output as a 1d tensors.
TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -515,7 +505,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
// allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and
// view the output as a 2d tensor of shape [N*H*W*C/4, 4]
TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -583,7 +572,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {

// Similar to NHWC4d_To_NHWC4d, but does a cacheBefore
TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -648,7 +636,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {

// Similar to NHWC2d_To_NHWC2d, but does a cacheBefore
TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -725,7 +712,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {

// Similar to NHWC4d_To_NHWC4d, but does a cacheAfter
TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -790,7 +776,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
// allocation tensor to be between rFactor domain and loop domain, which is not
// the case for NHWC2d_To_NHWC2d
TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -861,7 +846,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {

// Similar to NHWC4d_To_NHWC4d, but does a cacheFork
TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(4);
@@ -933,7 +917,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {

// Similar to NHWC2d_To_NHWC2d, but does a cacheFork
TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

int n = 31, h = 64, w = 103, c = 21;
@@ -1023,7 +1006,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
ke.compileFusion(&fusion, {t0});

EXPECT_THAT(
[&]() { ke.run({t0_wrong_format}); },
@@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
}

TEST_F(AllocationDomainTest, VectorizationIssue902) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

const std::vector<int64_t> shape({16, 16, 512, 64});

auto tv0 = makeContigTensor(4);
fusion.addInput(tv0);
fusion->addInput(tv0);

auto tv1 = set(tv0);
fusion.addOutput(tv1);
fusion->addOutput(tv1);

std::vector<nvfuser::IterDomain*> aloc_domain;
aloc_domain.push_back(tv1->axis(0));
aloc_domain.push_back(tv1->axis(2));
aloc_domain.push_back(tv1->axis(3));
aloc_domain.push_back(tv1->axis(1));
tv1->setAllocationDomain(aloc_domain, true);
std::vector<nvfuser::IterDomain*> alloc_domain;
alloc_domain.push_back(tv1->axis(0));
alloc_domain.push_back(tv1->axis(2));
alloc_domain.push_back(tv1->axis(3));
alloc_domain.push_back(tv1->axis(1));
tv1->setAllocationDomain(alloc_domain, true);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

FusionExecutorCache executor_cache(std::move(fusion_ptr));
FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);

ASSERT_TRUE(cg_outputs[0].equal(t0));
@@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) {
}

TEST_F(AllocationDomainTest, ContiguityIssue1021) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion* fusion = fusion_ptr.get();
FusionGuard fg(fusion);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = TensorViewBuilder()
.ndims(2)
@@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
FusionExecutorCache executor_cache(std::move(fusion_ptr));
FusionExecutorCache executor_cache(std::move(fusion));
auto outputs = executor_cache.runFusionWithInputs({t0});

auto t1 = t0.add(5.0);
testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
}

TEST_F(AllocationDomainTest, ContiguityForBroadcast) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion* fusion = fusion_ptr.get();
FusionGuard fg(fusion);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = TensorViewBuilder()
.ndims(2)
@@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) {

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3});
FusionExecutorCache executor_cache(std::move(fusion_ptr));
FusionExecutorCache executor_cache(std::move(fusion));
auto outputs = executor_cache.runFusionWithInputs({t0});

auto t1 = t0.add(5.0);
testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
}

TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion* fusion = fusion_ptr.get();
FusionGuard fg(fusion);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = TensorViewBuilder()
.ndims(3)
@@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8});
FusionExecutorCache executor_cache(std::move(fusion_ptr));
FusionExecutorCache executor_cache(std::move(fusion));
auto outputs = executor_cache.runFusionWithInputs({t0});

auto t1 = t0.add(5.0);
testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
}

// Test that allocation domain can be used to vectorize overlapping tensors,
@@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
// automatically supports all kinds of use cases, even those that we don't have
// an active plan to support on).
TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = makeContigTensor(3);
@@ -1226,8 +1204,8 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1});

KernelExecutor ke;
ke.compile(fusion_ptr.get(), {t0});
auto cg_outputs = ke.run({t0});
ke.compileFusion(&fusion, {t0});
auto cg_outputs = ke.runFusion({t0});

testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
}

0 comments on commit 77a8db7

Please sign in to comment.