diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp index eb545b1a7dc..3a01d89790d 100644 --- a/csrc/scheduler/reduction_utils.cpp +++ b/csrc/scheduler/reduction_utils.cpp @@ -740,15 +740,9 @@ class PersistentBufferProjector { persistent_buffers.begin(), persistent_buffers.end()); for (auto buffer_i : c10::irange(persistent_buffers.size())) { auto buffer = persistent_buffers[buffer_i]; - // skip reduction buffers - if (buffer->hasReduction()) { - continue; - } const auto& producers = ir_utils::producerTvsOf(buffer); - if (!producers.empty() && - std::all_of(producers.begin(), producers.end(), [&](auto producer) { - return persistent_buffer_set.count(producer) > 0; - })) { + if (scheduler_utils::canProjectToPersistentProducer( + buffer, producers, persistent_buffer_set)) { projectToInputOrImmediatePersistentProducer( (int)buffer_i, std::vector(producers.begin(), producers.end())); diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 0dc7338c533..70775a55939 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -781,6 +781,23 @@ getScopePersistenceFactors( } // namespace +// Returns true if a persistent tv can be projected to its persistent producers. +bool canProjectToPersistentProducer( + TensorView* buffer, + const std::vector& producers, + const std::unordered_set& persistent_buffer_set) { + if (buffer->hasReduction() || producers.empty()) { + return false; + } + if (std::all_of(producers.begin(), producers.end(), [&](auto producer) { + return persistent_buffer_set.count(producer) > 0; + })) { + return true; + } else { + return false; + } +} + PersistentBufferSizeReturn persistentBufferSize( Fusion* fusion, SchedulerRuntimeInfo& runtime_info, @@ -847,14 +864,19 @@ PersistentBufferSizeReturn persistentBufferSize( // Buffers involved in normal persistence std::vector persistent_mask(all_buffers.size(), false); - + std::unordered_set persistent_buffer_set( + persistent_buffers.begin(), persistent_buffers.end()); for (auto buffer_i : c10::irange(persistent_buffers.size())) { - persistent_mask[buffer_i] = true; + auto buffer = persistent_buffers[buffer_i]; + const auto& producers = ir_utils::producerTvsOf(buffer); + if (!canProjectToPersistentProducer( + buffer, producers, persistent_buffer_set)) { + persistent_mask[buffer_i] = true; + } } // Buffers involved in projected to inputs std::vector projected_mask(all_buffers.size(), true); - for (auto buffer_i : c10::irange(persistent_buffers.size())) { auto buffer = persistent_buffers[buffer_i]; // Not a projectable buffer, or an input of a projectable buffer diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h index a85c1029d1c..879157c6d65 100644 --- a/csrc/scheduler/utils.h +++ b/csrc/scheduler/utils.h @@ -190,6 +190,13 @@ struct PersistentBufferInfo { // can simply be read multiple times from GMEM in the same kernel. PersistentBufferInfo persistentBuffers(Fusion* fusion); +// A persistent tv can be projected to its producers when all the producers are +// persistent tvs and there is no reduction op. +bool canProjectToPersistentProducer( + TensorView* buffer, + const std::vector& producers, + const std::unordered_set& persistent_buffer_set); + struct ReductionTvProperties { // How many elements in tensor view are there to reduce. int64_t total_reduction_numel = 1; diff --git a/test/test_gpu2.cpp b/test/test_gpu2.cpp index 678b7dc1bc4..b1988e88198 100644 --- a/test/test_gpu2.cpp +++ b/test/test_gpu2.cpp @@ -9152,10 +9152,11 @@ TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) { auto persistent_buffer_size = persistentBufferSize(&fusion, runtime_info, persistent_buffer_info); + // T1 and T2 are persistent buffers, but T2 can be projected to T1. + // So, the actual buffer size is just the size to save T1. NVF_ERROR( persistent_buffer_size.persistent_buffer_size == - static_cast( - aten_t0.size(1) * dataTypeSize(DataType::Float) * 2)); + static_cast(aten_t0.size(1) * dataTypeSize(DataType::Float))); NVF_ERROR( persistent_buffer_size.projected_persistent_buffer_size == diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp index 10d1d86fad4..4af232da3ec 100644 --- a/test/test_gpu3.cpp +++ b/test/test_gpu3.cpp @@ -8123,8 +8123,8 @@ TEST_F(NVFuserTest, FusionLayerNormFusedOpsRedundantCast_CUDA) { const float kEps = 1e-5; const int batch_size = 2048 * 8; const int hidden_size = 20480; + DataType dtype = DataType::Half; { - DataType dtype = DataType::Half; auto tv0 = makeContigTensor(1, dtype); auto tv1 = makeContigTensor(2, dtype); auto tv2 = makeContigTensor(1, dtype); @@ -8206,16 +8206,20 @@ TEST_F(NVFuserTest, FusionLayerNormFusedOpsRedundantCast_CUDA) { outputs.emplace_back(t33); } - auto persistent_buffer_info1 = scheduler_utils::persistentBuffers(fusion); + auto persistent_buffer_info = scheduler_utils::persistentBuffers(fusion); NVF_CHECK( - persistent_buffer_info1.persistent_buffers.size() == 2, + persistent_buffer_info.persistent_buffers.size() == 2, "Before project to other buffers, should have two persistent buffers!"); - reduction_scheduler_utils::projectPersistentBuffers(fusion, false); - auto persistent_buffer_info2 = scheduler_utils::persistentBuffers(fusion); + // The buffer size should only count 1 buffer because the other one is + // projected to its producer. + SchedulerRuntimeInfo runtime_info(fusion, inputs); + auto persistent_buffer_size = + persistentBufferSize(fusion, runtime_info, persistent_buffer_info); NVF_CHECK( - persistent_buffer_info2.persistent_buffers.size() == 1, - "After project to other buffers, should have one persistent buffer!"); + persistent_buffer_size.persistent_buffer_size == + hidden_size * dataTypeSize(dtype), + "Persistent buffer size is not correct!"); FusionExecutorCache fec(std::move(fusion_ptr)); auto cg_outputs = fec.runFusionWithInputs(inputs); @@ -9408,6 +9412,91 @@ TEST_F(NVFuserTest, LoweringHook) { EXPECT_TRUE(executed); } +TEST_F(NVFuserTest, ProjectPersistentBufferMultiScopes) { + std::unique_ptr fusion_ptr = std::make_unique(); + auto fusion = fusion_ptr.get(); + FusionGuard fg(fusion); + + const int batch_size = 2048; + const int hidden_size = 10240; + DataType input_dtype = DataType::Float; + auto tv0 = makeContigTensor(2, input_dtype); + auto tv1 = makeContigTensor(2, input_dtype); + auto tv2 = makeContigTensor(2, input_dtype); + + fusion->addInput(tv0); + fusion->addInput(tv1); + fusion->addInput(tv2); + + auto tv3 = add(tv0, tv0); + auto tv4 = sum(tv3, {1}); + auto tv5 = broadcast(tv4, {false, true}); + auto tv6 = add(tv3, tv5); + + auto tv7 = add(tv3, tv3); + auto tv8 = sum(tv7, {1}); + auto tv9 = broadcast(tv8, {false, true}); + auto tv10 = add(tv7, tv9); + + auto tv11 = add(tv0, tv1); + auto tv12 = mul(tv11, tv11); + auto tv13 = sum(tv12, {1}); + auto tv14 = broadcast(tv13, {false, true}); + auto tv15 = add(tv12, tv14); + + auto tv16 = add(tv12, tv2); + auto tv17 = mul(tv16, tv16); + auto tv18 = sum(tv17, {1}); + auto tv19 = broadcast(tv18, {false, true}); + auto tv20 = add(tv17, tv19); + + fusion->addOutput(tv6); + fusion->addOutput(tv10); + fusion->addOutput(tv15); + fusion->addOutput(tv20); + + auto options = at::TensorOptions() + .dtype(data_type_to_aten(input_dtype)) + .device(at::kCUDA, 0); + auto t0 = at::randn({batch_size, hidden_size}, options); + auto t1 = at::randn({batch_size, hidden_size}, options); + auto t2 = at::randn({batch_size, hidden_size}, options); + std::vector inputs{t0, t1, t2}; + + // The persistent buffers in this fusion are: tv3, tv7, tv12, and tv17. Note + // that tv7 can be projected back to its producer, tv3. When calculating the + // total size of persistent buffers ([persistent_buffer_size]), it's important + // to consider the active scopes of these buffers. Simply subtracting the + // buffer size of tv7 from the max buffer size may lead to an underestimation. + // This is because there are two distinct scopes in this computation: (1) + // During the calculation of tv10, the active persistent buffers are tv3 and + // tv7. (2) For the calculation of tv20, the active persistent buffers are + // tv12 and tv17. The max buffer size is based on tv12 and tv17. There is no + // projectable buffer needs to be deducted in this scope. + auto persistent_info = scheduler_utils::persistentBuffers(fusion); + SchedulerRuntimeInfo runtime_info(fusion, inputs); + auto persistent_buffer_size = + persistentBufferSize(fusion, runtime_info, persistent_info); + auto calculated_size = persistent_buffer_size.persistent_buffer_size; + auto expected_size = + static_cast(hidden_size * 2 * dataTypeSize(input_dtype)); + NVF_CHECK( + calculated_size == expected_size, + "Buffer size calculation failure. Expected size: ", + expected_size, + ". Actual: ", + calculated_size); + auto persistent_params = getInnerPersistentHeuristics(fusion, inputs); + NVF_CHECK(persistent_params, "Reduction schedule was not generated!"); + NVF_CHECK( + !persistent_params->project_persistent_buffers, + "Shouldn't project persistent buffers to inputs!"); + + scheduleInnerPersistentKernel(fusion, *persistent_params); + FusionExecutor fe; + fe.compileFusion(fusion, inputs); + auto cg_outputs = fe.runFusion(inputs); +} // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser