From 0b7dcdc85577f85de298fb6b7d5dc47d2d5a88e2 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Sun, 27 Oct 2024 10:09:51 -0700 Subject: [PATCH 1/3] Create map value to fusion id * Track inputs, outputs, and extents --- csrc/python_frontend/fusion_cache.cpp | 12 ++- csrc/python_frontend/fusion_cache.h | 8 ++ csrc/python_frontend/fusion_definition.cpp | 22 ++++++ csrc/python_frontend/fusion_record.h | 8 +- csrc/python_frontend/fusion_state.cpp | 86 +++++++++++++++++++++- csrc/python_frontend/fusion_state.h | 28 ++++++- csrc/python_frontend/python_bindings.cpp | 3 + 7 files changed, 156 insertions(+), 11 deletions(-) diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp index 53dc43bdbe8..8116e62d1f8 100644 --- a/csrc/python_frontend/fusion_cache.cpp +++ b/csrc/python_frontend/fusion_cache.cpp @@ -780,8 +780,8 @@ void FusionCache::deserialize(std::string filename) { NVF_CHECK( trie_ptr->fusion_id == fb_trie_node->fusion_id(), "The fusion id for this TrieNode should already be set.") - Fusion* fusion = - queryFusionSchedules(fb_trie_node->fusion_id())->preschedFusion(); + FusionSchedules* fs = queryFusionSchedules(fb_trie_node->fusion_id()); + Fusion* fusion = fs->preschedFusion(); try { // There could be bad fusion in the serialization. state->buildFusionIr(fusion); @@ -789,6 +789,14 @@ void FusionCache::deserialize(std::string filename) { // catch exception and setException for the terminal node trie_ptr->setException(e.what()); } + // The FusionState creates a mapping from CPP Fusion to its State objects. + // Since the CPP Fusion is cached in FusionCache and the FusionState is + // temporary, the information linking CPP Fusion and Python + // FusionDefinition is stored in FusionCache. + fs->inputs_fid_ = state->inputs(); + fs->outputs_fid_ = state->outputs(); + fs->extents_fid_ = state->extents(); + fs->map_value_to_fid_ = state->getValueMap(); } // Table TrieNode => Field: children: [ulong] diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h index b4283b7bdaf..76c4f68af6c 100644 --- a/csrc/python_frontend/fusion_cache.h +++ b/csrc/python_frontend/fusion_cache.h @@ -103,6 +103,14 @@ struct FusionSchedules { std::mutex scheds_lock; //! ID of fusion in python frontend fusion cache int64_t fusion_id_ = -1; + //! Input arguments for FusionState + std::vector inputs_fid_; + //! Extents for TensorView input arguments for FusionState + std::vector extents_fid_; + //! Output arguments for FusionState + std::vector outputs_fid_; + //! Map Fusion Val to its corresponding FusionDefinition index + std::unordered_map map_value_to_fid_; }; //! \struct TrieNode diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index b512d9d761b..9c9aea15ee8 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -107,6 +107,17 @@ void FusionDefinition::finalizeDefinition() { throw; } + // The FusionState creates a mapping from CPP Fusion to its State objects. + // Since the CPP Fusion is cached in FusionCache and the FusionState is + // temporary, the information linking CPP Fusion and Python + // FusionDefinition is stored in FusionCache. + FusionSchedules* fs = + fusionCache()->queryFusionSchedules(fusion_id_.value()); + fs->inputs_fid_ = inputs(); + fs->outputs_fid_ = outputs(); + fs->extents_fid_ = extents(); + fs->map_value_to_fid_ = getValueMap(); + if (isDebugDumpEnabled(DebugDumpOption::FusionIrOriginal)) { printIr(); } @@ -120,6 +131,17 @@ void FusionDefinition::finalizeDefinition() { // build a proper fusion earlier. NVF_CHECK(!opt_e.has_value(), opt_e.value()); fusion_id_ = std::optional(trie_node_->fusion_id); + + // A CPP fusion already exists in the FusionCache for this FusionDefinition. + // In this case, a new CPP Fusion is not created, so the mapping from CPP + // fusion to Python FusionDefinition is not initialized. This state is + // stored within FusionSchedules and is retrieved for this FusionDefinition. + FusionSchedules* fs = + fusionCache()->queryFusionSchedules(fusion_id_.value()); + inputs_fid_ = fs->inputs_fid_; + outputs_fid_ = fs->outputs_fid_; + extents_fid_ = fs->extents_fid_; + map_value_to_fid_ = fs->map_value_to_fid_; } NVF_ERROR( diff --git a/csrc/python_frontend/fusion_record.h b/csrc/python_frontend/fusion_record.h index 82879912509..154f8d28805 100644 --- a/csrc/python_frontend/fusion_record.h +++ b/csrc/python_frontend/fusion_record.h @@ -1368,7 +1368,7 @@ struct TensorRecord : RecordFunctor { } fd.setFusionState(outputs_.at(0).index, tv); - fd.addInput(tv); + fd.addInput(tv, outputs_.at(0).index); } void print(std::ostream& os, bool close_function = true) const final { @@ -1545,12 +1545,12 @@ struct OutputRecord : RecordFunctor { } tv_output->setAllocationDomain(allocation_domain, true); } - fd.addOutput(tv_output); + fd.addOutput(tv_output, args_.at(0).index); } else { NVF_CHECK( stride_order_.empty(), "stride_order can't be dictated for scalar outputs."); - fd.addOutput(output); + fd.addOutput(output, args_.at(0).index); } } } @@ -2015,7 +2015,7 @@ struct ScalarRecord : RecordFunctor { void operator()(FusionState& fd) final { Val* output = IrBuilder::create(value_, dtype_); if (!value_.hasValue()) { - fd.addInput(output); + fd.addInput(output, outputs_.at(0).index); } fd.setFusionState(outputs_.at(0).index, output); } diff --git a/csrc/python_frontend/fusion_state.cpp b/csrc/python_frontend/fusion_state.cpp index 99868f14b21..c3970c6edf9 100644 --- a/csrc/python_frontend/fusion_state.cpp +++ b/csrc/python_frontend/fusion_state.cpp @@ -85,6 +85,22 @@ std::unique_ptr FusionState::clone() { state->fusion_state_.insert( state->fusion_state_.end(), fusion_state_.begin(), fusion_state_.end()); state->num_recording_states_ = num_recording_states_; + std::copy( + inputs_fid_.begin(), + inputs_fid_.end(), + std::back_inserter(state->inputs_fid_)); + std::copy( + outputs_fid_.begin(), + outputs_fid_.end(), + std::back_inserter(state->outputs_fid_)); + std::copy( + extents_fid_.begin(), + extents_fid_.end(), + std::back_inserter(state->extents_fid_)); + std::copy( + map_value_to_fid_.begin(), + map_value_to_fid_.end(), + std::inserter(state->map_value_to_fid_, state->map_value_to_fid_.end())); return state; } @@ -108,6 +124,7 @@ void FusionState::buildFusionIr(Fusion* fusion) { e.what()); } } + addExtents(); } void FusionState::addRecord(RecordFunctor* record) { @@ -147,6 +164,10 @@ void FusionState::resetFusionState(Fusion* fusion, size_t size) { fusion_ = fusion; fusion_state_.clear(); fusion_state_.resize(size, {}); + inputs_fid_.clear(); + outputs_fid_.clear(); + extents_fid_.clear(); + map_value_to_fid_.clear(); } void FusionState::addFusionState(Val* val) { @@ -178,6 +199,7 @@ size_t FusionState::numFusionStates() const { void FusionState::setFusionState(size_t index, Val* val) { fusion_state_.at(index) = {val}; + map_value_to_fid_.emplace(val, (int64_t)index); } void FusionState::setFusionStateVector(size_t index, std::vector val) { @@ -189,14 +211,18 @@ void FusionState::setFusionStateVector(size_t index, std::vector val) { fusion_state_.at(index) = {val}; } -void FusionState::addInput(Val* input) { +void FusionState::addInput(Val* input, size_t index) { NVF_CHECK(fusion_ != nullptr, "Fusion is undefined."); fusion_->addInput(input); + map_value_to_fid_.emplace(input, (int64_t)index); + inputs_fid_.push_back((int64_t)index); } -void FusionState::addOutput(Val* output) { +void FusionState::addOutput(Val* output, size_t index) { NVF_CHECK(fusion_ != nullptr, "Fusion is undefined."); fusion_->addOutput(output); + map_value_to_fid_.emplace(output, (int64_t)index); + outputs_fid_.push_back((int64_t)index); } void FusionState::aliasOutputToInput(Val* output, Val* input) { @@ -206,4 +232,60 @@ void FusionState::aliasOutputToInput(Val* output, Val* input) { fusion_->aliasOutputToInput(output, input, AllocationType::ReuseBuffer); } +const std::unordered_map& FusionState::getValueMap() + const { + return map_value_to_fid_; +} + +const std::vector& FusionState::inputs() const { + return inputs_fid_; +} + +const std::vector& FusionState::outputs() const { + return outputs_fid_; +} + +const std::vector& FusionState::extents() const { + return extents_fid_; +} + +std::vector FusionState::getExtents(Fusion* fusion) { + NVF_CHECK(fusion != nullptr, "Fusion is undefined."); + + std::vector extents; + for (Val* v : fusion->inputs()) { + // short-circuit: skip if not TensorView + if (!v->isA()) { + continue; + } + TensorView* tv = v->as(); + std::vector logical_dom = + TensorDomain::noReductions(tv->getLogicalDomain()); + std::transform( + logical_dom.begin(), + logical_dom.end(), + std::back_inserter(extents), + [](IterDomain* id) { return id->getMaybeExpandedExtent(); }); + } + return extents; +} + +void FusionState::addExtents() { + NVF_CHECK(fusion_ != nullptr, "Fusion is undefined."); + + // The size of the tensor dimensions can be used as an input of the + // segments. NvFuser does not support returning scalar values. Segmentation + // must pass those sizes as segment arguments manually. + std::vector extents = getExtents(fusion_); + for (Val* extent : extents) { + int64_t num_extents = (int64_t)extents_fid_.size(); + int64_t extent_fid = -num_extents - 1; + extents_fid_.push_back(extent_fid); + // The extent can already exist in the fusion. However, since scalars cannot + // be passed between segments, always overwrited existing fids. The original + // fusion definition will provide scalar extents. + map_value_to_fid_[extent] = extent_fid; + } +} + } // namespace nvfuser::python_frontend diff --git a/csrc/python_frontend/fusion_state.h b/csrc/python_frontend/fusion_state.h index bd75f7af5d6..b7adaf82a00 100644 --- a/csrc/python_frontend/fusion_state.h +++ b/csrc/python_frontend/fusion_state.h @@ -79,12 +79,24 @@ class FusionState { NVF_API void setFusionStateVector(size_t index, std::vector val); //! Adds a Tensor/Scalar input to the Fusion object - NVF_API void addInput(Val* input); + NVF_API void addInput(Val* input, size_t index); //! Adds a Tensor/Scalar output to the Fusion object - NVF_API void addOutput(Val* output); + NVF_API void addOutput(Val* output, size_t index); //! Alias an Output to Input in the Fusion object NVF_API void aliasOutputToInput(Val* output, Val* input); + //! Get map between CPP Fusion and Python FusionDefinition + NVF_API const std::unordered_map& getValueMap() const; + //! Get indicies for the inputs of FusionState + NVF_API const std::vector& inputs() const; + //! Get indicies for the outputs of FusionState + NVF_API const std::vector& outputs() const; + //! Get indicies for the extents of TensorView inputs of FusionState + NVF_API const std::vector& extents() const; + + //! Add extents of TensorView inputs to FusionState + NVF_API void addExtents(); + //! Add a Record void addRecord(RecordFunctor* record); //! Builds an nvFuser Fusion IR object @@ -94,6 +106,8 @@ class FusionState { std::unique_ptr clone(); private: + //! Get extents for TensorView inputs in Fusion + std::vector getExtents(Fusion* fusion); //! Change the fusion ptr and reset its state void resetFusionState(Fusion* fusion, size_t size); @@ -104,10 +118,18 @@ class FusionState { std::vector> recording_; //! A vector of state that represents Tensors/Vectors/Scalars std::vector recording_state_; + //! Input arguments for FusionState + std::vector inputs_fid_; + //! Output arguments for FusionState + std::vector outputs_fid_; + //! Extents for TensorView input arguments for FusionState + std::vector extents_fid_; + //! Map Fusion Val to its corresponding FusionDefinition index + std::unordered_map map_value_to_fid_; private: //! A ptr to the container used when building the Fusion IR from a definition - Fusion* fusion_; + Fusion* fusion_ = nullptr; //! A vector of nvFuser Fusion IR TensorViews/Vectors/Scalars for building the //! Fusion IR graph. //! NOTE: Vectors are represented by a vector. This could diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp index d7b9e8d1c34..eccbb867c7b 100644 --- a/csrc/python_frontend/python_bindings.cpp +++ b/csrc/python_frontend/python_bindings.cpp @@ -959,6 +959,9 @@ void initNvFuserPythonBindings(PyObject* module) { // Mark the end of a schedule inst::Trace::instance()->endEvent(nullptr); }) + .def("inputs", [](FusionDefinition& self) { return self.inputs(); }) + .def("outputs", [](FusionDefinition& self) { return self.outputs(); }) + .def("extents", [](FusionDefinition& self) { return self.extents(); }) .def( "__repr__", [](FusionDefinition& self) { From 8310ace10ad3a9c4983d95d376118c4ecc040433 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Wed, 30 Oct 2024 14:10:24 -0700 Subject: [PATCH 2/3] create test_fusion_information --- csrc/python_frontend/fusion_cache.h | 6 +++--- csrc/python_frontend/fusion_state.cpp | 3 +++ nvfuser/__init__.py | 1 - tests/python/test_python_frontend.py | 31 +++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h index 76c4f68af6c..ae64b588c89 100644 --- a/csrc/python_frontend/fusion_cache.h +++ b/csrc/python_frontend/fusion_cache.h @@ -103,11 +103,11 @@ struct FusionSchedules { std::mutex scheds_lock; //! ID of fusion in python frontend fusion cache int64_t fusion_id_ = -1; - //! Input arguments for FusionState + //! Fusion IDs of input arguments for FusionState std::vector inputs_fid_; - //! Extents for TensorView input arguments for FusionState + //! IDs for Extents for TensorView input arguments for FusionState std::vector extents_fid_; - //! Output arguments for FusionState + //! Fusion IDs of output arguments for FusionState std::vector outputs_fid_; //! Map Fusion Val to its corresponding FusionDefinition index std::unordered_map map_value_to_fid_; diff --git a/csrc/python_frontend/fusion_state.cpp b/csrc/python_frontend/fusion_state.cpp index c3970c6edf9..be8d8d0c514 100644 --- a/csrc/python_frontend/fusion_state.cpp +++ b/csrc/python_frontend/fusion_state.cpp @@ -279,6 +279,9 @@ void FusionState::addExtents() { std::vector extents = getExtents(fusion_); for (Val* extent : extents) { int64_t num_extents = (int64_t)extents_fid_.size(); + // Use negative numbers to represent extent of iterDomains to avoid conflict + // with non-negative numbers used for scalars, vectors, and tensors. + // The extents are ordered based on the order of the fusion's inputs. int64_t extent_fid = -num_extents - 1; extents_fid_.push_back(extent_fid); // The extent can already exist in the fusion. However, since scalars cannot diff --git a/nvfuser/__init__.py b/nvfuser/__init__.py index 4b4f25b9d66..7d9048e7bf6 100644 --- a/nvfuser/__init__.py +++ b/nvfuser/__init__.py @@ -53,7 +53,6 @@ class FusionDefinition(_C._FusionDefinition): def __init__(self, id=None, max_length=1024): super(FusionDefinition, self).__init__(id, max_length) self.profiled = False - self.inputs = None def __enter__(self): return self._setup_definition() diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py index 8080c48278c..b0b4af0236d 100644 --- a/tests/python/test_python_frontend.py +++ b/tests/python/test_python_frontend.py @@ -4600,3 +4600,34 @@ def fusion_func(fd: FusionDefinition) -> None: nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) for out in nvf_out: self.assertTrue(out.allclose(x[:, 1:, 2:])) + + def test_fusion_information(self): + inputs = [ + torch.ones(2, 4, 8, device="cuda"), + torch.ones(2, 4, 8, device="cuda"), + ] + + def fusion_func(fd: FusionDefinition) -> None: + t0 = fd.from_pytorch(inputs[0]) + t1 = fd.from_pytorch(inputs[1]) + c0 = fd.define_scalar(3.0) + + t2 = fd.ops.add(t0, t1) + t3 = fd.ops.mul(t2, c0) + t4 = fd.ops.sum(t3, [-1], False, DataType.Float) + + fd.add_output(t4) + + nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) + eager_out = torch.sum((inputs[0] + inputs[1]) * 3.0, dim=-1) + self.assertEqual(eager_out, nvf_out[0]) + + with FusionDefinition() as fd: + fusion_func(fd) + + nvf_out1 = fd.execute(inputs) + self.assertEqual(eager_out, nvf_out1[0]) + + self.assertEqual(fd.inputs(), [0, 1]) + self.assertEqual(fd.outputs(), [5]) + self.assertEqual(fd.extents(), [idx for idx in range(-1, -7, -1)]) From 09af663299a41218b4ffc74f74a75d0b750c9633 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Wed, 30 Oct 2024 17:29:15 -0700 Subject: [PATCH 3/3] comments --- csrc/python_frontend/fusion_state.h | 5 ++--- tests/python/test_python_frontend.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/csrc/python_frontend/fusion_state.h b/csrc/python_frontend/fusion_state.h index b7adaf82a00..7a83886514a 100644 --- a/csrc/python_frontend/fusion_state.h +++ b/csrc/python_frontend/fusion_state.h @@ -94,9 +94,6 @@ class FusionState { //! Get indicies for the extents of TensorView inputs of FusionState NVF_API const std::vector& extents() const; - //! Add extents of TensorView inputs to FusionState - NVF_API void addExtents(); - //! Add a Record void addRecord(RecordFunctor* record); //! Builds an nvFuser Fusion IR object @@ -108,6 +105,8 @@ class FusionState { private: //! Get extents for TensorView inputs in Fusion std::vector getExtents(Fusion* fusion); + //! Add extents of TensorView inputs to FusionState + void addExtents(); //! Change the fusion ptr and reset its state void resetFusionState(Fusion* fusion, size_t size); diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py index f0198bdc7b8..e0597757a9c 100644 --- a/tests/python/test_python_frontend.py +++ b/tests/python/test_python_frontend.py @@ -4610,13 +4610,13 @@ def test_fusion_information(self): def fusion_func(fd: FusionDefinition) -> None: t0 = fd.from_pytorch(inputs[0]) t1 = fd.from_pytorch(inputs[1]) - c0 = fd.define_scalar(3.0) + c2 = fd.define_scalar(3.0) - t2 = fd.ops.add(t0, t1) - t3 = fd.ops.mul(t2, c0) - t4 = fd.ops.sum(t3, [-1], False, DataType.Float) + t3 = fd.ops.add(t0, t1) + t4 = fd.ops.mul(t3, c2) + t5 = fd.ops.sum(t4, [-1], False, DataType.Float) - fd.add_output(t4) + fd.add_output(t5) nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) eager_out = torch.sum((inputs[0] + inputs[1]) * 3.0, dim=-1) @@ -4628,11 +4628,15 @@ def fusion_func(fd: FusionDefinition) -> None: nvf_out1 = fd.execute(inputs) self.assertEqual(eager_out, nvf_out1[0]) + # The input tensors are t0 and t1. self.assertEqual(fd.inputs(), [0, 1]) + # The output tensors is t5. self.assertEqual(fd.outputs(), [5]) + # The extents correspond with the dimensions for each input tensor. + # There are two input tensors with three dimensions each, so the + # extents range from [-1, -6]. self.assertEqual(fd.extents(), [idx for idx in range(-1, -7, -1)]) - def test_issue_3292(self): inputs = [ torch.testing.make_tensor( @@ -4705,4 +4709,4 @@ def fusion_func(fd: FusionDefinition) -> None: fd.add_output(T223) # is_clonable=False is because translation fails with missing ceilDiv - nvf_out, _ = self.exec_nvfuser(fusion_func, inputs, is_clonable=False) \ No newline at end of file + nvf_out, _ = self.exec_nvfuser(fusion_func, inputs, is_clonable=False)