diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index dc7d3d92b9f..2606e8a4592 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1889,7 +1889,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { /*root_domain=*/std::vector(), new_logical_domain, new_alloc, - /*loop_domain=*/new_logical_domain, + /*loop_domain=*/new_alloc, tv->domain()->contiguity()); } else { new_td = IrBuilder::create( diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp index bd1932d8f50..06e15929aa9 100644 --- a/csrc/transform_replay.cpp +++ b/csrc/transform_replay.cpp @@ -757,19 +757,19 @@ std::pair TransformReplay::replayCasP( consumer->domain()->contiguity()); if (producer->hasAllocation()) { - std::vector producer_allocation = - TensorDomain::noReductions(producer->getAllocationDomain()); - ReplayTransformations replay_CasP( - producer_allocation, + auto replay_CasP = BestEffortReplay( + new_IDs, + producer->getLoopDomain(), logical_map.mapProducerToConsumer(producer->domain(), replayed)); const auto& p2c_map = replay_CasP.getReplay(); + auto producer_rank = producer->getAllocationDomain().size(); std::vector new_allocation_domain; - new_allocation_domain.reserve(producer_allocation.size()); + new_allocation_domain.reserve(producer_rank); std::vector> new_contiguity; - new_contiguity.reserve(producer_allocation.size()); + new_contiguity.reserve(producer_rank); - for (auto i : c10::irange(producer->getAllocationDomain().size())) { + for (auto i : c10::irange(producer_rank)) { IterDomain* id = producer->getAllocationDomain()[i]; // We won't find reduction IterDomains in the map. See // AllocationDomainTest.CacheBefore.