diff --git a/library/src/amd_detail/hipblaslt.cpp b/library/src/amd_detail/hipblaslt.cpp index e216f4dba..cf90057d0 100644 --- a/library/src/amd_detail/hipblaslt.cpp +++ b/library/src/amd_detail/hipblaslt.cpp @@ -168,8 +168,8 @@ try // TODO: Synchronizer size pass into predicate SynchronizerSizeCheck // 1K just for small size now, need to cal corner case if support all situations void* d_Synchronizer = nullptr; - CHECK_HIP_ERROR(hipMalloc(&d_Synchronizer, 16 * 40960 * sizeof(int))); - CHECK_HIP_ERROR(hipMemset(d_Synchronizer, 0, sizeof(int) * 16 * 40960)); + CHECK_HIP_ERROR(hipMalloc(&d_Synchronizer, 16 * 409600 * sizeof(int))); + CHECK_HIP_ERROR(hipMemset(d_Synchronizer, 0, sizeof(int) * 16 * 409600)); err = hipGetDevice(&deviceId); if(err == hipSuccess) diff --git a/tensilelite/Tensile/KernelWriterAssembly.py b/tensilelite/Tensile/KernelWriterAssembly.py index 7e005ee2e..cfe90a9e6 100644 --- a/tensilelite/Tensile/KernelWriterAssembly.py +++ b/tensilelite/Tensile/KernelWriterAssembly.py @@ -1733,7 +1733,7 @@ def calculateWG(): module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr0), src=sgpr(tmpSgpr0), shiftHex=(2))) module.add(SAddU32(dst=sgpr("AddressTD"), src0=sgpr("AddressTD"), src1=sgpr(tmpSgpr0))) module.add(SAddCU32(dst=sgpr("AddressTD+1"), src0=sgpr("AddressTD+1"), src1=hex(0))) - module.add(SAddU32(dst=sgpr("Synchronizer"), src0=sgpr("Synchronizer"), src1=hex(163840))) + module.add(SAddU32(dst=sgpr("Synchronizer"), src0=sgpr("Synchronizer"), src1=hex(1638400))) module.add(SAddCU32(dst=sgpr("Synchronizer+1"), src0=sgpr("Synchronizer+1"), src1=hex(0))) module.add(extReadEpilogueLabeltmp) module.add(SAddU32(dst=sgpr(tmpSgprAddrM), src0=sgpr(tmpSgprAddrM), src1=sgpr(tmpSgprArgOffsett))) diff --git a/tensilelite/Tensile/Source/client/source/ClientProblemFactory.cpp b/tensilelite/Tensile/Source/client/source/ClientProblemFactory.cpp index 8dd7a8c28..88ae8a6f8 100644 --- a/tensilelite/Tensile/Source/client/source/ClientProblemFactory.cpp +++ b/tensilelite/Tensile/Source/client/source/ClientProblemFactory.cpp @@ -372,7 +372,7 @@ namespace TensileLite else { rv.back().setSynchronizer( - m_constantTypes[ContractionProblemGemm::CONST::ALPHA], 40960); + m_constantTypes[ContractionProblemGemm::CONST::ALPHA], 409600); } if(j < m_activationEnumArg.size()) { diff --git a/tensilelite/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp b/tensilelite/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp index 3af730eb7..4b31d05ef 100644 --- a/tensilelite/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp +++ b/tensilelite/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp @@ -234,7 +234,7 @@ namespace TensileLite bool ret = (std::ceil(static_cast(problem.freeSizeA(0)) / value[0]) * std::ceil(static_cast(problem.freeSizeB(0)) / value[1])) * (value[2]) * (value[4] / 64) * value[3] - <= 40960; + <= 409600; if(problem.groupedGemm()) ret = ret && (problem.groupedGemmCount() <= 16);