Avoid reciprocal round-trip error in FDIV emulation

If x == y then x/y == 1 (assuming x and y are normal values, i.e. neither is +/-0, +/-NaN, +/-Inf, or subnormal), skip FDIV expansion computation to avoid reciprocal round-trip error.
intel · Sep 25, 2023 · 549f098 · 549f098
1 parent 0d6a129
commit 549f098
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 22 deletions.
diff --git a/IGC/Compiler/LegalizationPass.cpp b/IGC/Compiler/LegalizationPass.cpp
@@ -2710,8 +2710,10 @@ static bool needsNoScaling(Value* Val)
 bool IGC::expandFDIVInstructions(llvm::Function& F)
 {
     bool Changed = false;
-    for (auto& BB : F.getBasicBlockList()) {
-        for (auto Iter = BB.begin(); Iter != BB.end();) {
+    for (auto BBIter = F.begin(); BBIter != F.end();) {
+        BasicBlock *BB = &*BBIter++;
+
+        for (auto Iter = BB->begin(); Iter != BB->end();) {
             Instruction* Inst = &*Iter++;
             if (!isCandidateFDiv(Inst))
                 continue;
@@ -2747,28 +2749,63 @@ bool IGC::expandFDIVInstructions(llvm::Function& F)
                 V = Builder.CreateFMul(Y, X);
             }
             else {
+                BasicBlock *PreFDIVExpBB = BB;
+                BasicBlock *PostFDIVExpBB = BB->splitBasicBlock(Inst->getNextNode());
+                BasicBlock *FDIVExpBB = BB->splitBasicBlock(Inst);
+
+                Builder.SetInsertPoint(
+                    FDIVExpBB->getPrevNode()->getTerminator());
+
+                // If x == y then x/y == 1 (assuming x and y are normal values,
+                // i.e. neither is +/-0, +/-NaN, +/-Inf, or subnormal), skip
+                // FDIV expansion basic block to avoid reciprocal
+                // round-trip error, break to post-FDIV-expansion basic block.
+                Value *CmpXY = Builder.CreateFCmp(CmpInst::FCMP_OEQ, X, Y);
+                Value *YAsInt32 = Builder.CreateBitCast(Y, Builder.getInt32Ty());
+                Value *YExp = Builder.CreateAnd(YAsInt32,
+                    Builder.getInt32(0x7f800000));
+                Value *YMantissa = Builder.CreateAnd(YAsInt32,
+                    Builder.getInt32(0x007fffff));
+                Value *CmpYExpZero =
+                    Builder.CreateICmpNE(YExp, Builder.getInt32(0));
+                Value *CmpYMantissaZero =
+                    Builder.CreateICmpNE(YMantissa, Builder.getInt32(0));
+                Value *CmpXYandYZero =
+                    Builder.CreateAnd({CmpXY, CmpYExpZero, CmpYMantissaZero});
+                Builder.CreateCondBr(CmpXYandYZero, PostFDIVExpBB, FDIVExpBB)
+                    ->getNextNode()
+                    ->eraseFromParent();
+
+                // Update iterators after creating BBs.
+                BBIter = PostFDIVExpBB->getIterator();
+                BB = FDIVExpBB;
+                Iter = ++FDIVExpBB->begin();
+                Builder.SetInsertPoint(Inst);
+
                 float S32 = uint64_t(1) << 32;
                 ConstantFP* C0 = ConstantFP::get(Ctx, APFloat(S32));
                 ConstantFP* C1 = ConstantFP::get(Ctx, APFloat(1.0f));
                 ConstantFP* C2 = ConstantFP::get(Ctx, APFloat(1.0f / S32));
 
-                Value* Exp = Builder.CreateAnd(
-                    Builder.CreateBitCast(Y, Builder.getInt32Ty()),
-                    Builder.getInt32(0x7f800000));
-
-                // Check if B's exponent is 0, scale up.
-                Value* P1 = Builder.CreateICmpEQ(Exp, Builder.getInt32(0));
+                // Check if y's exponent is 0, scale up.
+                Value* P1 = Builder.CreateICmpEQ(YExp, Builder.getInt32(0));
                 Value* Scale = Builder.CreateSelect(P1, C0, C1);
 
-                // Check if B's exponent >= 200, scale down.
-                Value* P2 = Builder.CreateICmpUGE(Exp, Builder.getInt32(200 << 23));
+                // Check if y's exponent >= 200, scale down.
+                Value* P2 = Builder.CreateICmpUGE(YExp, Builder.getInt32(200 << 23));
                 Scale = Builder.CreateSelect(P2, C2, Scale);
 
                 // Compute rcp(y * S) * x * S
                 V = Builder.CreateFMul(Y, Scale);
                 V = Builder.CreateFDiv(C1, V);
                 V = Builder.CreateFMul(V, X);
                 V = Builder.CreateFMul(V, Scale);
+
+               Builder.SetInsertPoint(&*PostFDIVExpBB->begin());
+               PHINode *Phi = Builder.CreatePHI(V->getType(), 2);
+               Phi->addIncoming(ConstantFP::get(Ctx, APFloat(1.0f)), PreFDIVExpBB);
+               Phi->addIncoming(V, FDIVExpBB);
+               V = Phi;
             }
 
             Inst->replaceAllUsesWith(V);

diff --git a/IGC/Compiler/Optimizer/SynchronizationObjectCoalescing.cpp b/IGC/Compiler/Optimizer/SynchronizationObjectCoalescing.cpp
@@ -67,7 +67,7 @@ enum InstructionMask : uint32_t
     EndOfThreadOperation       = (1 << 9),
 };
 constexpr InstructionMask AllNoAtomicMask =
-    InstructionMask{ ((EndOfThreadOperation << 1) - 1) & ~InstructionMask::AtomicOperation };
+    InstructionMask{ ((1 << 9) - 1) & ~InstructionMask::AtomicOperation };
 
 inline constexpr InstructionMask operator|(InstructionMask a, InstructionMask b)
 {

diff --git a/IGC/Compiler/tests/GenFDIVEmulation/basic.ll b/IGC/Compiler/tests/GenFDIVEmulation/basic.ll
@@ -17,17 +17,27 @@
 
 define void @test_fdiv(float %a, float %b) {
 ; CHECK-LABEL: @test_fdiv(
-; CHECK:    [[TMP1:%[A-z0-9]*]] = bitcast float [[B:%[A-z0-9]*]] to i32
-; CHECK:    [[TMP2:%[A-z0-9]*]] = and i32 [[TMP1]], 2139095040
-; CHECK:    [[TMP3:%[A-z0-9]*]] = icmp eq i32 [[TMP2]], 0
-; CHECK:    [[TMP4:%[A-z0-9]*]] = select i1 [[TMP3]], float 0x41F0000000000000, float 1.000000e+00
-; CHECK:    [[TMP5:%[A-z0-9]*]] = icmp uge i32 [[TMP2]], 1677721600
-; CHECK:    [[TMP6:%[A-z0-9]*]] = select i1 [[TMP5]], float 0x3DF0000000000000, float [[TMP4]]
-; CHECK:    [[TMP7:%[A-z0-9]*]] = fmul float [[B]], [[TMP6]]
-; CHECK:    [[TMP8:%[A-z0-9]*]] = fdiv float 1.000000e+00, [[TMP7]]
-; CHECK:    [[TMP9:%[A-z0-9]*]] = fmul float [[TMP8]], [[A:%[A-z0-9]*]]
-; CHECK:    [[TMP10:%[A-z0-9]*]] = fmul float [[TMP9]], [[TMP6]]
-; CHECK:    call void @use.f32(float [[TMP10]])
+; CHECK:    [[TMP1:%[A-z0-9]*]] = fcmp oeq float [[A:%[A-z0-9]*]], [[B:%[A-z0-9]*]]
+; CHECK:    [[TMP2:%[A-z0-9]*]] = bitcast float [[B]] to i32
+; CHECK:    [[TMP3:%[A-z0-9]*]] = and i32 [[TMP2]], 2139095040
+; CHECK:    [[TMP4:%[A-z0-9]*]] = and i32 [[TMP2]], 8388607
+; CHECK:    [[TMP5:%[A-z0-9]*]] = icmp ne i32 [[TMP3]], 0
+; CHECK:    [[TMP6:%[A-z0-9]*]] = icmp ne i32 [[TMP4]], 0
+; CHECK:    [[TMP7:%[A-z0-9]*]] = and i1 [[TMP1]], [[TMP5]], !dbg !11
+; CHECK:    [[TMP8:%[A-z0-9]*]] = and i1 [[TMP7]], [[TMP6]], !dbg !11
+; CHECK:    br i1 [[TMP8]], label %[[BB3:[A-z0-9]*]], label %[[BB2:[A-z0-9]*]]
+; CHECK:  [[BB2]]:
+; CHECK:    [[TMP9:%[A-z0-9]*]] = icmp eq i32 [[TMP3]], 0
+; CHECK:    [[TMP10:%[A-z0-9]*]] = select i1 [[TMP9]], float 0x41F0000000000000, float 1.000000e+00
+; CHECK:    [[TMP11:%[A-z0-9]*]] = icmp uge i32 [[TMP3]], 1677721600
+; CHECK:    [[TMP12:%[A-z0-9]*]] = select i1 [[TMP11]], float 0x3DF0000000000000, float [[TMP10]]
+; CHECK:    [[TMP13:%[A-z0-9]*]] = fmul float [[B]], [[TMP12]]
+; CHECK:    [[TMP14:%[A-z0-9]*]] = fdiv float 1.000000e+00, [[TMP13]]
+; CHECK:    [[TMP15:%[A-z0-9]*]] = fmul float [[TMP14]], [[A]]
+; CHECK:    [[TMP16:%[A-z0-9]*]] = fmul float [[TMP15]], [[TMP12]]
+; CHECK:  [[BB3]]:
+; CHECK:    [[TMP14:%[A-z0-9]*]] = phi float [ 1.000000e+00, %[[BB1:[A-z0-9]*]] ], [ [[TMP16]], %[[BB2]] ]
+; CHECK:    call void @use.f32(float [[TMP14]])
 ; CHECK:    ret void
 ;
   %1 = fdiv float %a, %b