diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index b0b3eb4d0c7c..d5772738cd97 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -4330,6 +4330,7 @@ void EmitPass::EmitGenericPointersCmp(llvm::Instruction* inst, void EmitPass::BinaryUnary(llvm::Instruction* inst, const SSource source[2], const DstModifier& modifier) { + switch (inst->getOpcode()) { case Instruction::FCmp: @@ -4361,6 +4362,9 @@ void EmitPass::BinaryUnary(llvm::Instruction* inst, const SSource source[2], con case Instruction::Mul: Mul(source, modifier); break; + case Instruction::FMul: + Mul(source, modifier); + break; case Instruction::Call: EmitAluIntrinsic(cast(inst), source, modifier); break; @@ -4572,6 +4576,15 @@ void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool m_encoder->Push(); } +static unsigned int getVectorSize(Instruction *I) { + IGCLLVM::FixedVectorType *VecType = + llvm::dyn_cast(I->getType()); + if (!VecType) + return 0; + unsigned int NumElements = VecType->getNumElements(); + return NumElements; +} + void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier) { CVariable* src[2]; @@ -4580,6 +4593,28 @@ void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier) src[i] = GetSrcVariable(sources[i]); } + if (IGC_IS_FLAG_ENABLED(EnableVectorEmitter) && sources[0].value->getType()->isVectorTy() && sources[1].value->getType()->isVectorTy()) { + + unsigned int VectorSize = 0; + if (llvm::isa(sources[0].value)) + VectorSize = getVectorSize(llvm::cast(sources[0].value)); + + for (unsigned int i = 0; i < VectorSize; ++i) { + SetSourceModifiers(0, sources[0]); + SetSourceModifiers(1, sources[1]); + + if (src[0]->IsUniform()) { m_encoder->SetSrcSubReg(0, i); } + else m_encoder->SetSrcSubVar(0, i); + if (src[1]->IsUniform()) { m_encoder->SetSrcSubReg(1, i); } + else m_encoder->SetSrcSubVar(1, i); + + m_encoder->SetDstSubVar(i); + m_encoder->Mul(m_destination, src[0], src[1]); + m_encoder->Push(); + } + return; + } + // Only i64 muls need special handling, otherwise go back to standard flow VISA_Type srcType = src[0]->GetType(); if (srcType != ISA_TYPE_Q && srcType != ISA_TYPE_UQ) diff --git a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp index 3517f8443ea8..4119be950ac5 100644 --- a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp +++ b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp @@ -82,7 +82,7 @@ SPDX-License-Identifier: MIT char IGCVectorizer::ID = 0; #define PASS_FLAG2 "igc-vectorizer" -#define PASS_DESCRIPTION2 "prints register pressure estimation" +#define PASS_DESCRIPTION2 "Vectorizes scalar path around igc vector intrinsics like dpas" #define PASS_CFG_ONLY2 false #define PASS_ANALYSIS2 false IGC_INITIALIZE_PASS_BEGIN(IGCVectorizer, PASS_FLAG2, PASS_DESCRIPTION2, @@ -91,11 +91,12 @@ IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper) IGC_INITIALIZE_PASS_END(IGCVectorizer, PASS_FLAG2, PASS_DESCRIPTION2, PASS_CFG_ONLY2, PASS_ANALYSIS2) +#define OutputLogStreamM OutputLogStream #define DEBUG IGC_IS_FLAG_ENABLED(VectorizerLog) -#define PRINT_LOG(Str) if (DEBUG) OutputLogStream << Str; -#define PRINT_LOG_NL(Str) if (DEBUG) OutputLogStream << Str << "\n"; -#define PRINT_INST(I) if (DEBUG) { I->print(OutputLogStream, false); } -#define PRINT_INST_NL(I) if (DEBUG) { I->print(OutputLogStream, false); OutputLogStream << "\n"; } +#define PRINT_LOG(Str) if (DEBUG) { OutputLogStreamM << Str; writeLog(); } +#define PRINT_LOG_NL(Str) if (DEBUG) { OutputLogStreamM << Str << "\n"; writeLog(); } +#define PRINT_INST(I) if (DEBUG) { I->print(OutputLogStreamM, false); } +#define PRINT_INST_NL(I) if (DEBUG) { if (I) { I->print(OutputLogStreamM, false); } else { PRINT_LOG("NULL"); } OutputLogStreamM << "\n"; } #define PRINT_DS(Str, DS) if (DEBUG) { for (auto DS_EL : DS) { { PRINT_LOG(Str); } { PRINT_INST_NL(DS_EL); } } } IGCVectorizer::IGCVectorizer() : FunctionPass(ID) { @@ -177,35 +178,27 @@ unsigned int getVectorSize(Instruction *I) { return NumElements; } -bool isSafeToVectorize(Instruction *I) { - // this is a very limited approach for vectorizing - // but it's safe - bool Result = llvm::isa(I) || llvm::isa(I) || - llvm::isa(I); +bool isBinarySafe(Instruction *I) { + + bool Result = false; + auto* Binary = llvm::dyn_cast(I); + if (Binary) { + auto OpCode = Binary->getOpcode(); + Result |= OpCode == Instruction::FMul; + } return Result; } -bool IGCVectorizer::compareOperands(Value *A, Value *B) { - Constant *ConstA = llvm::dyn_cast(A); - Constant *ConstB = llvm::dyn_cast(B); - - Instruction *InstA = llvm::dyn_cast(A); - Instruction *InstB = llvm::dyn_cast(B); +bool isSafeToVectorize(Instruction *I) { + // this is a very limited approach for vectorizing but it's safe + bool Result = + llvm::isa(I) || + llvm::isa(I) || + llvm::isa(I) || + isBinarySafe(I); - if (ConstA && ConstB) { - bool BothZero = ConstA->isZeroValue() && ConstB->isZeroValue(); - BothZero &= !(ConstA->isNegativeZeroValue() || ConstB->isNegativeZeroValue()); - return BothZero; - } else if (InstA && InstB) { - if (!ScalarToVector.count(InstA)) { - PRINT_LOG_NL("some elements weren't even vectorized"); - return false; - } - bool Same = ScalarToVector[InstA] == ScalarToVector[InstB]; - return Same; - } - return false; + return Result; } bool IGCVectorizer::handlePHI(VecArr &Slice, Type *VectorType) { @@ -214,30 +207,72 @@ bool IGCVectorizer::handlePHI(VecArr &Slice, Type *VectorType) { if (!checkPHI(ScalarPhi, Slice)) return false; - llvm::Constant *zeroInitializer = - llvm::ConstantAggregateZero::get(VectorType); PHINode *Phi = PHINode::Create(VectorType, 2); + CreatedVectorInstructions.push_back(Phi); Phi->setName("vectorized_phi"); + VecVal Operands; for (auto& BB : ScalarPhi->blocks()) { - Value *Val = ScalarPhi->getIncomingValueForBlock(BB); - Constant *Const = llvm::dyn_cast(Val); - Instruction *Inst = llvm::dyn_cast(Val); - - if (Const && Const->isZeroValue()) - Phi->addIncoming(zeroInitializer, BB); - else if (Inst) - Phi->addIncoming(ScalarToVector[Inst], BB); + + std::vector Elements; + VecArr ForVector; + bool IsConstOperand = true; + bool IsInstOperand = true; + bool IsVectorized = true; + for (auto& El : Slice) { + + PHINode *Phi = static_cast(El); + Value *Val = Phi->getIncomingValueForBlock(BB); + Value *ValCmp = ScalarPhi->getIncomingValueForBlock(BB); + + PRINT_INST(Val); PRINT_LOG(" & "); PRINT_INST_NL(ValCmp); + + Constant *Const = llvm::dyn_cast(Val); + Constant *ConstCmp = llvm::dyn_cast(ValCmp); + IsConstOperand &= Const && ConstCmp; + if (IsConstOperand) { Elements.push_back(Const); } + + Instruction* Inst = llvm::dyn_cast(Val); + Instruction* InstCmp = llvm::dyn_cast(ValCmp); + IsInstOperand &= Inst && InstCmp; + if (IsInstOperand) { + ForVector.push_back(Inst); + IsVectorized &= ScalarToVector.count(Inst) && (ScalarToVector[Inst] == ScalarToVector[InstCmp]); + } + + } + + if (IsConstOperand) { + PRINT_LOG_NL("ConstOperand"); + auto ConstVec = ConstantVector::get(Elements); + Operands.push_back(ConstVec); + } + else if (IsVectorized) { + PRINT_LOG_NL("Vectorized: "); + auto Vectorized = ScalarToVector[ScalarPhi->getIncomingValueForBlock(BB)]; + PRINT_INST_NL(Vectorized); + Operands.push_back(Vectorized); + } + else if (IsInstOperand) { + PRINT_LOG_NL("Created Vector: "); + auto CreatedVec = createVector(ForVector, ForVector.back()->getNextNonDebugInstruction()); + PRINT_INST_NL(CreatedVec); + Operands.push_back(CreatedVec); + } else { - PRINT_LOG_NL("malformed PHI, no vectorization"); + PRINT_LOG_NL("Couldn't create operand array"); return false; } + } - auto BB = ScalarPhi->getParent(); + for (unsigned int i = 0; i < Operands.size(); ++i) { + auto BB = ScalarPhi->getIncomingBlock(i); + Phi->addIncoming(Operands[i], BB); + } + + Phi->insertBefore(ScalarPhi); Phi->setDebugLoc(ScalarPhi->getDebugLoc()); - IGCLLVM::pushFrontInstruction(BB, Phi); - CreatedVectorInstructions.push_back(Phi); for (auto &El : Slice) ScalarToVector[El] = Phi; @@ -262,6 +297,115 @@ bool IGCVectorizer::handleInsertElement(VecArr &Slice, Instruction* Final) { return true; } + +InsertElementInst* IGCVectorizer::createVector(VecArr& Slice, Instruction* InsertPoint) { + + llvm::Type* elementType = Slice[0]->getType(); + llvm::VectorType* vectorType = llvm::FixedVectorType::get(elementType, Slice.size()); + llvm::Value* UndefVector = llvm::UndefValue::get(vectorType); + InsertElementInst* CreatedInsert = nullptr; + + for (size_t i = 0; i < Slice.size(); i++) { + llvm::Value* index = llvm::ConstantInt::get(llvm::Type::getInt32Ty(M->getContext()), i); + // we start insert element with under value + if (CreatedInsert) CreatedInsert = InsertElementInst::Create(CreatedInsert, Slice[i], index); + else CreatedInsert = InsertElementInst::Create(UndefVector, Slice[i], index); + CreatedInsert->setName("vector"); + CreatedInsert->setDebugLoc(Slice[i]->getDebugLoc()); + CreatedInsert->insertBefore(InsertPoint); + CreatedVectorInstructions.push_back(CreatedInsert); + } + + for (auto &El : Slice) + ScalarToVector[El] = CreatedInsert; + return CreatedInsert; +} + +bool IGCVectorizer::handleBinaryInstruction(VecArr &Slice) { + + Value *PrevVectorization = nullptr; + Instruction *First = Slice.front(); + if (ScalarToVector.count(First)) { + auto Vectorized = ScalarToVector[First]; + if (llvm::isa(Vectorized)) { + PRINT_LOG_NL("Was sourced by other vector instruction, but wasn't vectorized"); + PrevVectorization = Vectorized; + } + else { + PRINT_LOG_NL("Already was vectorized by other slice"); + return true; + } + } + VecArr Operands; + for (unsigned int OperNum = 0; OperNum < First->getNumOperands(); ++OperNum) { + Value* Vectorized = checkOperandsToBeVectorized(First, OperNum, Slice); + if (Vectorized) + Operands.push_back(llvm::cast(Vectorized)); + else { + Value* VectorizedOperand = vectorizeSlice(Slice, OperNum); + if (!VectorizedOperand) { PRINT_LOG_NL("Couldn't vectorize Slice"); return false; } + Operands.push_back(llvm::cast(VectorizedOperand)); + } + } + + PRINT_DS("Operands: ", Operands); + + auto BinaryOpcode = llvm::cast(First)->getOpcode(); + + auto* CreatedInst = BinaryOperator::Create(BinaryOpcode, Operands[0], Operands[1]); + CreatedInst->setName("vectorized_binary"); + + CreatedInst->setDebugLoc(First->getDebugLoc()); + CreatedInst->insertAfter(Slice.back()); + CreatedVectorInstructions.push_back(CreatedInst); + + PRINT_LOG("Binary instruction created: "); + PRINT_INST_NL(CreatedInst); + + for (auto &el : Slice) { + if (ScalarToVector.count(el)) { + PRINT_LOG_NL("Vectorized version already present"); + PRINT_INST(el); PRINT_LOG(" --> "); PRINT_INST_NL(ScalarToVector[el]); + } + ScalarToVector[el] = CreatedInst; + } + + if (PrevVectorization) { + PRINT_LOG_NL("Replaced with proper vector version"); + PrevVectorization->replaceAllUsesWith(CreatedInst); + } + + return true; +} + +bool IGCVectorizer::handleCastInstruction(VecArr &Slice) { + + Instruction *First = Slice.front(); + + unsigned int OperNum = 0; + Value* Vectorized = checkOperandsToBeVectorized(First, OperNum, Slice); + if (!Vectorized) Vectorized = vectorizeSlice(Slice, OperNum); + + auto VectorSize = getVectorSize((Instruction* )Vectorized); + auto Type = IGCLLVM::FixedVectorType::get(First->getType(), VectorSize); + auto CastOpcode = llvm::cast(First)->getOpcode(); + + CastInst* CreatedCast = CastInst::Create(CastOpcode, Vectorized, Type); + CreatedCast->setName("vectorized_cast"); + + CreatedCast->setDebugLoc(First->getDebugLoc()); + CreatedCast->insertBefore(First); + CreatedVectorInstructions.push_back(CreatedCast); + + PRINT_LOG("Cast instruction created: "); + PRINT_INST_NL(CreatedCast); + + for (auto &el : Slice) + ScalarToVector[el] = CreatedCast; + + return true; +} + // this basicaly seeds the chain bool IGCVectorizer::handleExtractElement(VecArr &Slice) { Instruction *First = Slice.front(); @@ -275,10 +419,12 @@ bool IGCVectorizer::handleExtractElement(VecArr &Slice) { } bool IGCVectorizer::processChain(InsertStruct &InSt) { - std::reverse(InSt.Chain.begin(), InSt.Chain.end()); + std::reverse(InSt.SlChain.begin(), InSt.SlChain.end()); - for (auto &Slice : InSt.Chain) { + for (auto &SliceSt : InSt.SlChain) { + PRINT_LOG_NL(""); PRINT_LOG_NL("Process slice: "); + VecArr& Slice = SliceSt.Vector; PRINT_DS("Slice: ", Slice); // this contains common checks for any slice @@ -288,6 +434,10 @@ bool IGCVectorizer::processChain(InsertStruct &InSt) { Instruction *First = Slice[0]; if (llvm::isa(First)) { if (!handlePHI(Slice, InSt.Final->getType())) return false; + } else if (llvm::isa(First)) { + if (!handleCastInstruction(Slice)) return false; + } else if (llvm::isa(First)) { + if (!handleBinaryInstruction(Slice)) return false; } else if (llvm::isa(First)) { if (!handleExtractElement(Slice)) return false; } else if (llvm::isa(First)) { @@ -323,86 +473,118 @@ void IGCVectorizer::clusterInsertElement( PRINT_LOG_NL("--------------------------"); } -void IGCVectorizer::collectScalarPath(VecArr &V, VectorSliceChain &Chain) { - typedef std::pair Pair; - std::queue BFSQ; +void IGCVectorizer::printSlice(Slice* S) { + + PRINT_LOG_NL("Slice: [ " << S << " ]"); + PRINT_LOG_NL("OpNum: " << S->OpNum); + PRINT_LOG_NL("Parent: " << S->Parent); + PRINT_DS("Slice: ", S->Vector); +} + +void IGCVectorizer::buildTree(VecArr &V, VecOfSlices& Chain) { - Chain.push_back({}); - for (auto &Insert : V) { - BFSQ.push({Insert, 0}); - Chain[0].push_back(Insert); - } std::unordered_set Explored; + std::queue BFSQ; + + Chain.push_back({ 0, V, nullptr}); + Slice* Root = &Chain.back(); + BFSQ.push(Root); while (!BFSQ.empty()) { - llvm::Instruction *CurrI = BFSQ.front().first; - unsigned int Level = BFSQ.front().second; + Slice *CurSlice = BFSQ.front(); + auto First = CurSlice->Vector.front(); BFSQ.pop(); - for (unsigned int i = 0; i < CurrI->getNumOperands(); ++i) { - Instruction *Op = llvm::dyn_cast(CurrI->getOperand(i)); - if (!Op) - continue; + PRINT_LOG_NL(""); PRINT_LOG("Start: "); PRINT_INST_NL(First); + for (unsigned int OpNum = 0; OpNum < First->getNumOperands(); ++OpNum) { - bool IsConstant = llvm::isa(Op); - bool IsExplored = Explored.count(Op); - bool IsVector = Op->getType()->isVectorTy(); - bool IsNotSafeToVectorize = !isSafeToVectorize(Op); - bool IsExtractEl = llvm::isa(Op); - - if (IsExtractEl) { - if (Chain.size() <= (Level + 1)) - Chain.push_back({}); - Chain[Level + 1].push_back(Op); - InstructionToSlice[Op] = &Chain[Level + 1]; - } + PRINT_LOG("Operand [" << OpNum << "]: "); + Instruction *Cmp = llvm::dyn_cast(First->getOperand(OpNum)); + bool IsSame = true; + if (!Cmp) { IsSame = false; PRINT_LOG_NL("Not an instruction"); continue; } + PRINT_LOG("First: "); PRINT_INST_NL(Cmp); + if (!isSafeToVectorize(Cmp)) { PRINT_LOG_NL(" Not safe to vectorize "); IsSame = false; continue; } - bool Skip = IsConstant || IsExplored || IsVector || IsExtractEl || IsNotSafeToVectorize; - if (Skip) - continue; + VecArr LocalVector; - if (Chain.size() <= (Level + 1)) - Chain.push_back({}); - Chain[Level + 1].push_back(Op); - InstructionToSlice[Op] = &Chain[Level + 1]; + for (auto &El : CurSlice->Vector) { + auto Operand = llvm::dyn_cast(El->getOperand(OpNum)); - Explored.insert(Op); - BFSQ.push({Op, Level + 1}); + if (!Operand) { IsSame = false; break; } + + bool IsExplored = Explored.count(Operand); + if (IsExplored) { IsSame = false; break; } + Explored.insert(Operand); + + IsSame &= Cmp->isSameOperationAs(Operand, false); + if (!IsSame) break; + LocalVector.push_back(Operand); + } + + PRINT_DS(" check: ", LocalVector); + if (IsSame) { + PRINT_LOG_NL("Pushed"); + Chain.push_back({ OpNum, LocalVector, CurSlice}); + BFSQ.push(&Chain.back()); + } } } } - bool IGCVectorizer::checkPHI(Instruction *Compare, VecArr &Slice) { PHINode *ComparePHI = static_cast(Slice[0]); if (ComparePHI->getNumIncomingValues() != 2) { PRINT_LOG_NL("Only 2-way phi supported"); return false; } + return true; +} - for (auto BB : ComparePHI->blocks()) { - for (auto &El : Slice) { - PHINode *ElPHI = static_cast(El); - Value *Val = ElPHI->getIncomingValueForBlock(BB); - Value *CompareVal = ComparePHI->getIncomingValueForBlock(BB); - if (!compareOperands(CompareVal, Val)) { - PRINT_LOG_NL("couldn't vectorize PHI, operands do not converge"); - return false; - } - } +Value* IGCVectorizer::vectorizeSlice(VecArr& Slice, unsigned int OperNum) { + + VecArr NotVectorizedInstruction; + VecConst ConstNotVectorized; + Value* NewVector = nullptr; + + for (auto &El : Slice) { + Value *Val = El->getOperand(OperNum); + PRINT_INST(El); PRINT_LOG(" --> " ); PRINT_INST_NL(Val); + auto Inst = llvm::dyn_cast(Val); + if (Inst) { NotVectorizedInstruction.push_back(Inst); continue; } + auto Const = llvm::dyn_cast(Val); + if (Const) { ConstNotVectorized.push_back(Const); continue; } } - for (auto &BB : ComparePHI->blocks()) { - Value *Val = ComparePHI->getIncomingValueForBlock(BB); - Instruction *Inst = llvm::dyn_cast(Val); - if (Inst) { - if (!ScalarToVector.count(Inst)) { - PRINT_LOG_NL("can't vectorize, operand hasn't been vectorized"); - return false; - } + if (ConstNotVectorized.size() == Slice.size()) { + NewVector = ConstantVector::get(ConstNotVectorized); + PRINT_LOG("New vector created: "); PRINT_INST_NL(NewVector); + } + + if (NotVectorizedInstruction.size() == Slice.size()) { + NewVector = createVector(NotVectorizedInstruction, NotVectorizedInstruction.back()->getNextNonDebugInstruction()); + PRINT_LOG("New vector created: "); PRINT_INST_NL(NewVector); + } + return NewVector; +} + +Value* IGCVectorizer::checkOperandsToBeVectorized(Instruction *First, unsigned int OperNum, VecArr &Slice) { + + Value *Compare = ScalarToVector[First->getOperand(OperNum)]; + if (!Compare) { + PRINT_LOG_NL(" Operand num: " << OperNum << " is not vectorized"); + return nullptr; + } + for (auto &El : Slice) { + Value *Val = El->getOperand(OperNum); + Value *ValCompare = ScalarToVector[Val]; + if (ValCompare != Compare) { + PRINT_LOG("Compare: "); PRINT_INST_NL(Compare); + PRINT_LOG("ValCompare: "); PRINT_INST_NL(ValCompare); + PRINT_LOG_NL("Operands in slice do not converge"); + return nullptr; } } - return true; + return Compare; } bool IGCVectorizer::checkInsertElement(Instruction *First, VecArr &Slice) { @@ -423,17 +605,8 @@ bool IGCVectorizer::checkInsertElement(Instruction *First, VecArr &Slice) { PRINT_LOG_NL("some elements weren't even vectorized"); return false; } - Value *Compare = ScalarToVector[First->getOperand(1)]; - for (auto &El : Slice) { - Value *Val = El->getOperand(1); - Value *ValCompare = ScalarToVector[Val]; - if (ValCompare != Compare) { - PRINT_LOG("InsertCompare: "); PRINT_INST_NL(Compare); - PRINT_LOG("InsertVal: "); PRINT_INST_NL(ValCompare); - PRINT_LOG_NL("Insert Element, operands do not converge"); - return false; - } - } + if (!checkOperandsToBeVectorized(First, 1, Slice)) + return false; return true; } @@ -529,6 +702,7 @@ void IGCVectorizer::collectInstructionToProcess(VecArr &ToProcess, } bool IGCVectorizer::runOnFunction(llvm::Function &F) { + M = F.getParent(); CGCtx = getAnalysis().getCodeGenContext(); initializeLogFile(F); @@ -540,7 +714,13 @@ bool IGCVectorizer::runOnFunction(llvm::Function &F) { writeLog(); - for (auto &El : ToProcess) { + for (unsigned int Ind = 0; Ind < ToProcess.size(); ++Ind) { + + unsigned int Index = IGC_GET_FLAG_VALUE(VectorizerList); + PRINT_LOG_NL(" Index: " << Index << " Ind: " << Ind); + if (Index != Ind && Index != -1) continue; + + auto& El = ToProcess[Ind]; PRINT_LOG("Candidate: "); PRINT_INST_NL(El); @@ -579,20 +759,22 @@ bool IGCVectorizer::runOnFunction(llvm::Function &F) { } writeLog(); - collectScalarPath(InSt.Vec, InSt.Chain); - for (auto &Slice : InSt.Chain) { - PRINT_LOG_NL("Slice:"); - PRINT_DS("--> ", Slice); - } - writeLog(); + buildTree(InSt.Vec, InSt.SlChain); + PRINT_LOG_NL("Print slices"); + for (auto& Slice : InSt.SlChain) { printSlice(&Slice); writeLog(); } CreatedVectorInstructions.clear(); if (!processChain(InSt)) { + writeLog(); + std::reverse(CreatedVectorInstructions.begin(), CreatedVectorInstructions.end()); for (auto& el : CreatedVectorInstructions) { - PRINT_LOG("Cleaned: "); PRINT_INST_NL(el); + PRINT_LOG("Cleaned: "); PRINT_INST_NL(el); writeLog(); el->eraseFromParent(); } } + else { + for (auto& el : CreatedVectorInstructions) { PRINT_LOG("Created: "); PRINT_INST_NL(el); writeLog(); } + } writeLog(); } diff --git a/IGC/Compiler/CISACodeGen/IGCVectorizer.h b/IGC/Compiler/CISACodeGen/IGCVectorizer.h index 6a923b37264e..c74dbb7cefcc 100644 --- a/IGC/Compiler/CISACodeGen/IGCVectorizer.h +++ b/IGC/Compiler/CISACodeGen/IGCVectorizer.h @@ -27,15 +27,28 @@ class IGCVectorizer : public llvm::FunctionPass { typedef llvm::SmallPtrSet ValueSet; typedef llvm::SmallVector VecArr; + typedef llvm::SmallVector VecConst; + typedef llvm::SmallVector VecVal; typedef llvm::SmallVector VectorSliceChain; + + struct Slice { + unsigned int OpNum; + VecArr Vector; + Slice* Parent; + }; + + typedef llvm::SmallVector VecOfSlices; + typedef llvm::SmallVector Tree; typedef std::unordered_map InstructionToSliceMap; + struct InsertStruct { Instruction* Final = nullptr; // contains insert elements VecArr Vec; // contains slices of vector tree VectorSliceChain Chain; + VecOfSlices SlChain; }; CodeGenContext *CGCtx = nullptr; @@ -54,15 +67,20 @@ class IGCVectorizer : public llvm::FunctionPass { std::unique_ptr OutputLogFile; std::string LogStr; llvm::raw_string_ostream OutputLogStream = raw_string_ostream(LogStr); + Module* M; void initializeLogFile(Function& F); void writeLog(); void findInsertElementsInDataFlow(llvm::Instruction* I, VecArr& Chain); void collectScalarPath(VecArr& V, VectorSliceChain& Chain); + void canonicalizeSlices(VectorSliceChain& Chain); bool checkSlice(VecArr& Slice, InsertStruct& InSt); bool processChain(InsertStruct& InSt); void clusterInsertElement(InsertElementInst* VecOfInsert, InsertStruct& InSt); void collectInstructionToProcess(VecArr& ToProcess, Function& F); + void buildTree(VecArr &V, VecOfSlices& Chain); + void printSlice(Slice* S); + bool checkPHI(Instruction* Compare, VecArr& Slice); bool handlePHI(VecArr& Slice, Type* VectorType); @@ -70,8 +88,16 @@ class IGCVectorizer : public llvm::FunctionPass { bool handleInsertElement(VecArr& Slice, Instruction* Final); bool checkExtractElement(Instruction* Compare, VecArr& Slice); bool handleExtractElement(VecArr& Slice); + bool handleCastInstruction(VecArr& Slice); + bool handleBinaryInstruction(VecArr& Slice); + bool checkBinaryOperator(VecArr& Slice); + bool handleIntrinsicInstruction(VecArr& Slice); + + Value* checkOperandsToBeVectorized(Instruction *First, unsigned int OperNum, VecArr &Slice); + Value* vectorizeSlice(VecArr& Slice, unsigned int OperNum); bool compareOperands(Value* A, Value* B); + InsertElementInst* createVector(VecArr& Slice, Instruction* InsertPoint); public: llvm::StringRef getPassName() const override { return "IGCVectorizer"; } diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp index 85e18b32471e..44f8d8e87664 100644 --- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp +++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp @@ -1930,6 +1930,9 @@ void OptimizeIR(CodeGenContext* const pContext) if (IGC_IS_FLAG_ENABLED(EnableVectorizer)) { mpm.add(new IGCVectorizer()); + mpm.add(llvm::createAggressiveDCEPass()); + if (IGC_IS_FLAG_ENABLED(VectorizerCheckScalarizer)) + mpm.add(createScalarizerPass(SelectiveScalarizer::Auto)); } mpm.run(*pContext->getModule()); diff --git a/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmul.ll b/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmul.ll new file mode 100644 index 000000000000..13e5398a1177 --- /dev/null +++ b/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-fmul.ll @@ -0,0 +1,500 @@ +; UNSUPPORTED: system-windows +; REQUIRES: pvc-supported, regkeys + +; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 -simd-mode 16 < %s | FileCheck %s + +; CHECK: .decl vectorized_phi v_type=G type=f num_elts=128 align=wordx32 +; CHECK: .decl vector v_type=G type=f num_elts=8 align=dword + +; CHECK: mul (M1, 16) vectorized_phi(0,0)<1> vector(0,0)<0;1,0> vectorized_phi(0,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(1,0)<1> vector(0,1)<0;1,0> vectorized_phi(1,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(2,0)<1> vector(0,2)<0;1,0> vectorized_phi(2,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(3,0)<1> vector(0,3)<0;1,0> vectorized_phi(3,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(4,0)<1> vector(0,4)<0;1,0> vectorized_phi(4,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(5,0)<1> vector(0,5)<0;1,0> vectorized_phi(5,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(6,0)<1> vector(0,6)<0;1,0> vectorized_phi(6,0)<1;1,0> +; CHECK: mul (M1, 16) vectorized_phi(7,0)<1> vector(0,7)<0;1,0> vectorized_phi(7,0)<1;1,0> + + +define spir_kernel void @_attn_fwd(half addrspace(1)* %0, half addrspace(1)* %1, half addrspace(1)* %2, float %3, i8 addrspace(1)* %4, float addrspace(1)* %5, <8 x i32> %r0, <8 x i32> %payloadHeader, i8* %privateBase, i32 %bufferOffset, i32 %bufferOffset1, i32 %bufferOffset2, i32 %bufferOffset3, i32 %bufferOffset4) { + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.._crit_edge_crit_edge, %6 + %7 = phi float [ 0.000000e+00, %6 ], [ %7, %._crit_edge.._crit_edge_crit_edge ] + %vectorized_phi = phi <8 x float> [ zeroinitializer, %6 ], [ %8, %._crit_edge.._crit_edge_crit_edge ] + %vector = insertelement <8 x float> zeroinitializer, float 0.000000e+00, i64 0 + %vectorized_binary = fmul <8 x float> %vector, %vectorized_phi + %8 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %vectorized_binary, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) + br label %._crit_edge.._crit_edge_crit_edge + +._crit_edge.._crit_edge_crit_edge: ; preds = %._crit_edge + br label %._crit_edge +} + +declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) + +!igc.functions = !{!0} +!IGCMetadata = !{!19} + +!0 = !{void (half addrspace(1)*, half addrspace(1)*, half addrspace(1)*, float, i8 addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, i8*, i32, i32, i32, i32, i32)* @_attn_fwd, !1} +!1 = !{!2, !3, !17, !18} +!2 = !{!"function_type", i32 0} +!3 = !{!"implicit_arg_desc", !4, !5, !6, !7, !9, !11, !13, !15} +!4 = !{i32 0} +!5 = !{i32 1} +!6 = !{i32 12} +!7 = !{i32 14, !8} +!8 = !{!"explicit_arg_num", i32 0} +!9 = !{i32 14, !10} +!10 = !{!"explicit_arg_num", i32 1} +!11 = !{i32 14, !12} +!12 = !{!"explicit_arg_num", i32 2} +!13 = !{i32 14, !14} +!14 = !{!"explicit_arg_num", i32 4} +!15 = !{i32 14, !16} +!16 = !{!"explicit_arg_num", i32 5} +!17 = !{!"sub_group_size", i32 16} +!18 = !{!"max_reg_pressure", i32 185} +!19 = !{!"ModuleMD", !20, !21, !126, !247, !278, !281, !282, !286, !289, !290, !291, !327, !353, !366, !367, !368, !384, !385, !386, !387, !388, !389, !390, !391, !392, !393, !397, !398, !405, !406, !407, !408, !409, !410, !411, !412, !413, !414, !415, !416, !418, !422, !423, !424, !425, !426, !427, !428, !429, !430, !431, !432, !433, !434, !435, !436, !437, !438, !439, !440, !191, !441, !444, !445, !447, !450, !451, !452, !454, !455, !456, !461, !462} +!20 = !{!"isPrecise", i1 false} +!21 = !{!"compOpt", !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74, !75, !76, !77, !78, !79, !80, !81, !82, !83, !84, !85, !86, !87, !88, !89, !90, !91, !92, !93, !94, !95, !96, !97, !98, !99, !100, !101, !103, !104, !105, !106, !107, !108, !109, !110, !111, !112, !113, !114, !115, !116, !117, !118, !119, !120, !121, !122, !123, !124, !125} +!22 = !{!"DenormsAreZero", i1 false} +!23 = !{!"BFTFDenormsAreZero", i1 false} +!24 = !{!"CorrectlyRoundedDivSqrt", i1 false} +!25 = !{!"OptDisable", i1 false} +!26 = !{!"MadEnable", i1 false} +!27 = !{!"NoSignedZeros", i1 false} +!28 = !{!"NoNaNs", i1 false} +!29 = !{!"FloatRoundingMode", i32 0} +!30 = !{!"FloatCvtIntRoundingMode", i32 3} +!31 = !{!"LoadCacheDefault", i32 4} +!32 = !{!"StoreCacheDefault", i32 2} +!33 = !{!"VISAPreSchedRPThreshold", i32 0} +!34 = !{!"SetLoopUnrollThreshold", i32 0} +!35 = !{!"UnsafeMathOptimizations", i1 false} +!36 = !{!"disableCustomUnsafeOpts", i1 false} +!37 = !{!"disableReducePow", i1 false} +!38 = !{!"disableSqrtOpt", i1 false} +!39 = !{!"FiniteMathOnly", i1 false} +!40 = !{!"FastRelaxedMath", i1 false} +!41 = !{!"DashGSpecified", i1 false} +!42 = !{!"FastCompilation", i1 false} +!43 = !{!"UseScratchSpacePrivateMemory", i1 true} +!44 = !{!"RelaxedBuiltins", i1 false} +!45 = !{!"SubgroupIndependentForwardProgressRequired", i1 true} +!46 = !{!"GreaterThan2GBBufferRequired", i1 true} +!47 = !{!"GreaterThan4GBBufferRequired", i1 true} +!48 = !{!"DisableA64WA", i1 false} +!49 = !{!"ForceEnableA64WA", i1 false} +!50 = !{!"PushConstantsEnable", i1 true} +!51 = !{!"HasPositivePointerOffset", i1 false} +!52 = !{!"HasBufferOffsetArg", i1 true} +!53 = !{!"BufferOffsetArgOptional", i1 true} +!54 = !{!"replaceGlobalOffsetsByZero", i1 false} +!55 = !{!"forcePixelShaderSIMDMode", i32 0} +!56 = !{!"ForceGeomFFShaderSIMDMode", i32 0} +!57 = !{!"pixelShaderDoNotAbortOnSpill", i1 false} +!58 = !{!"UniformWGS", i1 false} +!59 = !{!"disableVertexComponentPacking", i1 false} +!60 = !{!"disablePartialVertexComponentPacking", i1 false} +!61 = !{!"PreferBindlessImages", i1 false} +!62 = !{!"UseBindlessMode", i1 false} +!63 = !{!"UseLegacyBindlessMode", i1 true} +!64 = !{!"disableMathRefactoring", i1 false} +!65 = !{!"atomicBranch", i1 false} +!66 = !{!"spillCompression", i1 false} +!67 = !{!"DisableEarlyOut", i1 false} +!68 = !{!"ForceInt32DivRemEmu", i1 false} +!69 = !{!"ForceInt32DivRemEmuSP", i1 false} +!70 = !{!"WaveIntrinsicUsed", i1 false} +!71 = !{!"DisableMultiPolyPS", i1 false} +!72 = !{!"NeedTexture3DLODWA", i1 false} +!73 = !{!"DisableFastestSingleCSSIMD", i1 false} +!74 = !{!"DisableFastestLinearScan", i1 false} +!75 = !{!"UseStatelessforPrivateMemory", i1 false} +!76 = !{!"EnableTakeGlobalAddress", i1 false} +!77 = !{!"IsLibraryCompilation", i1 false} +!78 = !{!"LibraryCompileSIMDSize", i32 0} +!79 = !{!"FastVISACompile", i1 false} +!80 = !{!"MatchSinCosPi", i1 false} +!81 = !{!"ExcludeIRFromZEBinary", i1 false} +!82 = !{!"EmitZeBinVISASections", i1 false} +!83 = !{!"FP64GenEmulationEnabled", i1 false} +!84 = !{!"FP64GenConvEmulationEnabled", i1 false} +!85 = !{!"allowDisableRematforCS", i1 false} +!86 = !{!"DisableIncSpillCostAllAddrTaken", i1 false} +!87 = !{!"DisableCPSOmaskWA", i1 false} +!88 = !{!"DisableFastestGopt", i1 false} +!89 = !{!"WaForceHalfPromotionComputeShader", i1 false} +!90 = !{!"WaForceHalfPromotionPixelVertexShader", i1 false} +!91 = !{!"DisableConstantCoalescing", i1 false} +!92 = !{!"EnableUndefAlphaOutputAsRed", i1 true} +!93 = !{!"WaEnableALTModeVisaWA", i1 false} +!94 = !{!"EnableLdStCombineforLoad", i1 false} +!95 = !{!"EnableLdStCombinewithDummyLoad", i1 false} +!96 = !{!"WaEnableAtomicWaveFusion", i1 false} +!97 = !{!"WaEnableAtomicWaveFusionNonNullResource", i1 false} +!98 = !{!"WaEnableAtomicWaveFusionStateless", i1 false} +!99 = !{!"WaEnableAtomicWaveFusionTyped", i1 false} +!100 = !{!"WaEnableAtomicWaveFusionPartial", i1 false} +!101 = !{!"WaEnableAtomicWaveFusionMoreDimensions", i1 false} +!103 = !{!"WaStoreRawVectorToTypedWrite", i1 false} +!104 = !{!"WaLoadRawVectorToTypedRead", i1 false} +!105 = !{!"WaZeroSLMBeforeUse", i1 false} +!106 = !{!"WaFlagGroupTypedUAVGloballyCoherent", i1 false} +!107 = !{!"EnableFastSampleD", i1 false} +!108 = !{!"NewSpillCostFunction", i1 false} +!109 = !{!"EnableVRT", i1 false} +!110 = !{!"ForceLargeGRFNum4RQ", i1 false} +!111 = !{!"Enable2xGRFRetry", i1 false} +!112 = !{!"Detect2xGRFCandidate", i1 false} +!113 = !{!"EnableURBWritesMerging", i1 true} +!114 = !{!"DisableEUFusion", i1 false} +!115 = !{!"DisableFDivToFMulInvOpt", i1 false} +!116 = !{!"initializePhiSampleSourceWA", i1 false} +!117 = !{!"WaDisableSubspanUseNoMaskForCB", i1 false} +!118 = !{!"DisableLoosenSimd32Occu", i1 false} +!119 = !{!"FastestS1Options", i32 0} +!120 = !{!"DisableFastestForWaveIntrinsicsCS", i1 false} +!121 = !{!"ForceLinearWalkOnLinearUAV", i1 false} +!122 = !{!"DisableLscSamplerRouting", i1 false} +!123 = !{!"UseBarrierControlFlowOptimization", i1 false} +!124 = !{!"disableDynamicRQManagement", i1 false} +!125 = !{!"Quad8InputThreshold", i32 0} +!126 = !{!"FuncMD", !127, !128} +!127 = !{!"FuncMDMap[0]", void (half addrspace(1)*, half addrspace(1)*, half addrspace(1)*, float, i8 addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, i8*, i32, i32, i32, i32, i32)* @_attn_fwd} +!128 = !{!"FuncMDValue[0]", !129, !130, !134, !135, !136, !137, !138, !139, !140, !160, !183, !184, !185, !186, !187, !188, !189, !190, !191, !192, !193, !194, !195, !196, !197, !198, !199, !206, !213, !220, !227, !234, !241, !242, !246} +!129 = !{!"localOffsets"} +!130 = !{!"workGroupWalkOrder", !131, !132, !133} +!131 = !{!"dim0", i32 0} +!132 = !{!"dim1", i32 1} +!133 = !{!"dim2", i32 2} +!134 = !{!"funcArgs"} +!135 = !{!"functionType", !"KernelFunction"} +!136 = !{!"inlineDynConstants"} +!137 = !{!"inlineDynRootConstant"} +!138 = !{!"inlineDynConstantDescTable"} +!139 = !{!"m_pInterestingConstants"} +!140 = !{!"rtInfo", !141, !142, !143, !144, !145, !146, !147, !148, !149, !150, !151, !152, !153, !154, !155, !156, !158, !159, !109} +!141 = !{!"callableShaderType", !"NumberOfCallableShaderTypes"} +!142 = !{!"isContinuation", i1 false} +!143 = !{!"hasTraceRayPayload", i1 false} +!144 = !{!"hasHitAttributes", i1 false} +!145 = !{!"hasCallableData", i1 false} +!146 = !{!"ShaderStackSize", i32 0} +!147 = !{!"ShaderHash", i64 0} +!148 = !{!"ShaderName", !""} +!149 = !{!"ParentName", !""} +!150 = !{!"SlotNum", i1* null} +!151 = !{!"NOSSize", i32 0} +!152 = !{!"globalRootSignatureSize", i32 0} +!153 = !{!"Entries"} +!154 = !{!"SpillUnions"} +!155 = !{!"CustomHitAttrSizeInBytes", i32 0} +!156 = !{!"Types", !157} +!157 = !{!"FullFrameTys"} +!158 = !{!"Aliases"} +!159 = !{!"NumGRF", i32 0} +!160 = !{!"resAllocMD", !161, !162, !163, !164, !182} +!161 = !{!"uavsNumType", i32 0} +!162 = !{!"srvsNumType", i32 0} +!163 = !{!"samplersNumType", i32 0} +!164 = !{!"argAllocMDList", !165, !169, !170, !171, !172, !173, !174, !175, !176, !177, !178, !179, !180, !181} +!165 = !{!"argAllocMDListVec[0]", !166, !167, !168} +!166 = !{!"type", i32 0} +!167 = !{!"extensionType", i32 -1} +!168 = !{!"indexType", i32 -1} +!169 = !{!"argAllocMDListVec[1]", !166, !167, !168} +!170 = !{!"argAllocMDListVec[2]", !166, !167, !168} +!171 = !{!"argAllocMDListVec[3]", !166, !167, !168} +!172 = !{!"argAllocMDListVec[4]", !166, !167, !168} +!173 = !{!"argAllocMDListVec[5]", !166, !167, !168} +!174 = !{!"argAllocMDListVec[6]", !166, !167, !168} +!175 = !{!"argAllocMDListVec[7]", !166, !167, !168} +!176 = !{!"argAllocMDListVec[8]", !166, !167, !168} +!177 = !{!"argAllocMDListVec[9]", !166, !167, !168} +!178 = !{!"argAllocMDListVec[10]", !166, !167, !168} +!179 = !{!"argAllocMDListVec[11]", !166, !167, !168} +!180 = !{!"argAllocMDListVec[12]", !166, !167, !168} +!181 = !{!"argAllocMDListVec[13]", !166, !167, !168} +!182 = !{!"inlineSamplersMD"} +!183 = !{!"maxByteOffsets"} +!184 = !{!"IsInitializer", i1 false} +!185 = !{!"IsFinalizer", i1 false} +!186 = !{!"CompiledSubGroupsNumber", i32 0} +!187 = !{!"hasInlineVmeSamplers", i1 false} +!188 = !{!"localSize", i32 0} +!189 = !{!"localIDPresent", i1 false} +!190 = !{!"groupIDPresent", i1 false} +!191 = !{!"privateMemoryPerWI", i32 0} +!192 = !{!"prevFPOffset", i32 0} +!193 = !{!"globalIDPresent", i1 false} +!194 = !{!"hasSyncRTCalls", i1 false} +!195 = !{!"hasNonKernelArgLoad", i1 false} +!196 = !{!"hasNonKernelArgStore", i1 false} +!197 = !{!"hasNonKernelArgAtomic", i1 false} +!198 = !{!"UserAnnotations"} +!199 = !{!"m_OpenCLArgAddressSpaces", !200, !201, !202, !203, !204, !205} +!200 = !{!"m_OpenCLArgAddressSpacesVec[0]", i32 1} +!201 = !{!"m_OpenCLArgAddressSpacesVec[1]", i32 1} +!202 = !{!"m_OpenCLArgAddressSpacesVec[2]", i32 1} +!203 = !{!"m_OpenCLArgAddressSpacesVec[3]", i32 0} +!204 = !{!"m_OpenCLArgAddressSpacesVec[4]", i32 1} +!205 = !{!"m_OpenCLArgAddressSpacesVec[5]", i32 1} +!206 = !{!"m_OpenCLArgAccessQualifiers", !207, !208, !209, !210, !211, !212} +!207 = !{!"m_OpenCLArgAccessQualifiersVec[0]", !"none"} +!208 = !{!"m_OpenCLArgAccessQualifiersVec[1]", !"none"} +!209 = !{!"m_OpenCLArgAccessQualifiersVec[2]", !"none"} +!210 = !{!"m_OpenCLArgAccessQualifiersVec[3]", !"none"} +!211 = !{!"m_OpenCLArgAccessQualifiersVec[4]", !"none"} +!212 = !{!"m_OpenCLArgAccessQualifiersVec[5]", !"none"} +!213 = !{!"m_OpenCLArgTypes", !214, !215, !216, !217, !218, !219} +!214 = !{!"m_OpenCLArgTypesVec[0]", !"half*"} +!215 = !{!"m_OpenCLArgTypesVec[1]", !"half*"} +!216 = !{!"m_OpenCLArgTypesVec[2]", !"half*"} +!217 = !{!"m_OpenCLArgTypesVec[3]", !"float"} +!218 = !{!"m_OpenCLArgTypesVec[4]", !"char*"} +!219 = !{!"m_OpenCLArgTypesVec[5]", !"float*"} +!220 = !{!"m_OpenCLArgBaseTypes", !221, !222, !223, !224, !225, !226} +!221 = !{!"m_OpenCLArgBaseTypesVec[0]", !"half*"} +!222 = !{!"m_OpenCLArgBaseTypesVec[1]", !"half*"} +!223 = !{!"m_OpenCLArgBaseTypesVec[2]", !"half*"} +!224 = !{!"m_OpenCLArgBaseTypesVec[3]", !"float"} +!225 = !{!"m_OpenCLArgBaseTypesVec[4]", !"char*"} +!226 = !{!"m_OpenCLArgBaseTypesVec[5]", !"float*"} +!227 = !{!"m_OpenCLArgTypeQualifiers", !228, !229, !230, !231, !232, !233} +!228 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""} +!229 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""} +!230 = !{!"m_OpenCLArgTypeQualifiersVec[2]", !""} +!231 = !{!"m_OpenCLArgTypeQualifiersVec[3]", !""} +!232 = !{!"m_OpenCLArgTypeQualifiersVec[4]", !""} +!233 = !{!"m_OpenCLArgTypeQualifiersVec[5]", !""} +!234 = !{!"m_OpenCLArgNames", !235, !236, !237, !238, !239, !240} +!235 = !{!"m_OpenCLArgNamesVec[0]", !""} +!236 = !{!"m_OpenCLArgNamesVec[1]", !""} +!237 = !{!"m_OpenCLArgNamesVec[2]", !""} +!238 = !{!"m_OpenCLArgNamesVec[3]", !""} +!239 = !{!"m_OpenCLArgNamesVec[4]", !""} +!240 = !{!"m_OpenCLArgNamesVec[5]", !""} +!241 = !{!"m_OpenCLArgScalarAsPointers"} +!242 = !{!"m_OptsToDisablePerFunc", !243, !244, !245} +!243 = !{!"m_OptsToDisablePerFuncSet[0]", !"IGC-AddressArithmeticSinking"} +!244 = !{!"m_OptsToDisablePerFuncSet[1]", !"IGC-AllowSimd32Slicing"} +!245 = !{!"m_OptsToDisablePerFuncSet[2]", !"IGC-SinkLoadOpt"} +!246 = !{!"KABPointerLoc", i1* null} +!247 = !{!"pushInfo", !248, !249, !250, !254, !255, !256, !257, !258, !259, !260, !261, !274, !275, !276, !277} +!248 = !{!"pushableAddresses"} +!249 = !{!"bindlessPushInfo"} +!250 = !{!"dynamicBufferInfo", !251, !252, !253} +!251 = !{!"firstIndex", i32 0} +!252 = !{!"numOffsets", i32 0} +!253 = !{!"forceDisabled", i1 false} +!254 = !{!"MaxNumberOfPushedBuffers", i32 0} +!255 = !{!"inlineConstantBufferSlot", i32 -1} +!256 = !{!"inlineConstantBufferOffset", i32 -1} +!257 = !{!"inlineConstantBufferGRFOffset", i32 -1} +!258 = !{!"constants"} +!259 = !{!"inputs"} +!260 = !{!"constantReg"} +!261 = !{!"simplePushInfoArr", !262, !271, !272, !273} +!262 = !{!"simplePushInfoArrVec[0]", !263, !264, !265, !266, !267, !268, !269, !270} +!263 = !{!"cbIdx", i32 0} +!264 = !{!"pushableAddressGrfOffset", i32 -1} +!265 = !{!"pushableOffsetGrfOffset", i32 -1} +!266 = !{!"offset", i32 0} +!267 = !{!"size", i32 0} +!268 = !{!"isStateless", i1 false} +!269 = !{!"isBindless", i1 false} +!270 = !{!"simplePushLoads"} +!271 = !{!"simplePushInfoArrVec[1]", !263, !264, !265, !266, !267, !268, !269, !270} +!272 = !{!"simplePushInfoArrVec[2]", !263, !264, !265, !266, !267, !268, !269, !270} +!273 = !{!"simplePushInfoArrVec[3]", !263, !264, !265, !266, !267, !268, !269, !270} +!274 = !{!"simplePushBufferUsed", i32 0} +!275 = !{!"pushAnalysisWIInfos"} +!276 = !{!"inlineRTGlobalPtrOffset", i32 0} +!277 = !{!"rtSyncSurfPtrOffset", i32 0} +!278 = !{!"pISAInfo", !279, !280} +!279 = !{!"shaderType", !"UNKNOWN"} +!280 = !{!"URBOutputLength", i32 0} +!281 = !{!"WaEnableICBPromotion", i1 false} +!282 = !{!"vsInfo", !283, !284, !285} +!283 = !{!"DrawIndirectBufferIndex", i32 -1} +!284 = !{!"vertexReordering", i32 -1} +!285 = !{!"MaxNumOfOutputs", i32 0} +!286 = !{!"hsInfo", !287, !288} +!287 = !{!"numPatchAttributesPatchBaseName", !""} +!288 = !{!"numVertexAttributesPatchBaseName", !""} +!289 = !{!"dsInfo", !285} +!290 = !{!"gsInfo", !285} +!291 = !{!"psInfo", !292, !293, !294, !295, !296, !297, !298, !299, !300, !301, !302, !303, !304, !305, !306, !307, !308, !309, !310, !311, !312, !313, !314, !315, !316, !317, !318, !319, !320, !321, !322, !323, !324, !325, !326} +!292 = !{!"BlendStateDisabledMask", i8 0} +!293 = !{!"SkipSrc0Alpha", i1 false} +!294 = !{!"DualSourceBlendingDisabled", i1 false} +!295 = !{!"ForceEnableSimd32", i1 false} +!296 = !{!"DisableSimd32WithDiscard", i1 false} +!297 = !{!"outputDepth", i1 false} +!298 = !{!"outputStencil", i1 false} +!299 = !{!"outputMask", i1 false} +!300 = !{!"blendToFillEnabled", i1 false} +!301 = !{!"forceEarlyZ", i1 false} +!302 = !{!"hasVersionedLoop", i1 false} +!303 = !{!"forceSingleSourceRTWAfterDualSourceRTW", i1 false} +!304 = !{!"requestCPSizeRelevant", i1 false} +!305 = !{!"requestCPSize", i1 false} +!306 = !{!"texelMaskFastClearMode", !"Disabled"} +!307 = !{!"NumSamples", i8 0} +!308 = !{!"blendOptimizationMode"} +!309 = !{!"colorOutputMask"} +!310 = !{!"ProvokingVertexModeNosIndex", i32 0} +!311 = !{!"ProvokingVertexModeNosPatch", !""} +!312 = !{!"ProvokingVertexModeLast", !"Negative"} +!313 = !{!"VertexAttributesBypass", i1 false} +!314 = !{!"LegacyBaryAssignmentDisableLinear", i1 false} +!315 = !{!"LegacyBaryAssignmentDisableLinearNoPerspective", i1 false} +!316 = !{!"LegacyBaryAssignmentDisableLinearCentroid", i1 false} +!317 = !{!"LegacyBaryAssignmentDisableLinearNoPerspectiveCentroid", i1 false} +!318 = !{!"LegacyBaryAssignmentDisableLinearSample", i1 false} +!319 = !{!"LegacyBaryAssignmentDisableLinearNoPerspectiveSample", i1 false} +!320 = !{!"MeshShaderWAPerPrimitiveUserDataEnable", !"Negative"} +!321 = !{!"meshShaderWAPerPrimitiveUserDataEnablePatchName", !""} +!322 = !{!"generatePatchesForRTWriteSends", i1 false} +!323 = !{!"forceVMask", i1 false} +!324 = !{!"WaDisableVRS", i1 false} +!325 = !{!"RelaxMemoryVisibilityFromPSOrdering", i1 false} +!326 = !{!"WaEnableVMaskUnderNonUnifromCF", i1 false} +!327 = !{!"csInfo", !328, !329, !330, !331, !332, !33, !34, !333, !334, !335, !336, !337, !338, !339, !340, !341, !342, !343, !344, !345, !66, !346, !347, !348, !349, !350, !351, !352} +!328 = !{!"maxWorkGroupSize", i32 0} +!329 = !{!"waveSize", i32 0} +!330 = !{!"ComputeShaderSecondCompile"} +!331 = !{!"forcedSIMDSize", i8 0} +!332 = !{!"forceTotalGRFNum", i32 0} +!333 = !{!"forceSpillCompression", i1 false} +!334 = !{!"allowLowerSimd", i1 false} +!335 = !{!"disableSimd32Slicing", i1 false} +!336 = !{!"disableSplitOnSpill", i1 false} +!337 = !{!"enableNewSpillCostFunction", i1 false} +!338 = !{!"forceVISAPreSched", i1 false} +!339 = !{!"forceUniformBuffer", i1 false} +!340 = !{!"forceUniformSurfaceSampler", i1 false} +!341 = !{!"disableLocalIdOrderOptimizations", i1 false} +!342 = !{!"disableDispatchAlongY", i1 false} +!343 = !{!"neededThreadIdLayout", i1* null} +!344 = !{!"forceTileYWalk", i1 false} +!345 = !{!"atomicBranch", i32 0} +!346 = !{!"disableEarlyOut", i1 false} +!347 = !{!"walkOrderEnabled", i1 false} +!348 = !{!"walkOrderOverride", i32 0} +!349 = !{!"ResForHfPacking"} +!350 = !{!"hasWaveMatrix", i1 false} +!351 = !{!"constantFoldSimdSize", i1 false} +!352 = !{!"isNodeShader", i1 false} +!353 = !{!"msInfo", !354, !355, !356, !357, !358, !359, !360, !361, !362, !363, !364, !312, !310, !365} +!354 = !{!"PrimitiveTopology", i32 3} +!355 = !{!"MaxNumOfPrimitives", i32 0} +!356 = !{!"MaxNumOfVertices", i32 0} +!357 = !{!"MaxNumOfPerPrimitiveOutputs", i32 0} +!358 = !{!"MaxNumOfPerVertexOutputs", i32 0} +!359 = !{!"WorkGroupSize", i32 0} +!360 = !{!"WorkGroupMemorySizeInBytes", i32 0} +!361 = !{!"IndexFormat", i32 6} +!362 = !{!"SubgroupSize", i32 0} +!363 = !{!"VPandRTAIndexAutostripEnable", i1 false} +!364 = !{!"MeshShaderWAPerPrimitiveUserDataEnable", i1 false} +!365 = !{!"numPrimitiveAttributesPatchBaseName", !""} +!366 = !{!"taskInfo", !285, !359, !360, !362} +!367 = !{!"NBarrierCnt", i32 0} +!368 = !{!"rtInfo", !369, !370, !371, !372, !373, !374, !375, !376, !377, !378, !379, !380, !381, !382, !383} +!369 = !{!"RayQueryAllocSizeInBytes", i32 0} +!370 = !{!"NumContinuations", i32 0} +!371 = !{!"RTAsyncStackAddrspace", i32 -1} +!372 = !{!"RTAsyncStackSurfaceStateOffset", i1* null} +!373 = !{!"SWHotZoneAddrspace", i32 -1} +!374 = !{!"SWHotZoneSurfaceStateOffset", i1* null} +!375 = !{!"SWStackAddrspace", i32 -1} +!376 = !{!"SWStackSurfaceStateOffset", i1* null} +!377 = !{!"RTSyncStackAddrspace", i32 -1} +!378 = !{!"RTSyncStackSurfaceStateOffset", i1* null} +!379 = !{!"doSyncDispatchRays", i1 false} +!380 = !{!"MemStyle", !"Xe"} +!381 = !{!"GlobalDataStyle", !"Xe"} +!382 = !{!"NeedsBTD", i1 true} +!383 = !{!"uberTileDimensions", i1* null} +!384 = !{!"EnableTextureIndirection", i1 false} +!385 = !{!"EnableSamplerIndirection", i1 false} +!386 = !{!"samplerStateStride", i32 0} +!387 = !{!"samplerStateOffset", i32 0} +!388 = !{!"textureStateStride", i32 0} +!389 = !{!"textureStateOffset", i32 0} +!390 = !{!"CurUniqueIndirectIdx", i32 0} +!391 = !{!"inlineDynTextures"} +!392 = !{!"inlineResInfoData"} +!393 = !{!"immConstant", !394, !395, !396} +!394 = !{!"data"} +!395 = !{!"sizes"} +!396 = !{!"zeroIdxs"} +!397 = !{!"stringConstants"} +!398 = !{!"inlineBuffers", !399, !403, !404} +!399 = !{!"inlineBuffersVec[0]", !400, !401, !402} +!400 = !{!"alignment", i32 0} +!401 = !{!"allocSize", i64 0} +!402 = !{!"Buffer"} +!403 = !{!"inlineBuffersVec[1]", !400, !401, !402} +!404 = !{!"inlineBuffersVec[2]", !400, !401, !402} +!405 = !{!"GlobalPointerProgramBinaryInfos"} +!406 = !{!"ConstantPointerProgramBinaryInfos"} +!407 = !{!"GlobalBufferAddressRelocInfo"} +!408 = !{!"ConstantBufferAddressRelocInfo"} +!409 = !{!"forceLscCacheList"} +!410 = !{!"SrvMap"} +!411 = !{!"RootConstantBufferOffsetInBytes"} +!412 = !{!"RasterizerOrderedByteAddressBuffer"} +!413 = !{!"RasterizerOrderedViews"} +!414 = !{!"MinNOSPushConstantSize", i32 0} +!415 = !{!"inlineProgramScopeOffsets"} +!416 = !{!"shaderData", !417} +!417 = !{!"numReplicas", i32 0} +!418 = !{!"URBInfo", !419, !420, !421} +!419 = !{!"has64BVertexHeaderInput", i1 false} +!420 = !{!"has64BVertexHeaderOutput", i1 false} +!421 = !{!"hasVertexHeader", i1 true} +!422 = !{!"m_ForcePullModel", i1 false} +!423 = !{!"UseBindlessImage", i1 false} +!424 = !{!"enableRangeReduce", i1 false} +!425 = !{!"disableNewTrigFuncRangeReduction", i1 false} +!426 = !{!"enableFRemToSRemOpt", i1 false} +!427 = !{!"enableSampleptrToLdmsptrSample0", i1 false} +!428 = !{!"enableSampleLptrToLdmsptrSample0", i1 false} +!429 = !{!"WaForceSIMD32MicropolyRasterize", i1 false} +!430 = !{!"allowMatchMadOptimizationforVS", i1 false} +!431 = !{!"disableMatchMadOptimizationForCS", i1 false} +!432 = !{!"disableMemOptforNegativeOffsetLoads", i1 false} +!433 = !{!"enableThreeWayLoadSpiltOpt", i1 false} +!434 = !{!"statefulResourcesNotAliased", i1 false} +!435 = !{!"disableMixMode", i1 false} +!436 = !{!"genericAccessesResolved", i1 false} +!437 = !{!"disableSeparateSpillPvtScratchSpace", i1 false} +!438 = !{!"enableSeparateSpillPvtScratchSpace", i1 false} +!439 = !{!"disableSeparateScratchWA", i1 false} +!440 = !{!"enableRemoveUnusedTGMFence", i1 false} +!441 = !{!"PrivateMemoryPerFG", !442, !443} +!442 = !{!"PrivateMemoryPerFGMap[0]", void (half addrspace(1)*, half addrspace(1)*, half addrspace(1)*, float, i8 addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, i8*, i32, i32, i32, i32, i32)* @_attn_fwd} +!443 = !{!"PrivateMemoryPerFGValue[0]", i32 0} +!444 = !{!"m_OptsToDisable"} +!445 = !{!"capabilities", !446} +!446 = !{!"globalVariableDecorationsINTEL", i1 false} +!447 = !{!"m_ShaderResourceViewMcsMask", !448, !449} +!448 = !{!"m_ShaderResourceViewMcsMaskVec[0]", i64 0} +!449 = !{!"m_ShaderResourceViewMcsMaskVec[1]", i64 0} +!450 = !{!"computedDepthMode", i32 0} +!451 = !{!"isHDCFastClearShader", i1 false} +!452 = !{!"argRegisterReservations", !453} +!453 = !{!"argRegisterReservationsVec[0]", i32 0} +!454 = !{!"SIMD16_SpillThreshold", i8 0} +!455 = !{!"SIMD32_SpillThreshold", i8 0} +!456 = !{!"m_CacheControlOption", !457, !458, !459, !460} +!457 = !{!"LscLoadCacheControlOverride", i8 0} +!458 = !{!"LscStoreCacheControlOverride", i8 0} +!459 = !{!"TgmLoadCacheControlOverride", i8 0} +!460 = !{!"TgmStoreCacheControlOverride", i8 0} +!461 = !{!"ModuleUsesBindless", i1 false} +!462 = !{!"predicationMap"} diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-binary-fmul.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-binary-fmul.ll new file mode 100644 index 000000000000..281f046da76d --- /dev/null +++ b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-binary-fmul.ll @@ -0,0 +1,85 @@ +; UNSUPPORTED: system-windows +; REQUIRES: pvc-supported, regkeys + +; RUN: igc_opt -S --igc-vectorizer -dce < %s 2>&1 | FileCheck %s + +; CHECK: %vectorized_phi +; CHECK: %vector = insertelement <8 x float> undef +; CHECK: %vector1 = insertelement <8 x float> %vector +; CHECK: %vector2 = insertelement <8 x float> %vector1 +; CHECK: %vector3 = insertelement <8 x float> %vector2 +; CHECK: %vector4 = insertelement <8 x float> %vector3 +; CHECK: %vector5 = insertelement <8 x float> %vector4 +; CHECK: %vector6 = insertelement <8 x float> %vector5 +; CHECK: %vector7 = insertelement <8 x float> %vector6 +; CHECK: %vectorized_binary = fmul <8 x float> %vector7, %vectorized_phi +; CHECK: call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %vectorized_binary + +; ModuleID = 'reduced.ll' +source_filename = "initial_test.ll" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @_attn_fwd() #0 { + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge, %0 + %1 = phi float [ 0.000000e+00, %0 ], [ %35, %._crit_edge ] + %2 = phi float [ 0.000000e+00, %0 ], [ %36, %._crit_edge ] + %3 = phi float [ 0.000000e+00, %0 ], [ %37, %._crit_edge ] + %4 = phi float [ 0.000000e+00, %0 ], [ %38, %._crit_edge ] + %5 = phi float [ 0.000000e+00, %0 ], [ %39, %._crit_edge ] + %6 = phi float [ 0.000000e+00, %0 ], [ %40, %._crit_edge ] + %7 = phi float [ 0.000000e+00, %0 ], [ %41, %._crit_edge ] + %8 = phi float [ 0.000000e+00, %0 ], [ %42, %._crit_edge ] + %9 = call float @llvm.exp2.f32(float 0.000000e+00) + %10 = call float @llvm.exp2.f32(float 0.000000e+00) + %11 = call float @llvm.exp2.f32(float 0.000000e+00) + %12 = call float @llvm.exp2.f32(float 0.000000e+00) + %13 = call float @llvm.exp2.f32(float 0.000000e+00) + %14 = call float @llvm.exp2.f32(float 0.000000e+00) + %15 = call float @llvm.exp2.f32(float 0.000000e+00) + %16 = call float @llvm.exp2.f32(float 0.000000e+00) + %17 = fmul fast float %9, %1 + %18 = fmul fast float %10, %2 + %19 = fmul fast float %11, %3 + %20 = fmul fast float %12, %4 + %21 = fmul fast float %13, %5 + %22 = fmul fast float %14, %6 + %23 = fmul fast float %15, %7 + %24 = fmul fast float %16, %8 + %25 = insertelement <8 x float> zeroinitializer, float %17, i64 0 + %26 = insertelement <8 x float> %25, float %18, i64 1 + %27 = insertelement <8 x float> %26, float %19, i64 2 + %28 = insertelement <8 x float> %27, float %20, i64 3 + %29 = insertelement <8 x float> %28, float %21, i64 4 + %30 = insertelement <8 x float> %29, float %22, i64 5 + %31 = insertelement <8 x float> %30, float %23, i64 6 + %32 = insertelement <8 x float> %31, float %24, i64 7 + %33 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) + %34 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) + %35 = extractelement <8 x float> %34, i64 0 + %36 = extractelement <8 x float> %34, i64 1 + %37 = extractelement <8 x float> %34, i64 2 + %38 = extractelement <8 x float> %34, i64 3 + %39 = extractelement <8 x float> %34, i64 4 + %40 = extractelement <8 x float> %34, i64 5 + %41 = extractelement <8 x float> %34, i64 6 + %42 = extractelement <8 x float> %34, i64 7 + br label %._crit_edge +} + +; Function Attrs: convergent nounwind readnone willreturn +declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1 + +; Function Attrs: nofree nosync nounwind readnone speculatable willreturn +declare float @llvm.exp2.f32(float) #2 + +; uselistorder directives +uselistorder <8 x float> (<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)* @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32, { 1, 0 } +uselistorder float (float)* @llvm.exp2.f32, { 7, 6, 5, 4, 3, 2, 1, 0 } + +attributes #0 = { convergent nounwind } +attributes #1 = { convergent nounwind readnone willreturn } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-dpas-phi-negative-zero.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-dpas-phi-negative-zero.ll deleted file mode 100644 index b3ef5cc7695f..000000000000 --- a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-dpas-phi-negative-zero.ll +++ /dev/null @@ -1,108 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: igc_opt --igc-vectorizer -S -dce < %s 2>&1 | FileCheck %s - -define spir_kernel void @quux() { -; CHECK-LABEL: @quux( -; CHECK-NEXT: bb43: -; CHECK-NEXT: br label [[BB123:%.*]] -; CHECK: bb60: -; CHECK-NEXT: br label [[BB88:%.*]] -; CHECK: bb88: -; CHECK-NEXT: [[TMP90:%.*]] = phi float [ 0.000000e+00, [[BB60:%.*]] ], [ [[TMP114:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP91:%.*]] = phi float [ -0.000000e+00, [[BB60]] ], [ [[TMP115:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP92:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP116:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP93:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP117:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP94:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP118:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP95:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP119:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP96:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP120:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP97:%.*]] = phi float [ 0.000000e+00, [[BB60]] ], [ [[TMP121:%.*]], [[BB88]] ] -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <8 x float> zeroinitializer, float [[TMP90]], i64 0 -; CHECK-NEXT: [[TMP105:%.*]] = insertelement <8 x float> [[TMP104]], float [[TMP91]], i64 1 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <8 x float> [[TMP105]], float [[TMP92]], i64 2 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <8 x float> [[TMP106]], float [[TMP93]], i64 3 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <8 x float> [[TMP107]], float [[TMP94]], i64 4 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <8 x float> [[TMP108]], float [[TMP95]], i64 5 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <8 x float> [[TMP109]], float [[TMP96]], i64 6 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <8 x float> [[TMP110]], float [[TMP97]], i64 7 -; CHECK-NEXT: [[TMP112:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP111]], <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) -; CHECK-NEXT: [[TMP113:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) -; CHECK-NEXT: [[TMP114]] = extractelement <8 x float> [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP115]] = extractelement <8 x float> [[TMP113]], i64 1 -; CHECK-NEXT: [[TMP116]] = extractelement <8 x float> [[TMP113]], i64 2 -; CHECK-NEXT: [[TMP117]] = extractelement <8 x float> [[TMP113]], i64 3 -; CHECK-NEXT: [[TMP118]] = extractelement <8 x float> [[TMP113]], i64 4 -; CHECK-NEXT: [[TMP119]] = extractelement <8 x float> [[TMP113]], i64 5 -; CHECK-NEXT: [[TMP120]] = extractelement <8 x float> [[TMP113]], i64 6 -; CHECK-NEXT: [[TMP121]] = extractelement <8 x float> [[TMP113]], i64 7 -; CHECK-NEXT: br i1 false, label [[BB88]], label [[BB123]] -; CHECK: bb123: -; CHECK-NEXT: [[VECTORIZED_PHI:%.*]] = phi <8 x float> [ zeroinitializer, [[BB43:%.*]] ], [ [[TMP113]], [[BB88]] ] -; CHECK-NEXT: [[TMP151:%.*]] = bitcast <8 x float> [[VECTORIZED_PHI]] to <8 x i32> -; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> [[TMP151]]) -; CHECK-NEXT: ret void -; -bb43: - br label %bb123 - -bb60: ; No predecessors! - br label %bb88 - -bb88: ; preds = %bb88, %bb60 - %tmp90 = phi float [ 0.000000e+00, %bb60 ], [ %tmp114, %bb88 ] - %tmp91 = phi float [ -0.000000e+00, %bb60 ], [ %tmp115, %bb88 ] - %tmp92 = phi float [ 0.000000e+00, %bb60 ], [ %tmp116, %bb88 ] - %tmp93 = phi float [ 0.000000e+00, %bb60 ], [ %tmp117, %bb88 ] - %tmp94 = phi float [ 0.000000e+00, %bb60 ], [ %tmp118, %bb88 ] - %tmp95 = phi float [ 0.000000e+00, %bb60 ], [ %tmp119, %bb88 ] - %tmp96 = phi float [ 0.000000e+00, %bb60 ], [ %tmp120, %bb88 ] - %tmp97 = phi float [ 0.000000e+00, %bb60 ], [ %tmp121, %bb88 ] - %tmp104 = insertelement <8 x float> zeroinitializer, float %tmp90, i64 0 - %tmp105 = insertelement <8 x float> %tmp104, float %tmp91, i64 1 - %tmp106 = insertelement <8 x float> %tmp105, float %tmp92, i64 2 - %tmp107 = insertelement <8 x float> %tmp106, float %tmp93, i64 3 - %tmp108 = insertelement <8 x float> %tmp107, float %tmp94, i64 4 - %tmp109 = insertelement <8 x float> %tmp108, float %tmp95, i64 5 - %tmp110 = insertelement <8 x float> %tmp109, float %tmp96, i64 6 - %tmp111 = insertelement <8 x float> %tmp110, float %tmp97, i64 7 - %tmp112 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %tmp111, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) - %tmp113 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) - %tmp114 = extractelement <8 x float> %tmp113, i64 0 - %tmp115 = extractelement <8 x float> %tmp113, i64 1 - %tmp116 = extractelement <8 x float> %tmp113, i64 2 - %tmp117 = extractelement <8 x float> %tmp113, i64 3 - %tmp118 = extractelement <8 x float> %tmp113, i64 4 - %tmp119 = extractelement <8 x float> %tmp113, i64 5 - %tmp120 = extractelement <8 x float> %tmp113, i64 6 - %tmp121 = extractelement <8 x float> %tmp113, i64 7 - br i1 false, label %bb88, label %bb123 - -bb123: ; preds = %bb88, %bb43 - %tmp133 = phi float [ 0.000000e+00, %bb43 ], [ %tmp114, %bb88 ] - %tmp134 = phi float [ 0.000000e+00, %bb43 ], [ %tmp115, %bb88 ] - %tmp135 = phi float [ 0.000000e+00, %bb43 ], [ %tmp116, %bb88 ] - %tmp136 = phi float [ 0.000000e+00, %bb43 ], [ %tmp117, %bb88 ] - %tmp137 = phi float [ 0.000000e+00, %bb43 ], [ %tmp118, %bb88 ] - %tmp138 = phi float [ 0.000000e+00, %bb43 ], [ %tmp119, %bb88 ] - %tmp139 = phi float [ 0.000000e+00, %bb43 ], [ %tmp120, %bb88 ] - %tmp140 = phi float [ 0.000000e+00, %bb43 ], [ %tmp121, %bb88 ] - %tmp143 = insertelement <8 x float> zeroinitializer, float %tmp133, i64 0 - %tmp144 = insertelement <8 x float> %tmp143, float %tmp134, i64 1 - %tmp145 = insertelement <8 x float> %tmp144, float %tmp135, i64 2 - %tmp146 = insertelement <8 x float> %tmp145, float %tmp136, i64 3 - %tmp147 = insertelement <8 x float> %tmp146, float %tmp137, i64 4 - %tmp148 = insertelement <8 x float> %tmp147, float %tmp138, i64 5 - %tmp149 = insertelement <8 x float> %tmp148, float %tmp139, i64 6 - %tmp150 = insertelement <8 x float> %tmp149, float %tmp140, i64 7 - %tmp151 = bitcast <8 x float> %tmp150 to <8 x i32> - call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> %tmp151) - ret void -} - -declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) - -declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) - -declare <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) - -declare void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) - diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h index 72bddcc985fc..ce0f2712d706 100644 --- a/IGC/common/igc_flags.h +++ b/IGC/common/igc_flags.h @@ -497,6 +497,10 @@ DECLARE_IGC_REGKEY(DWORD,MaxLiveOutThreshold, 0, "Max LiveOut Thre DECLARE_IGC_REGKEY(bool, DisableScalarAtomics, false, "Disable the Scalar Atomics optimization", false) DECLARE_IGC_REGKEY(bool, EnableScalarTypedAtomics, true, "Enable the Scalar Typed Atomics optimization", false) DECLARE_IGC_REGKEY(bool, EnableVectorizer, true, "Enable IGCVectorizer pass", false) +DECLARE_IGC_REGKEY(bool, VectorizerCheckScalarizer, false, "Add scalariser after vectorizer to check performance", true) +DECLARE_IGC_REGKEY(DWORD, VectorizerList, -1, "Vectorize only one seed instruction with the provided number", true) +DECLARE_IGC_REGKEY(bool, EnableVectorEmitter, true, "Enable Vector Emission for a vectorizer", true) +DECLARE_IGC_REGKEY(bool, VectorizerAllowFDIV, true, "Allow FDIV instructions inside vectorizer", true) DECLARE_IGC_REGKEY(bool, DisableOCLScalarizer, false, "Disable ScalarizeFunction pass in OCL pipeline", true) DECLARE_IGC_REGKEY(bool, DisablePHIScalarization, false, "Disable scalarization of PHINode instructions", true) DECLARE_IGC_REGKEY(bool, EnableSelectiveScalarizer, false, "enable selective scalarizer on GPGPU path", true)