/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// This pass does misc. AMDGPU optimizations on IR before instruction |
12 | | /// selection. |
13 | | // |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "AMDGPU.h" |
17 | | #include "AMDGPUSubtarget.h" |
18 | | #include "AMDGPUTargetMachine.h" |
19 | | #include "llvm/ADT/StringRef.h" |
20 | | #include "llvm/Analysis/DivergenceAnalysis.h" |
21 | | #include "llvm/Analysis/Loads.h" |
22 | | #include "llvm/CodeGen/Passes.h" |
23 | | #include "llvm/CodeGen/TargetPassConfig.h" |
24 | | #include "llvm/IR/Attributes.h" |
25 | | #include "llvm/IR/BasicBlock.h" |
26 | | #include "llvm/IR/Constants.h" |
27 | | #include "llvm/IR/DerivedTypes.h" |
28 | | #include "llvm/IR/Function.h" |
29 | | #include "llvm/IR/IRBuilder.h" |
30 | | #include "llvm/IR/InstVisitor.h" |
31 | | #include "llvm/IR/InstrTypes.h" |
32 | | #include "llvm/IR/Instruction.h" |
33 | | #include "llvm/IR/Instructions.h" |
34 | | #include "llvm/IR/IntrinsicInst.h" |
35 | | #include "llvm/IR/Intrinsics.h" |
36 | | #include "llvm/IR/LLVMContext.h" |
37 | | #include "llvm/IR/Operator.h" |
38 | | #include "llvm/IR/Type.h" |
39 | | #include "llvm/IR/Value.h" |
40 | | #include "llvm/Pass.h" |
41 | | #include "llvm/Support/Casting.h" |
42 | | #include <cassert> |
43 | | #include <iterator> |
44 | | |
45 | | #define DEBUG_TYPE "amdgpu-codegenprepare" |
46 | | |
47 | | using namespace llvm; |
48 | | |
49 | | namespace { |
50 | | |
51 | | class AMDGPUCodeGenPrepare : public FunctionPass, |
52 | | public InstVisitor<AMDGPUCodeGenPrepare, bool> { |
53 | | const SISubtarget *ST = nullptr; |
54 | | DivergenceAnalysis *DA = nullptr; |
55 | | Module *Mod = nullptr; |
56 | | bool HasUnsafeFPMath = false; |
57 | | AMDGPUAS AMDGPUASI; |
58 | | |
59 | | /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to |
60 | | /// binary operation \p V. |
61 | | /// |
62 | | /// \returns Binary operation \p V. |
63 | | /// \returns \p T's base element bit width. |
64 | | unsigned getBaseElementBitWidth(const Type *T) const; |
65 | | |
66 | | /// \returns Equivalent 32 bit integer type for given type \p T. For example, |
67 | | /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> |
68 | | /// is returned. |
69 | | Type *getI32Ty(IRBuilder<> &B, const Type *T) const; |
70 | | |
71 | | /// \returns True if binary operation \p I is a signed binary operation, false |
72 | | /// otherwise. |
73 | | bool isSigned(const BinaryOperator &I) const; |
74 | | |
75 | | /// \returns True if the condition of 'select' operation \p I comes from a |
76 | | /// signed 'icmp' operation, false otherwise. |
77 | | bool isSigned(const SelectInst &I) const; |
78 | | |
79 | | /// \returns True if type \p T needs to be promoted to 32 bit integer type, |
80 | | /// false otherwise. |
81 | | bool needsPromotionToI32(const Type *T) const; |
82 | | |
83 | | /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary |
84 | | /// operation. |
85 | | /// |
86 | | /// \details \p I's base element bit width must be greater than 1 and less |
87 | | /// than or equal 16. Promotion is done by sign or zero extending operands to |
88 | | /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and |
89 | | /// truncating the result of 32 bit binary operation back to \p I's original |
90 | | /// type. Division operation is not promoted. |
91 | | /// |
92 | | /// \returns True if \p I is promoted to equivalent 32 bit binary operation, |
93 | | /// false otherwise. |
94 | | bool promoteUniformOpToI32(BinaryOperator &I) const; |
95 | | |
96 | | /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. |
97 | | /// |
98 | | /// \details \p I's base element bit width must be greater than 1 and less |
99 | | /// than or equal 16. Promotion is done by sign or zero extending operands to |
100 | | /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. |
101 | | /// |
102 | | /// \returns True. |
103 | | bool promoteUniformOpToI32(ICmpInst &I) const; |
104 | | |
105 | | /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' |
106 | | /// operation. |
107 | | /// |
108 | | /// \details \p I's base element bit width must be greater than 1 and less |
109 | | /// than or equal 16. Promotion is done by sign or zero extending operands to |
110 | | /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the |
111 | | /// result of 32 bit 'select' operation back to \p I's original type. |
112 | | /// |
113 | | /// \returns True. |
114 | | bool promoteUniformOpToI32(SelectInst &I) const; |
115 | | |
116 | | /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' |
117 | | /// intrinsic. |
118 | | /// |
119 | | /// \details \p I's base element bit width must be greater than 1 and less |
120 | | /// than or equal 16. Promotion is done by zero extending the operand to 32 |
121 | | /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the |
122 | | /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the |
123 | | /// shift amount is 32 minus \p I's base element bit width), and truncating |
124 | | /// the result of the shift operation back to \p I's original type. |
125 | | /// |
126 | | /// \returns True. |
127 | | bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; |
128 | | /// \brief Widen a scalar load. |
129 | | /// |
130 | | /// \details \p Widen scalar load for uniform, small type loads from constant |
131 | | // memory / to a full 32-bits and then truncate the input to allow a scalar |
132 | | // load instead of a vector load. |
133 | | // |
134 | | /// \returns True. |
135 | | |
136 | | bool canWidenScalarExtLoad(LoadInst &I) const; |
137 | | |
138 | | public: |
139 | | static char ID; |
140 | | |
141 | 1.48k | AMDGPUCodeGenPrepare() : FunctionPass(ID) {} |
142 | | |
143 | | bool visitFDiv(BinaryOperator &I); |
144 | | |
145 | 61.0k | bool visitInstruction(Instruction &I) { return false; } |
146 | | bool visitBinaryOperator(BinaryOperator &I); |
147 | | bool visitLoadInst(LoadInst &I); |
148 | | bool visitICmpInst(ICmpInst &I); |
149 | | bool visitSelectInst(SelectInst &I); |
150 | | |
151 | | bool visitIntrinsicInst(IntrinsicInst &I); |
152 | | bool visitBitreverseIntrinsicInst(IntrinsicInst &I); |
153 | | |
154 | | bool doInitialization(Module &M) override; |
155 | | bool runOnFunction(Function &F) override; |
156 | | |
157 | 0 | StringRef getPassName() const override { return "AMDGPU IR optimizations"; } |
158 | | |
159 | 1.47k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
160 | 1.47k | AU.addRequired<DivergenceAnalysis>(); |
161 | 1.47k | AU.setPreservesAll(); |
162 | 1.47k | } |
163 | | }; |
164 | | |
165 | | } // end anonymous namespace |
166 | | |
167 | 8 | unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { |
168 | 8 | assert(needsPromotionToI32(T) && "T does not need promotion to i32"); |
169 | 8 | |
170 | 8 | if (T->isIntegerTy()) |
171 | 6 | return T->getIntegerBitWidth(); |
172 | 2 | return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); |
173 | 2 | } |
174 | | |
175 | 450 | Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { |
176 | 450 | assert(needsPromotionToI32(T) && "T does not need promotion to i32"); |
177 | 450 | |
178 | 450 | if (T->isIntegerTy()) |
179 | 255 | return B.getInt32Ty(); |
180 | 195 | return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); |
181 | 195 | } |
182 | | |
183 | 218 | bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { |
184 | 218 | return I.getOpcode() == Instruction::AShr || |
185 | 218 | I.getOpcode() == Instruction::SDiv192 || I.getOpcode() == Instruction::SRem192 ; |
186 | 218 | } |
187 | | |
188 | 119 | bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { |
189 | 119 | return isa<ICmpInst>(I.getOperand(0)) ? |
190 | 119 | cast<ICmpInst>(I.getOperand(0))->isSigned()117 : false2 ; |
191 | 119 | } |
192 | | |
193 | 6.84k | bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { |
194 | 6.84k | const IntegerType *IntTy = dyn_cast<IntegerType>(T); |
195 | 6.84k | if (IntTy && 6.84k IntTy->getBitWidth() > 13.56k && IntTy->getBitWidth() <= 163.51k ) |
196 | 696 | return true; |
197 | 6.14k | |
198 | 6.14k | if (const VectorType *6.14k VT6.14k = dyn_cast<VectorType>(T)) { |
199 | 861 | // TODO: The set of packed operations is more limited, so may want to |
200 | 861 | // promote some anyway. |
201 | 861 | if (ST->hasVOP3PInsts()) |
202 | 289 | return false; |
203 | 572 | |
204 | 572 | return needsPromotionToI32(VT->getElementType()); |
205 | 572 | } |
206 | 5.28k | |
207 | 5.28k | return false; |
208 | 5.28k | } |
209 | | |
210 | | // Return true if the op promoted to i32 should have nsw set. |
211 | 214 | static bool promotedOpIsNSW(const Instruction &I) { |
212 | 214 | switch (I.getOpcode()) { |
213 | 96 | case Instruction::Shl: |
214 | 96 | case Instruction::Add: |
215 | 96 | case Instruction::Sub: |
216 | 96 | return true; |
217 | 46 | case Instruction::Mul: |
218 | 46 | return I.hasNoUnsignedWrap(); |
219 | 72 | default: |
220 | 72 | return false; |
221 | 0 | } |
222 | 0 | } |
223 | | |
224 | | // Return true if the op promoted to i32 should have nuw set. |
225 | 214 | static bool promotedOpIsNUW(const Instruction &I) { |
226 | 214 | switch (I.getOpcode()) { |
227 | 118 | case Instruction::Shl: |
228 | 118 | case Instruction::Add: |
229 | 118 | case Instruction::Mul: |
230 | 118 | return true; |
231 | 24 | case Instruction::Sub: |
232 | 24 | return I.hasNoUnsignedWrap(); |
233 | 72 | default: |
234 | 72 | return false; |
235 | 0 | } |
236 | 0 | } |
237 | | |
238 | 1.12k | bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { |
239 | 1.12k | Type *Ty = I.getType(); |
240 | 1.12k | const DataLayout &DL = Mod->getDataLayout(); |
241 | 1.12k | int TySize = DL.getTypeSizeInBits(Ty); |
242 | 1.12k | unsigned Align = I.getAlignment() ? |
243 | 1.12k | I.getAlignment()122 : DL.getABITypeAlignment(Ty)1.00k ; |
244 | 1.12k | |
245 | 1.12k | return I.isSimple() && TySize < 32999 && Align >= 4189 && DA->isUniform(&I)50 ; |
246 | 1.12k | } |
247 | | |
248 | 227 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { |
249 | 227 | assert(needsPromotionToI32(I.getType()) && |
250 | 227 | "I does not need promotion to i32"); |
251 | 227 | |
252 | 227 | if (I.getOpcode() == Instruction::SDiv || |
253 | 224 | I.getOpcode() == Instruction::UDiv) |
254 | 9 | return false; |
255 | 218 | |
256 | 218 | IRBuilder<> Builder(&I); |
257 | 218 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
258 | 218 | |
259 | 218 | Type *I32Ty = getI32Ty(Builder, I.getType()); |
260 | 218 | Value *ExtOp0 = nullptr; |
261 | 218 | Value *ExtOp1 = nullptr; |
262 | 218 | Value *ExtRes = nullptr; |
263 | 218 | Value *TruncRes = nullptr; |
264 | 218 | |
265 | 218 | if (isSigned(I)218 ) { |
266 | 32 | ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); |
267 | 32 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
268 | 218 | } else { |
269 | 186 | ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); |
270 | 186 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
271 | 186 | } |
272 | 218 | |
273 | 218 | ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); |
274 | 218 | if (Instruction *Inst218 = dyn_cast<Instruction>(ExtRes)) { |
275 | 214 | if (promotedOpIsNSW(cast<Instruction>(I))) |
276 | 104 | Inst->setHasNoSignedWrap(); |
277 | 214 | |
278 | 214 | if (promotedOpIsNUW(cast<Instruction>(I))) |
279 | 126 | Inst->setHasNoUnsignedWrap(); |
280 | 214 | |
281 | 214 | if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) |
282 | 36 | Inst->setIsExact(ExactOp->isExact()); |
283 | 214 | } |
284 | 227 | |
285 | 227 | TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); |
286 | 227 | |
287 | 227 | I.replaceAllUsesWith(TruncRes); |
288 | 227 | I.eraseFromParent(); |
289 | 227 | |
290 | 227 | return true; |
291 | 227 | } |
292 | | |
293 | 105 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { |
294 | 105 | assert(needsPromotionToI32(I.getOperand(0)->getType()) && |
295 | 105 | "I does not need promotion to i32"); |
296 | 105 | |
297 | 105 | IRBuilder<> Builder(&I); |
298 | 105 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
299 | 105 | |
300 | 105 | Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); |
301 | 105 | Value *ExtOp0 = nullptr; |
302 | 105 | Value *ExtOp1 = nullptr; |
303 | 105 | Value *NewICmp = nullptr; |
304 | 105 | |
305 | 105 | if (I.isSigned()105 ) { |
306 | 53 | ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); |
307 | 53 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
308 | 105 | } else { |
309 | 52 | ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); |
310 | 52 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
311 | 52 | } |
312 | 105 | NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); |
313 | 105 | |
314 | 105 | I.replaceAllUsesWith(NewICmp); |
315 | 105 | I.eraseFromParent(); |
316 | 105 | |
317 | 105 | return true; |
318 | 105 | } |
319 | | |
320 | 119 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { |
321 | 119 | assert(needsPromotionToI32(I.getType()) && |
322 | 119 | "I does not need promotion to i32"); |
323 | 119 | |
324 | 119 | IRBuilder<> Builder(&I); |
325 | 119 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
326 | 119 | |
327 | 119 | Type *I32Ty = getI32Ty(Builder, I.getType()); |
328 | 119 | Value *ExtOp1 = nullptr; |
329 | 119 | Value *ExtOp2 = nullptr; |
330 | 119 | Value *ExtRes = nullptr; |
331 | 119 | Value *TruncRes = nullptr; |
332 | 119 | |
333 | 119 | if (isSigned(I)119 ) { |
334 | 57 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
335 | 57 | ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); |
336 | 119 | } else { |
337 | 62 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
338 | 62 | ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); |
339 | 62 | } |
340 | 119 | ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); |
341 | 119 | TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); |
342 | 119 | |
343 | 119 | I.replaceAllUsesWith(TruncRes); |
344 | 119 | I.eraseFromParent(); |
345 | 119 | |
346 | 119 | return true; |
347 | 119 | } |
348 | | |
349 | | bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( |
350 | 8 | IntrinsicInst &I) const { |
351 | 8 | assert(I.getIntrinsicID() == Intrinsic::bitreverse && |
352 | 8 | "I must be bitreverse intrinsic"); |
353 | 8 | assert(needsPromotionToI32(I.getType()) && |
354 | 8 | "I does not need promotion to i32"); |
355 | 8 | |
356 | 8 | IRBuilder<> Builder(&I); |
357 | 8 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
358 | 8 | |
359 | 8 | Type *I32Ty = getI32Ty(Builder, I.getType()); |
360 | 8 | Function *I32 = |
361 | 8 | Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); |
362 | 8 | Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); |
363 | 8 | Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); |
364 | 8 | Value *LShrOp = |
365 | 8 | Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); |
366 | 8 | Value *TruncRes = |
367 | 8 | Builder.CreateTrunc(LShrOp, I.getType()); |
368 | 8 | |
369 | 8 | I.replaceAllUsesWith(TruncRes); |
370 | 8 | I.eraseFromParent(); |
371 | 8 | |
372 | 8 | return true; |
373 | 8 | } |
374 | | |
375 | 11 | static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { |
376 | 11 | const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); |
377 | 11 | if (!CNum) |
378 | 8 | return false; |
379 | 3 | |
380 | 3 | // Reciprocal f32 is handled separately without denormals. |
381 | 3 | return UnsafeDiv || 3 CNum->isExactlyValue(+1.0)3 ; |
382 | 11 | } |
383 | | |
384 | | // Insert an intrinsic for fast fdiv for safe math situations where we can |
385 | | // reduce precision. Leave fdiv for situations where the generic node is |
386 | | // expected to be optimized. |
387 | 231 | bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { |
388 | 231 | Type *Ty = FDiv.getType(); |
389 | 231 | |
390 | 231 | if (!Ty->getScalarType()->isFloatTy()) |
391 | 77 | return false; |
392 | 154 | |
393 | 154 | MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); |
394 | 154 | if (!FPMath) |
395 | 107 | return false; |
396 | 47 | |
397 | 47 | const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); |
398 | 47 | float ULP = FPOp->getFPAccuracy(); |
399 | 47 | if (ULP < 2.5f) |
400 | 8 | return false; |
401 | 39 | |
402 | 39 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
403 | 37 | bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || |
404 | 26 | FMF.allowReciprocal(); |
405 | 39 | |
406 | 39 | // With UnsafeDiv node will be optimized to just rcp and mul. |
407 | 39 | if (ST->hasFP32Denormals() || 39 UnsafeDiv32 ) |
408 | 29 | return false; |
409 | 10 | |
410 | 10 | IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); |
411 | 10 | Builder.setFastMathFlags(FMF); |
412 | 10 | Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); |
413 | 10 | |
414 | 10 | Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); |
415 | 10 | |
416 | 10 | Value *Num = FDiv.getOperand(0); |
417 | 10 | Value *Den = FDiv.getOperand(1); |
418 | 10 | |
419 | 10 | Value *NewFDiv = nullptr; |
420 | 10 | |
421 | 10 | if (VectorType *VT10 = dyn_cast<VectorType>(Ty)) { |
422 | 1 | NewFDiv = UndefValue::get(VT); |
423 | 1 | |
424 | 1 | // FIXME: Doesn't do the right thing for cases where the vector is partially |
425 | 1 | // constant. This works when the scalarizer pass is run first. |
426 | 3 | for (unsigned I = 0, E = VT->getNumElements(); I != E3 ; ++I2 ) { |
427 | 2 | Value *NumEltI = Builder.CreateExtractElement(Num, I); |
428 | 2 | Value *DenEltI = Builder.CreateExtractElement(Den, I); |
429 | 2 | Value *NewElt; |
430 | 2 | |
431 | 2 | if (shouldKeepFDivF32(NumEltI, UnsafeDiv)2 ) { |
432 | 0 | NewElt = Builder.CreateFDiv(NumEltI, DenEltI); |
433 | 2 | } else { |
434 | 2 | NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); |
435 | 2 | } |
436 | 2 | |
437 | 2 | NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); |
438 | 2 | } |
439 | 10 | } else { |
440 | 9 | if (!shouldKeepFDivF32(Num, UnsafeDiv)) |
441 | 6 | NewFDiv = Builder.CreateCall(Decl, { Num, Den }); |
442 | 9 | } |
443 | 10 | |
444 | 10 | if (NewFDiv10 ) { |
445 | 7 | FDiv.replaceAllUsesWith(NewFDiv); |
446 | 7 | NewFDiv->takeName(&FDiv); |
447 | 7 | FDiv.eraseFromParent(); |
448 | 7 | } |
449 | 231 | |
450 | 231 | return true; |
451 | 231 | } |
452 | | |
453 | 15.1k | static bool hasUnsafeFPMath(const Function &F) { |
454 | 15.1k | Attribute Attr = F.getFnAttribute("unsafe-fp-math"); |
455 | 15.1k | return Attr.getValueAsString() == "true"; |
456 | 15.1k | } |
457 | | |
458 | 9.06k | bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { |
459 | 9.06k | bool Changed = false; |
460 | 9.06k | |
461 | 9.06k | if (ST->has16BitInsts() && 9.06k needsPromotionToI32(I.getType())4.20k && |
462 | 318 | DA->isUniform(&I)) |
463 | 227 | Changed |= promoteUniformOpToI32(I); |
464 | 9.06k | |
465 | 9.06k | return Changed; |
466 | 9.06k | } |
467 | | |
468 | 11.8k | bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { |
469 | 11.8k | if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && |
470 | 11.8k | canWidenScalarExtLoad(I)1.12k ) { |
471 | 50 | IRBuilder<> Builder(&I); |
472 | 50 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
473 | 50 | |
474 | 50 | Type *I32Ty = Builder.getInt32Ty(); |
475 | 50 | Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); |
476 | 50 | Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); |
477 | 50 | Value *WidenLoad = Builder.CreateLoad(BitCast); |
478 | 50 | |
479 | 50 | int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); |
480 | 50 | Type *IntNTy = Builder.getIntNTy(TySize); |
481 | 50 | Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); |
482 | 50 | Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); |
483 | 50 | I.replaceAllUsesWith(ValOrig); |
484 | 50 | I.eraseFromParent(); |
485 | 50 | return true; |
486 | 50 | } |
487 | 11.7k | |
488 | 11.7k | return false; |
489 | 11.7k | } |
490 | | |
491 | 2.28k | bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { |
492 | 2.28k | bool Changed = false; |
493 | 2.28k | |
494 | 2.28k | if (ST->has16BitInsts() && 2.28k needsPromotionToI32(I.getOperand(0)->getType())1.14k && |
495 | 188 | DA->isUniform(&I)) |
496 | 105 | Changed |= promoteUniformOpToI32(I); |
497 | 2.28k | |
498 | 2.28k | return Changed; |
499 | 2.28k | } |
500 | | |
501 | 1.59k | bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { |
502 | 1.59k | bool Changed = false; |
503 | 1.59k | |
504 | 1.59k | if (ST->has16BitInsts() && 1.59k needsPromotionToI32(I.getType())899 && |
505 | 182 | DA->isUniform(&I)) |
506 | 119 | Changed |= promoteUniformOpToI32(I); |
507 | 1.59k | |
508 | 1.59k | return Changed; |
509 | 1.59k | } |
510 | | |
511 | 9.93k | bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { |
512 | 9.93k | switch (I.getIntrinsicID()) { |
513 | 38 | case Intrinsic::bitreverse: |
514 | 38 | return visitBitreverseIntrinsicInst(I); |
515 | 9.89k | default: |
516 | 9.89k | return false; |
517 | 0 | } |
518 | 0 | } |
519 | | |
520 | 38 | bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { |
521 | 38 | bool Changed = false; |
522 | 38 | |
523 | 38 | if (ST->has16BitInsts() && 38 needsPromotionToI32(I.getType())24 && |
524 | 8 | DA->isUniform(&I)) |
525 | 8 | Changed |= promoteUniformBitreverseToI32(I); |
526 | 38 | |
527 | 38 | return Changed; |
528 | 38 | } |
529 | | |
530 | 1.47k | bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { |
531 | 1.47k | Mod = &M; |
532 | 1.47k | return false; |
533 | 1.47k | } |
534 | | |
535 | 15.2k | bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { |
536 | 15.2k | if (skipFunction(F)) |
537 | 2 | return false; |
538 | 15.2k | |
539 | 15.2k | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
540 | 15.2k | if (!TPC) |
541 | 8 | return false; |
542 | 15.1k | |
543 | 15.1k | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
544 | 15.1k | ST = &TM.getSubtarget<SISubtarget>(F); |
545 | 15.1k | DA = &getAnalysis<DivergenceAnalysis>(); |
546 | 15.1k | HasUnsafeFPMath = hasUnsafeFPMath(F); |
547 | 15.1k | |
548 | 15.1k | bool MadeChange = false; |
549 | 15.1k | |
550 | 17.2k | for (BasicBlock &BB : F) { |
551 | 17.2k | BasicBlock::iterator Next; |
552 | 113k | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E113k ; I = Next95.9k ) { |
553 | 95.9k | Next = std::next(I); |
554 | 95.9k | MadeChange |= visit(*I); |
555 | 95.9k | } |
556 | 17.2k | } |
557 | 15.2k | |
558 | 15.2k | return MadeChange; |
559 | 15.2k | } |
560 | | |
561 | 90.0k | INITIALIZE_PASS_BEGIN90.0k (AMDGPUCodeGenPrepare, DEBUG_TYPE,
|
562 | 90.0k | "AMDGPU IR optimizations", false, false) |
563 | 90.0k | INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) |
564 | 90.0k | INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", |
565 | | false, false) |
566 | | |
567 | | char AMDGPUCodeGenPrepare::ID = 0; |
568 | | |
569 | 1.48k | FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { |
570 | 1.48k | return new AMDGPUCodeGenPrepare(); |
571 | 1.48k | } |