/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // \file |
11 | | // This file implements a TargetTransformInfo analysis pass specific to the |
12 | | // AMDGPU target machine. It uses the target's detailed information to provide |
13 | | // more precise answers to certain TTI queries, while letting the target |
14 | | // independent and default TTI implementations handle the rest. |
15 | | // |
16 | | //===----------------------------------------------------------------------===// |
17 | | |
18 | | #include "AMDGPUTargetTransformInfo.h" |
19 | | #include "AMDGPUSubtarget.h" |
20 | | #include "llvm/ADT/STLExtras.h" |
21 | | #include "llvm/Analysis/LoopInfo.h" |
22 | | #include "llvm/Analysis/TargetTransformInfo.h" |
23 | | #include "llvm/Analysis/ValueTracking.h" |
24 | | #include "llvm/CodeGen/ISDOpcodes.h" |
25 | | #include "llvm/CodeGen/MachineValueType.h" |
26 | | #include "llvm/CodeGen/ValueTypes.h" |
27 | | #include "llvm/IR/Argument.h" |
28 | | #include "llvm/IR/Attributes.h" |
29 | | #include "llvm/IR/BasicBlock.h" |
30 | | #include "llvm/IR/CallingConv.h" |
31 | | #include "llvm/IR/DataLayout.h" |
32 | | #include "llvm/IR/DerivedTypes.h" |
33 | | #include "llvm/IR/Function.h" |
34 | | #include "llvm/IR/Instruction.h" |
35 | | #include "llvm/IR/Instructions.h" |
36 | | #include "llvm/IR/IntrinsicInst.h" |
37 | | #include "llvm/IR/Module.h" |
38 | | #include "llvm/IR/PatternMatch.h" |
39 | | #include "llvm/IR/Type.h" |
40 | | #include "llvm/IR/Value.h" |
41 | | #include "llvm/MC/SubtargetFeature.h" |
42 | | #include "llvm/Support/Casting.h" |
43 | | #include "llvm/Support/CommandLine.h" |
44 | | #include "llvm/Support/Debug.h" |
45 | | #include "llvm/Support/ErrorHandling.h" |
46 | | #include "llvm/Support/raw_ostream.h" |
47 | | #include "llvm/Target/TargetMachine.h" |
48 | | #include <algorithm> |
49 | | #include <cassert> |
50 | | #include <limits> |
51 | | #include <utility> |
52 | | |
53 | | using namespace llvm; |
54 | | |
55 | | #define DEBUG_TYPE "AMDGPUtti" |
56 | | |
57 | | static cl::opt<unsigned> UnrollThresholdPrivate( |
58 | | "amdgpu-unroll-threshold-private", |
59 | | cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), |
60 | | cl::init(2500), cl::Hidden); |
61 | | |
62 | | static cl::opt<unsigned> UnrollThresholdLocal( |
63 | | "amdgpu-unroll-threshold-local", |
64 | | cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), |
65 | | cl::init(1000), cl::Hidden); |
66 | | |
67 | | static cl::opt<unsigned> UnrollThresholdIf( |
68 | | "amdgpu-unroll-threshold-if", |
69 | | cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), |
70 | | cl::init(150), cl::Hidden); |
71 | | |
72 | | static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, |
73 | 6 | unsigned Depth = 0) { |
74 | 6 | const Instruction *I = dyn_cast<Instruction>(Cond); |
75 | 6 | if (!I) |
76 | 0 | return false; |
77 | 6 | |
78 | 6 | for (const Value *V : I->operand_values()) 6 { |
79 | 6 | if (!L->contains(I)) |
80 | 0 | continue; |
81 | 6 | if (const PHINode *6 PHI6 = dyn_cast<PHINode>(V)) { |
82 | 4 | if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) 4 { |
83 | 0 | return SubLoop->contains(PHI); })) |
84 | 4 | return true; |
85 | 2 | } else if (2 Depth < 10 && 2 dependsOnLocalPhi(L, V, Depth+1)2 ) |
86 | 2 | return true; |
87 | 0 | } |
88 | 0 | return false; |
89 | 0 | } |
90 | | |
91 | | void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
92 | 17 | TTI::UnrollingPreferences &UP) { |
93 | 17 | UP.Threshold = 300; // Twice the default. |
94 | 17 | UP.MaxCount = std::numeric_limits<unsigned>::max(); |
95 | 17 | UP.Partial = true; |
96 | 17 | |
97 | 17 | // TODO: Do we want runtime unrolling? |
98 | 17 | |
99 | 17 | // Maximum alloca size than can fit registers. Reserve 16 registers. |
100 | 17 | const unsigned MaxAlloca = (256 - 16) * 4; |
101 | 17 | unsigned ThresholdPrivate = UnrollThresholdPrivate; |
102 | 17 | unsigned ThresholdLocal = UnrollThresholdLocal; |
103 | 17 | unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); |
104 | 17 | AMDGPUAS ASST = ST->getAMDGPUAS(); |
105 | 32 | for (const BasicBlock *BB : L->getBlocks()) { |
106 | 32 | const DataLayout &DL = BB->getModule()->getDataLayout(); |
107 | 32 | unsigned LocalGEPsSeen = 0; |
108 | 32 | |
109 | 32 | if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) 32 { |
110 | 0 | return SubLoop->contains(BB); })) |
111 | 0 | continue; // Block belongs to an inner loop. |
112 | 32 | |
113 | 32 | for (const Instruction &I : *BB) 32 { |
114 | 145 | // Unroll a loop which contains an "if" statement whose condition |
115 | 145 | // defined by a PHI belonging to the loop. This may help to eliminate |
116 | 145 | // if region and potentially even PHI itself, saving on both divergence |
117 | 145 | // and registers used for the PHI. |
118 | 145 | // Add a small bonus for each of such "if" statements. |
119 | 145 | if (const BranchInst *Br145 = dyn_cast<BranchInst>(&I)) { |
120 | 27 | if (UP.Threshold < MaxBoost && 27 Br->isConditional()27 ) { |
121 | 16 | if (L->isLoopExiting(Br->getSuccessor(0)) || |
122 | 9 | L->isLoopExiting(Br->getSuccessor(1))) |
123 | 12 | continue; |
124 | 4 | if (4 dependsOnLocalPhi(L, Br->getCondition())4 ) { |
125 | 4 | UP.Threshold += UnrollThresholdIf; |
126 | 4 | DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold |
127 | 4 | << " for loop:\n" << *L << " due to " << *Br << '\n'); |
128 | 4 | if (UP.Threshold >= MaxBoost) |
129 | 0 | return; |
130 | 15 | } |
131 | 16 | } |
132 | 15 | continue; |
133 | 15 | } |
134 | 118 | |
135 | 118 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); |
136 | 118 | if (!GEP) |
137 | 94 | continue; |
138 | 24 | |
139 | 24 | unsigned AS = GEP->getAddressSpace(); |
140 | 24 | unsigned Threshold = 0; |
141 | 24 | if (AS == ASST.PRIVATE_ADDRESS) |
142 | 13 | Threshold = ThresholdPrivate; |
143 | 11 | else if (11 AS == ASST.LOCAL_ADDRESS11 ) |
144 | 2 | Threshold = ThresholdLocal; |
145 | 11 | else |
146 | 9 | continue; |
147 | 15 | |
148 | 15 | if (15 UP.Threshold >= Threshold15 ) |
149 | 0 | continue; |
150 | 15 | |
151 | 15 | if (15 AS == ASST.PRIVATE_ADDRESS15 ) { |
152 | 13 | const Value *Ptr = GEP->getPointerOperand(); |
153 | 13 | const AllocaInst *Alloca = |
154 | 13 | dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); |
155 | 13 | if (!Alloca || 13 !Alloca->isStaticAlloca()7 ) |
156 | 7 | continue; |
157 | 6 | Type *Ty = Alloca->getAllocatedType(); |
158 | 6 | unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty)6 : 00 ; |
159 | 6 | if (AllocaSize > MaxAlloca) |
160 | 1 | continue; |
161 | 2 | } else if (2 AS == ASST.LOCAL_ADDRESS2 ) { |
162 | 2 | LocalGEPsSeen++; |
163 | 2 | // Inhibit unroll for local memory if we have seen addressing not to |
164 | 2 | // a variable, most likely we will be unable to combine it. |
165 | 2 | // Do not unroll too deep inner loops for local memory to give a chance |
166 | 2 | // to unroll an outer loop for a more important reason. |
167 | 2 | if (LocalGEPsSeen > 1 || 2 L->getLoopDepth() > 22 || |
168 | 2 | (!isa<GlobalVariable>(GEP->getPointerOperand()) && |
169 | 2 | !isa<Argument>(GEP->getPointerOperand()))) |
170 | 0 | continue; |
171 | 7 | } |
172 | 7 | |
173 | 7 | // Check if GEP depends on a value defined by this loop itself. |
174 | 7 | bool HasLoopDef = false; |
175 | 19 | for (const Value *Op : GEP->operands()) { |
176 | 19 | const Instruction *Inst = dyn_cast<Instruction>(Op); |
177 | 19 | if (!Inst || 19 L->isLoopInvariant(Op)12 ) |
178 | 12 | continue; |
179 | 7 | |
180 | 7 | if (7 llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) 7 { |
181 | 0 | return SubLoop->contains(Inst); })) |
182 | 0 | continue; |
183 | 7 | HasLoopDef = true; |
184 | 7 | break; |
185 | 7 | } |
186 | 7 | if (!HasLoopDef) |
187 | 0 | continue; |
188 | 7 | |
189 | 7 | // We want to do whatever we can to limit the number of alloca |
190 | 7 | // instructions that make it through to the code generator. allocas |
191 | 7 | // require us to use indirect addressing, which is slow and prone to |
192 | 7 | // compiler bugs. If this loop does an address calculation on an |
193 | 7 | // alloca ptr, then we want to use a higher than normal loop unroll |
194 | 7 | // threshold. This will give SROA a better chance to eliminate these |
195 | 7 | // allocas. |
196 | 7 | // |
197 | 7 | // We also want to have more unrolling for local memory to let ds |
198 | 7 | // instructions with different offsets combine. |
199 | 7 | // |
200 | 7 | // Don't use the maximum allowed value here as it will make some |
201 | 7 | // programs way too big. |
202 | 7 | UP.Threshold = Threshold; |
203 | 7 | DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" |
204 | 7 | << *L << " due to " << *GEP << '\n'); |
205 | 7 | if (UP.Threshold >= MaxBoost) |
206 | 5 | return; |
207 | 12 | } |
208 | 32 | } |
209 | 17 | } |
210 | | |
211 | 2.15k | unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { |
212 | 2.15k | // The concept of vector registers doesn't really exist. Some packed vector |
213 | 2.15k | // operations operate on the normal 32-bit registers. |
214 | 2.15k | |
215 | 2.15k | // Number of VGPRs on SI. |
216 | 2.15k | if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) |
217 | 1.95k | return 256; |
218 | 196 | |
219 | 196 | return 4 * 128; // XXX - 4 channels. Should these count as vector instead? |
220 | 196 | } |
221 | | |
222 | 2.15k | unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { |
223 | 2.15k | // This is really the number of registers to fill when vectorizing / |
224 | 2.15k | // interleaving loops, so we lie to avoid trying to use all registers. |
225 | 2.15k | return getHardwareNumberOfRegisters(Vec) >> 3; |
226 | 2.15k | } |
227 | | |
228 | 50 | unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { |
229 | 50 | return 32; |
230 | 50 | } |
231 | | |
232 | 38 | unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { |
233 | 38 | return 32; |
234 | 38 | } |
235 | | |
236 | 24.7k | unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
237 | 24.7k | AMDGPUAS AS = ST->getAMDGPUAS(); |
238 | 24.7k | if (AddrSpace == AS.GLOBAL_ADDRESS || |
239 | 6.47k | AddrSpace == AS.CONSTANT_ADDRESS || |
240 | 5.32k | AddrSpace == AS.FLAT_ADDRESS) |
241 | 19.8k | return 128; |
242 | 4.88k | if (4.88k AddrSpace == AS.LOCAL_ADDRESS || |
243 | 1.24k | AddrSpace == AS.REGION_ADDRESS) |
244 | 3.63k | return 64; |
245 | 1.24k | if (1.24k AddrSpace == AS.PRIVATE_ADDRESS1.24k ) |
246 | 962 | return 8 * ST->getMaxPrivateElementSize(); |
247 | 285 | |
248 | 285 | if (285 ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && |
249 | 285 | (AddrSpace == AS.PARAM_D_ADDRESS || |
250 | 285 | AddrSpace == AS.PARAM_I_ADDRESS || |
251 | 280 | (AddrSpace >= AS.CONSTANT_BUFFER_0 && |
252 | 280 | AddrSpace <= AS.CONSTANT_BUFFER_15))) |
253 | 285 | return 128; |
254 | 0 | llvm_unreachable0 ("unhandled address space"); |
255 | 0 | } |
256 | | |
257 | | bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
258 | | unsigned Alignment, |
259 | 1.07k | unsigned AddrSpace) const { |
260 | 1.07k | // We allow vectorization of flat stores, even though we may need to decompose |
261 | 1.07k | // them later if they may access private memory. We don't have enough context |
262 | 1.07k | // here, and legalization can handle it. |
263 | 1.07k | if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS1.07k ) { |
264 | 75 | return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && |
265 | 64 | ChainSizeInBytes <= ST->getMaxPrivateElementSize(); |
266 | 109 | } |
267 | 968 | return true; |
268 | 968 | } |
269 | | |
270 | | bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
271 | | unsigned Alignment, |
272 | 716 | unsigned AddrSpace) const { |
273 | 716 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
274 | 716 | } |
275 | | |
276 | | bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
277 | | unsigned Alignment, |
278 | 361 | unsigned AddrSpace) const { |
279 | 361 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
280 | 361 | } |
281 | | |
282 | 7 | unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { |
283 | 7 | // Disable unrolling if the loop is not vectorized. |
284 | 7 | // TODO: Enable this again. |
285 | 7 | if (VF == 1) |
286 | 6 | return 1; |
287 | 1 | |
288 | 1 | return 8; |
289 | 1 | } |
290 | | |
291 | | int AMDGPUTTIImpl::getArithmeticInstrCost( |
292 | | unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, |
293 | | TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, |
294 | 267 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { |
295 | 267 | EVT OrigTy = TLI->getValueType(DL, Ty); |
296 | 267 | if (!OrigTy.isSimple()267 ) { |
297 | 31 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
298 | 31 | Opd1PropInfo, Opd2PropInfo); |
299 | 31 | } |
300 | 236 | |
301 | 236 | // Legalize the type. |
302 | 236 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
303 | 236 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
304 | 236 | |
305 | 236 | // Because we don't have any legal vector operations, but the legal types, we |
306 | 236 | // need to account for split vectors. |
307 | 236 | unsigned NElts = LT.second.isVector() ? |
308 | 236 | LT.second.getVectorNumElements()53 : 1183 ; |
309 | 236 | |
310 | 236 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
311 | 236 | |
312 | 236 | switch (ISD) { |
313 | 12 | case ISD::SHL: |
314 | 12 | case ISD::SRL: |
315 | 12 | case ISD::SRA: |
316 | 12 | if (SLT == MVT::i64) |
317 | 6 | return get64BitInstrCost() * LT.first * NElts; |
318 | 6 | |
319 | 6 | // i32 |
320 | 6 | return getFullRateInstrCost() * LT.first * NElts; |
321 | 53 | case ISD::ADD: |
322 | 53 | case ISD::SUB: |
323 | 53 | case ISD::AND: |
324 | 53 | case ISD::OR: |
325 | 53 | case ISD::XOR: |
326 | 53 | if (SLT == MVT::i6453 ){ |
327 | 26 | // and, or and xor are typically split into 2 VALU instructions. |
328 | 26 | return 2 * getFullRateInstrCost() * LT.first * NElts; |
329 | 26 | } |
330 | 27 | |
331 | 27 | return LT.first * NElts * getFullRateInstrCost(); |
332 | 9 | case ISD::MUL: { |
333 | 9 | const int QuarterRateCost = getQuarterRateInstrCost(); |
334 | 9 | if (SLT == MVT::i649 ) { |
335 | 5 | const int FullRateCost = getFullRateInstrCost(); |
336 | 5 | return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; |
337 | 5 | } |
338 | 4 | |
339 | 4 | // i32 |
340 | 4 | return QuarterRateCost * NElts * LT.first; |
341 | 4 | } |
342 | 78 | case ISD::FADD: |
343 | 78 | case ISD::FSUB: |
344 | 78 | case ISD::FMUL: |
345 | 78 | if (SLT == MVT::f64) |
346 | 18 | return LT.first * NElts * get64BitInstrCost(); |
347 | 60 | |
348 | 60 | if (60 SLT == MVT::f32 || 60 SLT == MVT::f1622 ) |
349 | 60 | return LT.first * NElts * getFullRateInstrCost(); |
350 | 0 | break; |
351 | 84 | case ISD::FDIV: |
352 | 84 | case ISD::FREM: |
353 | 84 | // FIXME: frem should be handled separately. The fdiv in it is most of it, |
354 | 84 | // but the current lowering is also not entirely correct. |
355 | 84 | if (SLT == MVT::f6484 ) { |
356 | 24 | int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); |
357 | 24 | // Add cost of workaround. |
358 | 24 | if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) |
359 | 8 | Cost += 3 * getFullRateInstrCost(); |
360 | 24 | |
361 | 24 | return LT.first * Cost * NElts; |
362 | 24 | } |
363 | 60 | |
364 | 60 | if (60 !Args.empty() && 60 match(Args[0], PatternMatch::m_FPOne())54 ) { |
365 | 24 | // TODO: This is more complicated, unsafe flags etc. |
366 | 24 | if ((SLT == MVT::f32 && 24 !ST->hasFP32Denormals()22 ) || |
367 | 24 | (SLT == MVT::f16 && 8 ST->has16BitInsts()2 )) { |
368 | 18 | return LT.first * getQuarterRateInstrCost() * NElts; |
369 | 18 | } |
370 | 42 | } |
371 | 42 | |
372 | 42 | if (42 SLT == MVT::f16 && 42 ST->has16BitInsts()3 ) { |
373 | 3 | // 2 x v_cvt_f32_f16 |
374 | 3 | // f32 rcp |
375 | 3 | // f32 fmul |
376 | 3 | // v_cvt_f16_f32 |
377 | 3 | // f16 div_fixup |
378 | 3 | int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); |
379 | 3 | return LT.first * Cost * NElts; |
380 | 3 | } |
381 | 39 | |
382 | 39 | if (39 SLT == MVT::f32 || 39 SLT == MVT::f160 ) { |
383 | 39 | int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); |
384 | 39 | |
385 | 39 | if (!ST->hasFP32Denormals()39 ) { |
386 | 24 | // FP mode switches. |
387 | 24 | Cost += 2 * getFullRateInstrCost(); |
388 | 24 | } |
389 | 39 | |
390 | 39 | return LT.first * NElts * Cost; |
391 | 39 | } |
392 | 0 | break; |
393 | 0 | default: |
394 | 0 | break; |
395 | 0 | } |
396 | 0 |
|
397 | 0 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
398 | 0 | Opd1PropInfo, Opd2PropInfo); |
399 | 0 | } |
400 | | |
401 | 310 | unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { |
402 | 310 | // XXX - For some reason this isn't called for switch. |
403 | 310 | switch (Opcode) { |
404 | 294 | case Instruction::Br: |
405 | 294 | case Instruction::Ret: |
406 | 294 | return 10; |
407 | 16 | default: |
408 | 16 | return BaseT::getCFInstrCost(Opcode); |
409 | 0 | } |
410 | 0 | } |
411 | | |
412 | | int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
413 | 464 | unsigned Index) { |
414 | 464 | switch (Opcode) { |
415 | 464 | case Instruction::ExtractElement: |
416 | 464 | case Instruction::InsertElement: { |
417 | 464 | unsigned EltSize |
418 | 464 | = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); |
419 | 464 | if (EltSize < 32464 ) { |
420 | 115 | if (EltSize == 16 && 115 Index == 0109 && ST->has16BitInsts()52 ) |
421 | 46 | return 0; |
422 | 69 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
423 | 69 | } |
424 | 349 | |
425 | 349 | // Extracts are just reads of a subregister, so are free. Inserts are |
426 | 349 | // considered free because we don't want to have any cost for scalarizing |
427 | 349 | // operations, and we don't have to copy into a different register class. |
428 | 349 | |
429 | 349 | // Dynamic indexing isn't free and is best avoided. |
430 | 349 | return Index == ~0u ? 349 23 : 0346 ; |
431 | 349 | } |
432 | 0 | default: |
433 | 0 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
434 | 0 | } |
435 | 0 | } |
436 | | |
437 | 42.3k | static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { |
438 | 42.3k | switch (I->getIntrinsicID()) { |
439 | 14.0k | case Intrinsic::amdgcn_workitem_id_x: |
440 | 14.0k | case Intrinsic::amdgcn_workitem_id_y: |
441 | 14.0k | case Intrinsic::amdgcn_workitem_id_z: |
442 | 14.0k | case Intrinsic::amdgcn_interp_mov: |
443 | 14.0k | case Intrinsic::amdgcn_interp_p1: |
444 | 14.0k | case Intrinsic::amdgcn_interp_p2: |
445 | 14.0k | case Intrinsic::amdgcn_mbcnt_hi: |
446 | 14.0k | case Intrinsic::amdgcn_mbcnt_lo: |
447 | 14.0k | case Intrinsic::r600_read_tidig_x: |
448 | 14.0k | case Intrinsic::r600_read_tidig_y: |
449 | 14.0k | case Intrinsic::r600_read_tidig_z: |
450 | 14.0k | case Intrinsic::amdgcn_atomic_inc: |
451 | 14.0k | case Intrinsic::amdgcn_atomic_dec: |
452 | 14.0k | case Intrinsic::amdgcn_image_atomic_swap: |
453 | 14.0k | case Intrinsic::amdgcn_image_atomic_add: |
454 | 14.0k | case Intrinsic::amdgcn_image_atomic_sub: |
455 | 14.0k | case Intrinsic::amdgcn_image_atomic_smin: |
456 | 14.0k | case Intrinsic::amdgcn_image_atomic_umin: |
457 | 14.0k | case Intrinsic::amdgcn_image_atomic_smax: |
458 | 14.0k | case Intrinsic::amdgcn_image_atomic_umax: |
459 | 14.0k | case Intrinsic::amdgcn_image_atomic_and: |
460 | 14.0k | case Intrinsic::amdgcn_image_atomic_or: |
461 | 14.0k | case Intrinsic::amdgcn_image_atomic_xor: |
462 | 14.0k | case Intrinsic::amdgcn_image_atomic_inc: |
463 | 14.0k | case Intrinsic::amdgcn_image_atomic_dec: |
464 | 14.0k | case Intrinsic::amdgcn_image_atomic_cmpswap: |
465 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_swap: |
466 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_add: |
467 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_sub: |
468 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_smin: |
469 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_umin: |
470 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_smax: |
471 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_umax: |
472 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_and: |
473 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_or: |
474 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_xor: |
475 | 14.0k | case Intrinsic::amdgcn_buffer_atomic_cmpswap: |
476 | 14.0k | case Intrinsic::amdgcn_ps_live: |
477 | 14.0k | case Intrinsic::amdgcn_ds_swizzle: |
478 | 14.0k | return true; |
479 | 28.3k | default: |
480 | 28.3k | return false; |
481 | 0 | } |
482 | 0 | } |
483 | | |
484 | 131k | static bool isArgPassedInSGPR(const Argument *A) { |
485 | 131k | const Function *F = A->getParent(); |
486 | 131k | |
487 | 131k | // Arguments to compute shaders are never a source of divergence. |
488 | 131k | CallingConv::ID CC = F->getCallingConv(); |
489 | 131k | switch (CC) { |
490 | 115k | case CallingConv::AMDGPU_KERNEL: |
491 | 115k | case CallingConv::SPIR_KERNEL: |
492 | 115k | return true; |
493 | 9.78k | case CallingConv::AMDGPU_VS: |
494 | 9.78k | case CallingConv::AMDGPU_HS: |
495 | 9.78k | case CallingConv::AMDGPU_GS: |
496 | 9.78k | case CallingConv::AMDGPU_PS: |
497 | 9.78k | case CallingConv::AMDGPU_CS: |
498 | 9.78k | // For non-compute shaders, SGPR inputs are marked with either inreg or byval. |
499 | 9.78k | // Everything else is in VGPRs. |
500 | 9.78k | return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || |
501 | 7.13k | F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); |
502 | 6.16k | default: |
503 | 6.16k | // TODO: Should calls support inreg for SGPR inputs? |
504 | 6.16k | return false; |
505 | 0 | } |
506 | 0 | } |
507 | | |
508 | | /// \returns true if the result of the value could potentially be |
509 | | /// different across workitems in a wavefront. |
510 | 517k | bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { |
511 | 517k | if (const Argument *A = dyn_cast<Argument>(V)) |
512 | 131k | return !isArgPassedInSGPR(A); |
513 | 385k | |
514 | 385k | // Loads from the private address space are divergent, because threads |
515 | 385k | // can execute the load instruction with the same inputs and get different |
516 | 385k | // results. |
517 | 385k | // |
518 | 385k | // All other loads are not divergent, because if threads issue loads with the |
519 | 385k | // same arguments, they will always get the same result. |
520 | 385k | if (const LoadInst *385k Load385k = dyn_cast<LoadInst>(V)) |
521 | 45.9k | return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; |
522 | 339k | |
523 | 339k | // Atomics are divergent because they are executed sequentially: when an |
524 | 339k | // atomic operation refers to the same address in each thread, then each |
525 | 339k | // thread after the first sees the value written by the previous thread as |
526 | 339k | // original value. |
527 | 339k | if (339k isa<AtomicRMWInst>(V) || 339k isa<AtomicCmpXchgInst>(V)334k ) |
528 | 5.86k | return true; |
529 | 333k | |
530 | 333k | if (const IntrinsicInst *333k Intrinsic333k = dyn_cast<IntrinsicInst>(V)) |
531 | 42.3k | return isIntrinsicSourceOfDivergence(Intrinsic); |
532 | 291k | |
533 | 291k | // Assume all function calls are a source of divergence. |
534 | 291k | if (291k isa<CallInst>(V) || 291k isa<InvokeInst>(V)284k ) |
535 | 7.09k | return true; |
536 | 284k | |
537 | 284k | return false; |
538 | 284k | } |
539 | | |
540 | 146k | bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { |
541 | 146k | if (const IntrinsicInst *Intrinsic146k = dyn_cast<IntrinsicInst>(V)) { |
542 | 20.8k | switch (Intrinsic->getIntrinsicID()) { |
543 | 20.8k | default: |
544 | 20.8k | return false; |
545 | 26 | case Intrinsic::amdgcn_readfirstlane: |
546 | 26 | case Intrinsic::amdgcn_readlane: |
547 | 26 | return true; |
548 | 126k | } |
549 | 126k | } |
550 | 126k | return false; |
551 | 126k | } |
552 | | |
553 | | unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, |
554 | 12 | Type *SubTp) { |
555 | 12 | if (ST->hasVOP3PInsts()12 ) { |
556 | 6 | VectorType *VT = cast<VectorType>(Tp); |
557 | 6 | if (VT->getNumElements() == 2 && |
558 | 6 | DL.getTypeSizeInBits(VT->getElementType()) == 166 ) { |
559 | 6 | // With op_sel VOP3P instructions freely can access the low half or high |
560 | 6 | // half of a register, so any swizzle is free. |
561 | 6 | |
562 | 6 | switch (Kind) { |
563 | 5 | case TTI::SK_Broadcast: |
564 | 5 | case TTI::SK_Reverse: |
565 | 5 | case TTI::SK_PermuteSingleSrc: |
566 | 5 | return 0; |
567 | 1 | default: |
568 | 1 | break; |
569 | 7 | } |
570 | 7 | } |
571 | 6 | } |
572 | 7 | |
573 | 7 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); |
574 | 7 | } |
575 | | |
576 | | bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller, |
577 | 39 | const Function *Callee) const { |
578 | 39 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
579 | 39 | const FeatureBitset &CallerBits = |
580 | 39 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
581 | 39 | const FeatureBitset &CalleeBits = |
582 | 39 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
583 | 39 | |
584 | 39 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
585 | 39 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
586 | 39 | return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); |
587 | 39 | } |