/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // \file |
10 | | // This file implements a TargetTransformInfo analysis pass specific to the |
11 | | // AMDGPU target machine. It uses the target's detailed information to provide |
12 | | // more precise answers to certain TTI queries, while letting the target |
13 | | // independent and default TTI implementations handle the rest. |
14 | | // |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #include "AMDGPUTargetTransformInfo.h" |
18 | | #include "AMDGPUSubtarget.h" |
19 | | #include "Utils/AMDGPUBaseInfo.h" |
20 | | #include "llvm/ADT/STLExtras.h" |
21 | | #include "llvm/Analysis/LoopInfo.h" |
22 | | #include "llvm/Analysis/TargetTransformInfo.h" |
23 | | #include "llvm/Analysis/ValueTracking.h" |
24 | | #include "llvm/CodeGen/ISDOpcodes.h" |
25 | | #include "llvm/CodeGen/ValueTypes.h" |
26 | | #include "llvm/IR/Argument.h" |
27 | | #include "llvm/IR/Attributes.h" |
28 | | #include "llvm/IR/BasicBlock.h" |
29 | | #include "llvm/IR/CallingConv.h" |
30 | | #include "llvm/IR/DataLayout.h" |
31 | | #include "llvm/IR/DerivedTypes.h" |
32 | | #include "llvm/IR/Function.h" |
33 | | #include "llvm/IR/Instruction.h" |
34 | | #include "llvm/IR/Instructions.h" |
35 | | #include "llvm/IR/IntrinsicInst.h" |
36 | | #include "llvm/IR/Module.h" |
37 | | #include "llvm/IR/PatternMatch.h" |
38 | | #include "llvm/IR/Type.h" |
39 | | #include "llvm/IR/Value.h" |
40 | | #include "llvm/MC/SubtargetFeature.h" |
41 | | #include "llvm/Support/Casting.h" |
42 | | #include "llvm/Support/CommandLine.h" |
43 | | #include "llvm/Support/Debug.h" |
44 | | #include "llvm/Support/ErrorHandling.h" |
45 | | #include "llvm/Support/MachineValueType.h" |
46 | | #include "llvm/Support/raw_ostream.h" |
47 | | #include "llvm/Target/TargetMachine.h" |
48 | | #include <algorithm> |
49 | | #include <cassert> |
50 | | #include <limits> |
51 | | #include <utility> |
52 | | |
53 | | using namespace llvm; |
54 | | |
55 | | #define DEBUG_TYPE "AMDGPUtti" |
56 | | |
57 | | static cl::opt<unsigned> UnrollThresholdPrivate( |
58 | | "amdgpu-unroll-threshold-private", |
59 | | cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), |
60 | | cl::init(2500), cl::Hidden); |
61 | | |
62 | | static cl::opt<unsigned> UnrollThresholdLocal( |
63 | | "amdgpu-unroll-threshold-local", |
64 | | cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), |
65 | | cl::init(1000), cl::Hidden); |
66 | | |
67 | | static cl::opt<unsigned> UnrollThresholdIf( |
68 | | "amdgpu-unroll-threshold-if", |
69 | | cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), |
70 | | cl::init(150), cl::Hidden); |
71 | | |
72 | | static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, |
73 | 10 | unsigned Depth = 0) { |
74 | 10 | const Instruction *I = dyn_cast<Instruction>(Cond); |
75 | 10 | if (!I) |
76 | 0 | return false; |
77 | 10 | |
78 | 10 | for (const Value *V : I->operand_values()) { |
79 | 10 | if (!L->contains(I)) |
80 | 0 | continue; |
81 | 10 | if (const PHINode *PHI = dyn_cast<PHINode>(V)) { |
82 | 6 | if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { |
83 | 0 | return SubLoop->contains(PHI); })) |
84 | 6 | return true; |
85 | 4 | } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) |
86 | 4 | return true; |
87 | 10 | } |
88 | 10 | return false0 ; |
89 | 10 | } |
90 | | |
91 | | void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
92 | 27 | TTI::UnrollingPreferences &UP) { |
93 | 27 | UP.Threshold = 300; // Twice the default. |
94 | 27 | UP.MaxCount = std::numeric_limits<unsigned>::max(); |
95 | 27 | UP.Partial = true; |
96 | 27 | |
97 | 27 | // TODO: Do we want runtime unrolling? |
98 | 27 | |
99 | 27 | // Maximum alloca size than can fit registers. Reserve 16 registers. |
100 | 27 | const unsigned MaxAlloca = (256 - 16) * 4; |
101 | 27 | unsigned ThresholdPrivate = UnrollThresholdPrivate; |
102 | 27 | unsigned ThresholdLocal = UnrollThresholdLocal; |
103 | 27 | unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); |
104 | 42 | for (const BasicBlock *BB : L->getBlocks()) { |
105 | 42 | const DataLayout &DL = BB->getModule()->getDataLayout(); |
106 | 42 | unsigned LocalGEPsSeen = 0; |
107 | 42 | |
108 | 42 | if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { |
109 | 0 | return SubLoop->contains(BB); })) |
110 | 0 | continue; // Block belongs to an inner loop. |
111 | 42 | |
112 | 235 | for (const Instruction &I : *BB)42 { |
113 | 235 | // Unroll a loop which contains an "if" statement whose condition |
114 | 235 | // defined by a PHI belonging to the loop. This may help to eliminate |
115 | 235 | // if region and potentially even PHI itself, saving on both divergence |
116 | 235 | // and registers used for the PHI. |
117 | 235 | // Add a small bonus for each of such "if" statements. |
118 | 235 | if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { |
119 | 37 | if (UP.Threshold < MaxBoost && Br->isConditional()) { |
120 | 22 | BasicBlock *Succ0 = Br->getSuccessor(0); |
121 | 22 | BasicBlock *Succ1 = Br->getSuccessor(1); |
122 | 22 | if ((L->contains(Succ0) && L->isLoopExiting(Succ0)7 ) || |
123 | 22 | (19 L->contains(Succ1)19 && L->isLoopExiting(Succ1)17 )) |
124 | 16 | continue; |
125 | 6 | if (dependsOnLocalPhi(L, Br->getCondition())) { |
126 | 6 | UP.Threshold += UnrollThresholdIf; |
127 | 6 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold |
128 | 6 | << " for loop:\n" |
129 | 6 | << *L << " due to " << *Br << '\n'); |
130 | 6 | if (UP.Threshold >= MaxBoost) |
131 | 0 | return; |
132 | 21 | } |
133 | 6 | } |
134 | 21 | continue; |
135 | 21 | } |
136 | 198 | |
137 | 198 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); |
138 | 198 | if (!GEP) |
139 | 162 | continue; |
140 | 36 | |
141 | 36 | unsigned AS = GEP->getAddressSpace(); |
142 | 36 | unsigned Threshold = 0; |
143 | 36 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
144 | 9 | Threshold = ThresholdPrivate; |
145 | 27 | else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS23 ) |
146 | 4 | Threshold = ThresholdLocal; |
147 | 23 | else |
148 | 23 | continue; |
149 | 13 | |
150 | 13 | if (UP.Threshold >= Threshold) |
151 | 1 | continue; |
152 | 12 | |
153 | 12 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
154 | 9 | const Value *Ptr = GEP->getPointerOperand(); |
155 | 9 | const AllocaInst *Alloca = |
156 | 9 | dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); |
157 | 9 | if (!Alloca || !Alloca->isStaticAlloca()7 ) |
158 | 3 | continue; |
159 | 6 | Type *Ty = Alloca->getAllocatedType(); |
160 | 6 | unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 00 ; |
161 | 6 | if (AllocaSize > MaxAlloca) |
162 | 1 | continue; |
163 | 3 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || |
164 | 3 | AS == AMDGPUAS::REGION_ADDRESS0 ) { |
165 | 3 | LocalGEPsSeen++; |
166 | 3 | // Inhibit unroll for local memory if we have seen addressing not to |
167 | 3 | // a variable, most likely we will be unable to combine it. |
168 | 3 | // Do not unroll too deep inner loops for local memory to give a chance |
169 | 3 | // to unroll an outer loop for a more important reason. |
170 | 3 | if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || |
171 | 3 | (!isa<GlobalVariable>(GEP->getPointerOperand()) && |
172 | 3 | !isa<Argument>(GEP->getPointerOperand()))) |
173 | 0 | continue; |
174 | 8 | } |
175 | 8 | |
176 | 8 | // Check if GEP depends on a value defined by this loop itself. |
177 | 8 | bool HasLoopDef = false; |
178 | 21 | for (const Value *Op : GEP->operands()) { |
179 | 21 | const Instruction *Inst = dyn_cast<Instruction>(Op); |
180 | 21 | if (!Inst || L->isLoopInvariant(Op)13 ) |
181 | 13 | continue; |
182 | 8 | |
183 | 8 | if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { |
184 | 0 | return SubLoop->contains(Inst); })) |
185 | 0 | continue; |
186 | 8 | HasLoopDef = true; |
187 | 8 | break; |
188 | 8 | } |
189 | 8 | if (!HasLoopDef) |
190 | 0 | continue; |
191 | 8 | |
192 | 8 | // We want to do whatever we can to limit the number of alloca |
193 | 8 | // instructions that make it through to the code generator. allocas |
194 | 8 | // require us to use indirect addressing, which is slow and prone to |
195 | 8 | // compiler bugs. If this loop does an address calculation on an |
196 | 8 | // alloca ptr, then we want to use a higher than normal loop unroll |
197 | 8 | // threshold. This will give SROA a better chance to eliminate these |
198 | 8 | // allocas. |
199 | 8 | // |
200 | 8 | // We also want to have more unrolling for local memory to let ds |
201 | 8 | // instructions with different offsets combine. |
202 | 8 | // |
203 | 8 | // Don't use the maximum allowed value here as it will make some |
204 | 8 | // programs way too big. |
205 | 8 | UP.Threshold = Threshold; |
206 | 8 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold |
207 | 8 | << " for loop:\n" |
208 | 8 | << *L << " due to " << *GEP << '\n'); |
209 | 8 | if (UP.Threshold >= MaxBoost) |
210 | 5 | return; |
211 | 8 | } |
212 | 42 | } |
213 | 27 | } |
214 | | |
215 | 4.65k | unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { |
216 | 4.65k | // The concept of vector registers doesn't really exist. Some packed vector |
217 | 4.65k | // operations operate on the normal 32-bit registers. |
218 | 4.65k | return 256; |
219 | 4.65k | } |
220 | | |
221 | 4.65k | unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { |
222 | 4.65k | // This is really the number of registers to fill when vectorizing / |
223 | 4.65k | // interleaving loops, so we lie to avoid trying to use all registers. |
224 | 4.65k | return getHardwareNumberOfRegisters(Vec) >> 3; |
225 | 4.65k | } |
226 | | |
227 | 112 | unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { |
228 | 112 | return 32; |
229 | 112 | } |
230 | | |
231 | 100 | unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { |
232 | 100 | return 32; |
233 | 100 | } |
234 | | |
235 | | unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
236 | | unsigned ChainSizeInBytes, |
237 | 16.0k | VectorType *VecTy) const { |
238 | 16.0k | unsigned VecRegBitWidth = VF * LoadSize; |
239 | 16.0k | if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 3215.2k ) |
240 | 3.19k | // TODO: Support element-size less than 32bit? |
241 | 3.19k | return 128 / LoadSize; |
242 | 12.8k | |
243 | 12.8k | return VF; |
244 | 12.8k | } |
245 | | |
246 | | unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
247 | | unsigned ChainSizeInBytes, |
248 | 4.09k | VectorType *VecTy) const { |
249 | 4.09k | unsigned VecRegBitWidth = VF * StoreSize; |
250 | 4.09k | if (VecRegBitWidth > 128) |
251 | 3.69k | return 128 / StoreSize; |
252 | 401 | |
253 | 401 | return VF; |
254 | 401 | } |
255 | | |
256 | 77.9k | unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
257 | 77.9k | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
258 | 77.9k | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS55.1k || |
259 | 77.9k | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT7.58k || |
260 | 77.9k | AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER7.46k ) { |
261 | 70.4k | return 512; |
262 | 70.4k | } |
263 | 7.45k | |
264 | 7.45k | if (AddrSpace == AMDGPUAS::FLAT_ADDRESS || |
265 | 7.45k | AddrSpace == AMDGPUAS::LOCAL_ADDRESS6.30k || |
266 | 7.45k | AddrSpace == AMDGPUAS::REGION_ADDRESS1.08k ) |
267 | 6.37k | return 128; |
268 | 1.08k | |
269 | 1.08k | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
270 | 1.08k | return 8 * ST->getMaxPrivateElementSize(); |
271 | 0 | |
272 | 0 | llvm_unreachable("unhandled address space"); |
273 | 0 | } |
274 | | |
275 | | bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
276 | | unsigned Alignment, |
277 | 10.6k | unsigned AddrSpace) const { |
278 | 10.6k | // We allow vectorization of flat stores, even though we may need to decompose |
279 | 10.6k | // them later if they may access private memory. We don't have enough context |
280 | 10.6k | // here, and legalization can handle it. |
281 | 10.6k | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
282 | 129 | return (Alignment >= 4 || ST->hasUnalignedScratchAccess()89 ) && |
283 | 129 | ChainSizeInBytes <= ST->getMaxPrivateElementSize()82 ; |
284 | 129 | } |
285 | 10.5k | return true; |
286 | 10.5k | } |
287 | | |
288 | | bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
289 | | unsigned Alignment, |
290 | 10.2k | unsigned AddrSpace) const { |
291 | 10.2k | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
292 | 10.2k | } |
293 | | |
294 | | bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
295 | | unsigned Alignment, |
296 | 352 | unsigned AddrSpace) const { |
297 | 352 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
298 | 352 | } |
299 | | |
300 | 6 | unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { |
301 | 6 | // Disable unrolling if the loop is not vectorized. |
302 | 6 | // TODO: Enable this again. |
303 | 6 | if (VF == 1) |
304 | 4 | return 1; |
305 | 2 | |
306 | 2 | return 8; |
307 | 2 | } |
308 | | |
309 | | bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
310 | 13.6k | MemIntrinsicInfo &Info) const { |
311 | 13.6k | switch (Inst->getIntrinsicID()) { |
312 | 13.6k | case Intrinsic::amdgcn_atomic_inc: |
313 | 974 | case Intrinsic::amdgcn_atomic_dec: |
314 | 974 | case Intrinsic::amdgcn_ds_ordered_add: |
315 | 974 | case Intrinsic::amdgcn_ds_ordered_swap: |
316 | 974 | case Intrinsic::amdgcn_ds_fadd: |
317 | 974 | case Intrinsic::amdgcn_ds_fmin: |
318 | 974 | case Intrinsic::amdgcn_ds_fmax: { |
319 | 974 | auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); |
320 | 974 | auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); |
321 | 974 | if (!Ordering || !Volatile) |
322 | 0 | return false; // Invalid. |
323 | 974 | |
324 | 974 | unsigned OrderingVal = Ordering->getZExtValue(); |
325 | 974 | if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) |
326 | 0 | return false; |
327 | 974 | |
328 | 974 | Info.PtrVal = Inst->getArgOperand(0); |
329 | 974 | Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); |
330 | 974 | Info.ReadMem = true; |
331 | 974 | Info.WriteMem = true; |
332 | 974 | Info.IsVolatile = !Volatile->isNullValue(); |
333 | 974 | return true; |
334 | 974 | } |
335 | 12.6k | default: |
336 | 12.6k | return false; |
337 | 13.6k | } |
338 | 13.6k | } |
339 | | |
340 | | int GCNTTIImpl::getArithmeticInstrCost( |
341 | | unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, |
342 | | TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, |
343 | 307 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { |
344 | 307 | EVT OrigTy = TLI->getValueType(DL, Ty); |
345 | 307 | if (!OrigTy.isSimple()) { |
346 | 16 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
347 | 16 | Opd1PropInfo, Opd2PropInfo); |
348 | 16 | } |
349 | 291 | |
350 | 291 | // Legalize the type. |
351 | 291 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
352 | 291 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
353 | 291 | |
354 | 291 | // Because we don't have any legal vector operations, but the legal types, we |
355 | 291 | // need to account for split vectors. |
356 | 291 | unsigned NElts = LT.second.isVector() ? |
357 | 172 | LT.second.getVectorNumElements()119 : 1; |
358 | 291 | |
359 | 291 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
360 | 291 | |
361 | 291 | switch (ISD) { |
362 | 291 | case ISD::SHL: |
363 | 12 | case ISD::SRL: |
364 | 12 | case ISD::SRA: |
365 | 12 | if (SLT == MVT::i64) |
366 | 6 | return get64BitInstrCost() * LT.first * NElts; |
367 | 6 | |
368 | 6 | // i32 |
369 | 6 | return getFullRateInstrCost() * LT.first * NElts; |
370 | 67 | case ISD::ADD: |
371 | 67 | case ISD::SUB: |
372 | 67 | case ISD::AND: |
373 | 67 | case ISD::OR: |
374 | 67 | case ISD::XOR: |
375 | 67 | if (SLT == MVT::i64){ |
376 | 24 | // and, or and xor are typically split into 2 VALU instructions. |
377 | 24 | return 2 * getFullRateInstrCost() * LT.first * NElts; |
378 | 24 | } |
379 | 43 | |
380 | 43 | return LT.first * NElts * getFullRateInstrCost(); |
381 | 43 | case ISD::MUL: { |
382 | 10 | const int QuarterRateCost = getQuarterRateInstrCost(); |
383 | 10 | if (SLT == MVT::i64) { |
384 | 5 | const int FullRateCost = getFullRateInstrCost(); |
385 | 5 | return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; |
386 | 5 | } |
387 | 5 | |
388 | 5 | // i32 |
389 | 5 | return QuarterRateCost * NElts * LT.first; |
390 | 5 | } |
391 | 112 | case ISD::FADD: |
392 | 112 | case ISD::FSUB: |
393 | 112 | case ISD::FMUL: |
394 | 112 | if (SLT == MVT::f64) |
395 | 18 | return LT.first * NElts * get64BitInstrCost(); |
396 | 94 | |
397 | 94 | if (SLT == MVT::f32 || SLT == MVT::f1644 ) |
398 | 94 | return LT.first * NElts * getFullRateInstrCost(); |
399 | 0 | break; |
400 | 90 | case ISD::FDIV: |
401 | 90 | case ISD::FREM: |
402 | 90 | // FIXME: frem should be handled separately. The fdiv in it is most of it, |
403 | 90 | // but the current lowering is also not entirely correct. |
404 | 90 | if (SLT == MVT::f64) { |
405 | 24 | int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); |
406 | 24 | // Add cost of workaround. |
407 | 24 | if (!ST->hasUsableDivScaleConditionOutput()) |
408 | 8 | Cost += 3 * getFullRateInstrCost(); |
409 | 24 | |
410 | 24 | return LT.first * Cost * NElts; |
411 | 24 | } |
412 | 66 | |
413 | 66 | if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { |
414 | 24 | // TODO: This is more complicated, unsafe flags etc. |
415 | 24 | if ((SLT == MVT::f32 && !ST->hasFP32Denormals()22 ) || |
416 | 24 | (8 SLT == MVT::f168 && ST->has16BitInsts()2 )) { |
417 | 18 | return LT.first * getQuarterRateInstrCost() * NElts; |
418 | 18 | } |
419 | 48 | } |
420 | 48 | |
421 | 48 | if (SLT == MVT::f16 && ST->has16BitInsts()3 ) { |
422 | 3 | // 2 x v_cvt_f32_f16 |
423 | 3 | // f32 rcp |
424 | 3 | // f32 fmul |
425 | 3 | // v_cvt_f16_f32 |
426 | 3 | // f16 div_fixup |
427 | 3 | int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); |
428 | 3 | return LT.first * Cost * NElts; |
429 | 3 | } |
430 | 45 | |
431 | 45 | if (SLT == MVT::f32 || SLT == MVT::f160 ) { |
432 | 45 | int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); |
433 | 45 | |
434 | 45 | if (!ST->hasFP32Denormals()) { |
435 | 28 | // FP mode switches. |
436 | 28 | Cost += 2 * getFullRateInstrCost(); |
437 | 28 | } |
438 | 45 | |
439 | 45 | return LT.first * NElts * Cost; |
440 | 45 | } |
441 | 0 | break; |
442 | 0 | default: |
443 | 0 | break; |
444 | 0 | } |
445 | 0 | |
446 | 0 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
447 | 0 | Opd1PropInfo, Opd2PropInfo); |
448 | 0 | } |
449 | | |
450 | 332 | unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { |
451 | 332 | // XXX - For some reason this isn't called for switch. |
452 | 332 | switch (Opcode) { |
453 | 332 | case Instruction::Br: |
454 | 317 | case Instruction::Ret: |
455 | 317 | return 10; |
456 | 317 | default: |
457 | 15 | return BaseT::getCFInstrCost(Opcode); |
458 | 332 | } |
459 | 332 | } |
460 | | |
461 | | int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, |
462 | 24 | bool IsPairwise) { |
463 | 24 | EVT OrigTy = TLI->getValueType(DL, Ty); |
464 | 24 | |
465 | 24 | // Computes cost on targets that have packed math instructions(which support |
466 | 24 | // 16-bit types only). |
467 | 24 | if (IsPairwise || |
468 | 24 | !ST->hasVOP3PInsts()12 || |
469 | 24 | OrigTy.getScalarSizeInBits() != 166 ) |
470 | 19 | return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); |
471 | 5 | |
472 | 5 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
473 | 5 | return LT.first * getFullRateInstrCost(); |
474 | 5 | } |
475 | | |
476 | | int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, |
477 | | bool IsPairwise, |
478 | 38 | bool IsUnsigned) { |
479 | 38 | EVT OrigTy = TLI->getValueType(DL, Ty); |
480 | 38 | |
481 | 38 | // Computes cost on targets that have packed math instructions(which support |
482 | 38 | // 16-bit types only). |
483 | 38 | if (IsPairwise || |
484 | 38 | !ST->hasVOP3PInsts()19 || |
485 | 38 | OrigTy.getScalarSizeInBits() != 1612 ) |
486 | 31 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); |
487 | 7 | |
488 | 7 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
489 | 7 | return LT.first * getHalfRateInstrCost(); |
490 | 7 | } |
491 | | |
492 | | int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
493 | 1.50k | unsigned Index) { |
494 | 1.50k | switch (Opcode) { |
495 | 1.50k | case Instruction::ExtractElement: |
496 | 1.50k | case Instruction::InsertElement: { |
497 | 1.50k | unsigned EltSize |
498 | 1.50k | = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); |
499 | 1.50k | if (EltSize < 32) { |
500 | 1.02k | if (EltSize == 16 && Index == 01.01k && ST->has16BitInsts()265 ) |
501 | 259 | return 0; |
502 | 763 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
503 | 763 | } |
504 | 485 | |
505 | 485 | // Extracts are just reads of a subregister, so are free. Inserts are |
506 | 485 | // considered free because we don't want to have any cost for scalarizing |
507 | 485 | // operations, and we don't have to copy into a different register class. |
508 | 485 | |
509 | 485 | // Dynamic indexing isn't free and is best avoided. |
510 | 485 | return Index == ~0u ? 23 : 0482 ; |
511 | 485 | } |
512 | 485 | default: |
513 | 0 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
514 | 1.50k | } |
515 | 1.50k | } |
516 | | |
517 | | |
518 | | |
519 | 289k | static bool isArgPassedInSGPR(const Argument *A) { |
520 | 289k | const Function *F = A->getParent(); |
521 | 289k | |
522 | 289k | // Arguments to compute shaders are never a source of divergence. |
523 | 289k | CallingConv::ID CC = F->getCallingConv(); |
524 | 289k | switch (CC) { |
525 | 289k | case CallingConv::AMDGPU_KERNEL: |
526 | 211k | case CallingConv::SPIR_KERNEL: |
527 | 211k | return true; |
528 | 211k | case CallingConv::AMDGPU_VS: |
529 | 58.1k | case CallingConv::AMDGPU_LS: |
530 | 58.1k | case CallingConv::AMDGPU_HS: |
531 | 58.1k | case CallingConv::AMDGPU_ES: |
532 | 58.1k | case CallingConv::AMDGPU_GS: |
533 | 58.1k | case CallingConv::AMDGPU_PS: |
534 | 58.1k | case CallingConv::AMDGPU_CS: |
535 | 58.1k | // For non-compute shaders, SGPR inputs are marked with either inreg or byval. |
536 | 58.1k | // Everything else is in VGPRs. |
537 | 58.1k | return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || |
538 | 58.1k | F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal)36.0k ; |
539 | 58.1k | default: |
540 | 20.0k | // TODO: Should calls support inreg for SGPR inputs? |
541 | 20.0k | return false; |
542 | 289k | } |
543 | 289k | } |
544 | | |
545 | | /// \returns true if the result of the value could potentially be |
546 | | /// different across workitems in a wavefront. |
547 | 1.79M | bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { |
548 | 1.79M | if (const Argument *A = dyn_cast<Argument>(V)) |
549 | 289k | return !isArgPassedInSGPR(A); |
550 | 1.50M | |
551 | 1.50M | // Loads from the private and flat address spaces are divergent, because |
552 | 1.50M | // threads can execute the load instruction with the same inputs and get |
553 | 1.50M | // different results. |
554 | 1.50M | // |
555 | 1.50M | // All other loads are not divergent, because if threads issue loads with the |
556 | 1.50M | // same arguments, they will always get the same result. |
557 | 1.50M | if (const LoadInst *Load = dyn_cast<LoadInst>(V)) |
558 | 182k | return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || |
559 | 182k | Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS180k ; |
560 | 1.32M | |
561 | 1.32M | // Atomics are divergent because they are executed sequentially: when an |
562 | 1.32M | // atomic operation refers to the same address in each thread, then each |
563 | 1.32M | // thread after the first sees the value written by the previous thread as |
564 | 1.32M | // original value. |
565 | 1.32M | if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)1.31M ) |
566 | 12.7k | return true; |
567 | 1.30M | |
568 | 1.30M | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) |
569 | 161k | return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); |
570 | 1.14M | |
571 | 1.14M | // Assume all function calls are a source of divergence. |
572 | 1.14M | if (isa<CallInst>(V) || isa<InvokeInst>(V)1.13M ) |
573 | 11.4k | return true; |
574 | 1.13M | |
575 | 1.13M | return false; |
576 | 1.13M | } |
577 | | |
578 | 341k | bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { |
579 | 341k | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { |
580 | 59.8k | switch (Intrinsic->getIntrinsicID()) { |
581 | 59.8k | default: |
582 | 58.7k | return false; |
583 | 59.8k | case Intrinsic::amdgcn_readfirstlane: |
584 | 1.00k | case Intrinsic::amdgcn_readlane: |
585 | 1.00k | case Intrinsic::amdgcn_icmp: |
586 | 1.00k | case Intrinsic::amdgcn_fcmp: |
587 | 1.00k | return true; |
588 | 281k | } |
589 | 281k | } |
590 | 281k | return false; |
591 | 281k | } |
592 | | |
593 | | unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, |
594 | 85 | Type *SubTp) { |
595 | 85 | if (ST->hasVOP3PInsts()) { |
596 | 40 | VectorType *VT = cast<VectorType>(Tp); |
597 | 40 | if (VT->getNumElements() == 2 && |
598 | 40 | DL.getTypeSizeInBits(VT->getElementType()) == 169 ) { |
599 | 5 | // With op_sel VOP3P instructions freely can access the low half or high |
600 | 5 | // half of a register, so any swizzle is free. |
601 | 5 | |
602 | 5 | switch (Kind) { |
603 | 5 | case TTI::SK_Broadcast: |
604 | 4 | case TTI::SK_Reverse: |
605 | 4 | case TTI::SK_PermuteSingleSrc: |
606 | 4 | return 0; |
607 | 4 | default: |
608 | 1 | break; |
609 | 81 | } |
610 | 81 | } |
611 | 40 | } |
612 | 81 | |
613 | 81 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); |
614 | 81 | } |
615 | | |
616 | | bool GCNTTIImpl::areInlineCompatible(const Function *Caller, |
617 | 238 | const Function *Callee) const { |
618 | 238 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
619 | 238 | const FeatureBitset &CallerBits = |
620 | 238 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
621 | 238 | const FeatureBitset &CalleeBits = |
622 | 238 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
623 | 238 | |
624 | 238 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
625 | 238 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
626 | 238 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
627 | 2 | return false; |
628 | 236 | |
629 | 236 | // FIXME: dx10_clamp can just take the caller setting, but there seems to be |
630 | 236 | // no way to support merge for backend defined attributes. |
631 | 236 | AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); |
632 | 236 | AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); |
633 | 236 | return CallerMode.isInlineCompatible(CalleeMode); |
634 | 236 | } |
635 | | |
636 | | void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
637 | 24 | TTI::UnrollingPreferences &UP) { |
638 | 24 | CommonTTI.getUnrollingPreferences(L, SE, UP); |
639 | 24 | } |
640 | | |
641 | 244 | unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { |
642 | 244 | return 4 * 128; // XXX - 4 channels. Should these count as vector instead? |
643 | 244 | } |
644 | | |
645 | 244 | unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { |
646 | 244 | return getHardwareNumberOfRegisters(Vec); |
647 | 244 | } |
648 | | |
649 | 2 | unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { |
650 | 2 | return 32; |
651 | 2 | } |
652 | | |
653 | 2 | unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { |
654 | 2 | return 32; |
655 | 2 | } |
656 | | |
657 | 3.88k | unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
658 | 3.88k | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
659 | 3.88k | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS1.11k ) |
660 | 2.94k | return 128; |
661 | 939 | if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
662 | 939 | AddrSpace == AMDGPUAS::REGION_ADDRESS429 ) |
663 | 510 | return 64; |
664 | 429 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
665 | 144 | return 32; |
666 | 285 | |
667 | 285 | if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || |
668 | 285 | AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || |
669 | 285 | (280 AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0280 && |
670 | 280 | AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) |
671 | 285 | return 128; |
672 | 0 | llvm_unreachable("unhandled address space"); |
673 | 0 | } |
674 | | |
675 | | bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
676 | | unsigned Alignment, |
677 | 97 | unsigned AddrSpace) const { |
678 | 97 | // We allow vectorization of flat stores, even though we may need to decompose |
679 | 97 | // them later if they may access private memory. We don't have enough context |
680 | 97 | // here, and legalization can handle it. |
681 | 97 | return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); |
682 | 97 | } |
683 | | |
684 | | bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
685 | | unsigned Alignment, |
686 | 67 | unsigned AddrSpace) const { |
687 | 67 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
688 | 67 | } |
689 | | |
690 | | bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
691 | | unsigned Alignment, |
692 | 30 | unsigned AddrSpace) const { |
693 | 30 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
694 | 30 | } |
695 | | |
696 | 0 | unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { |
697 | 0 | // Disable unrolling if the loop is not vectorized. |
698 | 0 | // TODO: Enable this again. |
699 | 0 | if (VF == 1) |
700 | 0 | return 1; |
701 | 0 | |
702 | 0 | return 8; |
703 | 0 | } |
704 | | |
705 | | unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { |
706 | | // XXX - For some reason this isn't called for switch. |
707 | | switch (Opcode) { |
708 | | case Instruction::Br: |
709 | | case Instruction::Ret: |
710 | | return 10; |
711 | | default: |
712 | | return BaseT::getCFInstrCost(Opcode); |
713 | | } |
714 | | } |
715 | | |
716 | | int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
717 | 0 | unsigned Index) { |
718 | 0 | switch (Opcode) { |
719 | 0 | case Instruction::ExtractElement: |
720 | 0 | case Instruction::InsertElement: { |
721 | 0 | unsigned EltSize |
722 | 0 | = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); |
723 | 0 | if (EltSize < 32) { |
724 | 0 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
725 | 0 | } |
726 | 0 | |
727 | 0 | // Extracts are just reads of a subregister, so are free. Inserts are |
728 | 0 | // considered free because we don't want to have any cost for scalarizing |
729 | 0 | // operations, and we don't have to copy into a different register class. |
730 | 0 | |
731 | 0 | // Dynamic indexing isn't free and is best avoided. |
732 | 0 | return Index == ~0u ? 2 : 0; |
733 | 0 | } |
734 | 0 | default: |
735 | 0 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); |
736 | 0 | } |
737 | 0 | } |
738 | | |
739 | | void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
740 | 3 | TTI::UnrollingPreferences &UP) { |
741 | 3 | CommonTTI.getUnrollingPreferences(L, SE, UP); |
742 | 3 | } |