/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | |
10 | | #include "PPCTargetTransformInfo.h" |
11 | | #include "llvm/Analysis/TargetTransformInfo.h" |
12 | | #include "llvm/CodeGen/BasicTTIImpl.h" |
13 | | #include "llvm/Support/CommandLine.h" |
14 | | #include "llvm/Support/Debug.h" |
15 | | #include "llvm/Target/CostTable.h" |
16 | | #include "llvm/Target/TargetLowering.h" |
17 | | using namespace llvm; |
18 | | |
19 | | #define DEBUG_TYPE "ppctti" |
20 | | |
21 | | static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", |
22 | | cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); |
23 | | |
24 | | // This is currently only used for the data prefetch pass which is only enabled |
25 | | // for BG/Q by default. |
26 | | static cl::opt<unsigned> |
27 | | CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), |
28 | | cl::desc("The loop prefetch cache line size")); |
29 | | |
30 | | //===----------------------------------------------------------------------===// |
31 | | // |
32 | | // PPC cost model. |
33 | | // |
34 | | //===----------------------------------------------------------------------===// |
35 | | |
36 | | TargetTransformInfo::PopcntSupportKind |
37 | 0 | PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { |
38 | 0 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
39 | 0 | if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && 0 TyWidth <= 640 ) |
40 | 0 | return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ? |
41 | 0 | TTI::PSK_SlowHardware0 : TTI::PSK_FastHardware0 ; |
42 | 0 | return TTI::PSK_Software; |
43 | 0 | } |
44 | | |
45 | 1.73k | int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { |
46 | 1.73k | if (DisablePPCConstHoist) |
47 | 0 | return BaseT::getIntImmCost(Imm, Ty); |
48 | 1.73k | |
49 | 1.73k | assert(Ty->isIntegerTy()); |
50 | 1.73k | |
51 | 1.73k | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
52 | 1.73k | if (BitSize == 0) |
53 | 0 | return ~0U; |
54 | 1.73k | |
55 | 1.73k | if (1.73k Imm == 01.73k ) |
56 | 847 | return TTI::TCC_Free; |
57 | 891 | |
58 | 891 | if (891 Imm.getBitWidth() <= 64891 ) { |
59 | 874 | if (isInt<16>(Imm.getSExtValue())) |
60 | 695 | return TTI::TCC_Basic; |
61 | 179 | |
62 | 179 | if (179 isInt<32>(Imm.getSExtValue())179 ) { |
63 | 133 | // A constant that can be materialized using lis. |
64 | 133 | if ((Imm.getZExtValue() & 0xFFFF) == 0) |
65 | 23 | return TTI::TCC_Basic; |
66 | 110 | |
67 | 110 | return 2 * TTI::TCC_Basic; |
68 | 110 | } |
69 | 874 | } |
70 | 63 | |
71 | 63 | return 4 * TTI::TCC_Basic; |
72 | 63 | } |
73 | | |
74 | | int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, |
75 | 592 | Type *Ty) { |
76 | 592 | if (DisablePPCConstHoist) |
77 | 0 | return BaseT::getIntImmCost(IID, Idx, Imm, Ty); |
78 | 592 | |
79 | 592 | assert(Ty->isIntegerTy()); |
80 | 592 | |
81 | 592 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
82 | 592 | if (BitSize == 0) |
83 | 0 | return ~0U; |
84 | 592 | |
85 | 592 | switch (IID) { |
86 | 405 | default: |
87 | 405 | return TTI::TCC_Free; |
88 | 2 | case Intrinsic::sadd_with_overflow: |
89 | 2 | case Intrinsic::uadd_with_overflow: |
90 | 2 | case Intrinsic::ssub_with_overflow: |
91 | 2 | case Intrinsic::usub_with_overflow: |
92 | 2 | if ((Idx == 1) && 2 Imm.getBitWidth() <= 641 && isInt<16>(Imm.getSExtValue())0 ) |
93 | 0 | return TTI::TCC_Free; |
94 | 2 | break; |
95 | 39 | case Intrinsic::experimental_stackmap: |
96 | 39 | if ((Idx < 2) || 39 (Imm.getBitWidth() <= 64 && 1 isInt<64>(Imm.getSExtValue())1 )) |
97 | 39 | return TTI::TCC_Free; |
98 | 0 | break; |
99 | 146 | case Intrinsic::experimental_patchpoint_void: |
100 | 146 | case Intrinsic::experimental_patchpoint_i64: |
101 | 146 | if ((Idx < 4) || 146 (Imm.getBitWidth() <= 64 && 7 isInt<64>(Imm.getSExtValue())7 )) |
102 | 146 | return TTI::TCC_Free; |
103 | 0 | break; |
104 | 2 | } |
105 | 2 | return PPCTTIImpl::getIntImmCost(Imm, Ty); |
106 | 2 | } |
107 | | |
108 | | int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, |
109 | 8.04k | Type *Ty) { |
110 | 8.04k | if (DisablePPCConstHoist) |
111 | 0 | return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); |
112 | 8.04k | |
113 | 8.04k | assert(Ty->isIntegerTy()); |
114 | 8.04k | |
115 | 8.04k | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
116 | 8.04k | if (BitSize == 0) |
117 | 0 | return ~0U; |
118 | 8.04k | |
119 | 8.04k | unsigned ImmIdx = ~0U; |
120 | 8.04k | bool ShiftedFree = false, RunFree = false, UnsignedFree = false, |
121 | 8.04k | ZeroFree = false; |
122 | 8.04k | switch (Opcode) { |
123 | 2.47k | default: |
124 | 2.47k | return TTI::TCC_Free; |
125 | 1.23k | case Instruction::GetElementPtr: |
126 | 1.23k | // Always hoist the base address of a GetElementPtr. This prevents the |
127 | 1.23k | // creation of new constants for every base constant that gets constant |
128 | 1.23k | // folded with the offset. |
129 | 1.23k | if (Idx == 0) |
130 | 4 | return 2 * TTI::TCC_Basic; |
131 | 1.22k | return TTI::TCC_Free; |
132 | 356 | case Instruction::And: |
133 | 356 | RunFree = true; // (for the rotate-and-mask instructions) |
134 | 356 | LLVM_FALLTHROUGH; |
135 | 1.08k | case Instruction::Add: |
136 | 1.08k | case Instruction::Or: |
137 | 1.08k | case Instruction::Xor: |
138 | 1.08k | ShiftedFree = true; |
139 | 1.08k | LLVM_FALLTHROUGH; |
140 | 1.75k | case Instruction::Sub: |
141 | 1.75k | case Instruction::Mul: |
142 | 1.75k | case Instruction::Shl: |
143 | 1.75k | case Instruction::LShr: |
144 | 1.75k | case Instruction::AShr: |
145 | 1.75k | ImmIdx = 1; |
146 | 1.75k | break; |
147 | 1.00k | case Instruction::ICmp: |
148 | 1.00k | UnsignedFree = true; |
149 | 1.00k | ImmIdx = 1; |
150 | 1.00k | // Zero comparisons can use record-form instructions. |
151 | 1.00k | LLVM_FALLTHROUGH; |
152 | 1.26k | case Instruction::Select: |
153 | 1.26k | ZeroFree = true; |
154 | 1.26k | break; |
155 | 1.31k | case Instruction::PHI: |
156 | 1.31k | case Instruction::Call: |
157 | 1.31k | case Instruction::Ret: |
158 | 1.31k | case Instruction::Load: |
159 | 1.31k | case Instruction::Store: |
160 | 1.31k | break; |
161 | 4.33k | } |
162 | 4.33k | |
163 | 4.33k | if (4.33k ZeroFree && 4.33k Imm == 01.26k ) |
164 | 710 | return TTI::TCC_Free; |
165 | 3.62k | |
166 | 3.62k | if (3.62k Idx == ImmIdx && 3.62k Imm.getBitWidth() <= 642.00k ) { |
167 | 1.98k | if (isInt<16>(Imm.getSExtValue())) |
168 | 1.77k | return TTI::TCC_Free; |
169 | 209 | |
170 | 209 | if (209 RunFree209 ) { |
171 | 125 | if (Imm.getBitWidth() <= 32 && |
172 | 72 | (isShiftedMask_32(Imm.getZExtValue()) || |
173 | 19 | isShiftedMask_32(~Imm.getZExtValue()))) |
174 | 54 | return TTI::TCC_Free; |
175 | 71 | |
176 | 71 | if (71 ST->isPPC64() && |
177 | 66 | (isShiftedMask_64(Imm.getZExtValue()) || |
178 | 41 | isShiftedMask_64(~Imm.getZExtValue()))) |
179 | 26 | return TTI::TCC_Free; |
180 | 129 | } |
181 | 129 | |
182 | 129 | if (129 UnsignedFree && 129 isUInt<16>(Imm.getZExtValue())36 ) |
183 | 3 | return TTI::TCC_Free; |
184 | 126 | |
185 | 126 | if (126 ShiftedFree && 126 (Imm.getZExtValue() & 0xFFFF) == 091 ) |
186 | 31 | return TTI::TCC_Free; |
187 | 1.73k | } |
188 | 1.73k | |
189 | 1.73k | return PPCTTIImpl::getIntImmCost(Imm, Ty); |
190 | 1.73k | } |
191 | | |
192 | | void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
193 | 7 | TTI::UnrollingPreferences &UP) { |
194 | 7 | if (ST->getDarwinDirective() == PPC::DIR_A27 ) { |
195 | 3 | // The A2 is in-order with a deep pipeline, and concatenation unrolling |
196 | 3 | // helps expose latency-hiding opportunities to the instruction scheduler. |
197 | 3 | UP.Partial = UP.Runtime = true; |
198 | 3 | |
199 | 3 | // We unroll a lot on the A2 (hundreds of instructions), and the benefits |
200 | 3 | // often outweigh the cost of a division to compute the trip count. |
201 | 3 | UP.AllowExpensiveTripCount = true; |
202 | 3 | } |
203 | 7 | |
204 | 7 | BaseT::getUnrollingPreferences(L, SE, UP); |
205 | 7 | } |
206 | | |
207 | 1 | bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { |
208 | 1 | // On the A2, always unroll aggressively. For QPX unaligned loads, we depend |
209 | 1 | // on combining the loads generated for consecutive accesses, and failure to |
210 | 1 | // do so is particularly expensive. This makes it much more likely (compared |
211 | 1 | // to only using concatenation unrolling). |
212 | 1 | if (ST->getDarwinDirective() == PPC::DIR_A2) |
213 | 1 | return true; |
214 | 0 |
|
215 | 0 | return LoopHasReductions; |
216 | 0 | } |
217 | | |
218 | 23 | bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { |
219 | 23 | MaxLoadSize = 8; |
220 | 23 | return true; |
221 | 23 | } |
222 | | |
223 | 12 | bool PPCTTIImpl::enableInterleavedAccessVectorization() { |
224 | 12 | return true; |
225 | 12 | } |
226 | | |
227 | 9.54k | unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { |
228 | 9.54k | if (Vector && 9.54k !ST->hasAltivec()88 && !ST->hasQPX()71 ) |
229 | 68 | return 0; |
230 | 9.47k | return ST->hasVSX() ? 9.47k 643.14k : 326.33k ; |
231 | 9.54k | } |
232 | | |
233 | 24 | unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { |
234 | 24 | if (Vector24 ) { |
235 | 24 | if (ST->hasQPX()24 ) return 2566 ; |
236 | 18 | if (18 ST->hasAltivec()18 ) return 12818 ; |
237 | 0 | return 0; |
238 | 0 | } |
239 | 0 |
|
240 | 0 | if (0 ST->isPPC64()0 ) |
241 | 0 | return 64; |
242 | 0 | return 32; |
243 | 0 |
|
244 | 0 | } |
245 | | |
246 | 15 | unsigned PPCTTIImpl::getCacheLineSize() { |
247 | 15 | // Check first if the user specified a custom line size. |
248 | 15 | if (CacheLineSize.getNumOccurrences() > 0) |
249 | 3 | return CacheLineSize; |
250 | 12 | |
251 | 12 | // On P7, P8 or P9 we have a cache line size of 128. |
252 | 12 | unsigned Directive = ST->getDarwinDirective(); |
253 | 12 | if (Directive == PPC::DIR_PWR7 || 12 Directive == PPC::DIR_PWR811 || |
254 | 10 | Directive == PPC::DIR_PWR9) |
255 | 3 | return 128; |
256 | 9 | |
257 | 9 | // On other processors return a default of 64 bytes. |
258 | 9 | return 64; |
259 | 9 | } |
260 | | |
261 | 103 | unsigned PPCTTIImpl::getPrefetchDistance() { |
262 | 103 | // This seems like a reasonable default for the BG/Q (this pass is enabled, by |
263 | 103 | // default, only on the BG/Q). |
264 | 103 | return 300; |
265 | 103 | } |
266 | | |
267 | 75 | unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { |
268 | 75 | unsigned Directive = ST->getDarwinDirective(); |
269 | 75 | // The 440 has no SIMD support, but floating-point instructions |
270 | 75 | // have a 5-cycle latency, so unroll by 5x for latency hiding. |
271 | 75 | if (Directive == PPC::DIR_440) |
272 | 0 | return 5; |
273 | 75 | |
274 | 75 | // The A2 has no SIMD support, but floating-point instructions |
275 | 75 | // have a 6-cycle latency, so unroll by 6x for latency hiding. |
276 | 75 | if (75 Directive == PPC::DIR_A275 ) |
277 | 3 | return 6; |
278 | 72 | |
279 | 72 | // FIXME: For lack of any better information, do no harm... |
280 | 72 | if (72 Directive == PPC::DIR_E500mc || 72 Directive == PPC::DIR_E550072 ) |
281 | 0 | return 1; |
282 | 72 | |
283 | 72 | // For P7 and P8, floating-point instructions have a 6-cycle latency and |
284 | 72 | // there are two execution units, so unroll by 12x for latency hiding. |
285 | 72 | // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready |
286 | 72 | if (72 Directive == PPC::DIR_PWR7 || 72 Directive == PPC::DIR_PWR872 || |
287 | 64 | Directive == PPC::DIR_PWR9) |
288 | 8 | return 12; |
289 | 64 | |
290 | 64 | // For most things, modern systems have two execution units (and |
291 | 64 | // out-of-order execution). |
292 | 64 | return 2; |
293 | 64 | } |
294 | | |
295 | | int PPCTTIImpl::getArithmeticInstrCost( |
296 | | unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, |
297 | | TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, |
298 | 106 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { |
299 | 106 | assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); |
300 | 106 | |
301 | 106 | // Fallback to the default implementation. |
302 | 106 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, |
303 | 106 | Opd1PropInfo, Opd2PropInfo); |
304 | 106 | } |
305 | | |
306 | | int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, |
307 | 2 | Type *SubTp) { |
308 | 2 | // Legalize the type. |
309 | 2 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); |
310 | 2 | |
311 | 2 | // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations |
312 | 2 | // (at least in the sense that there need only be one non-loop-invariant |
313 | 2 | // instruction). We need one such shuffle instruction for each actual |
314 | 2 | // register (this is not true for arbitrary shuffles, but is true for the |
315 | 2 | // structured types of shuffles covered by TTI::ShuffleKind). |
316 | 2 | return LT.first; |
317 | 2 | } |
318 | | |
319 | | int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
320 | 35 | const Instruction *I) { |
321 | 35 | assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); |
322 | 35 | |
323 | 35 | return BaseT::getCastInstrCost(Opcode, Dst, Src); |
324 | 35 | } |
325 | | |
326 | | int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
327 | 49 | const Instruction *I) { |
328 | 49 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); |
329 | 49 | } |
330 | | |
331 | 123 | int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { |
332 | 123 | assert(Val->isVectorTy() && "This must be a vector type"); |
333 | 123 | |
334 | 123 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
335 | 123 | assert(ISD && "Invalid opcode"); |
336 | 123 | |
337 | 123 | if (ST->hasVSX() && 123 Val->getScalarType()->isDoubleTy()82 ) { |
338 | 2 | // Double-precision scalars are already located in index #0. |
339 | 2 | if (Index == 0) |
340 | 1 | return 0; |
341 | 1 | |
342 | 1 | return BaseT::getVectorInstrCost(Opcode, Val, Index); |
343 | 121 | } else if (121 ST->hasQPX() && 121 Val->getScalarType()->isFloatingPointTy()30 ) { |
344 | 30 | // Floating point scalars are already located in index #0. |
345 | 30 | if (Index == 0) |
346 | 6 | return 0; |
347 | 24 | |
348 | 24 | return BaseT::getVectorInstrCost(Opcode, Val, Index); |
349 | 24 | } |
350 | 91 | |
351 | 91 | // Estimated cost of a load-hit-store delay. This was obtained |
352 | 91 | // experimentally as a minimum needed to prevent unprofitable |
353 | 91 | // vectorization for the paq8p benchmark. It may need to be |
354 | 91 | // raised further if other unprofitable cases remain. |
355 | 91 | unsigned LHSPenalty = 2; |
356 | 91 | if (ISD == ISD::INSERT_VECTOR_ELT) |
357 | 68 | LHSPenalty += 7; |
358 | 91 | |
359 | 91 | // Vector element insert/extract with Altivec is very expensive, |
360 | 91 | // because they require store and reload with the attendant |
361 | 91 | // processor stall for load-hit-store. Until VSX is available, |
362 | 91 | // these need to be estimated as very costly. |
363 | 91 | if (ISD == ISD::EXTRACT_VECTOR_ELT || |
364 | 68 | ISD == ISD::INSERT_VECTOR_ELT) |
365 | 91 | return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); |
366 | 0 |
|
367 | 0 | return BaseT::getVectorInstrCost(Opcode, Val, Index); |
368 | 0 | } |
369 | | |
370 | | int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, |
371 | 134 | unsigned AddressSpace, const Instruction *I) { |
372 | 134 | // Legalize the type. |
373 | 134 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); |
374 | 134 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
375 | 134 | "Invalid Opcode"); |
376 | 134 | |
377 | 134 | int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); |
378 | 134 | |
379 | 134 | bool IsAltivecType = ST->hasAltivec() && |
380 | 109 | (LT.second == MVT::v16i8 || 109 LT.second == MVT::v8i1696 || |
381 | 109 | LT.second == MVT::v4i3284 || LT.second == MVT::v4f3277 ); |
382 | 134 | bool IsVSXType = ST->hasVSX() && |
383 | 96 | (LT.second == MVT::v2f64 || 96 LT.second == MVT::v2i6487 ); |
384 | 134 | bool IsQPXType = ST->hasQPX() && |
385 | 25 | (LT.second == MVT::v4f64 || 25 LT.second == MVT::v4f3215 ); |
386 | 134 | |
387 | 134 | // VSX has 32b/64b load instructions. Legalization can handle loading of |
388 | 134 | // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and |
389 | 134 | // PPCTargetLowering can't compute the cost appropriately. So here we |
390 | 134 | // explicitly check this case. |
391 | 134 | unsigned MemBytes = Src->getPrimitiveSizeInBits(); |
392 | 134 | if (Opcode == Instruction::Load && 134 ST->hasVSX()97 && IsAltivecType75 && |
393 | 33 | (MemBytes == 64 || 33 (ST->hasP8Vector() && 27 MemBytes == 3217 ))) |
394 | 12 | return 1; |
395 | 122 | |
396 | 122 | // Aligned loads and stores are easy. |
397 | 122 | unsigned SrcBytes = LT.second.getStoreSize(); |
398 | 122 | if (!SrcBytes || 122 !Alignment122 || Alignment >= SrcBytes122 ) |
399 | 39 | return Cost; |
400 | 83 | |
401 | 83 | // If we can use the permutation-based load sequence, then this is also |
402 | 83 | // relatively cheap (not counting loop-invariant instructions): one load plus |
403 | 83 | // one permute (the last load in a series has extra cost, but we're |
404 | 83 | // neglecting that here). Note that on the P7, we could do unaligned loads |
405 | 83 | // for Altivec types using the VSX instructions, but that's more expensive |
406 | 83 | // than using the permutation-based load sequence. On the P8, that's no |
407 | 83 | // longer true. |
408 | 83 | if (83 Opcode == Instruction::Load && |
409 | 56 | ((!ST->hasP8Vector() && 56 IsAltivecType27 ) || IsQPXType43 ) && |
410 | 21 | Alignment >= LT.second.getScalarType().getStoreSize()) |
411 | 20 | return Cost + LT.first; // Add the cost of the permutations. |
412 | 63 | |
413 | 63 | // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the |
414 | 63 | // P7, unaligned vector loads are more expensive than the permutation-based |
415 | 63 | // load sequence, so that might be used instead, but regardless, the net cost |
416 | 63 | // is about the same (not counting loop-invariant instructions). |
417 | 63 | if (63 IsVSXType || 63 (ST->hasVSX() && 44 IsAltivecType33 )) |
418 | 40 | return Cost; |
419 | 23 | |
420 | 23 | // Newer PPC supports unaligned memory access. |
421 | 23 | if (23 TLI->allowsMisalignedMemoryAccesses(LT.second, 0)23 ) |
422 | 12 | return Cost; |
423 | 11 | |
424 | 11 | // PPC in general does not support unaligned loads and stores. They'll need |
425 | 11 | // to be decomposed based on the alignment factor. |
426 | 11 | |
427 | 11 | // Add the cost of each scalar load or store. |
428 | 11 | Cost += LT.first*(SrcBytes/Alignment-1); |
429 | 11 | |
430 | 11 | // For a vector type, there is also scalarization overhead (only for |
431 | 11 | // stores, loads are expanded using the vector-load + permutation sequence, |
432 | 11 | // which is much less expensive). |
433 | 11 | if (Src->isVectorTy() && 11 Opcode == Instruction::Store7 ) |
434 | 36 | for (int i = 0, e = Src->getVectorNumElements(); 6 i < e36 ; ++i30 ) |
435 | 30 | Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); |
436 | 134 | |
437 | 134 | return Cost; |
438 | 134 | } |
439 | | |
440 | | int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, |
441 | | unsigned Factor, |
442 | | ArrayRef<unsigned> Indices, |
443 | | unsigned Alignment, |
444 | 2 | unsigned AddressSpace) { |
445 | 2 | assert(isa<VectorType>(VecTy) && |
446 | 2 | "Expect a vector type for interleaved memory op"); |
447 | 2 | |
448 | 2 | // Legalize the type. |
449 | 2 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); |
450 | 2 | |
451 | 2 | // Firstly, the cost of load/store operation. |
452 | 2 | int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); |
453 | 2 | |
454 | 2 | // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations |
455 | 2 | // (at least in the sense that there need only be one non-loop-invariant |
456 | 2 | // instruction). For each result vector, we need one shuffle per incoming |
457 | 2 | // vector (except that the first shuffle can take two incoming vectors |
458 | 2 | // because it does not need to take itself). |
459 | 2 | Cost += Factor*(LT.first-1); |
460 | 2 | |
461 | 2 | return Cost; |
462 | 2 | } |
463 | | |