Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
10
#include "PPCTargetTransformInfo.h"
11
#include "llvm/Analysis/TargetTransformInfo.h"
12
#include "llvm/CodeGen/BasicTTIImpl.h"
13
#include "llvm/Support/CommandLine.h"
14
#include "llvm/Support/Debug.h"
15
#include "llvm/Target/CostTable.h"
16
#include "llvm/Target/TargetLowering.h"
17
using namespace llvm;
18
19
#define DEBUG_TYPE "ppctti"
20
21
static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
22
cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
23
24
// This is currently only used for the data prefetch pass which is only enabled
25
// for BG/Q by default.
26
static cl::opt<unsigned>
27
CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
28
              cl::desc("The loop prefetch cache line size"));
29
30
//===----------------------------------------------------------------------===//
31
//
32
// PPC cost model.
33
//
34
//===----------------------------------------------------------------------===//
35
36
TargetTransformInfo::PopcntSupportKind
37
0
PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
38
0
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
39
0
  if (
ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && 0
TyWidth <= 640
)
40
0
    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
41
0
             
TTI::PSK_SlowHardware0
:
TTI::PSK_FastHardware0
;
42
0
  return TTI::PSK_Software;
43
0
}
44
45
1.73k
int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
46
1.73k
  if (DisablePPCConstHoist)
47
0
    return BaseT::getIntImmCost(Imm, Ty);
48
1.73k
49
1.73k
  assert(Ty->isIntegerTy());
50
1.73k
51
1.73k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
52
1.73k
  if (BitSize == 0)
53
0
    return ~0U;
54
1.73k
55
1.73k
  
if (1.73k
Imm == 01.73k
)
56
847
    return TTI::TCC_Free;
57
891
58
891
  
if (891
Imm.getBitWidth() <= 64891
) {
59
874
    if (isInt<16>(Imm.getSExtValue()))
60
695
      return TTI::TCC_Basic;
61
179
62
179
    
if (179
isInt<32>(Imm.getSExtValue())179
) {
63
133
      // A constant that can be materialized using lis.
64
133
      if ((Imm.getZExtValue() & 0xFFFF) == 0)
65
23
        return TTI::TCC_Basic;
66
110
67
110
      return 2 * TTI::TCC_Basic;
68
110
    }
69
874
  }
70
63
71
63
  return 4 * TTI::TCC_Basic;
72
63
}
73
74
int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
75
592
                              Type *Ty) {
76
592
  if (DisablePPCConstHoist)
77
0
    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
78
592
79
592
  assert(Ty->isIntegerTy());
80
592
81
592
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
82
592
  if (BitSize == 0)
83
0
    return ~0U;
84
592
85
592
  switch (IID) {
86
405
  default:
87
405
    return TTI::TCC_Free;
88
2
  case Intrinsic::sadd_with_overflow:
89
2
  case Intrinsic::uadd_with_overflow:
90
2
  case Intrinsic::ssub_with_overflow:
91
2
  case Intrinsic::usub_with_overflow:
92
2
    if (
(Idx == 1) && 2
Imm.getBitWidth() <= 641
&&
isInt<16>(Imm.getSExtValue())0
)
93
0
      return TTI::TCC_Free;
94
2
    break;
95
39
  case Intrinsic::experimental_stackmap:
96
39
    if (
(Idx < 2) || 39
(Imm.getBitWidth() <= 64 && 1
isInt<64>(Imm.getSExtValue())1
))
97
39
      return TTI::TCC_Free;
98
0
    break;
99
146
  case Intrinsic::experimental_patchpoint_void:
100
146
  case Intrinsic::experimental_patchpoint_i64:
101
146
    if (
(Idx < 4) || 146
(Imm.getBitWidth() <= 64 && 7
isInt<64>(Imm.getSExtValue())7
))
102
146
      return TTI::TCC_Free;
103
0
    break;
104
2
  }
105
2
  return PPCTTIImpl::getIntImmCost(Imm, Ty);
106
2
}
107
108
int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
109
8.04k
                              Type *Ty) {
110
8.04k
  if (DisablePPCConstHoist)
111
0
    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
112
8.04k
113
8.04k
  assert(Ty->isIntegerTy());
114
8.04k
115
8.04k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
116
8.04k
  if (BitSize == 0)
117
0
    return ~0U;
118
8.04k
119
8.04k
  unsigned ImmIdx = ~0U;
120
8.04k
  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
121
8.04k
       ZeroFree = false;
122
8.04k
  switch (Opcode) {
123
2.47k
  default:
124
2.47k
    return TTI::TCC_Free;
125
1.23k
  case Instruction::GetElementPtr:
126
1.23k
    // Always hoist the base address of a GetElementPtr. This prevents the
127
1.23k
    // creation of new constants for every base constant that gets constant
128
1.23k
    // folded with the offset.
129
1.23k
    if (Idx == 0)
130
4
      return 2 * TTI::TCC_Basic;
131
1.22k
    return TTI::TCC_Free;
132
356
  case Instruction::And:
133
356
    RunFree = true; // (for the rotate-and-mask instructions)
134
356
    LLVM_FALLTHROUGH;
135
1.08k
  case Instruction::Add:
136
1.08k
  case Instruction::Or:
137
1.08k
  case Instruction::Xor:
138
1.08k
    ShiftedFree = true;
139
1.08k
    LLVM_FALLTHROUGH;
140
1.75k
  case Instruction::Sub:
141
1.75k
  case Instruction::Mul:
142
1.75k
  case Instruction::Shl:
143
1.75k
  case Instruction::LShr:
144
1.75k
  case Instruction::AShr:
145
1.75k
    ImmIdx = 1;
146
1.75k
    break;
147
1.00k
  case Instruction::ICmp:
148
1.00k
    UnsignedFree = true;
149
1.00k
    ImmIdx = 1;
150
1.00k
    // Zero comparisons can use record-form instructions.
151
1.00k
    LLVM_FALLTHROUGH;
152
1.26k
  case Instruction::Select:
153
1.26k
    ZeroFree = true;
154
1.26k
    break;
155
1.31k
  case Instruction::PHI:
156
1.31k
  case Instruction::Call:
157
1.31k
  case Instruction::Ret:
158
1.31k
  case Instruction::Load:
159
1.31k
  case Instruction::Store:
160
1.31k
    break;
161
4.33k
  }
162
4.33k
163
4.33k
  
if (4.33k
ZeroFree && 4.33k
Imm == 01.26k
)
164
710
    return TTI::TCC_Free;
165
3.62k
166
3.62k
  
if (3.62k
Idx == ImmIdx && 3.62k
Imm.getBitWidth() <= 642.00k
) {
167
1.98k
    if (isInt<16>(Imm.getSExtValue()))
168
1.77k
      return TTI::TCC_Free;
169
209
170
209
    
if (209
RunFree209
) {
171
125
      if (Imm.getBitWidth() <= 32 &&
172
72
          (isShiftedMask_32(Imm.getZExtValue()) ||
173
19
           isShiftedMask_32(~Imm.getZExtValue())))
174
54
        return TTI::TCC_Free;
175
71
176
71
      
if (71
ST->isPPC64() &&
177
66
          (isShiftedMask_64(Imm.getZExtValue()) ||
178
41
           isShiftedMask_64(~Imm.getZExtValue())))
179
26
        return TTI::TCC_Free;
180
129
    }
181
129
182
129
    
if (129
UnsignedFree && 129
isUInt<16>(Imm.getZExtValue())36
)
183
3
      return TTI::TCC_Free;
184
126
185
126
    
if (126
ShiftedFree && 126
(Imm.getZExtValue() & 0xFFFF) == 091
)
186
31
      return TTI::TCC_Free;
187
1.73k
  }
188
1.73k
189
1.73k
  return PPCTTIImpl::getIntImmCost(Imm, Ty);
190
1.73k
}
191
192
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
193
7
                                         TTI::UnrollingPreferences &UP) {
194
7
  if (
ST->getDarwinDirective() == PPC::DIR_A27
) {
195
3
    // The A2 is in-order with a deep pipeline, and concatenation unrolling
196
3
    // helps expose latency-hiding opportunities to the instruction scheduler.
197
3
    UP.Partial = UP.Runtime = true;
198
3
199
3
    // We unroll a lot on the A2 (hundreds of instructions), and the benefits
200
3
    // often outweigh the cost of a division to compute the trip count.
201
3
    UP.AllowExpensiveTripCount = true;
202
3
  }
203
7
204
7
  BaseT::getUnrollingPreferences(L, SE, UP);
205
7
}
206
207
1
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
208
1
  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
209
1
  // on combining the loads generated for consecutive accesses, and failure to
210
1
  // do so is particularly expensive. This makes it much more likely (compared
211
1
  // to only using concatenation unrolling).
212
1
  if (ST->getDarwinDirective() == PPC::DIR_A2)
213
1
    return true;
214
0
215
0
  return LoopHasReductions;
216
0
}
217
218
23
bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {
219
23
  MaxLoadSize = 8;
220
23
  return true;
221
23
}
222
223
12
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
224
12
  return true;
225
12
}
226
227
9.54k
unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
228
9.54k
  if (
Vector && 9.54k
!ST->hasAltivec()88
&&
!ST->hasQPX()71
)
229
68
    return 0;
230
9.47k
  
return ST->hasVSX() ? 9.47k
643.14k
:
326.33k
;
231
9.54k
}
232
233
24
unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
234
24
  if (
Vector24
) {
235
24
    if (
ST->hasQPX()24
)
return 2566
;
236
18
    
if (18
ST->hasAltivec()18
)
return 12818
;
237
0
    return 0;
238
0
  }
239
0
240
0
  
if (0
ST->isPPC64()0
)
241
0
    return 64;
242
0
  return 32;
243
0
244
0
}
245
246
15
unsigned PPCTTIImpl::getCacheLineSize() {
247
15
  // Check first if the user specified a custom line size.
248
15
  if (CacheLineSize.getNumOccurrences() > 0)
249
3
    return CacheLineSize;
250
12
251
12
  // On P7, P8 or P9 we have a cache line size of 128.
252
12
  unsigned Directive = ST->getDarwinDirective();
253
12
  if (
Directive == PPC::DIR_PWR7 || 12
Directive == PPC::DIR_PWR811
||
254
10
      Directive == PPC::DIR_PWR9)
255
3
    return 128;
256
9
257
9
  // On other processors return a default of 64 bytes.
258
9
  return 64;
259
9
}
260
261
103
unsigned PPCTTIImpl::getPrefetchDistance() {
262
103
  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
263
103
  // default, only on the BG/Q).
264
103
  return 300;
265
103
}
266
267
75
unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
268
75
  unsigned Directive = ST->getDarwinDirective();
269
75
  // The 440 has no SIMD support, but floating-point instructions
270
75
  // have a 5-cycle latency, so unroll by 5x for latency hiding.
271
75
  if (Directive == PPC::DIR_440)
272
0
    return 5;
273
75
274
75
  // The A2 has no SIMD support, but floating-point instructions
275
75
  // have a 6-cycle latency, so unroll by 6x for latency hiding.
276
75
  
if (75
Directive == PPC::DIR_A275
)
277
3
    return 6;
278
72
279
72
  // FIXME: For lack of any better information, do no harm...
280
72
  
if (72
Directive == PPC::DIR_E500mc || 72
Directive == PPC::DIR_E550072
)
281
0
    return 1;
282
72
283
72
  // For P7 and P8, floating-point instructions have a 6-cycle latency and
284
72
  // there are two execution units, so unroll by 12x for latency hiding.
285
72
  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
286
72
  
if (72
Directive == PPC::DIR_PWR7 || 72
Directive == PPC::DIR_PWR872
||
287
64
      Directive == PPC::DIR_PWR9)
288
8
    return 12;
289
64
290
64
  // For most things, modern systems have two execution units (and
291
64
  // out-of-order execution).
292
64
  return 2;
293
64
}
294
295
int PPCTTIImpl::getArithmeticInstrCost(
296
    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
297
    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
298
106
    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
299
106
  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
300
106
301
106
  // Fallback to the default implementation.
302
106
  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
303
106
                                       Opd1PropInfo, Opd2PropInfo);
304
106
}
305
306
int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
307
2
                               Type *SubTp) {
308
2
  // Legalize the type.
309
2
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
310
2
311
2
  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
312
2
  // (at least in the sense that there need only be one non-loop-invariant
313
2
  // instruction). We need one such shuffle instruction for each actual
314
2
  // register (this is not true for arbitrary shuffles, but is true for the
315
2
  // structured types of shuffles covered by TTI::ShuffleKind).
316
2
  return LT.first;
317
2
}
318
319
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
320
35
                                 const Instruction *I) {
321
35
  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
322
35
323
35
  return BaseT::getCastInstrCost(Opcode, Dst, Src);
324
35
}
325
326
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
327
49
                                   const Instruction *I) {
328
49
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
329
49
}
330
331
123
int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
332
123
  assert(Val->isVectorTy() && "This must be a vector type");
333
123
334
123
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
335
123
  assert(ISD && "Invalid opcode");
336
123
337
123
  if (
ST->hasVSX() && 123
Val->getScalarType()->isDoubleTy()82
) {
338
2
    // Double-precision scalars are already located in index #0.
339
2
    if (Index == 0)
340
1
      return 0;
341
1
342
1
    return BaseT::getVectorInstrCost(Opcode, Val, Index);
343
121
  } else 
if (121
ST->hasQPX() && 121
Val->getScalarType()->isFloatingPointTy()30
) {
344
30
    // Floating point scalars are already located in index #0.
345
30
    if (Index == 0)
346
6
      return 0;
347
24
348
24
    return BaseT::getVectorInstrCost(Opcode, Val, Index);
349
24
  }
350
91
351
91
  // Estimated cost of a load-hit-store delay.  This was obtained
352
91
  // experimentally as a minimum needed to prevent unprofitable
353
91
  // vectorization for the paq8p benchmark.  It may need to be
354
91
  // raised further if other unprofitable cases remain.
355
91
  unsigned LHSPenalty = 2;
356
91
  if (ISD == ISD::INSERT_VECTOR_ELT)
357
68
    LHSPenalty += 7;
358
91
359
91
  // Vector element insert/extract with Altivec is very expensive,
360
91
  // because they require store and reload with the attendant
361
91
  // processor stall for load-hit-store.  Until VSX is available,
362
91
  // these need to be estimated as very costly.
363
91
  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
364
68
      ISD == ISD::INSERT_VECTOR_ELT)
365
91
    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
366
0
367
0
  return BaseT::getVectorInstrCost(Opcode, Val, Index);
368
0
}
369
370
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
371
134
                                unsigned AddressSpace, const Instruction *I) {
372
134
  // Legalize the type.
373
134
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
374
134
  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
375
134
         "Invalid Opcode");
376
134
377
134
  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
378
134
379
134
  bool IsAltivecType = ST->hasAltivec() &&
380
109
                       
(LT.second == MVT::v16i8 || 109
LT.second == MVT::v8i1696
||
381
109
                        
LT.second == MVT::v4i3284
||
LT.second == MVT::v4f3277
);
382
134
  bool IsVSXType = ST->hasVSX() &&
383
96
                   
(LT.second == MVT::v2f64 || 96
LT.second == MVT::v2i6487
);
384
134
  bool IsQPXType = ST->hasQPX() &&
385
25
                   
(LT.second == MVT::v4f64 || 25
LT.second == MVT::v4f3215
);
386
134
387
134
  // VSX has 32b/64b load instructions. Legalization can handle loading of
388
134
  // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
389
134
  // PPCTargetLowering can't compute the cost appropriately. So here we
390
134
  // explicitly check this case.
391
134
  unsigned MemBytes = Src->getPrimitiveSizeInBits();
392
134
  if (
Opcode == Instruction::Load && 134
ST->hasVSX()97
&&
IsAltivecType75
&&
393
33
      
(MemBytes == 64 || 33
(ST->hasP8Vector() && 27
MemBytes == 3217
)))
394
12
    return 1;
395
122
396
122
  // Aligned loads and stores are easy.
397
122
  unsigned SrcBytes = LT.second.getStoreSize();
398
122
  if (
!SrcBytes || 122
!Alignment122
||
Alignment >= SrcBytes122
)
399
39
    return Cost;
400
83
401
83
  // If we can use the permutation-based load sequence, then this is also
402
83
  // relatively cheap (not counting loop-invariant instructions): one load plus
403
83
  // one permute (the last load in a series has extra cost, but we're
404
83
  // neglecting that here). Note that on the P7, we could do unaligned loads
405
83
  // for Altivec types using the VSX instructions, but that's more expensive
406
83
  // than using the permutation-based load sequence. On the P8, that's no
407
83
  // longer true.
408
83
  
if (83
Opcode == Instruction::Load &&
409
56
      
((!ST->hasP8Vector() && 56
IsAltivecType27
) ||
IsQPXType43
) &&
410
21
      Alignment >= LT.second.getScalarType().getStoreSize())
411
20
    return Cost + LT.first; // Add the cost of the permutations.
412
63
413
63
  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
414
63
  // P7, unaligned vector loads are more expensive than the permutation-based
415
63
  // load sequence, so that might be used instead, but regardless, the net cost
416
63
  // is about the same (not counting loop-invariant instructions).
417
63
  
if (63
IsVSXType || 63
(ST->hasVSX() && 44
IsAltivecType33
))
418
40
    return Cost;
419
23
420
23
  // Newer PPC supports unaligned memory access.
421
23
  
if (23
TLI->allowsMisalignedMemoryAccesses(LT.second, 0)23
)
422
12
    return Cost;
423
11
424
11
  // PPC in general does not support unaligned loads and stores. They'll need
425
11
  // to be decomposed based on the alignment factor.
426
11
427
11
  // Add the cost of each scalar load or store.
428
11
  Cost += LT.first*(SrcBytes/Alignment-1);
429
11
430
11
  // For a vector type, there is also scalarization overhead (only for
431
11
  // stores, loads are expanded using the vector-load + permutation sequence,
432
11
  // which is much less expensive).
433
11
  if (
Src->isVectorTy() && 11
Opcode == Instruction::Store7
)
434
36
    
for (int i = 0, e = Src->getVectorNumElements(); 6
i < e36
;
++i30
)
435
30
      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
436
134
437
134
  return Cost;
438
134
}
439
440
int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
441
                                           unsigned Factor,
442
                                           ArrayRef<unsigned> Indices,
443
                                           unsigned Alignment,
444
2
                                           unsigned AddressSpace) {
445
2
  assert(isa<VectorType>(VecTy) &&
446
2
         "Expect a vector type for interleaved memory op");
447
2
448
2
  // Legalize the type.
449
2
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
450
2
451
2
  // Firstly, the cost of load/store operation.
452
2
  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
453
2
454
2
  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
455
2
  // (at least in the sense that there need only be one non-loop-invariant
456
2
  // instruction). For each result vector, we need one shuffle per incoming
457
2
  // vector (except that the first shuffle can take two incoming vectors
458
2
  // because it does not need to take itself).
459
2
  Cost += Factor*(LT.first-1);
460
2
461
2
  return Cost;
462
2
}
463