Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "PPCTargetTransformInfo.h"
10
#include "llvm/Analysis/CodeMetrics.h"
11
#include "llvm/Analysis/TargetTransformInfo.h"
12
#include "llvm/CodeGen/BasicTTIImpl.h"
13
#include "llvm/CodeGen/CostTable.h"
14
#include "llvm/CodeGen/TargetLowering.h"
15
#include "llvm/CodeGen/TargetSchedule.h"
16
#include "llvm/Support/CommandLine.h"
17
#include "llvm/Support/Debug.h"
18
using namespace llvm;
19
20
#define DEBUG_TYPE "ppctti"
21
22
static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
23
cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
24
25
// This is currently only used for the data prefetch pass which is only enabled
26
// for BG/Q by default.
27
static cl::opt<unsigned>
28
CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
29
              cl::desc("The loop prefetch cache line size"));
30
31
static cl::opt<bool>
32
EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
33
                cl::desc("Enable using coldcc calling conv for cold "
34
                         "internal functions"));
35
36
// The latency of mtctr is only justified if there are more than 4
37
// comparisons that will be removed as a result.
38
static cl::opt<unsigned>
39
SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
40
                      cl::desc("Loops with a constant trip count smaller than "
41
                               "this value will not use the count register."));
42
43
//===----------------------------------------------------------------------===//
44
//
45
// PPC cost model.
46
//
47
//===----------------------------------------------------------------------===//
48
49
TargetTransformInfo::PopcntSupportKind
50
0
PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
51
0
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
52
0
  if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
53
0
    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
54
0
             TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
55
0
  return TTI::PSK_Software;
56
0
}
57
58
2.60k
int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
59
2.60k
  if (DisablePPCConstHoist)
60
0
    return BaseT::getIntImmCost(Imm, Ty);
61
2.60k
62
2.60k
  assert(Ty->isIntegerTy());
63
2.60k
64
2.60k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
65
2.60k
  if (BitSize == 0)
66
0
    return ~0U;
67
2.60k
68
2.60k
  if (Imm == 0)
69
1.12k
    return TTI::TCC_Free;
70
1.48k
71
1.48k
  if (Imm.getBitWidth() <= 64) {
72
1.43k
    if (isInt<16>(Imm.getSExtValue()))
73
1.22k
      return TTI::TCC_Basic;
74
206
75
206
    if (isInt<32>(Imm.getSExtValue())) {
76
151
      // A constant that can be materialized using lis.
77
151
      if ((Imm.getZExtValue() & 0xFFFF) == 0)
78
30
        return TTI::TCC_Basic;
79
121
80
121
      return 2 * TTI::TCC_Basic;
81
121
    }
82
206
  }
83
102
84
102
  return 4 * TTI::TCC_Basic;
85
102
}
86
87
int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
88
604
                              Type *Ty) {
89
604
  if (DisablePPCConstHoist)
90
0
    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
91
604
92
604
  assert(Ty->isIntegerTy());
93
604
94
604
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
95
604
  if (BitSize == 0)
96
0
    return ~0U;
97
604
98
604
  switch (IID) {
99
604
  default:
100
415
    return TTI::TCC_Free;
101
604
  case Intrinsic::sadd_with_overflow:
102
4
  case Intrinsic::uadd_with_overflow:
103
4
  case Intrinsic::ssub_with_overflow:
104
4
  case Intrinsic::usub_with_overflow:
105
4
    if ((Idx == 1) && 
Imm.getBitWidth() <= 643
&&
isInt<16>(Imm.getSExtValue())2
)
106
2
      return TTI::TCC_Free;
107
2
    break;
108
39
  case Intrinsic::experimental_stackmap:
109
39
    if ((Idx < 2) || 
(1
Imm.getBitWidth() <= 641
&&
isInt<64>(Imm.getSExtValue())1
))
110
39
      return TTI::TCC_Free;
111
0
    break;
112
146
  case Intrinsic::experimental_patchpoint_void:
113
146
  case Intrinsic::experimental_patchpoint_i64:
114
146
    if ((Idx < 4) || 
(7
Imm.getBitWidth() <= 647
&&
isInt<64>(Imm.getSExtValue())7
))
115
146
      return TTI::TCC_Free;
116
0
    break;
117
2
  }
118
2
  return PPCTTIImpl::getIntImmCost(Imm, Ty);
119
2
}
120
121
int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
122
11.5k
                              Type *Ty) {
123
11.5k
  if (DisablePPCConstHoist)
124
0
    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
125
11.5k
126
11.5k
  assert(Ty->isIntegerTy());
127
11.5k
128
11.5k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
129
11.5k
  if (BitSize == 0)
130
0
    return ~0U;
131
11.5k
132
11.5k
  unsigned ImmIdx = ~0U;
133
11.5k
  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
134
11.5k
       ZeroFree = false;
135
11.5k
  switch (Opcode) {
136
11.5k
  default:
137
3.09k
    return TTI::TCC_Free;
138
11.5k
  case Instruction::GetElementPtr:
139
2.16k
    // Always hoist the base address of a GetElementPtr. This prevents the
140
2.16k
    // creation of new constants for every base constant that gets constant
141
2.16k
    // folded with the offset.
142
2.16k
    if (Idx == 0)
143
4
      return 2 * TTI::TCC_Basic;
144
2.16k
    return TTI::TCC_Free;
145
2.16k
  case Instruction::And:
146
424
    RunFree = true; // (for the rotate-and-mask instructions)
147
424
    LLVM_FALLTHROUGH;
148
1.44k
  case Instruction::Add:
149
1.44k
  case Instruction::Or:
150
1.44k
  case Instruction::Xor:
151
1.44k
    ShiftedFree = true;
152
1.44k
    LLVM_FALLTHROUGH;
153
2.41k
  case Instruction::Sub:
154
2.41k
  case Instruction::Mul:
155
2.41k
  case Instruction::Shl:
156
2.41k
  case Instruction::LShr:
157
2.41k
  case Instruction::AShr:
158
2.41k
    ImmIdx = 1;
159
2.41k
    break;
160
2.41k
  case Instruction::ICmp:
161
1.55k
    UnsignedFree = true;
162
1.55k
    ImmIdx = 1;
163
1.55k
    // Zero comparisons can use record-form instructions.
164
1.55k
    LLVM_FALLTHROUGH;
165
2.08k
  case Instruction::Select:
166
2.08k
    ZeroFree = true;
167
2.08k
    break;
168
1.80k
  case Instruction::PHI:
169
1.80k
  case Instruction::Call:
170
1.80k
  case Instruction::Ret:
171
1.80k
  case Instruction::Load:
172
1.80k
  case Instruction::Store:
173
1.80k
    break;
174
6.30k
  }
175
6.30k
176
6.30k
  if (ZeroFree && 
Imm == 02.08k
)
177
1.12k
    return TTI::TCC_Free;
178
5.18k
179
5.18k
  if (Idx == ImmIdx && 
Imm.getBitWidth() <= 642.74k
) {
180
2.70k
    if (isInt<16>(Imm.getSExtValue()))
181
2.43k
      return TTI::TCC_Free;
182
262
183
262
    if (RunFree) {
184
147
      if (Imm.getBitWidth() <= 32 &&
185
147
          
(86
isShiftedMask_32(Imm.getZExtValue())86
||
186
86
           
isShiftedMask_32(~Imm.getZExtValue())25
))
187
68
        return TTI::TCC_Free;
188
79
189
79
      if (ST->isPPC64() &&
190
79
          
(73
isShiftedMask_64(Imm.getZExtValue())73
||
191
73
           
isShiftedMask_64(~Imm.getZExtValue())43
))
192
33
        return TTI::TCC_Free;
193
161
    }
194
161
195
161
    if (UnsignedFree && 
isUInt<16>(Imm.getZExtValue())47
)
196
11
      return TTI::TCC_Free;
197
150
198
150
    if (ShiftedFree && 
(Imm.getZExtValue() & 0xFFFF) == 098
)
199
35
      return TTI::TCC_Free;
200
2.59k
  }
201
2.59k
202
2.59k
  return PPCTTIImpl::getIntImmCost(Imm, Ty);
203
2.59k
}
204
205
unsigned PPCTTIImpl::getUserCost(const User *U,
206
9.02k
                                 ArrayRef<const Value *> Operands) {
207
9.02k
  if (U->getType()->isVectorTy()) {
208
455
    // Instructions that need to be split should cost more.
209
455
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
210
455
    return LT.first * BaseT::getUserCost(U, Operands);
211
455
  }
212
8.57k
213
8.57k
  return BaseT::getUserCost(U, Operands);
214
8.57k
}
215
216
bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
217
860
                             TargetLibraryInfo *LibInfo) {
218
860
  const PPCTargetMachine &TM = ST->getTargetMachine();
219
860
220
860
  // Loop through the inline asm constraints and look for something that
221
860
  // clobbers ctr.
222
860
  auto asmClobbersCTR = [](InlineAsm *IA) {
223
48
    InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
224
349
    for (unsigned i = 0, ie = CIV.size(); i < ie; 
++i301
) {
225
303
      InlineAsm::ConstraintInfo &C = CIV[i];
226
303
      if (C.Type != InlineAsm::isInput)
227
424
        
for (unsigned j = 0, je = C.Codes.size(); 213
j < je;
++j211
)
228
213
          if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
229
2
            return true;
230
303
    }
231
48
    
return false46
;
232
48
  };
233
860
234
860
  // Determining the address of a TLS variable results in a function call in
235
860
  // certain TLS models.
236
860
  std::function<bool(const Value*)> memAddrUsesCTR =
237
17.3k
    [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
238
17.3k
    const auto *GV = dyn_cast<GlobalValue>(MemAddr);
239
17.3k
    if (!GV) {
240
17.1k
      // Recurse to check for constants that refer to TLS global variables.
241
17.1k
      if (const auto *CV = dyn_cast<Constant>(MemAddr))
242
4.18k
        for (const auto &CO : CV->operands())
243
426
          if (memAddrUsesCTR(CO))
244
2
            return true;
245
17.1k
246
17.1k
      
return false17.1k
;
247
240
    }
248
240
249
240
    if (!GV->isThreadLocal())
250
238
      return false;
251
2
    TLSModel::Model Model = TM.getTLSModel(GV);
252
2
    return Model == TLSModel::GeneralDynamic ||
253
2
      
Model == TLSModel::LocalDynamic0
;
254
2
  };
255
860
256
9.94k
  auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
257
9.94k
    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
258
5.01k
      return ITy->getBitWidth() > (Is32Bit ? 
32U404
:
64U4.60k
);
259
4.93k
260
4.93k
    return false;
261
4.93k
  };
262
860
263
860
  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
264
10.1k
       J != JE; 
++J9.29k
) {
265
9.44k
    if (CallInst *CI = dyn_cast<CallInst>(J)) {
266
197
      // Inline ASM is okay, unless it clobbers the ctr register.
267
197
      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
268
48
        if (asmClobbersCTR(IA))
269
2
          return true;
270
46
        continue;
271
46
      }
272
149
273
149
      if (Function *F = CI->getCalledFunction()) {
274
84
        // Most intrinsics don't become function calls, but some might.
275
84
        // sin, cos, exp and log are always calls.
276
84
        unsigned Opcode = 0;
277
84
        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
278
43
          switch (F->getIntrinsicID()) {
279
43
          
default: continue40
;
280
43
          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
281
43
          // we're definitely using CTR.
282
43
          case Intrinsic::set_loop_iterations:
283
1
          case Intrinsic::loop_decrement:
284
1
            return true;
285
1
286
1
// VisualStudio defines setjmp as _setjmp
287
#if defined(_MSC_VER) && defined(setjmp) && \
288
                       !defined(setjmp_undefined_for_msvc)
289
#  pragma push_macro("setjmp")
290
#  undef setjmp
291
#  define setjmp_undefined_for_msvc
292
#endif
293
294
1
          case Intrinsic::setjmp:
295
0
296
#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
297
 // let's return it to _setjmp state
298
#  pragma pop_macro("setjmp")
299
#  undef setjmp_undefined_for_msvc
300
#endif
301
302
0
          case Intrinsic::longjmp:
303
0
304
0
          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
305
0
          // because, although it does clobber the counter register, the
306
0
          // control can't then return to inside the loop unless there is also
307
0
          // an eh_sjlj_setjmp.
308
0
          case Intrinsic::eh_sjlj_setjmp:
309
0
310
0
          case Intrinsic::memcpy:
311
0
          case Intrinsic::memmove:
312
0
          case Intrinsic::memset:
313
0
          case Intrinsic::powi:
314
0
          case Intrinsic::log:
315
0
          case Intrinsic::log2:
316
0
          case Intrinsic::log10:
317
0
          case Intrinsic::exp:
318
0
          case Intrinsic::exp2:
319
0
          case Intrinsic::pow:
320
0
          case Intrinsic::sin:
321
0
          case Intrinsic::cos:
322
0
            return true;
323
0
          case Intrinsic::copysign:
324
0
            if (CI->getArgOperand(0)->getType()->getScalarType()->
325
0
                isPPC_FP128Ty())
326
0
              return true;
327
0
            else
328
0
              continue; // ISD::FCOPYSIGN is never a library call.
329
0
          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
330
0
          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
331
0
          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
332
0
          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
333
0
          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
334
0
          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
335
0
          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
336
0
          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
337
0
          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
338
0
          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
339
2
          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
340
43
          }
341
43
        }
342
43
343
43
        // PowerPC does not use [US]DIVREM or other library calls for
344
43
        // operations on regular types which are not otherwise library calls
345
43
        // (i.e. soft float or atomics). If adapting for targets that do,
346
43
        // additional care is required here.
347
43
348
43
        LibFunc Func;
349
43
        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
350
43
            LibInfo->getLibFunc(F->getName(), Func) &&
351
43
            
LibInfo->hasOptimizedCodeGen(Func)5
) {
352
2
          // Non-read-only functions are never treated as intrinsics.
353
2
          if (!CI->onlyReadsMemory())
354
0
            return true;
355
2
356
2
          // Conversion happens only for FP calls.
357
2
          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
358
0
            return true;
359
2
360
2
          switch (Func) {
361
2
          
default: return true0
;
362
2
          case LibFunc_copysign:
363
0
          case LibFunc_copysignf:
364
0
            continue; // ISD::FCOPYSIGN is never a library call.
365
2
          case LibFunc_copysignl:
366
2
            return true;
367
0
          case LibFunc_fabs:
368
0
          case LibFunc_fabsf:
369
0
          case LibFunc_fabsl:
370
0
            continue; // ISD::FABS is never a library call.
371
0
          case LibFunc_sqrt:
372
0
          case LibFunc_sqrtf:
373
0
          case LibFunc_sqrtl:
374
0
            Opcode = ISD::FSQRT; break;
375
0
          case LibFunc_floor:
376
0
          case LibFunc_floorf:
377
0
          case LibFunc_floorl:
378
0
            Opcode = ISD::FFLOOR; break;
379
0
          case LibFunc_nearbyint:
380
0
          case LibFunc_nearbyintf:
381
0
          case LibFunc_nearbyintl:
382
0
            Opcode = ISD::FNEARBYINT; break;
383
0
          case LibFunc_ceil:
384
0
          case LibFunc_ceilf:
385
0
          case LibFunc_ceill:
386
0
            Opcode = ISD::FCEIL; break;
387
0
          case LibFunc_rint:
388
0
          case LibFunc_rintf:
389
0
          case LibFunc_rintl:
390
0
            Opcode = ISD::FRINT; break;
391
0
          case LibFunc_round:
392
0
          case LibFunc_roundf:
393
0
          case LibFunc_roundl:
394
0
            Opcode = ISD::FROUND; break;
395
0
          case LibFunc_trunc:
396
0
          case LibFunc_truncf:
397
0
          case LibFunc_truncl:
398
0
            Opcode = ISD::FTRUNC; break;
399
0
          case LibFunc_fmin:
400
0
          case LibFunc_fminf:
401
0
          case LibFunc_fminl:
402
0
            Opcode = ISD::FMINNUM; break;
403
0
          case LibFunc_fmax:
404
0
          case LibFunc_fmaxf:
405
0
          case LibFunc_fmaxl:
406
0
            Opcode = ISD::FMAXNUM; break;
407
41
          }
408
41
        }
409
41
410
41
        if (Opcode) {
411
2
          EVT EVTy =
412
2
              TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
413
2
414
2
          if (EVTy == MVT::Other)
415
0
            return true;
416
2
417
2
          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
418
0
            continue;
419
2
          else if (EVTy.isVector() &&
420
2
                   
TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())0
)
421
0
            continue;
422
2
423
2
          return true;
424
2
        }
425
41
      }
426
104
427
104
      return true;
428
9.24k
    } else if (isa<BinaryOperator>(J) &&
429
9.24k
               
J->getType()->getScalarType()->isPPC_FP128Ty()1.88k
) {
430
4
      // Most operations on ppc_f128 values become calls.
431
4
      return true;
432
9.24k
    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
433
9.24k
               
isa<FPToUIInst>(J)9.24k
||
isa<FPToSIInst>(J)9.24k
) {
434
8
      CastInst *CI = cast<CastInst>(J);
435
8
      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
436
8
          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
437
8
          isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
438
8
          
isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType())6
)
439
2
        return true;
440
9.23k
    } else if (isLargeIntegerTy(!TM.isPPC64(),
441
9.23k
                                J->getType()->getScalarType()) &&
442
9.23k
               
(146
J->getOpcode() == Instruction::UDiv146
||
443
146
                
J->getOpcode() == Instruction::SDiv142
||
444
146
                
J->getOpcode() == Instruction::URem140
||
445
146
                
J->getOpcode() == Instruction::SRem138
)) {
446
10
      return true;
447
9.22k
    } else if (!TM.isPPC64() &&
448
9.22k
               
isLargeIntegerTy(false, J->getType()->getScalarType())697
&&
449
9.22k
               
(20
J->getOpcode() == Instruction::Shl20
||
450
20
                
J->getOpcode() == Instruction::AShr18
||
451
20
                
J->getOpcode() == Instruction::LShr16
)) {
452
7
      // Only on PPC32, for 128-bit integers (specifically not 64-bit
453
7
      // integers), these might be runtime calls.
454
7
      return true;
455
9.21k
    } else if (isa<IndirectBrInst>(J) || 
isa<InvokeInst>(J)9.21k
) {
456
2
      // On PowerPC, indirect jumps use the counter register.
457
2
      return true;
458
9.21k
    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
459
14
      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
460
3
        return true;
461
9.21k
    }
462
9.21k
463
9.21k
    // FREM is always a call.
464
9.21k
    if (J->getOpcode() == Instruction::FRem)
465
2
      return true;
466
9.21k
467
9.21k
    if (ST->useSoftFloat()) {
468
24
      switch(J->getOpcode()) {
469
24
      case Instruction::FAdd:
470
8
      case Instruction::FSub:
471
8
      case Instruction::FMul:
472
8
      case Instruction::FDiv:
473
8
      case Instruction::FPTrunc:
474
8
      case Instruction::FPExt:
475
8
      case Instruction::FPToUI:
476
8
      case Instruction::FPToSI:
477
8
      case Instruction::UIToFP:
478
8
      case Instruction::SIToFP:
479
8
      case Instruction::FCmp:
480
8
        return true;
481
9.20k
      }
482
9.20k
    }
483
9.20k
484
9.20k
    for (Value *Operand : J->operands())
485
16.9k
      if (memAddrUsesCTR(Operand))
486
2
        return true;
487
9.20k
  }
488
860
489
860
  
return false709
;
490
860
}
491
492
bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
493
                                          AssumptionCache &AC,
494
                                          TargetLibraryInfo *LibInfo,
495
701
                                          HardwareLoopInfo &HWLoopInfo) {
496
701
  const PPCTargetMachine &TM = ST->getTargetMachine();
497
701
  TargetSchedModel SchedModel;
498
701
  SchedModel.init(ST);
499
701
500
701
  // Do not convert small short loops to CTR loop.
501
701
  unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
502
701
  if (ConstTripCount && 
ConstTripCount < SmallCTRLoopThreshold271
) {
503
49
    SmallPtrSet<const Value *, 32> EphValues;
504
49
    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
505
49
    CodeMetrics Metrics;
506
49
    for (BasicBlock *BB : L->blocks())
507
57
      Metrics.analyzeBasicBlock(BB, *this, EphValues);
508
49
    // 6 is an approximate latency for the mtctr instruction.
509
49
    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
510
44
      return false;
511
657
  }
512
657
513
657
  // We don't want to spill/restore the counter register, and so we don't
514
657
  // want to use the counter register if the loop contains calls.
515
657
  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
516
1.36k
       I != IE; 
++I709
)
517
860
    if (mightUseCTR(*I, LibInfo))
518
151
      return false;
519
657
520
657
  SmallVector<BasicBlock*, 4> ExitingBlocks;
521
506
  L->getExitingBlocks(ExitingBlocks);
522
506
523
506
  // If there is an exit edge known to be frequently taken,
524
506
  // we should not transform this loop.
525
546
  for (auto &BB : ExitingBlocks) {
526
546
    Instruction *TI = BB->getTerminator();
527
546
    if (!TI) 
continue0
;
528
546
529
546
    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
530
537
      uint64_t TrueWeight = 0, FalseWeight = 0;
531
537
      if (!BI->isConditional() ||
532
537
          !BI->extractProfMetadata(TrueWeight, FalseWeight))
533
529
        continue;
534
8
535
8
      // If the exit path is more frequent than the loop path,
536
8
      // we return here without further analysis for this loop.
537
8
      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
538
8
      if (( TrueIsExit && 
FalseWeight < TrueWeight4
) ||
539
8
          
(6
!TrueIsExit6
&&
FalseWeight > TrueWeight4
))
540
4
        return false;
541
8
    }
542
546
  }
543
506
544
506
  LLVMContext &C = L->getHeader()->getContext();
545
502
  HWLoopInfo.CountType = TM.isPPC64() ?
546
476
    Type::getInt64Ty(C) : 
Type::getInt32Ty(C)26
;
547
502
  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
548
502
  return true;
549
506
}
550
551
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
552
15
                                         TTI::UnrollingPreferences &UP) {
553
15
  if (ST->getDarwinDirective() == PPC::DIR_A2) {
554
3
    // The A2 is in-order with a deep pipeline, and concatenation unrolling
555
3
    // helps expose latency-hiding opportunities to the instruction scheduler.
556
3
    UP.Partial = UP.Runtime = true;
557
3
558
3
    // We unroll a lot on the A2 (hundreds of instructions), and the benefits
559
3
    // often outweigh the cost of a division to compute the trip count.
560
3
    UP.AllowExpensiveTripCount = true;
561
3
  }
562
15
563
15
  BaseT::getUnrollingPreferences(L, SE, UP);
564
15
}
565
566
// This function returns true to allow using coldcc calling convention.
567
// Returning true results in coldcc being used for functions which are cold at
568
// all call sites when the callers of the functions are not calling any other
569
// non coldcc functions.
570
11
bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
571
11
  return EnablePPCColdCC;
572
11
}
573
574
1
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
575
1
  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
576
1
  // on combining the loads generated for consecutive accesses, and failure to
577
1
  // do so is particularly expensive. This makes it much more likely (compared
578
1
  // to only using concatenation unrolling).
579
1
  if (ST->getDarwinDirective() == PPC::DIR_A2)
580
1
    return true;
581
0
582
0
  return LoopHasReductions;
583
0
}
584
585
PPCTTIImpl::TTI::MemCmpExpansionOptions
586
10.3k
PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
587
10.3k
  TTI::MemCmpExpansionOptions Options;
588
10.3k
  Options.LoadSizes = {8, 4, 2, 1};
589
10.3k
  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
590
10.3k
  return Options;
591
10.3k
}
592
593
80
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
594
80
  return true;
595
80
}
596
597
13.2k
unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
598
13.2k
  if (Vector && 
!ST->hasAltivec()232
&&
!ST->hasQPX()77
)
599
74
    return 0;
600
13.1k
  return ST->hasVSX() ? 
648.46k
:
324.69k
;
601
13.1k
}
602
603
164
unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
604
164
  if (Vector) {
605
164
    if (ST->hasQPX()) 
return 2566
;
606
158
    if (ST->hasAltivec()) 
return 128150
;
607
8
    return 0;
608
8
  }
609
0
610
0
  if (ST->isPPC64())
611
0
    return 64;
612
0
  return 32;
613
0
614
0
}
615
616
15
unsigned PPCTTIImpl::getCacheLineSize() {
617
15
  // Check first if the user specified a custom line size.
618
15
  if (CacheLineSize.getNumOccurrences() > 0)
619
3
    return CacheLineSize;
620
12
621
12
  // On P7, P8 or P9 we have a cache line size of 128.
622
12
  unsigned Directive = ST->getDarwinDirective();
623
12
  if (Directive == PPC::DIR_PWR7 || 
Directive == PPC::DIR_PWR811
||
624
12
      
Directive == PPC::DIR_PWR910
)
625
3
    return 128;
626
9
627
9
  // On other processors return a default of 64 bytes.
628
9
  return 64;
629
9
}
630
631
99
unsigned PPCTTIImpl::getPrefetchDistance() {
632
99
  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
633
99
  // default, only on the BG/Q).
634
99
  return 300;
635
99
}
636
637
149
unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
638
149
  unsigned Directive = ST->getDarwinDirective();
639
149
  // The 440 has no SIMD support, but floating-point instructions
640
149
  // have a 5-cycle latency, so unroll by 5x for latency hiding.
641
149
  if (Directive == PPC::DIR_440)
642
0
    return 5;
643
149
644
149
  // The A2 has no SIMD support, but floating-point instructions
645
149
  // have a 6-cycle latency, so unroll by 6x for latency hiding.
646
149
  if (Directive == PPC::DIR_A2)
647
3
    return 6;
648
146
649
146
  // FIXME: For lack of any better information, do no harm...
650
146
  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
651
0
    return 1;
652
146
653
146
  // For P7 and P8, floating-point instructions have a 6-cycle latency and
654
146
  // there are two execution units, so unroll by 12x for latency hiding.
655
146
  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
656
146
  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
657
146
      
Directive == PPC::DIR_PWR966
)
658
80
    return 12;
659
66
660
66
  // For most things, modern systems have two execution units (and
661
66
  // out-of-order execution).
662
66
  return 2;
663
66
}
664
665
// Adjust the cost of vector instructions on targets which there is overlap
666
// between the vector and scalar units, thereby reducing the overall throughput
667
// of vector code wrt. scalar code.
668
int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
669
3.53k
                                     Type *Ty2) {
670
3.53k
  if (!ST->vectorsUseTwoUnits() || 
!Ty1->isVectorTy()67
)
671
3.48k
    return Cost;
672
53
673
53
  std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
674
53
  // If type legalization involves splitting the vector, we don't want to
675
53
  // double the cost at every step - only the last step.
676
53
  if (LT1.first != 1 || !LT1.second.isVector())
677
0
    return Cost;
678
53
679
53
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
680
53
  if (TLI->isOperationExpand(ISD, LT1.second))
681
0
    return Cost;
682
53
683
53
  if (Ty2) {
684
3
    std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
685
3
    if (LT2.first != 1 || !LT2.second.isVector())
686
0
      return Cost;
687
53
  }
688
53
689
53
  return Cost * 2;
690
53
}
691
692
int PPCTTIImpl::getArithmeticInstrCost(
693
    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
694
    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
695
288
    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
696
288
  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
697
288
698
288
  // Fallback to the default implementation.
699
288
  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
700
288
                                           Opd1PropInfo, Opd2PropInfo);
701
288
  return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
702
288
}
703
704
int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
705
7
                               Type *SubTp) {
706
7
  // Legalize the type.
707
7
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
708
7
709
7
  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
710
7
  // (at least in the sense that there need only be one non-loop-invariant
711
7
  // instruction). We need one such shuffle instruction for each actual
712
7
  // register (this is not true for arbitrary shuffles, but is true for the
713
7
  // structured types of shuffles covered by TTI::ShuffleKind).
714
7
  return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
715
7
                              nullptr);
716
7
}
717
718
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
719
373
                                 const Instruction *I) {
720
373
  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
721
373
722
373
  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
723
373
  return vectorCostAdjustment(Cost, Opcode, Dst, Src);
724
373
}
725
726
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
727
217
                                   const Instruction *I) {
728
217
  int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
729
217
  return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
730
217
}
731
732
2.32k
int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
733
2.32k
  assert(Val->isVectorTy() && "This must be a vector type");
734
2.32k
735
2.32k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
736
2.32k
  assert(ISD && "Invalid opcode");
737
2.32k
738
2.32k
  int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
739
2.32k
  Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
740
2.32k
741
2.32k
  if (ST->hasVSX() && 
Val->getScalarType()->isDoubleTy()2.28k
) {
742
529
    // Double-precision scalars are already located in index #0 (or #1 if LE).
743
529
    if (ISD == ISD::EXTRACT_VECTOR_ELT &&
744
529
        
Index == (ST->isLittleEndian() 278
?
1268
:
010
))
745
139
      return 0;
746
390
747
390
    return Cost;
748
390
749
1.79k
  } else if (ST->hasQPX() && 
Val->getScalarType()->isFloatingPointTy()30
) {
750
30
    // Floating point scalars are already located in index #0.
751
30
    if (Index == 0)
752
6
      return 0;
753
24
754
24
    return Cost;
755
24
  }
756
1.76k
757
1.76k
  // Estimated cost of a load-hit-store delay.  This was obtained
758
1.76k
  // experimentally as a minimum needed to prevent unprofitable
759
1.76k
  // vectorization for the paq8p benchmark.  It may need to be
760
1.76k
  // raised further if other unprofitable cases remain.
761
1.76k
  unsigned LHSPenalty = 2;
762
1.76k
  if (ISD == ISD::INSERT_VECTOR_ELT)
763
837
    LHSPenalty += 7;
764
1.76k
765
1.76k
  // Vector element insert/extract with Altivec is very expensive,
766
1.76k
  // because they require store and reload with the attendant
767
1.76k
  // processor stall for load-hit-store.  Until VSX is available,
768
1.76k
  // these need to be estimated as very costly.
769
1.76k
  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
770
1.76k
      
ISD == ISD::INSERT_VECTOR_ELT837
)
771
1.76k
    return LHSPenalty + Cost;
772
0
773
0
  return Cost;
774
0
}
775
776
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
777
324
                                unsigned AddressSpace, const Instruction *I) {
778
324
  // Legalize the type.
779
324
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
780
324
  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
781
324
         "Invalid Opcode");
782
324
783
324
  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
784
324
  Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
785
324
786
324
  bool IsAltivecType = ST->hasAltivec() &&
787
324
                       
(295
LT.second == MVT::v16i8295
||
LT.second == MVT::v8i16282
||
788
295
                        
LT.second == MVT::v4i32267
||
LT.second == MVT::v4f32256
);
789
324
  bool IsVSXType = ST->hasVSX() &&
790
324
                   
(282
LT.second == MVT::v2f64282
||
LT.second == MVT::v2i64239
);
791
324
  bool IsQPXType = ST->hasQPX() &&
792
324
                   
(25
LT.second == MVT::v4f6425
||
LT.second == MVT::v4f3215
);
793
324
794
324
  // VSX has 32b/64b load instructions. Legalization can handle loading of
795
324
  // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
796
324
  // PPCTargetLowering can't compute the cost appropriately. So here we
797
324
  // explicitly check this case.
798
324
  unsigned MemBytes = Src->getPrimitiveSizeInBits();
799
324
  if (Opcode == Instruction::Load && 
ST->hasVSX()119
&&
IsAltivecType97
&&
800
324
      
(42
MemBytes == 6442
||
(34
ST->hasP8Vector()34
&&
MemBytes == 3223
)))
801
17
    return 1;
802
307
803
307
  // Aligned loads and stores are easy.
804
307
  unsigned SrcBytes = LT.second.getStoreSize();
805
307
  if (!SrcBytes || !Alignment || 
Alignment >= SrcBytes299
)
806
84
    return Cost;
807
223
808
223
  // If we can use the permutation-based load sequence, then this is also
809
223
  // relatively cheap (not counting loop-invariant instructions): one load plus
810
223
  // one permute (the last load in a series has extra cost, but we're
811
223
  // neglecting that here). Note that on the P7, we could do unaligned loads
812
223
  // for Altivec types using the VSX instructions, but that's more expensive
813
223
  // than using the permutation-based load sequence. On the P8, that's no
814
223
  // longer true.
815
223
  if (Opcode == Instruction::Load &&
816
223
      
(66
(66
!ST->hasP8Vector()66
&&
IsAltivecType27
) ||
IsQPXType53
) &&
817
223
      
Alignment >= LT.second.getScalarType().getStoreSize()21
)
818
20
    return Cost + LT.first; // Add the cost of the permutations.
819
203
820
203
  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
821
203
  // P7, unaligned vector loads are more expensive than the permutation-based
822
203
  // load sequence, so that might be used instead, but regardless, the net cost
823
203
  // is about the same (not counting loop-invariant instructions).
824
203
  if (IsVSXType || 
(149
ST->hasVSX()149
&&
IsAltivecType136
))
825
141
    return Cost;
826
62
827
62
  // Newer PPC supports unaligned memory access.
828
62
  if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
829
51
    return Cost;
830
11
831
11
  // PPC in general does not support unaligned loads and stores. They'll need
832
11
  // to be decomposed based on the alignment factor.
833
11
834
11
  // Add the cost of each scalar load or store.
835
11
  Cost += LT.first*(SrcBytes/Alignment-1);
836
11
837
11
  // For a vector type, there is also scalarization overhead (only for
838
11
  // stores, loads are expanded using the vector-load + permutation sequence,
839
11
  // which is much less expensive).
840
11
  if (Src->isVectorTy() && 
Opcode == Instruction::Store7
)
841
36
    
for (int i = 0, e = Src->getVectorNumElements(); 6
i < e;
++i30
)
842
30
      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
843
11
844
11
  return Cost;
845
11
}
846
847
int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
848
                                           unsigned Factor,
849
                                           ArrayRef<unsigned> Indices,
850
                                           unsigned Alignment,
851
                                           unsigned AddressSpace,
852
                                           bool UseMaskForCond,
853
2
                                           bool UseMaskForGaps) {
854
2
  if (UseMaskForCond || UseMaskForGaps)
855
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
856
0
                                             Alignment, AddressSpace,
857
0
                                             UseMaskForCond, UseMaskForGaps);
858
2
859
2
  assert(isa<VectorType>(VecTy) &&
860
2
         "Expect a vector type for interleaved memory op");
861
2
862
2
  // Legalize the type.
863
2
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
864
2
865
2
  // Firstly, the cost of load/store operation.
866
2
  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
867
2
868
2
  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
869
2
  // (at least in the sense that there need only be one non-loop-invariant
870
2
  // instruction). For each result vector, we need one shuffle per incoming
871
2
  // vector (except that the first shuffle can take two incoming vectors
872
2
  // because it does not need to take itself).
873
2
  Cost += Factor*(LT.first-1);
874
2
875
2
  return Cost;
876
2
}
877
878
bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
879
                            LoopInfo *LI, DominatorTree *DT,
880
302
                            AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
881
302
  // Process nested loops first.
882
302
  for (Loop::iterator I = L->begin(), E = L->end(); I != E; 
++I0
)
883
0
    if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo))
884
0
      return false; // Stop search.
885
302
886
302
  HardwareLoopInfo HWLoopInfo(L);
887
302
888
302
  if (!HWLoopInfo.canAnalyze(*LI))
889
0
    return false;
890
302
891
302
  if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
892
85
    return false;
893
217
894
217
  if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
895
28
    return false;
896
189
897
189
  *BI = HWLoopInfo.ExitBranch;
898
189
  return true;
899
189
}