Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
10
#include "AArch64TargetTransformInfo.h"
11
#include "MCTargetDesc/AArch64AddressingModes.h"
12
#include "llvm/Analysis/LoopInfo.h"
13
#include "llvm/Analysis/TargetTransformInfo.h"
14
#include "llvm/CodeGen/BasicTTIImpl.h"
15
#include "llvm/IR/IntrinsicInst.h"
16
#include "llvm/Support/Debug.h"
17
#include "llvm/Target/CostTable.h"
18
#include "llvm/Target/TargetLowering.h"
19
#include <algorithm>
20
using namespace llvm;
21
22
#define DEBUG_TYPE "aarch64tti"
23
24
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
25
                                               cl::init(true), cl::Hidden);
26
27
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
28
1.95M
                                         const Function *Callee) const {
29
1.95M
  const TargetMachine &TM = getTLI()->getTargetMachine();
30
1.95M
31
1.95M
  const FeatureBitset &CallerBits =
32
1.95M
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
33
1.95M
  const FeatureBitset &CalleeBits =
34
1.95M
      TM.getSubtargetImpl(*Callee)->getFeatureBits();
35
1.95M
36
1.95M
  // Inline a callee if its target-features are a subset of the callers
37
1.95M
  // target-features.
38
1.95M
  return (CallerBits & CalleeBits) == CalleeBits;
39
1.95M
}
40
41
/// \brief Calculate the cost of materializing a 64-bit value. This helper
42
/// method might only calculate a fraction of a larger immediate. Therefore it
43
/// is valid to return a cost of ZERO.
44
3.43M
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
45
3.43M
  // Check if the immediate can be encoded within an instruction.
46
3.43M
  if (
Val == 0 || 3.43M
AArch64_AM::isLogicalImmediate(Val, 64)2.09M
)
47
2.72M
    return 0;
48
712k
49
712k
  
if (712k
Val < 0712k
)
50
326k
    Val = ~Val;
51
3.43M
52
3.43M
  // Calculate how many moves we will need to materialize this constant.
53
3.43M
  unsigned LZ = countLeadingZeros((uint64_t)Val);
54
3.43M
  return (64 - LZ + 15) / 16;
55
3.43M
}
56
57
/// \brief Calculate the cost of materializing the given constant.
58
3.43M
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
59
3.43M
  assert(Ty->isIntegerTy());
60
3.43M
61
3.43M
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
62
3.43M
  if (BitSize == 0)
63
0
    return ~0U;
64
3.43M
65
3.43M
  // Sign-extend all constants to a multiple of 64-bit.
66
3.43M
  APInt ImmVal = Imm;
67
3.43M
  if (BitSize & 0x3f)
68
2.41M
    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
69
3.43M
70
3.43M
  // Split the constant into 64-bit chunks and calculate the cost for each
71
3.43M
  // chunk.
72
3.43M
  int Cost = 0;
73
6.87M
  for (unsigned ShiftVal = 0; 
ShiftVal < BitSize6.87M
;
ShiftVal += 643.43M
) {
74
3.43M
    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
75
3.43M
    int64_t Val = Tmp.getSExtValue();
76
3.43M
    Cost += getIntImmCost(Val);
77
3.43M
  }
78
3.43M
  // We need at least one instruction to materialze the constant.
79
3.43M
  return std::max(1, Cost);
80
3.43M
}
81
82
int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
83
6.27M
                                  const APInt &Imm, Type *Ty) {
84
6.27M
  assert(Ty->isIntegerTy());
85
6.27M
86
6.27M
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
87
6.27M
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
88
6.27M
  // here, so that constant hoisting will ignore this constant.
89
6.27M
  if (BitSize == 0)
90
0
    return TTI::TCC_Free;
91
6.27M
92
6.27M
  unsigned ImmIdx = ~0U;
93
6.27M
  switch (Opcode) {
94
20.3k
  default:
95
20.3k
    return TTI::TCC_Free;
96
2.64M
  case Instruction::GetElementPtr:
97
2.64M
    // Always hoist the base address of a GetElementPtr.
98
2.64M
    if (Idx == 0)
99
6
      return 2 * TTI::TCC_Basic;
100
2.64M
    return TTI::TCC_Free;
101
461k
  case Instruction::Store:
102
461k
    ImmIdx = 0;
103
461k
    break;
104
1.69M
  case Instruction::Add:
105
1.69M
  case Instruction::Sub:
106
1.69M
  case Instruction::Mul:
107
1.69M
  case Instruction::UDiv:
108
1.69M
  case Instruction::SDiv:
109
1.69M
  case Instruction::URem:
110
1.69M
  case Instruction::SRem:
111
1.69M
  case Instruction::And:
112
1.69M
  case Instruction::Or:
113
1.69M
  case Instruction::Xor:
114
1.69M
  case Instruction::ICmp:
115
1.69M
    ImmIdx = 1;
116
1.69M
    break;
117
1.69M
  // Always return TCC_Free for the shift value of a shift instruction.
118
178k
  case Instruction::Shl:
119
178k
  case Instruction::LShr:
120
178k
  case Instruction::AShr:
121
178k
    if (Idx == 1)
122
172k
      return TTI::TCC_Free;
123
5.75k
    break;
124
1.27M
  case Instruction::Trunc:
125
1.27M
  case Instruction::ZExt:
126
1.27M
  case Instruction::SExt:
127
1.27M
  case Instruction::IntToPtr:
128
1.27M
  case Instruction::PtrToInt:
129
1.27M
  case Instruction::BitCast:
130
1.27M
  case Instruction::PHI:
131
1.27M
  case Instruction::Call:
132
1.27M
  case Instruction::Select:
133
1.27M
  case Instruction::Ret:
134
1.27M
  case Instruction::Load:
135
1.27M
    break;
136
3.43M
  }
137
3.43M
138
3.43M
  
if (3.43M
Idx == ImmIdx3.43M
) {
139
2.09M
    int NumConstants = (BitSize + 63) / 64;
140
2.09M
    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
141
2.09M
    return (Cost <= NumConstants * TTI::TCC_Basic)
142
2.06M
               ? static_cast<int>(TTI::TCC_Free)
143
30.3k
               : Cost;
144
2.09M
  }
145
1.34M
  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
146
1.34M
}
147
148
int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
149
338k
                                  const APInt &Imm, Type *Ty) {
150
338k
  assert(Ty->isIntegerTy());
151
338k
152
338k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
153
338k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
154
338k
  // here, so that constant hoisting will ignore this constant.
155
338k
  if (BitSize == 0)
156
0
    return TTI::TCC_Free;
157
338k
158
338k
  switch (IID) {
159
337k
  default:
160
337k
    return TTI::TCC_Free;
161
1.18k
  case Intrinsic::sadd_with_overflow:
162
1.18k
  case Intrinsic::uadd_with_overflow:
163
1.18k
  case Intrinsic::ssub_with_overflow:
164
1.18k
  case Intrinsic::usub_with_overflow:
165
1.18k
  case Intrinsic::smul_with_overflow:
166
1.18k
  case Intrinsic::umul_with_overflow:
167
1.18k
    if (
Idx == 11.18k
) {
168
1.18k
      int NumConstants = (BitSize + 63) / 64;
169
1.18k
      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
170
1.18k
      return (Cost <= NumConstants * TTI::TCC_Basic)
171
1.18k
                 ? static_cast<int>(TTI::TCC_Free)
172
0
                 : Cost;
173
1.18k
    }
174
0
    break;
175
36
  case Intrinsic::experimental_stackmap:
176
36
    if (
(Idx < 2) || 36
(Imm.getBitWidth() <= 64 && 2
isInt<64>(Imm.getSExtValue())2
))
177
36
      return TTI::TCC_Free;
178
0
    break;
179
209
  case Intrinsic::experimental_patchpoint_void:
180
209
  case Intrinsic::experimental_patchpoint_i64:
181
209
    if (
(Idx < 4) || 209
(Imm.getBitWidth() <= 64 && 35
isInt<64>(Imm.getSExtValue())35
))
182
209
      return TTI::TCC_Free;
183
0
    break;
184
0
  }
185
0
  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
186
0
}
187
188
TargetTransformInfo::PopcntSupportKind
189
219k
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
190
219k
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
191
219k
  if (
TyWidth == 32 || 219k
TyWidth == 640
)
192
219k
    return TTI::PSK_FastHardware;
193
0
  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
194
0
  return TTI::PSK_Software;
195
0
}
196
197
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
198
980k
                                           ArrayRef<const Value *> Args) {
199
980k
200
980k
  // A helper that returns a vector type from the given type. The number of
201
980k
  // elements in type Ty determine the vector width.
202
10.9k
  auto toVectorTy = [&](Type *ArgTy) {
203
10.9k
    return VectorType::get(ArgTy->getScalarType(),
204
10.9k
                           DstTy->getVectorNumElements());
205
10.9k
  };
206
980k
207
980k
  // Exit early if DstTy is not a vector type whose elements are at least
208
980k
  // 16-bits wide.
209
980k
  if (
!DstTy->isVectorTy() || 980k
DstTy->getScalarSizeInBits() < 16428k
)
210
563k
    return false;
211
416k
212
416k
  // Determine if the operation has a widening variant. We consider both the
213
416k
  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
214
416k
  // instructions.
215
416k
  //
216
416k
  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
217
416k
  //       verify that their extending operands are eliminated during code
218
416k
  //       generation.
219
416k
  switch (Opcode) {
220
146k
  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
221
146k
  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
222
146k
    break;
223
270k
  default:
224
270k
    return false;
225
146k
  }
226
146k
227
146k
  // To be a widening instruction (either the "wide" or "long" versions), the
228
146k
  // second operand must be a sign- or zero extend having a single user. We
229
146k
  // only consider extends having a single user because they may otherwise not
230
146k
  // be eliminated.
231
146k
  
if (146k
Args.size() != 2 ||
232
120k
      
(!isa<SExtInst>(Args[1]) && 120k
!isa<ZExtInst>(Args[1])110k
) ||
233
19.7k
      !Args[1]->hasOneUse())
234
134k
    return false;
235
11.9k
  auto *Extend = cast<CastInst>(Args[1]);
236
11.9k
237
11.9k
  // Legalize the destination type and ensure it can be used in a widening
238
11.9k
  // operation.
239
11.9k
  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
240
11.9k
  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
241
11.9k
  if (
!DstTyL.second.isVector() || 11.9k
DstElTySize != DstTy->getScalarSizeInBits()11.9k
)
242
936
    return false;
243
10.9k
244
10.9k
  // Legalize the source type and ensure it can be used in a widening
245
10.9k
  // operation.
246
10.9k
  Type *SrcTy = toVectorTy(Extend->getSrcTy());
247
10.9k
  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
248
10.9k
  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
249
10.9k
  if (
!SrcTyL.second.isVector() || 10.9k
SrcElTySize != SrcTy->getScalarSizeInBits()10.9k
)
250
6.55k
    return false;
251
4.42k
252
4.42k
  // Get the total number of vector elements in the legalized types.
253
4.42k
  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
254
4.42k
  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
255
4.42k
256
4.42k
  // Return true if the legalized types have the same number of vector elements
257
4.42k
  // and the destination element type size is twice that of the source type.
258
4.42k
  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
259
980k
}
260
261
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
262
241k
                                     const Instruction *I) {
263
241k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
264
241k
  assert(ISD && "Invalid opcode");
265
241k
266
241k
  // If the cast is observable, and it is used by a widening instruction (e.g.,
267
241k
  // uaddl, saddw, etc.), it may be free.
268
241k
  if (
I && 241k
I->hasOneUse()237k
) {
269
220k
    auto *SingleUser = cast<Instruction>(*I->user_begin());
270
220k
    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
271
220k
    if (
isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)220k
) {
272
2.22k
      // If the cast is the second operand, it is free. We will generate either
273
2.22k
      // a "wide" or "long" version of the widening instruction.
274
2.22k
      if (I == SingleUser->getOperand(1))
275
1.67k
        return 0;
276
552
      // If the cast is not the second operand, it will be free if it looks the
277
552
      // same as the second operand. In this case, we will generate a "long"
278
552
      // version of the widening instruction.
279
552
      
if (auto *552
Cast552
= dyn_cast<CastInst>(SingleUser->getOperand(1)))
280
552
        
if (552
I->getOpcode() == Cast->getOpcode() &&
281
551
            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
282
550
          return 0;
283
239k
    }
284
220k
  }
285
239k
286
239k
  EVT SrcTy = TLI->getValueType(DL, Src);
287
239k
  EVT DstTy = TLI->getValueType(DL, Dst);
288
239k
289
239k
  if (
!SrcTy.isSimple() || 239k
!DstTy.isSimple()239k
)
290
61
    return BaseT::getCastInstrCost(Opcode, Dst, Src);
291
239k
292
239k
  static const TypeConversionCostTblEntry
293
239k
  ConversionTbl[] = {
294
239k
    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
295
239k
    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
296
239k
    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
297
239k
    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
298
239k
299
239k
    // The number of shll instructions for the extension.
300
239k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
301
239k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
302
239k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
303
239k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
304
239k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
305
239k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
306
239k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
307
239k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
308
239k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
309
239k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
310
239k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
311
239k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
312
239k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
313
239k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
314
239k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
315
239k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
316
239k
317
239k
    // LowerVectorINT_TO_FP:
318
239k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
319
239k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
320
239k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
321
239k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
322
239k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
323
239k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
324
239k
325
239k
    // Complex: to v2f32
326
239k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
327
239k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
328
239k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
329
239k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
330
239k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
331
239k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
332
239k
333
239k
    // Complex: to v4f32
334
239k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
335
239k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
336
239k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
337
239k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
338
239k
339
239k
    // Complex: to v8f32
340
239k
    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
341
239k
    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
342
239k
    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
343
239k
    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
344
239k
345
239k
    // Complex: to v16f32
346
239k
    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
347
239k
    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
348
239k
349
239k
    // Complex: to v2f64
350
239k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
351
239k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
352
239k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
353
239k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
354
239k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
355
239k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
356
239k
357
239k
358
239k
    // LowerVectorFP_TO_INT
359
239k
    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
360
239k
    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
361
239k
    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
362
239k
    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
363
239k
    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
364
239k
    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
365
239k
366
239k
    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
367
239k
    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
368
239k
    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
369
239k
    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
370
239k
    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
371
239k
    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
372
239k
    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
373
239k
374
239k
    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
375
239k
    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
376
239k
    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
377
239k
    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
378
239k
    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
379
239k
380
239k
    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
381
239k
    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
382
239k
    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
383
239k
    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
384
239k
    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
385
239k
    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
386
239k
    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
387
239k
  };
388
239k
389
239k
  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
390
239k
                                                 DstTy.getSimpleVT(),
391
239k
                                                 SrcTy.getSimpleVT()))
392
53.9k
    return Entry->Cost;
393
185k
394
185k
  return BaseT::getCastInstrCost(Opcode, Dst, Src);
395
185k
}
396
397
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
398
                                             VectorType *VecTy,
399
12.4k
                                             unsigned Index) {
400
12.4k
401
12.4k
  // Make sure we were given a valid extend opcode.
402
12.4k
  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
403
12.4k
         "Invalid opcode");
404
12.4k
405
12.4k
  // We are extending an element we extract from a vector, so the source type
406
12.4k
  // of the extend is the element type of the vector.
407
12.4k
  auto *Src = VecTy->getElementType();
408
12.4k
409
12.4k
  // Sign- and zero-extends are for integer types only.
410
12.4k
  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
411
12.4k
412
12.4k
  // Get the cost for the extract. We compute the cost (if any) for the extend
413
12.4k
  // below.
414
12.4k
  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
415
12.4k
416
12.4k
  // Legalize the types.
417
12.4k
  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
418
12.4k
  auto DstVT = TLI->getValueType(DL, Dst);
419
12.4k
  auto SrcVT = TLI->getValueType(DL, Src);
420
12.4k
421
12.4k
  // If the resulting type is still a vector and the destination type is legal,
422
12.4k
  // we may get the extension for free. If not, get the default cost for the
423
12.4k
  // extend.
424
12.4k
  if (
!VecLT.second.isVector() || 12.4k
!TLI->isTypeLegal(DstVT)12.4k
)
425
0
    return Cost + getCastInstrCost(Opcode, Dst, Src);
426
12.4k
427
12.4k
  // The destination type should be larger than the element type. If not, get
428
12.4k
  // the default cost for the extend.
429
12.4k
  
if (12.4k
DstVT.getSizeInBits() < SrcVT.getSizeInBits()12.4k
)
430
0
    return Cost + getCastInstrCost(Opcode, Dst, Src);
431
12.4k
432
12.4k
  switch (Opcode) {
433
0
  default:
434
0
    llvm_unreachable("Opcode should be either SExt or ZExt");
435
12.4k
436
12.4k
  // For sign-extends, we only need a smov, which performs the extension
437
12.4k
  // automatically.
438
11.5k
  case Instruction::SExt:
439
11.5k
    return Cost;
440
12.4k
441
12.4k
  // For zero-extends, the extend is performed automatically by a umov unless
442
12.4k
  // the destination type is i64 and the element type is i8 or i16.
443
936
  case Instruction::ZExt:
444
936
    if (
DstVT.getSizeInBits() != 64u || 936
SrcVT.getSizeInBits() == 32u808
)
445
244
      return Cost;
446
692
  }
447
692
448
692
  // If we are unable to perform the extend for free, get the default cost.
449
692
  return Cost + getCastInstrCost(Opcode, Dst, Src);
450
692
}
451
452
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
453
2.29M
                                       unsigned Index) {
454
2.29M
  assert(Val->isVectorTy() && "This must be a vector type");
455
2.29M
456
2.29M
  if (
Index != -1U2.29M
) {
457
2.29M
    // Legalize the type.
458
2.29M
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
459
2.29M
460
2.29M
    // This type is legalized to a scalar type.
461
2.29M
    if (!LT.second.isVector())
462
136
      return 0;
463
2.29M
464
2.29M
    // The type may be split. Normalize the index to the new type.
465
2.29M
    unsigned Width = LT.second.getVectorNumElements();
466
2.29M
    Index = Index % Width;
467
2.29M
468
2.29M
    // The element at index zero is already inside the vector.
469
2.29M
    if (Index == 0)
470
987k
      return 0;
471
1.30M
  }
472
1.30M
473
1.30M
  // All other insert/extracts cost this much.
474
1.30M
  return ST->getVectorInsertExtractBaseCost();
475
1.30M
}
476
477
int AArch64TTIImpl::getArithmeticInstrCost(
478
    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
479
    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
480
759k
    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
481
759k
  // Legalize the type.
482
759k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
483
759k
484
759k
  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
485
759k
  // add in the widening overhead specified by the sub-target. Since the
486
759k
  // extends feeding widening instructions are performed automatically, they
487
759k
  // aren't present in the generated code and have a zero cost. By adding a
488
759k
  // widening overhead here, we attach the total cost of the combined operation
489
759k
  // to the widening instruction.
490
759k
  int Cost = 0;
491
759k
  if (isWideningInstruction(Ty, Opcode, Args))
492
1.80k
    Cost += ST->getWideningBaseCost();
493
759k
494
759k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
495
759k
496
759k
  if (ISD == ISD::SDIV &&
497
8.38k
      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
498
759k
      
Opd2PropInfo == TargetTransformInfo::OP_PowerOf26.15k
) {
499
5.89k
    // On AArch64, scalar signed division by constants power-of-two are
500
5.89k
    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
501
5.89k
    // The OperandValue properties many not be same as that of previous
502
5.89k
    // operation; conservatively assume OP_None.
503
5.89k
    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
504
5.89k
                                   TargetTransformInfo::OP_None,
505
5.89k
                                   TargetTransformInfo::OP_None);
506
5.89k
    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
507
5.89k
                                   TargetTransformInfo::OP_None,
508
5.89k
                                   TargetTransformInfo::OP_None);
509
5.89k
    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
510
5.89k
                                   TargetTransformInfo::OP_None,
511
5.89k
                                   TargetTransformInfo::OP_None);
512
5.89k
    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
513
5.89k
                                   TargetTransformInfo::OP_None,
514
5.89k
                                   TargetTransformInfo::OP_None);
515
5.89k
    return Cost;
516
5.89k
  }
517
754k
518
754k
  switch (ISD) {
519
288k
  default:
520
288k
    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
521
288k
                                                Opd1PropInfo, Opd2PropInfo);
522
465k
  case ISD::ADD:
523
465k
  case ISD::MUL:
524
465k
  case ISD::XOR:
525
465k
  case ISD::OR:
526
465k
  case ISD::AND:
527
465k
    // These nodes are marked as 'custom' for combining purposes only.
528
465k
    // We know that they are legal. See LowerAdd in ISelLowering.
529
465k
    return (Cost + 1) * LT.first;
530
0
  }
531
0
}
532
533
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
534
76.7k
                                              const SCEV *Ptr) {
535
76.7k
  // Address computations in vectorized code with non-consecutive addresses will
536
76.7k
  // likely result in more instructions compared to scalar code where the
537
76.7k
  // computation can more often be merged into the index mode. The resulting
538
76.7k
  // extra micro-ops can significantly decrease throughput.
539
76.7k
  unsigned NumVectorInstToHideOverhead = 10;
540
76.7k
  int MaxMergeDistance = 64;
541
76.7k
542
76.7k
  if (
Ty->isVectorTy() && 76.7k
SE21.2k
&&
543
21.2k
      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
544
16.2k
    return NumVectorInstToHideOverhead;
545
60.5k
546
60.5k
  // In many cases the address computation is not merged into the instruction
547
60.5k
  // addressing mode.
548
60.5k
  return 1;
549
60.5k
}
550
551
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
552
307k
                                       Type *CondTy, const Instruction *I) {
553
307k
554
307k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
555
307k
  // We don't lower some vector selects well that are wider than the register
556
307k
  // width.
557
307k
  if (
ValTy->isVectorTy() && 307k
ISD == ISD::SELECT106k
) {
558
45.4k
    // We would need this many instructions to hide the scalarization happening.
559
45.4k
    const int AmortizationCost = 20;
560
45.4k
    static const TypeConversionCostTblEntry
561
45.4k
    VectorSelectTbl[] = {
562
45.4k
      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
563
45.4k
      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
564
45.4k
      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
565
45.4k
      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
566
45.4k
      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
567
45.4k
      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
568
45.4k
    };
569
45.4k
570
45.4k
    EVT SelCondTy = TLI->getValueType(DL, CondTy);
571
45.4k
    EVT SelValTy = TLI->getValueType(DL, ValTy);
572
45.4k
    if (
SelCondTy.isSimple() && 45.4k
SelValTy.isSimple()45.4k
) {
573
45.4k
      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
574
45.4k
                                                     SelCondTy.getSimpleVT(),
575
45.4k
                                                     SelValTy.getSimpleVT()))
576
6.19k
        return Entry->Cost;
577
301k
    }
578
45.4k
  }
579
301k
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
580
301k
}
581
582
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
583
                                    unsigned Alignment, unsigned AddressSpace,
584
948k
                                    const Instruction *I) {
585
948k
  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
586
948k
587
948k
  if (
ST->isMisaligned128StoreSlow() && 948k
Opcode == Instruction::Store947k
&&
588
948k
      
LT.second.is128BitVector()614k
&&
Alignment < 16253k
) {
589
137k
    // Unaligned stores are extremely inefficient. We don't split all
590
137k
    // unaligned 128-bit stores because the negative impact that has shown in
591
137k
    // practice on inlined block copy code.
592
137k
    // We make such stores expensive so that we will only vectorize if there
593
137k
    // are 6 other instructions getting vectorized.
594
137k
    const int AmortizationCost = 6;
595
137k
596
137k
    return LT.first * 2 * AmortizationCost;
597
137k
  }
598
810k
599
810k
  
if (810k
Ty->isVectorTy() && 810k
Ty->getVectorElementType()->isIntegerTy(8)436k
&&
600
810k
      
Ty->getVectorNumElements() < 89.69k
) {
601
7.17k
    // We scalarize the loads/stores because there is not v.4b register and we
602
7.17k
    // have to promote the elements to v.4h.
603
7.17k
    unsigned NumVecElts = Ty->getVectorNumElements();
604
7.17k
    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
605
7.17k
    // We generate 2 instructions per vector element.
606
7.17k
    return NumVectorizableInstsToAmortize * NumVecElts * 2;
607
7.17k
  }
608
803k
609
803k
  return LT.first;
610
803k
}
611
612
int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
613
                                               unsigned Factor,
614
                                               ArrayRef<unsigned> Indices,
615
                                               unsigned Alignment,
616
2.46k
                                               unsigned AddressSpace) {
617
2.46k
  assert(Factor >= 2 && "Invalid interleave factor");
618
2.46k
  assert(isa<VectorType>(VecTy) && "Expect a vector type");
619
2.46k
620
2.46k
  if (
Factor <= TLI->getMaxSupportedInterleaveFactor()2.46k
) {
621
1.92k
    unsigned NumElts = VecTy->getVectorNumElements();
622
1.92k
    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
623
1.92k
624
1.92k
    // ldN/stN only support legal vector types of size 64 or 128 in bits.
625
1.92k
    // Accesses having vector types that are a multiple of 128 bits can be
626
1.92k
    // matched to more than one ldN/stN instruction.
627
1.92k
    if (NumElts % Factor == 0 &&
628
1.92k
        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
629
1.58k
      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
630
880
  }
631
880
632
880
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
633
880
                                           Alignment, AddressSpace);
634
880
}
635
636
182k
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
637
182k
  int Cost = 0;
638
347k
  for (auto *I : Tys) {
639
347k
    if (!I->isVectorTy())
640
0
      continue;
641
347k
    
if (347k
I->getScalarSizeInBits() * I->getVectorNumElements() == 128347k
)
642
108k
      Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
643
108k
        getMemoryOpCost(Instruction::Load, I, 128, 0);
644
347k
  }
645
182k
  return Cost;
646
182k
}
647
648
25.9k
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
649
25.9k
  return ST->getMaxInterleaveFactor();
650
25.9k
}
651
652
// For Falkor, we want to avoid having too many strided loads in a loop since
653
// that can exhaust the HW prefetcher resources.  We adjust the unroller
654
// MaxCount preference below to attempt to ensure unrolling doesn't create too
655
// many strided loads.
656
static void
657
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
658
3
                              TargetTransformInfo::UnrollingPreferences &UP) {
659
3
  enum { MaxStridedLoads = 7 };
660
3
  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
661
3
    int StridedLoads = 0;
662
3
    // FIXME? We could make this more precise by looking at the CFG and
663
3
    // e.g. not counting loads in each side of an if-then-else diamond.
664
5
    for (const auto BB : L->blocks()) {
665
36
      for (auto &I : *BB) {
666
36
        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
667
36
        if (!LMemI)
668
29
          continue;
669
7
670
7
        Value *PtrValue = LMemI->getPointerOperand();
671
7
        if (L->isLoopInvariant(PtrValue))
672
0
          continue;
673
7
674
7
        const SCEV *LSCEV = SE.getSCEV(PtrValue);
675
7
        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
676
7
        if (
!LSCEVAddRec || 7
!LSCEVAddRec->isAffine()7
)
677
0
          continue;
678
7
679
7
        // FIXME? We could take pairing of unrolled load copies into account
680
7
        // by looking at the AddRec, but we would probably have to limit this
681
7
        // to loops with no stores or other memory optimization barriers.
682
7
        ++StridedLoads;
683
7
        // We've seen enough strided loads that seeing more won't make a
684
7
        // difference.
685
7
        if (StridedLoads > MaxStridedLoads / 2)
686
1
          return StridedLoads;
687
2
      }
688
5
    }
689
2
    return StridedLoads;
690
2
  };
691
3
692
3
  int StridedLoads = countStridedLoads(L, SE);
693
3
  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
694
3
               << " strided loads\n");
695
3
  // Pick the largest power of 2 unroll count that won't result in too many
696
3
  // strided loads.
697
3
  if (
StridedLoads3
) {
698
3
    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
699
3
    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
700
3
                 << '\n');
701
3
  }
702
3
}
703
704
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
705
636k
                                             TTI::UnrollingPreferences &UP) {
706
636k
  // Enable partial unrolling and runtime unrolling.
707
636k
  BaseT::getUnrollingPreferences(L, SE, UP);
708
636k
709
636k
  // For inner loop, it is more likely to be a hot one, and the runtime check
710
636k
  // can be promoted out from LICM pass, so the overhead is less, let's try
711
636k
  // a larger threshold to unroll more loops.
712
636k
  if (L->getLoopDepth() > 1)
713
182k
    UP.PartialThreshold *= 2;
714
636k
715
636k
  // Disable partial & runtime unrolling on -Os.
716
636k
  UP.PartialOptSizeThreshold = 0;
717
636k
718
636k
  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
719
6
      EnableFalkorHWPFUnrollFix)
720
3
    getFalkorUnrollingPreferences(L, SE, UP);
721
636k
}
722
723
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
724
136
                                                         Type *ExpectedType) {
725
136
  switch (Inst->getIntrinsicID()) {
726
0
  default:
727
0
    return nullptr;
728
16
  case Intrinsic::aarch64_neon_st2:
729
16
  case Intrinsic::aarch64_neon_st3:
730
16
  case Intrinsic::aarch64_neon_st4: {
731
16
    // Create a struct type
732
16
    StructType *ST = dyn_cast<StructType>(ExpectedType);
733
16
    if (!ST)
734
8
      return nullptr;
735
8
    unsigned NumElts = Inst->getNumArgOperands() - 1;
736
8
    if (ST->getNumElements() != NumElts)
737
0
      return nullptr;
738
24
    
for (unsigned i = 0, e = NumElts; 8
i != e24
;
++i16
) {
739
16
      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
740
0
        return nullptr;
741
16
    }
742
8
    Value *Res = UndefValue::get(ExpectedType);
743
8
    IRBuilder<> Builder(Inst);
744
24
    for (unsigned i = 0, e = NumElts; 
i != e24
;
++i16
) {
745
16
      Value *L = Inst->getArgOperand(i);
746
16
      Res = Builder.CreateInsertValue(Res, L, i);
747
16
    }
748
8
    return Res;
749
8
  }
750
120
  case Intrinsic::aarch64_neon_ld2:
751
120
  case Intrinsic::aarch64_neon_ld3:
752
120
  case Intrinsic::aarch64_neon_ld4:
753
120
    if (Inst->getType() == ExpectedType)
754
120
      return Inst;
755
0
    return nullptr;
756
136
  }
757
136
}
758
759
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
760
369k
                                        MemIntrinsicInfo &Info) {
761
369k
  switch (Inst->getIntrinsicID()) {
762
367k
  default:
763
367k
    break;
764
1.16k
  case Intrinsic::aarch64_neon_ld2:
765
1.16k
  case Intrinsic::aarch64_neon_ld3:
766
1.16k
  case Intrinsic::aarch64_neon_ld4:
767
1.16k
    Info.ReadMem = true;
768
1.16k
    Info.WriteMem = false;
769
1.16k
    Info.PtrVal = Inst->getArgOperand(0);
770
1.16k
    break;
771
1.31k
  case Intrinsic::aarch64_neon_st2:
772
1.31k
  case Intrinsic::aarch64_neon_st3:
773
1.31k
  case Intrinsic::aarch64_neon_st4:
774
1.31k
    Info.ReadMem = false;
775
1.31k
    Info.WriteMem = true;
776
1.31k
    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
777
1.31k
    break;
778
369k
  }
779
369k
780
369k
  switch (Inst->getIntrinsicID()) {
781
367k
  default:
782
367k
    return false;
783
1.09k
  case Intrinsic::aarch64_neon_ld2:
784
1.09k
  case Intrinsic::aarch64_neon_st2:
785
1.09k
    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
786
1.09k
    break;
787
696
  case Intrinsic::aarch64_neon_ld3:
788
696
  case Intrinsic::aarch64_neon_st3:
789
696
    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
790
696
    break;
791
682
  case Intrinsic::aarch64_neon_ld4:
792
682
  case Intrinsic::aarch64_neon_st4:
793
682
    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
794
682
    break;
795
2.47k
  }
796
2.47k
  return true;
797
2.47k
}
798
799
/// See if \p I should be considered for address type promotion. We check if \p
800
/// I is a sext with right type and used in memory accesses. If it used in a
801
/// "complex" getelementptr, we allow it to be promoted without finding other
802
/// sext instructions that sign extended the same initial value. A getelementptr
803
/// is considered as "complex" if it has more than 2 operands.
804
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
805
1.38M
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
806
1.38M
  bool Considerable = false;
807
1.38M
  AllowPromotionWithoutCommonHeader = false;
808
1.38M
  if (!isa<SExtInst>(&I))
809
474k
    return false;
810
906k
  Type *ConsideredSExtType =
811
906k
      Type::getInt64Ty(I.getParent()->getParent()->getContext());
812
906k
  if (I.getType() != ConsideredSExtType)
813
53.0k
    return false;
814
853k
  // See if the sext is the one with the right type and used in at least one
815
853k
  // GetElementPtrInst.
816
853k
  
for (const User *U : I.users()) 853k
{
817
1.57M
    if (const GetElementPtrInst *
GEPInst1.57M
= dyn_cast<GetElementPtrInst>(U)) {
818
261k
      Considerable = true;
819
261k
      // A getelementptr is considered as "complex" if it has more than 2
820
261k
      // operands. We will promote a SExt used in such complex GEP as we
821
261k
      // expect some computation to be merged if they are done on 64 bits.
822
261k
      if (
GEPInst->getNumOperands() > 2261k
) {
823
122k
        AllowPromotionWithoutCommonHeader = true;
824
122k
        break;
825
122k
      }
826
853k
    }
827
1.57M
  }
828
853k
  return Considerable;
829
1.38M
}
830
831
0
unsigned AArch64TTIImpl::getCacheLineSize() {
832
0
  return ST->getCacheLineSize();
833
0
}
834
835
734k
unsigned AArch64TTIImpl::getPrefetchDistance() {
836
734k
  return ST->getPrefetchDistance();
837
734k
}
838
839
16.6k
unsigned AArch64TTIImpl::getMinPrefetchStride() {
840
16.6k
  return ST->getMinPrefetchStride();
841
16.6k
}
842
843
278k
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
844
278k
  return ST->getMaxPrefetchIterationsAhead();
845
278k
}
846
847
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
848
958
                                           TTI::ReductionFlags Flags) const {
849
958
  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
850
958
  unsigned ScalarBits = Ty->getScalarSizeInBits();
851
958
  switch (Opcode) {
852
72
  case Instruction::FAdd:
853
72
  case Instruction::FMul:
854
72
  case Instruction::And:
855
72
  case Instruction::Or:
856
72
  case Instruction::Xor:
857
72
  case Instruction::Mul:
858
72
    return false;
859
854
  case Instruction::Add:
860
854
    return ScalarBits * Ty->getVectorNumElements() >= 128;
861
32
  case Instruction::ICmp:
862
32
    return (ScalarBits < 64) &&
863
30
           (ScalarBits * Ty->getVectorNumElements() >= 128);
864
0
  case Instruction::FCmp:
865
0
    return Flags.NoNaN;
866
0
  default:
867
0
    llvm_unreachable("Unhandled reduction opcode");
868
0
  }
869
0
  return false;
870
0
}