Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "AArch64ExpandImm.h"
10
#include "AArch64TargetTransformInfo.h"
11
#include "MCTargetDesc/AArch64AddressingModes.h"
12
#include "llvm/Analysis/LoopInfo.h"
13
#include "llvm/Analysis/TargetTransformInfo.h"
14
#include "llvm/CodeGen/BasicTTIImpl.h"
15
#include "llvm/CodeGen/CostTable.h"
16
#include "llvm/CodeGen/TargetLowering.h"
17
#include "llvm/IR/IntrinsicInst.h"
18
#include "llvm/Support/Debug.h"
19
#include <algorithm>
20
using namespace llvm;
21
22
#define DEBUG_TYPE "aarch64tti"
23
24
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
25
                                               cl::init(true), cl::Hidden);
26
27
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
28
1.29M
                                         const Function *Callee) const {
29
1.29M
  const TargetMachine &TM = getTLI()->getTargetMachine();
30
1.29M
31
1.29M
  const FeatureBitset &CallerBits =
32
1.29M
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
33
1.29M
  const FeatureBitset &CalleeBits =
34
1.29M
      TM.getSubtargetImpl(*Callee)->getFeatureBits();
35
1.29M
36
1.29M
  // Inline a callee if its target-features are a subset of the callers
37
1.29M
  // target-features.
38
1.29M
  return (CallerBits & CalleeBits) == CalleeBits;
39
1.29M
}
40
41
/// Calculate the cost of materializing a 64-bit value. This helper
42
/// method might only calculate a fraction of a larger immediate. Therefore it
43
/// is valid to return a cost of ZERO.
44
1.92M
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
45
1.92M
  // Check if the immediate can be encoded within an instruction.
46
1.92M
  if (Val == 0 || 
AArch64_AM::isLogicalImmediate(Val, 64)1.15M
)
47
1.52M
    return 0;
48
399k
49
399k
  if (Val < 0)
50
185k
    Val = ~Val;
51
399k
52
399k
  // Calculate how many moves we will need to materialize this constant.
53
399k
  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
54
399k
  AArch64_IMM::expandMOVImm(Val, 64, Insn);
55
399k
  return Insn.size();
56
399k
}
57
58
/// Calculate the cost of materializing the given constant.
59
1.92M
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
60
1.92M
  assert(Ty->isIntegerTy());
61
1.92M
62
1.92M
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
63
1.92M
  if (BitSize == 0)
64
0
    return ~0U;
65
1.92M
66
1.92M
  // Sign-extend all constants to a multiple of 64-bit.
67
1.92M
  APInt ImmVal = Imm;
68
1.92M
  if (BitSize & 0x3f)
69
1.30M
    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
70
1.92M
71
1.92M
  // Split the constant into 64-bit chunks and calculate the cost for each
72
1.92M
  // chunk.
73
1.92M
  int Cost = 0;
74
3.84M
  for (unsigned ShiftVal = 0; ShiftVal < BitSize; 
ShiftVal += 641.92M
) {
75
1.92M
    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
76
1.92M
    int64_t Val = Tmp.getSExtValue();
77
1.92M
    Cost += getIntImmCost(Val);
78
1.92M
  }
79
1.92M
  // We need at least one instruction to materialze the constant.
80
1.92M
  return std::max(1, Cost);
81
1.92M
}
82
83
int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
84
3.67M
                                  const APInt &Imm, Type *Ty) {
85
3.67M
  assert(Ty->isIntegerTy());
86
3.67M
87
3.67M
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
88
3.67M
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
89
3.67M
  // here, so that constant hoisting will ignore this constant.
90
3.67M
  if (BitSize == 0)
91
0
    return TTI::TCC_Free;
92
3.67M
93
3.67M
  unsigned ImmIdx = ~0U;
94
3.67M
  switch (Opcode) {
95
3.67M
  default:
96
29.6k
    return TTI::TCC_Free;
97
3.67M
  case Instruction::GetElementPtr:
98
1.60M
    // Always hoist the base address of a GetElementPtr.
99
1.60M
    if (Idx == 0)
100
6
      return 2 * TTI::TCC_Basic;
101
1.60M
    return TTI::TCC_Free;
102
1.60M
  case Instruction::Store:
103
251k
    ImmIdx = 0;
104
251k
    break;
105
1.60M
  case Instruction::Add:
106
984k
  case Instruction::Sub:
107
984k
  case Instruction::Mul:
108
984k
  case Instruction::UDiv:
109
984k
  case Instruction::SDiv:
110
984k
  case Instruction::URem:
111
984k
  case Instruction::SRem:
112
984k
  case Instruction::And:
113
984k
  case Instruction::Or:
114
984k
  case Instruction::Xor:
115
984k
  case Instruction::ICmp:
116
984k
    ImmIdx = 1;
117
984k
    break;
118
984k
  // Always return TCC_Free for the shift value of a shift instruction.
119
984k
  case Instruction::Shl:
120
117k
  case Instruction::LShr:
121
117k
  case Instruction::AShr:
122
117k
    if (Idx == 1)
123
113k
      return TTI::TCC_Free;
124
3.91k
    break;
125
679k
  case Instruction::Trunc:
126
679k
  case Instruction::ZExt:
127
679k
  case Instruction::SExt:
128
679k
  case Instruction::IntToPtr:
129
679k
  case Instruction::PtrToInt:
130
679k
  case Instruction::BitCast:
131
679k
  case Instruction::PHI:
132
679k
  case Instruction::Call:
133
679k
  case Instruction::Select:
134
679k
  case Instruction::Ret:
135
679k
  case Instruction::Load:
136
679k
    break;
137
1.92M
  }
138
1.92M
139
1.92M
  if (Idx == ImmIdx) {
140
1.20M
    int NumConstants = (BitSize + 63) / 64;
141
1.20M
    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
142
1.20M
    return (Cost <= NumConstants * TTI::TCC_Basic)
143
1.20M
               ? 
static_cast<int>(TTI::TCC_Free)1.18M
144
1.20M
               : 
Cost14.9k
;
145
1.20M
  }
146
717k
  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
147
717k
}
148
149
int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
150
220k
                                  const APInt &Imm, Type *Ty) {
151
220k
  assert(Ty->isIntegerTy());
152
220k
153
220k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
154
220k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
155
220k
  // here, so that constant hoisting will ignore this constant.
156
220k
  if (BitSize == 0)
157
0
    return TTI::TCC_Free;
158
220k
159
220k
  switch (IID) {
160
220k
  default:
161
219k
    return TTI::TCC_Free;
162
220k
  case Intrinsic::sadd_with_overflow:
163
705
  case Intrinsic::uadd_with_overflow:
164
705
  case Intrinsic::ssub_with_overflow:
165
705
  case Intrinsic::usub_with_overflow:
166
705
  case Intrinsic::smul_with_overflow:
167
705
  case Intrinsic::umul_with_overflow:
168
705
    if (Idx == 1) {
169
705
      int NumConstants = (BitSize + 63) / 64;
170
705
      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
171
705
      return (Cost <= NumConstants * TTI::TCC_Basic)
172
705
                 ? static_cast<int>(TTI::TCC_Free)
173
705
                 : 
Cost0
;
174
705
    }
175
0
    break;
176
36
  case Intrinsic::experimental_stackmap:
177
36
    if ((Idx < 2) || 
(2
Imm.getBitWidth() <= 642
&&
isInt<64>(Imm.getSExtValue())2
))
178
36
      return TTI::TCC_Free;
179
0
    break;
180
209
  case Intrinsic::experimental_patchpoint_void:
181
209
  case Intrinsic::experimental_patchpoint_i64:
182
209
    if ((Idx < 4) || 
(35
Imm.getBitWidth() <= 6435
&&
isInt<64>(Imm.getSExtValue())35
))
183
209
      return TTI::TCC_Free;
184
0
    break;
185
0
  }
186
0
  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
187
0
}
188
189
TargetTransformInfo::PopcntSupportKind
190
123k
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
191
123k
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
192
123k
  if (TyWidth == 32 || 
TyWidth == 640
)
193
123k
    return TTI::PSK_FastHardware;
194
0
  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
195
0
  return TTI::PSK_Software;
196
0
}
197
198
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
199
932k
                                           ArrayRef<const Value *> Args) {
200
932k
201
932k
  // A helper that returns a vector type from the given type. The number of
202
932k
  // elements in type Ty determine the vector width.
203
932k
  auto toVectorTy = [&](Type *ArgTy) {
204
6.52k
    return VectorType::get(ArgTy->getScalarType(),
205
6.52k
                           DstTy->getVectorNumElements());
206
6.52k
  };
207
932k
208
932k
  // Exit early if DstTy is not a vector type whose elements are at least
209
932k
  // 16-bits wide.
210
932k
  if (!DstTy->isVectorTy() || 
DstTy->getScalarSizeInBits() < 16421k
)
211
523k
    return false;
212
408k
213
408k
  // Determine if the operation has a widening variant. We consider both the
214
408k
  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
215
408k
  // instructions.
216
408k
  //
217
408k
  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
218
408k
  //       verify that their extending operands are eliminated during code
219
408k
  //       generation.
220
408k
  switch (Opcode) {
221
408k
  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
222
120k
  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
223
120k
    break;
224
288k
  default:
225
288k
    return false;
226
120k
  }
227
120k
228
120k
  // To be a widening instruction (either the "wide" or "long" versions), the
229
120k
  // second operand must be a sign- or zero extend having a single user. We
230
120k
  // only consider extends having a single user because they may otherwise not
231
120k
  // be eliminated.
232
120k
  if (Args.size() != 2 ||
233
120k
      
(70.6k
!isa<SExtInst>(Args[1])70.6k
&&
!isa<ZExtInst>(Args[1])65.4k
) ||
234
120k
      
!Args[1]->hasOneUse()10.9k
)
235
113k
    return false;
236
6.98k
  auto *Extend = cast<CastInst>(Args[1]);
237
6.98k
238
6.98k
  // Legalize the destination type and ensure it can be used in a widening
239
6.98k
  // operation.
240
6.98k
  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
241
6.98k
  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
242
6.98k
  if (!DstTyL.second.isVector() || 
DstElTySize != DstTy->getScalarSizeInBits()6.98k
)
243
456
    return false;
244
6.52k
245
6.52k
  // Legalize the source type and ensure it can be used in a widening
246
6.52k
  // operation.
247
6.52k
  Type *SrcTy = toVectorTy(Extend->getSrcTy());
248
6.52k
  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
249
6.52k
  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
250
6.52k
  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
251
3.90k
    return false;
252
2.61k
253
2.61k
  // Get the total number of vector elements in the legalized types.
254
2.61k
  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
255
2.61k
  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
256
2.61k
257
2.61k
  // Return true if the legalized types have the same number of vector elements
258
2.61k
  // and the destination element type size is twice that of the source type.
259
2.61k
  return NumDstEls == NumSrcEls && 
2 * SrcElTySize == DstElTySize2.61k
;
260
2.61k
}
261
262
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
263
129k
                                     const Instruction *I) {
264
129k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
265
129k
  assert(ISD && "Invalid opcode");
266
129k
267
129k
  // If the cast is observable, and it is used by a widening instruction (e.g.,
268
129k
  // uaddl, saddw, etc.), it may be free.
269
129k
  if (I && 
I->hasOneUse()127k
) {
270
109k
    auto *SingleUser = cast<Instruction>(*I->user_begin());
271
109k
    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
272
109k
    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
273
1.33k
      // If the cast is the second operand, it is free. We will generate either
274
1.33k
      // a "wide" or "long" version of the widening instruction.
275
1.33k
      if (I == SingleUser->getOperand(1))
276
924
        return 0;
277
414
      // If the cast is not the second operand, it will be free if it looks the
278
414
      // same as the second operand. In this case, we will generate a "long"
279
414
      // version of the widening instruction.
280
414
      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
281
414
        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
282
414
            
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()413
)
283
412
          return 0;
284
128k
    }
285
109k
  }
286
128k
287
128k
  EVT SrcTy = TLI->getValueType(DL, Src);
288
128k
  EVT DstTy = TLI->getValueType(DL, Dst);
289
128k
290
128k
  if (!SrcTy.isSimple() || 
!DstTy.isSimple()127k
)
291
56
    return BaseT::getCastInstrCost(Opcode, Dst, Src);
292
127k
293
127k
  static const TypeConversionCostTblEntry
294
127k
  ConversionTbl[] = {
295
127k
    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
296
127k
    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
297
127k
    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
298
127k
    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
299
127k
300
127k
    // The number of shll instructions for the extension.
301
127k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
302
127k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
303
127k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
304
127k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
305
127k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
306
127k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
307
127k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
308
127k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
309
127k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
310
127k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
311
127k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
312
127k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
313
127k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
314
127k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
315
127k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
316
127k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
317
127k
318
127k
    // LowerVectorINT_TO_FP:
319
127k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
320
127k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
321
127k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
322
127k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
323
127k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
324
127k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
325
127k
326
127k
    // Complex: to v2f32
327
127k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
328
127k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
329
127k
    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
330
127k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
331
127k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
332
127k
    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
333
127k
334
127k
    // Complex: to v4f32
335
127k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
336
127k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
337
127k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
338
127k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
339
127k
340
127k
    // Complex: to v8f32
341
127k
    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
342
127k
    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
343
127k
    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
344
127k
    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
345
127k
346
127k
    // Complex: to v16f32
347
127k
    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
348
127k
    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
349
127k
350
127k
    // Complex: to v2f64
351
127k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
352
127k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
353
127k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
354
127k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
355
127k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
356
127k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
357
127k
358
127k
359
127k
    // LowerVectorFP_TO_INT
360
127k
    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
361
127k
    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
362
127k
    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
363
127k
    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
364
127k
    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
365
127k
    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
366
127k
367
127k
    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
368
127k
    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
369
127k
    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
370
127k
    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
371
127k
    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
372
127k
    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
373
127k
    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
374
127k
375
127k
    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
376
127k
    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
377
127k
    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
378
127k
    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
379
127k
    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
380
127k
381
127k
    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
382
127k
    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
383
127k
    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
384
127k
    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
385
127k
    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
386
127k
    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
387
127k
    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
388
127k
  };
389
127k
390
127k
  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
391
25.6k
                                                 DstTy.getSimpleVT(),
392
25.6k
                                                 SrcTy.getSimpleVT()))
393
25.6k
    return Entry->Cost;
394
102k
395
102k
  return BaseT::getCastInstrCost(Opcode, Dst, Src);
396
102k
}
397
398
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
399
                                             VectorType *VecTy,
400
6.58k
                                             unsigned Index) {
401
6.58k
402
6.58k
  // Make sure we were given a valid extend opcode.
403
6.58k
  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
404
6.58k
         "Invalid opcode");
405
6.58k
406
6.58k
  // We are extending an element we extract from a vector, so the source type
407
6.58k
  // of the extend is the element type of the vector.
408
6.58k
  auto *Src = VecTy->getElementType();
409
6.58k
410
6.58k
  // Sign- and zero-extends are for integer types only.
411
6.58k
  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
412
6.58k
413
6.58k
  // Get the cost for the extract. We compute the cost (if any) for the extend
414
6.58k
  // below.
415
6.58k
  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
416
6.58k
417
6.58k
  // Legalize the types.
418
6.58k
  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
419
6.58k
  auto DstVT = TLI->getValueType(DL, Dst);
420
6.58k
  auto SrcVT = TLI->getValueType(DL, Src);
421
6.58k
422
6.58k
  // If the resulting type is still a vector and the destination type is legal,
423
6.58k
  // we may get the extension for free. If not, get the default cost for the
424
6.58k
  // extend.
425
6.58k
  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
426
0
    return Cost + getCastInstrCost(Opcode, Dst, Src);
427
6.58k
428
6.58k
  // The destination type should be larger than the element type. If not, get
429
6.58k
  // the default cost for the extend.
430
6.58k
  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
431
0
    return Cost + getCastInstrCost(Opcode, Dst, Src);
432
6.58k
433
6.58k
  switch (Opcode) {
434
6.58k
  default:
435
0
    llvm_unreachable("Opcode should be either SExt or ZExt");
436
6.58k
437
6.58k
  // For sign-extends, we only need a smov, which performs the extension
438
6.58k
  // automatically.
439
6.58k
  case Instruction::SExt:
440
6.15k
    return Cost;
441
6.58k
442
6.58k
  // For zero-extends, the extend is performed automatically by a umov unless
443
6.58k
  // the destination type is i64 and the element type is i8 or i16.
444
6.58k
  case Instruction::ZExt:
445
426
    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
446
278
      return Cost;
447
148
  }
448
148
449
148
  // If we are unable to perform the extend for free, get the default cost.
450
148
  return Cost + getCastInstrCost(Opcode, Dst, Src);
451
148
}
452
453
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
454
1.75M
                                       unsigned Index) {
455
1.75M
  assert(Val->isVectorTy() && "This must be a vector type");
456
1.75M
457
1.75M
  if (Index != -1U) {
458
1.75M
    // Legalize the type.
459
1.75M
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
460
1.75M
461
1.75M
    // This type is legalized to a scalar type.
462
1.75M
    if (!LT.second.isVector())
463
176
      return 0;
464
1.75M
465
1.75M
    // The type may be split. Normalize the index to the new type.
466
1.75M
    unsigned Width = LT.second.getVectorNumElements();
467
1.75M
    Index = Index % Width;
468
1.75M
469
1.75M
    // The element at index zero is already inside the vector.
470
1.75M
    if (Index == 0)
471
801k
      return 0;
472
958k
  }
473
958k
474
958k
  // All other insert/extracts cost this much.
475
958k
  return ST->getVectorInsertExtractBaseCost();
476
958k
}
477
478
int AArch64TTIImpl::getArithmeticInstrCost(
479
    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
480
    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
481
822k
    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
482
822k
  // Legalize the type.
483
822k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
484
822k
485
822k
  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
486
822k
  // add in the widening overhead specified by the sub-target. Since the
487
822k
  // extends feeding widening instructions are performed automatically, they
488
822k
  // aren't present in the generated code and have a zero cost. By adding a
489
822k
  // widening overhead here, we attach the total cost of the combined operation
490
822k
  // to the widening instruction.
491
822k
  int Cost = 0;
492
822k
  if (isWideningInstruction(Ty, Opcode, Args))
493
1.08k
    Cost += ST->getWideningBaseCost();
494
822k
495
822k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
496
822k
497
822k
  switch (ISD) {
498
822k
  default:
499
412k
    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
500
412k
                                                Opd1PropInfo, Opd2PropInfo);
501
822k
  case ISD::SDIV:
502
9.88k
    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
503
9.88k
        
Opd2PropInfo == TargetTransformInfo::OP_PowerOf24.29k
) {
504
3.36k
      // On AArch64, scalar signed division by constants power-of-two are
505
3.36k
      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
506
3.36k
      // The OperandValue properties many not be same as that of previous
507
3.36k
      // operation; conservatively assume OP_None.
508
3.36k
      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
509
3.36k
                                     TargetTransformInfo::OP_None,
510
3.36k
                                     TargetTransformInfo::OP_None);
511
3.36k
      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
512
3.36k
                                     TargetTransformInfo::OP_None,
513
3.36k
                                     TargetTransformInfo::OP_None);
514
3.36k
      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
515
3.36k
                                     TargetTransformInfo::OP_None,
516
3.36k
                                     TargetTransformInfo::OP_None);
517
3.36k
      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
518
3.36k
                                     TargetTransformInfo::OP_None,
519
3.36k
                                     TargetTransformInfo::OP_None);
520
3.36k
      return Cost;
521
3.36k
    }
522
6.51k
    LLVM_FALLTHROUGH;
523
8.75k
  case ISD::UDIV:
524
8.75k
    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
525
1.45k
      auto VT = TLI->getValueType(DL, Ty);
526
1.45k
      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
527
1.12k
        // Vector signed division by constant are expanded to the
528
1.12k
        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
529
1.12k
        // to MULHS + SUB + SRL + ADD + SRL.
530
1.12k
        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
531
1.12k
                                             Opd2Info,
532
1.12k
                                             TargetTransformInfo::OP_None,
533
1.12k
                                             TargetTransformInfo::OP_None);
534
1.12k
        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
535
1.12k
                                             Opd2Info,
536
1.12k
                                             TargetTransformInfo::OP_None,
537
1.12k
                                             TargetTransformInfo::OP_None);
538
1.12k
        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
539
1.12k
                                             Opd2Info,
540
1.12k
                                             TargetTransformInfo::OP_None,
541
1.12k
                                             TargetTransformInfo::OP_None);
542
1.12k
        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
543
1.12k
      }
544
7.62k
    }
545
7.62k
546
7.62k
    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
547
7.62k
                                          Opd1PropInfo, Opd2PropInfo);
548
7.62k
    if (Ty->isVectorTy()) {
549
3.01k
      // On AArch64, vector divisions are not supported natively and are
550
3.01k
      // expanded into scalar divisions of each pair of elements.
551
3.01k
      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
552
3.01k
                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
553
3.01k
      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
554
3.01k
                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
555
3.01k
      // TODO: if one of the arguments is scalar, then it's not necessary to
556
3.01k
      // double the cost of handling the vector elements.
557
3.01k
      Cost += Cost;
558
3.01k
    }
559
7.62k
    return Cost;
560
7.62k
561
397k
  case ISD::ADD:
562
397k
  case ISD::MUL:
563
397k
  case ISD::XOR:
564
397k
  case ISD::OR:
565
397k
  case ISD::AND:
566
397k
    // These nodes are marked as 'custom' for combining purposes only.
567
397k
    // We know that they are legal. See LowerAdd in ISelLowering.
568
397k
    return (Cost + 1) * LT.first;
569
822k
  }
570
822k
}
571
572
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
573
51.9k
                                              const SCEV *Ptr) {
574
51.9k
  // Address computations in vectorized code with non-consecutive addresses will
575
51.9k
  // likely result in more instructions compared to scalar code where the
576
51.9k
  // computation can more often be merged into the index mode. The resulting
577
51.9k
  // extra micro-ops can significantly decrease throughput.
578
51.9k
  unsigned NumVectorInstToHideOverhead = 10;
579
51.9k
  int MaxMergeDistance = 64;
580
51.9k
581
51.9k
  if (Ty->isVectorTy() && 
SE15.0k
&&
582
51.9k
      
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)15.0k
)
583
10.0k
    return NumVectorInstToHideOverhead;
584
41.9k
585
41.9k
  // In many cases the address computation is not merged into the instruction
586
41.9k
  // addressing mode.
587
41.9k
  return 1;
588
41.9k
}
589
590
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
591
189k
                                       Type *CondTy, const Instruction *I) {
592
189k
593
189k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
594
189k
  // We don't lower some vector selects well that are wider than the register
595
189k
  // width.
596
189k
  if (ValTy->isVectorTy() && 
ISD == ISD::SELECT65.3k
) {
597
24.2k
    // We would need this many instructions to hide the scalarization happening.
598
24.2k
    const int AmortizationCost = 20;
599
24.2k
    static const TypeConversionCostTblEntry
600
24.2k
    VectorSelectTbl[] = {
601
24.2k
      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
602
24.2k
      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
603
24.2k
      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
604
24.2k
      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
605
24.2k
      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
606
24.2k
      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
607
24.2k
    };
608
24.2k
609
24.2k
    EVT SelCondTy = TLI->getValueType(DL, CondTy);
610
24.2k
    EVT SelValTy = TLI->getValueType(DL, ValTy);
611
24.2k
    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
612
24.2k
      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
613
350
                                                     SelCondTy.getSimpleVT(),
614
350
                                                     SelValTy.getSimpleVT()))
615
350
        return Entry->Cost;
616
189k
    }
617
24.2k
  }
618
189k
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
619
189k
}
620
621
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
622
                                    unsigned Alignment, unsigned AddressSpace,
623
418k
                                    const Instruction *I) {
624
418k
  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
625
418k
626
418k
  if (ST->isMisaligned128StoreSlow() && 
Opcode == Instruction::Store18
&&
627
418k
      
LT.second.is128BitVector()16
&&
Alignment < 1614
) {
628
14
    // Unaligned stores are extremely inefficient. We don't split all
629
14
    // unaligned 128-bit stores because the negative impact that has shown in
630
14
    // practice on inlined block copy code.
631
14
    // We make such stores expensive so that we will only vectorize if there
632
14
    // are 6 other instructions getting vectorized.
633
14
    const int AmortizationCost = 6;
634
14
635
14
    return LT.first * 2 * AmortizationCost;
636
14
  }
637
418k
638
418k
  if (Ty->isVectorTy() && 
Ty->getVectorElementType()->isIntegerTy(8)235k
) {
639
7.10k
    unsigned ProfitableNumElements;
640
7.10k
    if (Opcode == Instruction::Store)
641
2.59k
      // We use a custom trunc store lowering so v.4b should be profitable.
642
2.59k
      ProfitableNumElements = 4;
643
4.51k
    else
644
4.51k
      // We scalarize the loads because there is not v.4b register and we
645
4.51k
      // have to promote the elements to v.2.
646
4.51k
      ProfitableNumElements = 8;
647
7.10k
648
7.10k
    if (Ty->getVectorNumElements() < ProfitableNumElements) {
649
3.92k
      unsigned NumVecElts = Ty->getVectorNumElements();
650
3.92k
      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
651
3.92k
      // We generate 2 instructions per vector element.
652
3.92k
      return NumVectorizableInstsToAmortize * NumVecElts * 2;
653
3.92k
    }
654
414k
  }
655
414k
656
414k
  return LT.first;
657
414k
}
658
659
int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
660
                                               unsigned Factor,
661
                                               ArrayRef<unsigned> Indices,
662
                                               unsigned Alignment,
663
                                               unsigned AddressSpace,
664
                                               bool UseMaskForCond,
665
2.10k
                                               bool UseMaskForGaps) {
666
2.10k
  assert(Factor >= 2 && "Invalid interleave factor");
667
2.10k
  assert(isa<VectorType>(VecTy) && "Expect a vector type");
668
2.10k
669
2.10k
  if (!UseMaskForCond && !UseMaskForGaps &&
670
2.10k
      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
671
1.79k
    unsigned NumElts = VecTy->getVectorNumElements();
672
1.79k
    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
673
1.79k
674
1.79k
    // ldN/stN only support legal vector types of size 64 or 128 in bits.
675
1.79k
    // Accesses having vector types that are a multiple of 128 bits can be
676
1.79k
    // matched to more than one ldN/stN instruction.
677
1.79k
    if (NumElts % Factor == 0 &&
678
1.79k
        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
679
1.61k
      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
680
488
  }
681
488
682
488
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
683
488
                                           Alignment, AddressSpace,
684
488
                                           UseMaskForCond, UseMaskForGaps);
685
488
}
686
687
110k
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
688
110k
  int Cost = 0;
689
199k
  for (auto *I : Tys) {
690
199k
    if (!I->isVectorTy())
691
0
      continue;
692
199k
    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
693
33.6k
      Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
694
33.6k
        getMemoryOpCost(Instruction::Load, I, 128, 0);
695
199k
  }
696
110k
  return Cost;
697
110k
}
698
699
17.1k
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
700
17.1k
  return ST->getMaxInterleaveFactor();
701
17.1k
}
702
703
// For Falkor, we want to avoid having too many strided loads in a loop since
704
// that can exhaust the HW prefetcher resources.  We adjust the unroller
705
// MaxCount preference below to attempt to ensure unrolling doesn't create too
706
// many strided loads.
707
static void
708
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
709
3
                              TargetTransformInfo::UnrollingPreferences &UP) {
710
3
  enum { MaxStridedLoads = 7 };
711
3
  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
712
3
    int StridedLoads = 0;
713
3
    // FIXME? We could make this more precise by looking at the CFG and
714
3
    // e.g. not counting loads in each side of an if-then-else diamond.
715
5
    for (const auto BB : L->blocks()) {
716
36
      for (auto &I : *BB) {
717
36
        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
718
36
        if (!LMemI)
719
29
          continue;
720
7
721
7
        Value *PtrValue = LMemI->getPointerOperand();
722
7
        if (L->isLoopInvariant(PtrValue))
723
0
          continue;
724
7
725
7
        const SCEV *LSCEV = SE.getSCEV(PtrValue);
726
7
        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
727
7
        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
728
0
          continue;
729
7
730
7
        // FIXME? We could take pairing of unrolled load copies into account
731
7
        // by looking at the AddRec, but we would probably have to limit this
732
7
        // to loops with no stores or other memory optimization barriers.
733
7
        ++StridedLoads;
734
7
        // We've seen enough strided loads that seeing more won't make a
735
7
        // difference.
736
7
        if (StridedLoads > MaxStridedLoads / 2)
737
1
          return StridedLoads;
738
7
      }
739
5
    }
740
3
    
return StridedLoads2
;
741
3
  };
742
3
743
3
  int StridedLoads = countStridedLoads(L, SE);
744
3
  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
745
3
                    << " strided loads\n");
746
3
  // Pick the largest power of 2 unroll count that won't result in too many
747
3
  // strided loads.
748
3
  if (StridedLoads) {
749
3
    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
750
3
    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
751
3
                      << UP.MaxCount << '\n');
752
3
  }
753
3
}
754
755
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
756
359k
                                             TTI::UnrollingPreferences &UP) {
757
359k
  // Enable partial unrolling and runtime unrolling.
758
359k
  BaseT::getUnrollingPreferences(L, SE, UP);
759
359k
760
359k
  // For inner loop, it is more likely to be a hot one, and the runtime check
761
359k
  // can be promoted out from LICM pass, so the overhead is less, let's try
762
359k
  // a larger threshold to unroll more loops.
763
359k
  if (L->getLoopDepth() > 1)
764
101k
    UP.PartialThreshold *= 2;
765
359k
766
359k
  // Disable partial & runtime unrolling on -Os.
767
359k
  UP.PartialOptSizeThreshold = 0;
768
359k
769
359k
  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
770
359k
      
EnableFalkorHWPFUnrollFix6
)
771
3
    getFalkorUnrollingPreferences(L, SE, UP);
772
359k
}
773
774
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
775
78
                                                         Type *ExpectedType) {
776
78
  switch (Inst->getIntrinsicID()) {
777
78
  default:
778
0
    return nullptr;
779
78
  case Intrinsic::aarch64_neon_st2:
780
16
  case Intrinsic::aarch64_neon_st3:
781
16
  case Intrinsic::aarch64_neon_st4: {
782
16
    // Create a struct type
783
16
    StructType *ST = dyn_cast<StructType>(ExpectedType);
784
16
    if (!ST)
785
8
      return nullptr;
786
8
    unsigned NumElts = Inst->getNumArgOperands() - 1;
787
8
    if (ST->getNumElements() != NumElts)
788
0
      return nullptr;
789
24
    
for (unsigned i = 0, e = NumElts; 8
i != e;
++i16
) {
790
16
      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
791
0
        return nullptr;
792
16
    }
793
8
    Value *Res = UndefValue::get(ExpectedType);
794
8
    IRBuilder<> Builder(Inst);
795
24
    for (unsigned i = 0, e = NumElts; i != e; 
++i16
) {
796
16
      Value *L = Inst->getArgOperand(i);
797
16
      Res = Builder.CreateInsertValue(Res, L, i);
798
16
    }
799
8
    return Res;
800
8
  }
801
62
  case Intrinsic::aarch64_neon_ld2:
802
62
  case Intrinsic::aarch64_neon_ld3:
803
62
  case Intrinsic::aarch64_neon_ld4:
804
62
    if (Inst->getType() == ExpectedType)
805
62
      return Inst;
806
0
    return nullptr;
807
78
  }
808
78
}
809
810
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
811
342k
                                        MemIntrinsicInfo &Info) {
812
342k
  switch (Inst->getIntrinsicID()) {
813
342k
  default:
814
340k
    break;
815
342k
  case Intrinsic::aarch64_neon_ld2:
816
931
  case Intrinsic::aarch64_neon_ld3:
817
931
  case Intrinsic::aarch64_neon_ld4:
818
931
    Info.ReadMem = true;
819
931
    Info.WriteMem = false;
820
931
    Info.PtrVal = Inst->getArgOperand(0);
821
931
    break;
822
931
  case Intrinsic::aarch64_neon_st2:
823
820
  case Intrinsic::aarch64_neon_st3:
824
820
  case Intrinsic::aarch64_neon_st4:
825
820
    Info.ReadMem = false;
826
820
    Info.WriteMem = true;
827
820
    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
828
820
    break;
829
342k
  }
830
342k
831
342k
  switch (Inst->getIntrinsicID()) {
832
342k
  default:
833
340k
    return false;
834
342k
  case Intrinsic::aarch64_neon_ld2:
835
697
  case Intrinsic::aarch64_neon_st2:
836
697
    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
837
697
    break;
838
697
  case Intrinsic::aarch64_neon_ld3:
839
534
  case Intrinsic::aarch64_neon_st3:
840
534
    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
841
534
    break;
842
534
  case Intrinsic::aarch64_neon_ld4:
843
520
  case Intrinsic::aarch64_neon_st4:
844
520
    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
845
520
    break;
846
1.75k
  }
847
1.75k
  return true;
848
1.75k
}
849
850
/// See if \p I should be considered for address type promotion. We check if \p
851
/// I is a sext with right type and used in memory accesses. If it used in a
852
/// "complex" getelementptr, we allow it to be promoted without finding other
853
/// sext instructions that sign extended the same initial value. A getelementptr
854
/// is considered as "complex" if it has more than 2 operands.
855
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
856
732k
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
857
732k
  bool Considerable = false;
858
732k
  AllowPromotionWithoutCommonHeader = false;
859
732k
  if (!isa<SExtInst>(&I))
860
277k
    return false;
861
455k
  Type *ConsideredSExtType =
862
455k
      Type::getInt64Ty(I.getParent()->getParent()->getContext());
863
455k
  if (I.getType() != ConsideredSExtType)
864
28.7k
    return false;
865
426k
  // See if the sext is the one with the right type and used in at least one
866
426k
  // GetElementPtrInst.
867
787k
  
for (const User *U : I.users())426k
{
868
787k
    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
869
135k
      Considerable = true;
870
135k
      // A getelementptr is considered as "complex" if it has more than 2
871
135k
      // operands. We will promote a SExt used in such complex GEP as we
872
135k
      // expect some computation to be merged if they are done on 64 bits.
873
135k
      if (GEPInst->getNumOperands() > 2) {
874
49.5k
        AllowPromotionWithoutCommonHeader = true;
875
49.5k
        break;
876
49.5k
      }
877
135k
    }
878
787k
  }
879
426k
  return Considerable;
880
426k
}
881
882
18
unsigned AArch64TTIImpl::getCacheLineSize() {
883
18
  return ST->getCacheLineSize();
884
18
}
885
886
412k
unsigned AArch64TTIImpl::getPrefetchDistance() {
887
412k
  return ST->getPrefetchDistance();
888
412k
}
889
890
9.55k
unsigned AArch64TTIImpl::getMinPrefetchStride() {
891
9.55k
  return ST->getMinPrefetchStride();
892
9.55k
}
893
894
155k
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
895
155k
  return ST->getMaxPrefetchIterationsAhead();
896
155k
}
897
898
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
899
785
                                           TTI::ReductionFlags Flags) const {
900
785
  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
901
785
  unsigned ScalarBits = Ty->getScalarSizeInBits();
902
785
  switch (Opcode) {
903
785
  case Instruction::FAdd:
904
53
  case Instruction::FMul:
905
53
  case Instruction::And:
906
53
  case Instruction::Or:
907
53
  case Instruction::Xor:
908
53
  case Instruction::Mul:
909
53
    return false;
910
708
  case Instruction::Add:
911
708
    return ScalarBits * Ty->getVectorNumElements() >= 128;
912
53
  case Instruction::ICmp:
913
24
    return (ScalarBits < 64) &&
914
24
           
(ScalarBits * Ty->getVectorNumElements() >= 128)23
;
915
53
  case Instruction::FCmp:
916
0
    return Flags.NoNaN;
917
53
  default:
918
0
    llvm_unreachable("Unhandled reduction opcode");
919
0
  }
920
0
  return false;
921
0
}
922
923
int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
924
4.49k
                                               bool IsPairwiseForm) {
925
4.49k
926
4.49k
  if (IsPairwiseForm)
927
2.24k
    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
928
2.25k
929
2.25k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
930
2.25k
  MVT MTy = LT.second;
931
2.25k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
932
2.25k
  assert(ISD && "Invalid opcode");
933
2.25k
934
2.25k
  // Horizontal adds can use the 'addv' instruction. We model the cost of these
935
2.25k
  // instructions as normal vector adds. This is the only arithmetic vector
936
2.25k
  // reduction operation for which we have an instruction.
937
2.25k
  static const CostTblEntry CostTblNoPairwise[]{
938
2.25k
      {ISD::ADD, MVT::v8i8,  1},
939
2.25k
      {ISD::ADD, MVT::v16i8, 1},
940
2.25k
      {ISD::ADD, MVT::v4i16, 1},
941
2.25k
      {ISD::ADD, MVT::v8i16, 1},
942
2.25k
      {ISD::ADD, MVT::v4i32, 1},
943
2.25k
  };
944
2.25k
945
2.25k
  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
946
1.57k
    return LT.first * Entry->Cost;
947
677
948
677
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
949
677
}
950
951
int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
952
144k
                                   Type *SubTp) {
953
144k
  if (Kind == TTI::SK_Broadcast || 
Kind == TTI::SK_Transpose93.2k
||
954
144k
      
Kind == TTI::SK_Select93.2k
||
Kind == TTI::SK_PermuteSingleSrc24.1k
) {
955
138k
    static const CostTblEntry ShuffleTbl[] = {
956
138k
      // Broadcast shuffle kinds can be performed with 'dup'.
957
138k
      { TTI::SK_Broadcast, MVT::v8i8,  1 },
958
138k
      { TTI::SK_Broadcast, MVT::v16i8, 1 },
959
138k
      { TTI::SK_Broadcast, MVT::v4i16, 1 },
960
138k
      { TTI::SK_Broadcast, MVT::v8i16, 1 },
961
138k
      { TTI::SK_Broadcast, MVT::v2i32, 1 },
962
138k
      { TTI::SK_Broadcast, MVT::v4i32, 1 },
963
138k
      { TTI::SK_Broadcast, MVT::v2i64, 1 },
964
138k
      { TTI::SK_Broadcast, MVT::v2f32, 1 },
965
138k
      { TTI::SK_Broadcast, MVT::v4f32, 1 },
966
138k
      { TTI::SK_Broadcast, MVT::v2f64, 1 },
967
138k
      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
968
138k
      // 'zip1/zip2' instructions.
969
138k
      { TTI::SK_Transpose, MVT::v8i8,  1 },
970
138k
      { TTI::SK_Transpose, MVT::v16i8, 1 },
971
138k
      { TTI::SK_Transpose, MVT::v4i16, 1 },
972
138k
      { TTI::SK_Transpose, MVT::v8i16, 1 },
973
138k
      { TTI::SK_Transpose, MVT::v2i32, 1 },
974
138k
      { TTI::SK_Transpose, MVT::v4i32, 1 },
975
138k
      { TTI::SK_Transpose, MVT::v2i64, 1 },
976
138k
      { TTI::SK_Transpose, MVT::v2f32, 1 },
977
138k
      { TTI::SK_Transpose, MVT::v4f32, 1 },
978
138k
      { TTI::SK_Transpose, MVT::v2f64, 1 },
979
138k
      // Select shuffle kinds.
980
138k
      // TODO: handle vXi8/vXi16.
981
138k
      { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
982
138k
      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
983
138k
      { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
984
138k
      { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
985
138k
      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
986
138k
      { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
987
138k
      // PermuteSingleSrc shuffle kinds.
988
138k
      // TODO: handle vXi8/vXi16.
989
138k
      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
990
138k
      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
991
138k
      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
992
138k
      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
993
138k
      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
994
138k
      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
995
138k
    };
996
138k
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
997
138k
    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
998
137k
      return LT.first * Entry->Cost;
999
7.21k
  }
1000
7.21k
1001
7.21k
  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1002
7.21k
}