Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
/// \file
10
/// This file implements a TargetTransformInfo analysis pass specific to the
11
/// X86 target machine. It uses the target's detailed information to provide
12
/// more precise answers to certain TTI queries, while letting the target
13
/// independent and default TTI implementations handle the rest.
14
///
15
//===----------------------------------------------------------------------===//
16
/// About Cost Model numbers used below it's necessary to say the following:
17
/// the numbers correspond to some "generic" X86 CPU instead of usage of
18
/// concrete CPU model. Usually the numbers correspond to CPU where the feature
19
/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20
/// the lookups below the cost is based on Nehalem as that was the first CPU
21
/// to support that feature level and thus has most likely the worst case cost.
22
/// Some examples of other technologies/CPUs:
23
///   SSE 3   - Pentium4 / Athlon64
24
///   SSE 4.1 - Penryn
25
///   SSE 4.2 - Nehalem
26
///   AVX     - Sandy Bridge
27
///   AVX2    - Haswell
28
///   AVX-512 - Xeon Phi / Skylake
29
/// And some examples of instruction target dependent costs (latency)
30
///                   divss     sqrtss          rsqrtss
31
///   AMD K7            11-16     19              3
32
///   Piledriver        9-24      13-15           5
33
///   Jaguar            14        16              2
34
///   Pentium II,III    18        30              2
35
///   Nehalem           7-14      7-18            3
36
///   Haswell           10-13     11              5
37
/// TODO: Develop and implement  the target dependent cost model and
38
/// specialize cost numbers for different Cost Model Targets such as throughput,
39
/// code size, latency and uop count.
40
//===----------------------------------------------------------------------===//
41
42
#include "X86TargetTransformInfo.h"
43
#include "llvm/Analysis/TargetTransformInfo.h"
44
#include "llvm/CodeGen/BasicTTIImpl.h"
45
#include "llvm/IR/IntrinsicInst.h"
46
#include "llvm/Support/Debug.h"
47
#include "llvm/Target/CostTable.h"
48
#include "llvm/Target/TargetLowering.h"
49
50
using namespace llvm;
51
52
#define DEBUG_TYPE "x86tti"
53
54
//===----------------------------------------------------------------------===//
55
//
56
// X86 cost model.
57
//
58
//===----------------------------------------------------------------------===//
59
60
TargetTransformInfo::PopcntSupportKind
61
3.41k
X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62
3.41k
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63
3.41k
  // TODO: Currently the __builtin_popcount() implementation using SSE3
64
3.41k
  //   instructions is inefficient. Once the problem is fixed, we should
65
3.41k
  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
66
3.41k
  return ST->hasPOPCNT() ? 
TTI::PSK_FastHardware692
:
TTI::PSK_Software2.72k
;
67
3.41k
}
68
69
llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
70
2
  TargetTransformInfo::CacheLevel Level) const {
71
2
  switch (Level) {
72
0
  case TargetTransformInfo::CacheLevel::L1D:
73
0
    //   - Penry
74
0
    //   - Nehalem
75
0
    //   - Westmere
76
0
    //   - Sandy Bridge
77
0
    //   - Ivy Bridge
78
0
    //   - Haswell
79
0
    //   - Broadwell
80
0
    //   - Skylake
81
0
    //   - Kabylake
82
0
    return 32 * 1024;  //  32 KByte
83
2
  case TargetTransformInfo::CacheLevel::L2D:
84
2
    //   - Penry
85
2
    //   - Nehalem
86
2
    //   - Westmere
87
2
    //   - Sandy Bridge
88
2
    //   - Ivy Bridge
89
2
    //   - Haswell
90
2
    //   - Broadwell
91
2
    //   - Skylake
92
2
    //   - Kabylake
93
2
    return 256 * 1024; // 256 KByte
94
0
  }
95
0
96
0
  
llvm_unreachable0
("Unknown TargetTransformInfo::CacheLevel");
97
0
}
98
99
llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
100
4
  TargetTransformInfo::CacheLevel Level) const {
101
4
  //   - Penry
102
4
  //   - Nehalem
103
4
  //   - Westmere
104
4
  //   - Sandy Bridge
105
4
  //   - Ivy Bridge
106
4
  //   - Haswell
107
4
  //   - Broadwell
108
4
  //   - Skylake
109
4
  //   - Kabylake
110
4
  switch (Level) {
111
2
  case TargetTransformInfo::CacheLevel::L1D:
112
2
    LLVM_FALLTHROUGH;
113
4
  case TargetTransformInfo::CacheLevel::L2D:
114
4
    return 8;
115
0
  }
116
0
117
0
  
llvm_unreachable0
("Unknown TargetTransformInfo::CacheLevel");
118
0
}
119
120
331k
unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121
331k
  if (
Vector && 331k
!ST->hasSSE1()20.1k
)
122
1.40k
    return 0;
123
330k
124
330k
  
if (330k
ST->is64Bit()330k
) {
125
202k
    if (
Vector && 202k
ST->hasAVX512()13.2k
)
126
280
      return 32;
127
202k
    return 16;
128
202k
  }
129
127k
  return 8;
130
127k
}
131
132
9.72k
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133
9.72k
  if (
Vector9.72k
) {
134
9.72k
    if (ST->hasAVX512())
135
261
      return 512;
136
9.45k
    
if (9.45k
ST->hasAVX()9.45k
)
137
2.31k
      return 256;
138
7.14k
    
if (7.14k
ST->hasSSE1()7.14k
)
139
7.14k
      return 128;
140
0
    return 0;
141
0
  }
142
0
143
0
  
if (0
ST->is64Bit()0
)
144
0
    return 64;
145
0
146
0
  return 32;
147
0
}
148
149
86
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150
86
  return getRegisterBitWidth(true);
151
86
}
152
153
1.58k
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154
1.58k
  // If the loop will not be vectorized, don't interleave the loop.
155
1.58k
  // Let regular unroll to unroll the loop, which saves the overflow
156
1.58k
  // check and memory check cost.
157
1.58k
  if (VF == 1)
158
1.23k
    return 1;
159
352
160
352
  
if (352
ST->isAtom()352
)
161
0
    return 1;
162
352
163
352
  // Sandybridge and Haswell have multiple execution ports and pipelined
164
352
  // vector units.
165
352
  
if (352
ST->hasAVX()352
)
166
98
    return 4;
167
254
168
254
  return 2;
169
254
}
170
171
int X86TTIImpl::getArithmeticInstrCost(
172
    unsigned Opcode, Type *Ty,
173
    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
174
    TTI::OperandValueProperties Opd1PropInfo,
175
    TTI::OperandValueProperties Opd2PropInfo,
176
13.5k
    ArrayRef<const Value *> Args) {
177
13.5k
  // Legalize the type.
178
13.5k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179
13.5k
180
13.5k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181
13.5k
  assert(ISD && "Invalid opcode");
182
13.5k
183
13.5k
  static const CostTblEntry SLMCostTable[] = {
184
13.5k
    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
185
13.5k
    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
186
13.5k
    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
187
13.5k
    { ISD::FMUL, MVT::f64,   2  }, // mulsd
188
13.5k
    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
189
13.5k
    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
190
13.5k
    { ISD::FDIV, MVT::f32,   17 }, // divss
191
13.5k
    { ISD::FDIV, MVT::v4f32, 39 }, // divps
192
13.5k
    { ISD::FDIV, MVT::f64,   32 }, // divsd
193
13.5k
    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
194
13.5k
    { ISD::FADD, MVT::v2f64, 2  }, // addpd
195
13.5k
    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
196
13.5k
    // v2i64/v4i64 mul is custom lowered as a series of long:
197
13.5k
    // multiplies(3), shifts(3) and adds(2)
198
13.5k
    // slm muldq version throughput is 2 and addq throughput 4
199
13.5k
    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
200
13.5k
    //       3X4 (addq throughput) = 17
201
13.5k
    { ISD::MUL,  MVT::v2i64, 17 },
202
13.5k
    // slm addq\subq throughput is 4
203
13.5k
    { ISD::ADD,  MVT::v2i64, 4  },
204
13.5k
    { ISD::SUB,  MVT::v2i64, 4  },
205
13.5k
  };
206
13.5k
207
13.5k
  if (
ST->isSLM()13.5k
) {
208
303
    if (
Args.size() == 2 && 303
ISD == ISD::MUL303
&&
LT.second == MVT::v4i3270
) {
209
21
      // Check if the operands can be shrinked into a smaller datatype.
210
21
      bool Op1Signed = false;
211
21
      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
212
21
      bool Op2Signed = false;
213
21
      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
214
21
215
21
      bool signedMode = Op1Signed | Op2Signed;
216
21
      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
217
21
218
21
      if (OpMinSize <= 7)
219
1
        return LT.first * 3; // pmullw/sext
220
20
      
if (20
!signedMode && 20
OpMinSize <= 813
)
221
2
        return LT.first * 3; // pmullw/zext
222
18
      
if (18
OpMinSize <= 1518
)
223
5
        return LT.first * 5; // pmullw/pmulhw/pshuf
224
13
      
if (13
!signedMode && 13
OpMinSize <= 1610
)
225
2
        return LT.first * 5; // pmullw/pmulhw/pshuf
226
293
    }
227
293
    
if (const auto *293
Entry293
= CostTableLookup(SLMCostTable, ISD,
228
182
                                            LT.second)) {
229
182
      return LT.first * Entry->Cost;
230
182
    }
231
13.3k
  }
232
13.3k
233
13.3k
  
if (13.3k
ISD == ISD::SDIV &&
234
399
      Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
235
13.3k
      
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2213
) {
236
5
    // On X86, vector signed division by constants power-of-two are
237
5
    // normally expanded to the sequence SRA + SRL + ADD + SRA.
238
5
    // The OperandValue properties many not be same as that of previous
239
5
    // operation;conservatively assume OP_None.
240
5
    int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
241
5
                                          Op2Info, TargetTransformInfo::OP_None,
242
5
                                          TargetTransformInfo::OP_None);
243
5
    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
244
5
                                   TargetTransformInfo::OP_None,
245
5
                                   TargetTransformInfo::OP_None);
246
5
    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
247
5
                                   TargetTransformInfo::OP_None,
248
5
                                   TargetTransformInfo::OP_None);
249
5
250
5
    return Cost;
251
5
  }
252
13.3k
253
13.3k
  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
254
13.3k
    { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
255
13.3k
    { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
256
13.3k
    { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
257
13.3k
258
13.3k
    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
259
13.3k
    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
260
13.3k
  };
261
13.3k
262
13.3k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
263
13.3k
      
ST->hasBWI()4.06k
) {
264
168
    if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
265
168
                                            LT.second))
266
10
      return LT.first * Entry->Cost;
267
13.3k
  }
268
13.3k
269
13.3k
  static const CostTblEntry AVX512UniformConstCostTable[] = {
270
13.3k
    { ISD::SRA,  MVT::v2i64,   1 },
271
13.3k
    { ISD::SRA,  MVT::v4i64,   1 },
272
13.3k
    { ISD::SRA,  MVT::v8i64,   1 },
273
13.3k
274
13.3k
    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
275
13.3k
    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
276
13.3k
  };
277
13.3k
278
13.3k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
279
13.3k
      
ST->hasAVX512()4.05k
) {
280
505
    if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
281
505
                                            LT.second))
282
26
      return LT.first * Entry->Cost;
283
13.2k
  }
284
13.2k
285
13.2k
  static const CostTblEntry AVX2UniformConstCostTable[] = {
286
13.2k
    { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
287
13.2k
    { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
288
13.2k
    { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
289
13.2k
290
13.2k
    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
291
13.2k
292
13.2k
    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
293
13.2k
    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
294
13.2k
    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
295
13.2k
    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
296
13.2k
  };
297
13.2k
298
13.2k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
299
13.2k
      
ST->hasAVX2()4.03k
) {
300
1.17k
    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
301
1.17k
                                            LT.second))
302
98
      return LT.first * Entry->Cost;
303
13.1k
  }
304
13.1k
305
13.1k
  static const CostTblEntry SSE2UniformConstCostTable[] = {
306
13.1k
    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
307
13.1k
    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
308
13.1k
    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
309
13.1k
310
13.1k
    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
311
13.1k
    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
312
13.1k
    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
313
13.1k
314
13.1k
    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
315
13.1k
    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
316
13.1k
    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
317
13.1k
    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
318
13.1k
    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
319
13.1k
    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
320
13.1k
    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
321
13.1k
    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
322
13.1k
  };
323
13.1k
324
13.1k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
325
13.1k
      
ST->hasSSE2()3.93k
) {
326
3.92k
    // pmuldq sequence.
327
3.92k
    if (
ISD == ISD::SDIV && 3.92k
LT.second == MVT::v8i32176
&&
ST->hasAVX()5
)
328
5
      return LT.first * 32;
329
3.92k
    
if (3.92k
ISD == ISD::SDIV && 3.92k
LT.second == MVT::v4i32171
&&
ST->hasSSE41()37
)
330
21
      return LT.first * 15;
331
3.90k
332
3.90k
    // XOP has faster vXi8 shifts.
333
3.90k
    
if (3.90k
(ISD != ISD::SHL && 3.90k
ISD != ISD::SRL3.65k
&&
ISD != ISD::SRA3.22k
) ||
334
978
        !ST->hasXOP())
335
3.83k
      
if (const auto *3.83k
Entry3.83k
=
336
3.83k
              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
337
196
        return LT.first * Entry->Cost;
338
12.9k
  }
339
12.9k
340
12.9k
  static const CostTblEntry AVX2UniformCostTable[] = {
341
12.9k
    // Uniform splats are cheaper for the following instructions.
342
12.9k
    { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
343
12.9k
    { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
344
12.9k
    { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
345
12.9k
  };
346
12.9k
347
12.9k
  if (ST->hasAVX2() &&
348
4.16k
      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
349
12.9k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)3.14k
)) {
350
1.31k
    if (const auto *Entry =
351
1.31k
            CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
352
84
      return LT.first * Entry->Cost;
353
12.8k
  }
354
12.8k
355
12.8k
  static const CostTblEntry SSE2UniformCostTable[] = {
356
12.8k
    // Uniform splats are cheaper for the following instructions.
357
12.8k
    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
358
12.8k
    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
359
12.8k
    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
360
12.8k
361
12.8k
    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
362
12.8k
    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
363
12.8k
    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
364
12.8k
365
12.8k
    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
366
12.8k
    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
367
12.8k
  };
368
12.8k
369
12.8k
  if (ST->hasSSE2() &&
370
12.8k
      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
371
12.8k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)9.19k
)) {
372
4.15k
    if (const auto *Entry =
373
4.15k
            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
374
475
      return LT.first * Entry->Cost;
375
12.4k
  }
376
12.4k
377
12.4k
  static const CostTblEntry AVX512DQCostTable[] = {
378
12.4k
    { ISD::MUL,  MVT::v2i64, 1 },
379
12.4k
    { ISD::MUL,  MVT::v4i64, 1 },
380
12.4k
    { ISD::MUL,  MVT::v8i64, 1 }
381
12.4k
  };
382
12.4k
383
12.4k
  // Look for AVX512DQ lowering tricks for custom cases.
384
12.4k
  if (ST->hasDQI())
385
517
    
if (const auto *517
Entry517
= CostTableLookup(AVX512DQCostTable, ISD, LT.second))
386
5
      return LT.first * Entry->Cost;
387
12.3k
388
12.3k
  static const CostTblEntry AVX512BWCostTable[] = {
389
12.3k
    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
390
12.3k
    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
391
12.3k
    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
392
12.3k
393
12.3k
    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
394
12.3k
    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
395
12.3k
    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
396
12.3k
397
12.3k
    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
398
12.3k
    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
399
12.3k
    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
400
12.3k
401
12.3k
    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
402
12.3k
    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
403
12.3k
    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
404
12.3k
405
12.3k
    { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
406
12.3k
    { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
407
12.3k
    { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
408
12.3k
409
12.3k
    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
410
12.3k
    { ISD::SDIV,  MVT::v64i8,  64*20 },
411
12.3k
    { ISD::SDIV,  MVT::v32i16, 32*20 },
412
12.3k
    { ISD::UDIV,  MVT::v64i8,  64*20 },
413
12.3k
    { ISD::UDIV,  MVT::v32i16, 32*20 }
414
12.3k
  };
415
12.3k
416
12.3k
  // Look for AVX512BW lowering tricks for custom cases.
417
12.3k
  if (ST->hasBWI())
418
702
    
if (const auto *702
Entry702
= CostTableLookup(AVX512BWCostTable, ISD, LT.second))
419
95
      return LT.first * Entry->Cost;
420
12.3k
421
12.3k
  static const CostTblEntry AVX512CostTable[] = {
422
12.3k
    { ISD::SHL,     MVT::v16i32,     1 },
423
12.3k
    { ISD::SRL,     MVT::v16i32,     1 },
424
12.3k
    { ISD::SRA,     MVT::v16i32,     1 },
425
12.3k
426
12.3k
    { ISD::SHL,     MVT::v8i64,      1 },
427
12.3k
    { ISD::SRL,     MVT::v8i64,      1 },
428
12.3k
429
12.3k
    { ISD::SRA,     MVT::v2i64,      1 },
430
12.3k
    { ISD::SRA,     MVT::v4i64,      1 },
431
12.3k
    { ISD::SRA,     MVT::v8i64,      1 },
432
12.3k
433
12.3k
    { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
434
12.3k
    { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
435
12.3k
    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
436
12.3k
    { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
437
12.3k
438
12.3k
    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
439
12.3k
    { ISD::SDIV,    MVT::v16i32, 16*20 },
440
12.3k
    { ISD::SDIV,    MVT::v8i64,   8*20 },
441
12.3k
    { ISD::UDIV,    MVT::v16i32, 16*20 },
442
12.3k
    { ISD::UDIV,    MVT::v8i64,   8*20 }
443
12.3k
  };
444
12.3k
445
12.3k
  if (ST->hasAVX512())
446
1.80k
    
if (const auto *1.80k
Entry1.80k
= CostTableLookup(AVX512CostTable, ISD, LT.second))
447
236
      return LT.first * Entry->Cost;
448
12.0k
449
12.0k
  static const CostTblEntry AVX2ShiftCostTable[] = {
450
12.0k
    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
451
12.0k
    // customize them to detect the cases where shift amount is a scalar one.
452
12.0k
    { ISD::SHL,     MVT::v4i32,    1 },
453
12.0k
    { ISD::SRL,     MVT::v4i32,    1 },
454
12.0k
    { ISD::SRA,     MVT::v4i32,    1 },
455
12.0k
    { ISD::SHL,     MVT::v8i32,    1 },
456
12.0k
    { ISD::SRL,     MVT::v8i32,    1 },
457
12.0k
    { ISD::SRA,     MVT::v8i32,    1 },
458
12.0k
    { ISD::SHL,     MVT::v2i64,    1 },
459
12.0k
    { ISD::SRL,     MVT::v2i64,    1 },
460
12.0k
    { ISD::SHL,     MVT::v4i64,    1 },
461
12.0k
    { ISD::SRL,     MVT::v4i64,    1 },
462
12.0k
  };
463
12.0k
464
12.0k
  // Look for AVX2 lowering tricks.
465
12.0k
  if (
ST->hasAVX2()12.0k
) {
466
3.58k
    if (
ISD == ISD::SHL && 3.58k
LT.second == MVT::v16i16372
&&
467
42
        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
468
42
         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
469
3.58k
      // On AVX2, a packed v16i16 shift left by a constant build_vector
470
3.58k
      // is lowered into a vector multiply (vpmullw).
471
24
      return LT.first;
472
3.56k
473
3.56k
    
if (const auto *3.56k
Entry3.56k
= CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
474
370
      return LT.first * Entry->Cost;
475
11.6k
  }
476
11.6k
477
11.6k
  static const CostTblEntry XOPShiftCostTable[] = {
478
11.6k
    // 128bit shifts take 1cy, but right shifts require negation beforehand.
479
11.6k
    { ISD::SHL,     MVT::v16i8,    1 },
480
11.6k
    { ISD::SRL,     MVT::v16i8,    2 },
481
11.6k
    { ISD::SRA,     MVT::v16i8,    2 },
482
11.6k
    { ISD::SHL,     MVT::v8i16,    1 },
483
11.6k
    { ISD::SRL,     MVT::v8i16,    2 },
484
11.6k
    { ISD::SRA,     MVT::v8i16,    2 },
485
11.6k
    { ISD::SHL,     MVT::v4i32,    1 },
486
11.6k
    { ISD::SRL,     MVT::v4i32,    2 },
487
11.6k
    { ISD::SRA,     MVT::v4i32,    2 },
488
11.6k
    { ISD::SHL,     MVT::v2i64,    1 },
489
11.6k
    { ISD::SRL,     MVT::v2i64,    2 },
490
11.6k
    { ISD::SRA,     MVT::v2i64,    2 },
491
11.6k
    // 256bit shifts require splitting if AVX2 didn't catch them above.
492
11.6k
    { ISD::SHL,     MVT::v32i8,  2+2 },
493
11.6k
    { ISD::SRL,     MVT::v32i8,  4+2 },
494
11.6k
    { ISD::SRA,     MVT::v32i8,  4+2 },
495
11.6k
    { ISD::SHL,     MVT::v16i16, 2+2 },
496
11.6k
    { ISD::SRL,     MVT::v16i16, 4+2 },
497
11.6k
    { ISD::SRA,     MVT::v16i16, 4+2 },
498
11.6k
    { ISD::SHL,     MVT::v8i32,  2+2 },
499
11.6k
    { ISD::SRL,     MVT::v8i32,  4+2 },
500
11.6k
    { ISD::SRA,     MVT::v8i32,  4+2 },
501
11.6k
    { ISD::SHL,     MVT::v4i64,  2+2 },
502
11.6k
    { ISD::SRL,     MVT::v4i64,  4+2 },
503
11.6k
    { ISD::SRA,     MVT::v4i64,  4+2 },
504
11.6k
  };
505
11.6k
506
11.6k
  // Look for XOP lowering tricks.
507
11.6k
  if (ST->hasXOP())
508
324
    
if (const auto *324
Entry324
= CostTableLookup(XOPShiftCostTable, ISD, LT.second))
509
218
      return LT.first * Entry->Cost;
510
11.4k
511
11.4k
  static const CostTblEntry SSE2UniformShiftCostTable[] = {
512
11.4k
    // Uniform splats are cheaper for the following instructions.
513
11.4k
    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
514
11.4k
    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
515
11.4k
    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
516
11.4k
517
11.4k
    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
518
11.4k
    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
519
11.4k
    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
520
11.4k
521
11.4k
    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
522
11.4k
    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
523
11.4k
    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
524
11.4k
    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
525
11.4k
  };
526
11.4k
527
11.4k
  if (ST->hasSSE2() &&
528
11.4k
      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
529
11.4k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)8.24k
)) {
530
3.40k
531
3.40k
    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
532
3.40k
    if (
ISD == ISD::SRA && 3.40k
LT.second == MVT::v4i64235
&&
ST->hasAVX2()6
)
533
2
      return LT.first * 4; // 2*psrad + shuffle.
534
3.39k
535
3.39k
    
if (const auto *3.39k
Entry3.39k
=
536
3.39k
            CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
537
131
      return LT.first * Entry->Cost;
538
11.3k
  }
539
11.3k
540
11.3k
  
if (11.3k
ISD == ISD::SHL &&
541
11.3k
      
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue442
) {
542
104
    MVT VT = LT.second;
543
104
    // Vector shift left by non uniform constant can be lowered
544
104
    // into vector multiply.
545
104
    if (
((VT == MVT::v8i16 || 104
VT == MVT::v4i3272
) &&
ST->hasSSE2()49
) ||
546
55
        
((VT == MVT::v16i16 || 55
VT == MVT::v8i3251
) &&
ST->hasAVX()8
))
547
57
      ISD = ISD::MUL;
548
104
  }
549
11.3k
550
11.3k
  static const CostTblEntry AVX2CostTable[] = {
551
11.3k
    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
552
11.3k
    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
553
11.3k
554
11.3k
    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
555
11.3k
    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
556
11.3k
557
11.3k
    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
558
11.3k
    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
559
11.3k
    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
560
11.3k
    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
561
11.3k
562
11.3k
    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
563
11.3k
    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
564
11.3k
    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
565
11.3k
    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
566
11.3k
    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
567
11.3k
    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
568
11.3k
    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
569
11.3k
    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
570
11.3k
571
11.3k
    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
572
11.3k
    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
573
11.3k
    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
574
11.3k
    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
575
11.3k
    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
576
11.3k
577
11.3k
    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
578
11.3k
    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
579
11.3k
    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
580
11.3k
    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
581
11.3k
    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
582
11.3k
    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
583
11.3k
  };
584
11.3k
585
11.3k
  // Look for AVX2 lowering tricks for custom cases.
586
11.3k
  if (ST->hasAVX2())
587
3.10k
    
if (const auto *3.10k
Entry3.10k
= CostTableLookup(AVX2CostTable, ISD, LT.second))
588
368
      return LT.first * Entry->Cost;
589
10.9k
590
10.9k
  static const CostTblEntry AVX1CostTable[] = {
591
10.9k
    // We don't have to scalarize unsupported ops. We can issue two half-sized
592
10.9k
    // operations and we only need to extract the upper YMM half.
593
10.9k
    // Two ops + 1 extract + 1 insert = 4.
594
10.9k
    { ISD::MUL,     MVT::v16i16,     4 },
595
10.9k
    { ISD::MUL,     MVT::v8i32,      4 },
596
10.9k
    { ISD::SUB,     MVT::v32i8,      4 },
597
10.9k
    { ISD::ADD,     MVT::v32i8,      4 },
598
10.9k
    { ISD::SUB,     MVT::v16i16,     4 },
599
10.9k
    { ISD::ADD,     MVT::v16i16,     4 },
600
10.9k
    { ISD::SUB,     MVT::v8i32,      4 },
601
10.9k
    { ISD::ADD,     MVT::v8i32,      4 },
602
10.9k
    { ISD::SUB,     MVT::v4i64,      4 },
603
10.9k
    { ISD::ADD,     MVT::v4i64,      4 },
604
10.9k
605
10.9k
    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
606
10.9k
    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
607
10.9k
    // Because we believe v4i64 to be a legal type, we must also include the
608
10.9k
    // extract+insert in the cost table. Therefore, the cost here is 18
609
10.9k
    // instead of 8.
610
10.9k
    { ISD::MUL,     MVT::v4i64,     18 },
611
10.9k
612
10.9k
    { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
613
10.9k
614
10.9k
    { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
615
10.9k
    { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
616
10.9k
    { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
617
10.9k
    { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
618
10.9k
    { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
619
10.9k
    { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
620
10.9k
621
10.9k
    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
622
10.9k
    { ISD::SDIV,    MVT::v32i8,  32*20 },
623
10.9k
    { ISD::SDIV,    MVT::v16i16, 16*20 },
624
10.9k
    { ISD::SDIV,    MVT::v8i32,   8*20 },
625
10.9k
    { ISD::SDIV,    MVT::v4i64,   4*20 },
626
10.9k
    { ISD::UDIV,    MVT::v32i8,  32*20 },
627
10.9k
    { ISD::UDIV,    MVT::v16i16, 16*20 },
628
10.9k
    { ISD::UDIV,    MVT::v8i32,   8*20 },
629
10.9k
    { ISD::UDIV,    MVT::v4i64,   4*20 },
630
10.9k
  };
631
10.9k
632
10.9k
  if (ST->hasAVX())
633
4.11k
    
if (const auto *4.11k
Entry4.11k
= CostTableLookup(AVX1CostTable, ISD, LT.second))
634
225
      return LT.first * Entry->Cost;
635
10.7k
636
10.7k
  static const CostTblEntry SSE42CostTable[] = {
637
10.7k
    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
638
10.7k
    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
639
10.7k
    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
640
10.7k
    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
641
10.7k
  };
642
10.7k
643
10.7k
  if (ST->hasSSE42())
644
4.65k
    
if (const auto *4.65k
Entry4.65k
= CostTableLookup(SSE42CostTable, ISD, LT.second))
645
16
      return LT.first * Entry->Cost;
646
10.7k
647
10.7k
  static const CostTblEntry SSE41CostTable[] = {
648
10.7k
    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
649
10.7k
    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
650
10.7k
    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
651
10.7k
    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
652
10.7k
    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
653
10.7k
    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
654
10.7k
655
10.7k
    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
656
10.7k
    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
657
10.7k
    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
658
10.7k
    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
659
10.7k
    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
660
10.7k
    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
661
10.7k
662
10.7k
    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
663
10.7k
    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
664
10.7k
    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
665
10.7k
    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
666
10.7k
    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
667
10.7k
    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
668
10.7k
669
10.7k
    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
670
10.7k
  };
671
10.7k
672
10.7k
  if (ST->hasSSE41())
673
4.73k
    
if (const auto *4.73k
Entry4.73k
= CostTableLookup(SSE41CostTable, ISD, LT.second))
674
302
      return LT.first * Entry->Cost;
675
10.4k
676
10.4k
  static const CostTblEntry SSE2CostTable[] = {
677
10.4k
    // We don't correctly identify costs of casts because they are marked as
678
10.4k
    // custom.
679
10.4k
    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
680
10.4k
    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
681
10.4k
    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
682
10.4k
    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
683
10.4k
    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
684
10.4k
685
10.4k
    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
686
10.4k
    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
687
10.4k
    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
688
10.4k
    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
689
10.4k
    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
690
10.4k
691
10.4k
    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
692
10.4k
    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
693
10.4k
    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
694
10.4k
    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
695
10.4k
    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
696
10.4k
697
10.4k
    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
698
10.4k
    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
699
10.4k
    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
700
10.4k
    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
701
10.4k
702
10.4k
    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
703
10.4k
    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
704
10.4k
    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
705
10.4k
    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
706
10.4k
707
10.4k
    // It is not a good idea to vectorize division. We have to scalarize it and
708
10.4k
    // in the process we will often end up having to spilling regular
709
10.4k
    // registers. The overhead of division is going to dominate most kernels
710
10.4k
    // anyways so try hard to prevent vectorization of division - it is
711
10.4k
    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
712
10.4k
    // to hide "20 cycles" for each lane.
713
10.4k
    { ISD::SDIV,  MVT::v16i8,  16*20 },
714
10.4k
    { ISD::SDIV,  MVT::v8i16,   8*20 },
715
10.4k
    { ISD::SDIV,  MVT::v4i32,   4*20 },
716
10.4k
    { ISD::SDIV,  MVT::v2i64,   2*20 },
717
10.4k
    { ISD::UDIV,  MVT::v16i8,  16*20 },
718
10.4k
    { ISD::UDIV,  MVT::v8i16,   8*20 },
719
10.4k
    { ISD::UDIV,  MVT::v4i32,   4*20 },
720
10.4k
    { ISD::UDIV,  MVT::v2i64,   2*20 },
721
10.4k
  };
722
10.4k
723
10.4k
  if (ST->hasSSE2())
724
10.3k
    
if (const auto *10.3k
Entry10.3k
= CostTableLookup(SSE2CostTable, ISD, LT.second))
725
809
      return LT.first * Entry->Cost;
726
9.59k
727
9.59k
  static const CostTblEntry SSE1CostTable[] = {
728
9.59k
    { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
729
9.59k
    { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
730
9.59k
  };
731
9.59k
732
9.59k
  if (ST->hasSSE1())
733
9.59k
    
if (const auto *9.59k
Entry9.59k
= CostTableLookup(SSE1CostTable, ISD, LT.second))
734
0
      return LT.first * Entry->Cost;
735
9.59k
736
9.59k
  // Fallback to the default implementation.
737
9.59k
  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
738
9.59k
}
739
740
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
741
5.01k
                               Type *SubTp) {
742
5.01k
  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
743
5.01k
  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
744
5.01k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
745
5.01k
746
5.01k
  // For Broadcasts we are splatting the first element from the first input
747
5.01k
  // register, so only need to reference that input and all the output
748
5.01k
  // registers are the same.
749
5.01k
  if (Kind == TTI::SK_Broadcast)
750
1.41k
    LT.first = 1;
751
5.01k
752
5.01k
  // We are going to permute multiple sources and the result will be in multiple
753
5.01k
  // destinations. Providing an accurate cost only for splits where the element
754
5.01k
  // type remains the same.
755
5.01k
  if (
Kind == TTI::SK_PermuteSingleSrc && 5.01k
LT.first != 1547
) {
756
175
    MVT LegalVT = LT.second;
757
175
    if (LegalVT.getVectorElementType().getSizeInBits() ==
758
175
            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
759
175
        
LegalVT.getVectorNumElements() < Tp->getVectorNumElements()175
) {
760
175
761
175
      unsigned VecTySize = DL.getTypeStoreSize(Tp);
762
175
      unsigned LegalVTSize = LegalVT.getStoreSize();
763
175
      // Number of source vectors after legalization:
764
175
      unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
765
175
      // Number of destination vectors after legalization:
766
175
      unsigned NumOfDests = LT.first;
767
175
768
175
      Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
769
175
                                         LegalVT.getVectorNumElements());
770
175
771
175
      unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
772
175
      return NumOfShuffles *
773
175
             getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
774
175
    }
775
0
776
0
    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
777
0
  }
778
4.83k
779
4.83k
  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
780
4.83k
  
if (4.83k
Kind == TTI::SK_PermuteTwoSrc && 4.83k
LT.first != 1385
) {
781
101
    // We assume that source and destination have the same vector type.
782
101
    int NumOfDests = LT.first;
783
101
    int NumOfShufflesPerDest = LT.first * 2 - 1;
784
101
    LT.first = NumOfDests * NumOfShufflesPerDest;
785
101
  }
786
4.83k
787
4.83k
  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
788
4.83k
    { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
789
4.83k
    { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
790
4.83k
791
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
792
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
793
4.83k
794
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
795
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
796
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
797
4.83k
  };
798
4.83k
799
4.83k
  if (ST->hasVBMI())
800
84
    
if (const auto *84
Entry84
=
801
84
            CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
802
7
      return LT.first * Entry->Cost;
803
4.83k
804
4.83k
  static const CostTblEntry AVX512BWShuffleTbl[] = {
805
4.83k
    { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
806
4.83k
    { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
807
4.83k
808
4.83k
    { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
809
4.83k
    { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
810
4.83k
    { TTI::SK_Reverse,          MVT::v64i8,  2 }, // pshufb + vshufi64x2
811
4.83k
812
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
813
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
814
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
815
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
816
4.83k
    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
817
4.83k
818
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
819
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
820
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
821
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
822
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
823
4.83k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
824
4.83k
  };
825
4.83k
826
4.83k
  if (ST->hasBWI())
827
244
    
if (const auto *244
Entry244
=
828
244
            CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
829
42
      return LT.first * Entry->Cost;
830
4.78k
831
4.78k
  static const CostTblEntry AVX512ShuffleTbl[] = {
832
4.78k
    { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
833
4.78k
    { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
834
4.78k
    { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
835
4.78k
    { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
836
4.78k
837
4.78k
    { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
838
4.78k
    { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
839
4.78k
    { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
840
4.78k
    { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
841
4.78k
842
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
843
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
844
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
845
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
846
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
847
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
848
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
849
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
850
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
851
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
852
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
853
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
854
4.78k
    { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
855
4.78k
856
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
857
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
858
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
859
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
860
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
861
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
862
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
863
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
864
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
865
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
866
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
867
4.78k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
868
4.78k
  };
869
4.78k
870
4.78k
  if (ST->hasAVX512())
871
469
    
if (const auto *469
Entry469
= CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
872
162
      return LT.first * Entry->Cost;
873
4.62k
874
4.62k
  static const CostTblEntry AVX2ShuffleTbl[] = {
875
4.62k
    { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
876
4.62k
    { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
877
4.62k
    { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
878
4.62k
    { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
879
4.62k
    { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
880
4.62k
    { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
881
4.62k
882
4.62k
    { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
883
4.62k
    { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
884
4.62k
    { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
885
4.62k
    { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
886
4.62k
    { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
887
4.62k
    { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
888
4.62k
889
4.62k
    { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
890
4.62k
    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
891
4.62k
892
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
893
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
894
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
895
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
896
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
897
4.62k
                                                  // + vpblendvb
898
4.62k
    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }, // vperm2i128 + 2*vpshufb
899
4.62k
                                                  // + vpblendvb
900
4.62k
901
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  3 }, // 2*vpermpd + vblendpd
902
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  3 }, // 2*vpermps + vblendps
903
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  3 }, // 2*vpermq + vpblendd
904
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  3 }, // 2*vpermd + vpblendd
905
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
906
4.62k
                                                  // + vpblendvb
907
4.62k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  7 }, // 2*vperm2i128 + 4*vpshufb
908
4.62k
                                                  // + vpblendvb
909
4.62k
  };
910
4.62k
911
4.62k
  if (ST->hasAVX2())
912
848
    
if (const auto *848
Entry848
= CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
913
309
      return LT.first * Entry->Cost;
914
4.31k
915
4.31k
  static const CostTblEntry XOPShuffleTbl[] = {
916
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v4f64,   2 }, // vperm2f128 + vpermil2pd
917
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v8f32,   2 }, // vperm2f128 + vpermil2ps
918
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v4i64,   2 }, // vperm2f128 + vpermil2pd
919
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v8i32,   2 }, // vperm2f128 + vpermil2ps
920
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v16i16,  4 }, // vextractf128 + 2*vpperm
921
4.31k
                                                   // + vinsertf128
922
4.31k
    { TTI::SK_PermuteSingleSrc, MVT::v32i8,   4 }, // vextractf128 + 2*vpperm
923
4.31k
                                                   // + vinsertf128
924
4.31k
925
4.31k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i16,  9 }, // 2*vextractf128 + 6*vpperm
926
4.31k
                                                   // + vinsertf128
927
4.31k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,   1 }, // vpperm
928
4.31k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,   9 }, // 2*vextractf128 + 6*vpperm
929
4.31k
                                                   // + vinsertf128
930
4.31k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,   1 }, // vpperm
931
4.31k
  };
932
4.31k
933
4.31k
  if (ST->hasXOP())
934
96
    
if (const auto *96
Entry96
= CostTableLookup(XOPShuffleTbl, Kind, LT.second))
935
16
      return LT.first * Entry->Cost;
936
4.30k
937
4.30k
  static const CostTblEntry AVX1ShuffleTbl[] = {
938
4.30k
    { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
939
4.30k
    { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
940
4.30k
    { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
941
4.30k
    { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
942
4.30k
    { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
943
4.30k
    { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
944
4.30k
945
4.30k
    { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
946
4.30k
    { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
947
4.30k
    { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
948
4.30k
    { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
949
4.30k
    { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
950
4.30k
                                           // + vinsertf128
951
4.30k
    { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
952
4.30k
                                           // + vinsertf128
953
4.30k
954
4.30k
    { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
955
4.30k
    { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
956
4.30k
    { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
957
4.30k
    { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
958
4.30k
    { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
959
4.30k
    { TTI::SK_Alternate, MVT::v32i8,  3 }, // vpand + vpandn + vpor
960
4.30k
961
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  3 }, // 2*vperm2f128 + vshufpd
962
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  3 }, // 2*vperm2f128 + vshufpd
963
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  4 }, // 2*vperm2f128 + 2*vshufps
964
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  4 }, // 2*vperm2f128 + 2*vshufps
965
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
966
4.30k
                                                  // + 2*por + vinsertf128
967
4.30k
    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  8 }, // vextractf128 + 4*pshufb
968
4.30k
                                                  // + 2*por + vinsertf128
969
4.30k
970
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,   4 }, // 2*vperm2f128 + 2*vshufpd
971
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,   4 }, // 2*vperm2f128 + 2*vshufps
972
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,   4 }, // 2*vperm2f128 + 2*vshufpd
973
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,   4 }, // 2*vperm2f128 + 2*vshufps
974
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
975
4.30k
                                                   // + 4*por + vinsertf128
976
4.30k
    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  15 }, // 2*vextractf128 + 8*pshufb
977
4.30k
                                                   // + 4*por + vinsertf128
978
4.30k
  };
979
4.30k
980
4.30k
  if (ST->hasAVX())
981
887
    
if (const auto *887
Entry887
= CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
982
186
      return LT.first * Entry->Cost;
983
4.11k
984
4.11k
  static const CostTblEntry SSE41ShuffleTbl[] = {
985
4.11k
    { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
986
4.11k
    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
987
4.11k
    { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
988
4.11k
    { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
989
4.11k
    { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
990
4.11k
    { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
991
4.11k
  };
992
4.11k
993
4.11k
  if (ST->hasSSE41())
994
1.05k
    
if (const auto *1.05k
Entry1.05k
= CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
995
136
      return LT.first * Entry->Cost;
996
3.97k
997
3.97k
  static const CostTblEntry SSSE3ShuffleTbl[] = {
998
3.97k
    { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
999
3.97k
    { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
1000
3.97k
1001
3.97k
    { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
1002
3.97k
    { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
1003
3.97k
1004
3.97k
    { TTI::SK_Alternate, MVT::v8i16,  3 }, // 2*pshufb + por
1005
3.97k
    { TTI::SK_Alternate, MVT::v16i8,  3 }, // 2*pshufb + por
1006
3.97k
1007
3.97k
    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
1008
3.97k
    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
1009
3.97k
1010
3.97k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i16, 3 }, // 2*pshufb + por
1011
3.97k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 3 }, // 2*pshufb + por
1012
3.97k
  };
1013
3.97k
1014
3.97k
  if (ST->hasSSSE3())
1015
1.71k
    
if (const auto *1.71k
Entry1.71k
= CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1016
324
      return LT.first * Entry->Cost;
1017
3.65k
1018
3.65k
  static const CostTblEntry SSE2ShuffleTbl[] = {
1019
3.65k
    { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
1020
3.65k
    { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
1021
3.65k
    { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
1022
3.65k
    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw + pshufd
1023
3.65k
    { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
1024
3.65k
1025
3.65k
    { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
1026
3.65k
    { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
1027
3.65k
    { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
1028
3.65k
    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw + pshufd
1029
3.65k
    { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
1030
3.65k
                                           // + 2*pshufd + 2*unpck + packus
1031
3.65k
1032
3.65k
    { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
1033
3.65k
    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
1034
3.65k
    { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
1035
3.65k
    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
1036
3.65k
    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
1037
3.65k
1038
3.65k
    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // shufpd
1039
3.65k
    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // pshufd
1040
3.65k
    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // pshufd
1041
3.65k
    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  5 }, // 2*pshuflw + 2*pshufhw
1042
3.65k
                                                  // + pshufd/unpck
1043
3.65k
    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1044
3.65k
                                                  // + 2*pshufd + 2*unpck + 2*packus
1045
3.65k
1046
3.65k
    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
1047
3.65k
    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
1048
3.65k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
1049
3.65k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
1050
3.65k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
1051
3.65k
  };
1052
3.65k
1053
3.65k
  if (ST->hasSSE2())
1054
3.65k
    
if (const auto *3.65k
Entry3.65k
= CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1055
1.59k
      return LT.first * Entry->Cost;
1056
2.06k
1057
2.06k
  static const CostTblEntry SSE1ShuffleTbl[] = {
1058
2.06k
    { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
1059
2.06k
    { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
1060
2.06k
    { TTI::SK_Alternate,        MVT::v4f32, 2 }, // 2*shufps
1061
2.06k
    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1062
2.06k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
1063
2.06k
  };
1064
2.06k
1065
2.06k
  if (ST->hasSSE1())
1066
2.06k
    
if (const auto *2.06k
Entry2.06k
= CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1067
316
      return LT.first * Entry->Cost;
1068
1.74k
1069
1.74k
  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1070
1.74k
}
1071
1072
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1073
5.05k
                                 const Instruction *I) {
1074
5.05k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1075
5.05k
  assert(ISD && "Invalid opcode");
1076
5.05k
1077
5.05k
  // FIXME: Need a better design of the cost table to handle non-simple types of
1078
5.05k
  // potential massive combinations (elem_num x src_type x dst_type).
1079
5.05k
1080
5.05k
  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1081
5.05k
    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1082
5.05k
    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1083
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1084
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1085
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1086
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1087
5.05k
1088
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1089
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1090
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1091
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1092
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1093
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1094
5.05k
1095
5.05k
    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
1096
5.05k
    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
1097
5.05k
    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
1098
5.05k
    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
1099
5.05k
    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
1100
5.05k
    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
1101
5.05k
1102
5.05k
    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
1103
5.05k
    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
1104
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
1105
5.05k
    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
1106
5.05k
    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
1107
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
1108
5.05k
  };
1109
5.05k
1110
5.05k
  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1111
5.05k
  // 256-bit wide vectors.
1112
5.05k
1113
5.05k
  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1114
5.05k
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
1115
5.05k
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
1116
5.05k
    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
1117
5.05k
1118
5.05k
    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
1119
5.05k
    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
1120
5.05k
    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
1121
5.05k
    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
1122
5.05k
1123
5.05k
    // v16i1 -> v16i32 - load + broadcast
1124
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1125
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1126
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1127
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1128
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1129
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1130
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1131
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1132
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1133
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1134
5.05k
1135
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1136
5.05k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1137
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1138
5.05k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1139
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1140
5.05k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1141
5.05k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1142
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1143
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
1144
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1145
5.05k
1146
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1147
5.05k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1148
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
1149
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
1150
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
1151
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1152
5.05k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1153
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
1154
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
1155
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
1156
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1157
5.05k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1158
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
1159
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
1160
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
1161
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
1162
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
1163
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1164
5.05k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1165
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
1166
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
1167
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
1168
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1169
5.05k
1170
5.05k
    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
1171
5.05k
    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
1172
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
1173
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  2 },
1174
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  2 },
1175
5.05k
    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
1176
5.05k
    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 2 },
1177
5.05k
    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 2 },
1178
5.05k
  };
1179
5.05k
1180
5.05k
  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1181
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1182
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1183
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1184
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1185
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1186
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1187
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1188
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1189
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1190
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1191
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1192
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1193
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1194
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1195
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1196
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1197
5.05k
1198
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
1199
5.05k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
1200
5.05k
    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
1201
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
1202
5.05k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
1203
5.05k
    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
1204
5.05k
1205
5.05k
    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
1206
5.05k
    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
1207
5.05k
1208
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
1209
5.05k
  };
1210
5.05k
1211
5.05k
  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1212
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
1213
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
1214
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
1215
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
1216
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
1217
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1218
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
1219
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1220
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1221
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1222
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
1223
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1224
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1225
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1226
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1227
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1228
5.05k
1229
5.05k
    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
1230
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
1231
5.05k
    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
1232
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
1233
5.05k
    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
1234
5.05k
    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
1235
5.05k
    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
1236
5.05k
1237
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
1238
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
1239
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
1240
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
1241
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
1242
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
1243
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
1244
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
1245
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1246
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
1247
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
1248
5.05k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
1249
5.05k
1250
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
1251
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
1252
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
1253
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
1254
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
1255
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
1256
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
1257
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
1258
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1259
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
1260
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
1261
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
1262
5.05k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
1263
5.05k
    // The generic code to compute the scalar overhead is currently broken.
1264
5.05k
    // Workaround this limitation by estimating the scalarization overhead
1265
5.05k
    // here. We have roughly 10 instructions per scalar element.
1266
5.05k
    // Multiply that by the vector width.
1267
5.05k
    // FIXME: remove that when PR19268 is fixed.
1268
5.05k
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
1269
5.05k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
1270
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1271
5.05k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1272
5.05k
1273
5.05k
    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
1274
5.05k
    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
1275
5.05k
    // This node is expanded into scalarized operations but BasicTTI is overly
1276
5.05k
    // optimistic estimating its cost.  It computes 3 per element (one
1277
5.05k
    // vector-extract, one scalar conversion and one vector-insert).  The
1278
5.05k
    // problem is that the inserts form a read-modify-write chain so latency
1279
5.05k
    // should be factored in too.  Inflating the cost per element by 1.
1280
5.05k
    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
1281
5.05k
    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
1282
5.05k
1283
5.05k
    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
1284
5.05k
    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
1285
5.05k
  };
1286
5.05k
1287
5.05k
  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1288
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1289
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1290
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1291
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1292
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1293
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1294
5.05k
1295
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1296
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
1297
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1298
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1299
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1300
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1301
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1302
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1303
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1304
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1305
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1306
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1307
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1308
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1309
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1310
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1311
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1312
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1313
5.05k
1314
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
1315
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
1316
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
1317
5.05k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
1318
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
1319
5.05k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
1320
5.05k
    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
1321
5.05k
1322
5.05k
  };
1323
5.05k
1324
5.05k
  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1325
5.05k
    // These are somewhat magic numbers justified by looking at the output of
1326
5.05k
    // Intel's IACA, running some kernels and making sure when we take
1327
5.05k
    // legalization into account the throughput will be overestimated.
1328
5.05k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1329
5.05k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1330
5.05k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1331
5.05k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1332
5.05k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1333
5.05k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1334
5.05k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1335
5.05k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1336
5.05k
1337
5.05k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1338
5.05k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1339
5.05k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1340
5.05k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1341
5.05k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1342
5.05k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1343
5.05k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1344
5.05k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1345
5.05k
1346
5.05k
    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
1347
5.05k
1348
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1349
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
1350
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
1351
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
1352
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
1353
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
1354
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1355
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
1356
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1357
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1358
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
1359
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
1360
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
1361
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
1362
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1363
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
1364
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1365
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
1366
5.05k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
1367
5.05k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
1368
5.05k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1369
5.05k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1370
5.05k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
1371
5.05k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
1372
5.05k
1373
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
1374
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
1375
5.05k
    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
1376
5.05k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
1377
5.05k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
1378
5.05k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
1379
5.05k
    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
1380
5.05k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
1381
5.05k
    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
1382
5.05k
  };
1383
5.05k
1384
5.05k
  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1385
5.05k
  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1386
5.05k
1387
5.05k
  if (
ST->hasSSE2() && 5.05k
!ST->hasAVX()5.05k
) {
1388
2.20k
    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1389
2.20k
                                                   LTDest.second, LTSrc.second))
1390
250
      return LTSrc.first * Entry->Cost;
1391
4.80k
  }
1392
4.80k
1393
4.80k
  EVT SrcTy = TLI->getValueType(DL, Src);
1394
4.80k
  EVT DstTy = TLI->getValueType(DL, Dst);
1395
4.80k
1396
4.80k
  // The function getSimpleVT only handles simple value types.
1397
4.80k
  if (
!SrcTy.isSimple() || 4.80k
!DstTy.isSimple()4.76k
)
1398
75
    return BaseT::getCastInstrCost(Opcode, Dst, Src);
1399
4.73k
1400
4.73k
  
if (4.73k
ST->hasDQI()4.73k
)
1401
280
    
if (const auto *280
Entry280
= ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1402
280
                                                   DstTy.getSimpleVT(),
1403
280
                                                   SrcTy.getSimpleVT()))
1404
42
      return Entry->Cost;
1405
4.68k
1406
4.68k
  
if (4.68k
ST->hasAVX512()4.68k
)
1407
552
    
if (const auto *552
Entry552
= ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1408
552
                                                   DstTy.getSimpleVT(),
1409
552
                                                   SrcTy.getSimpleVT()))
1410
126
      return Entry->Cost;
1411
4.56k
1412
4.56k
  
if (4.56k
ST->hasAVX2()4.56k
) {
1413
1.44k
    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1414
1.44k
                                                   DstTy.getSimpleVT(),
1415
1.44k
                                                   SrcTy.getSimpleVT()))
1416
95
      return Entry->Cost;
1417
4.46k
  }
1418
4.46k
1419
4.46k
  
if (4.46k
ST->hasAVX()4.46k
) {
1420
2.56k
    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1421
2.56k
                                                   DstTy.getSimpleVT(),
1422
2.56k
                                                   SrcTy.getSimpleVT()))
1423
474
      return Entry->Cost;
1424
3.99k
  }
1425
3.99k
1426
3.99k
  
if (3.99k
ST->hasSSE41()3.99k
) {
1427
2.35k
    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1428
2.35k
                                                   DstTy.getSimpleVT(),
1429
2.35k
                                                   SrcTy.getSimpleVT()))
1430
84
      return Entry->Cost;
1431
3.90k
  }
1432
3.90k
1433
3.90k
  
if (3.90k
ST->hasSSE2()3.90k
) {
1434
3.90k
    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1435
3.90k
                                                   DstTy.getSimpleVT(),
1436
3.90k
                                                   SrcTy.getSimpleVT()))
1437
141
      return Entry->Cost;
1438
3.76k
  }
1439
3.76k
1440
3.76k
  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1441
3.76k
}
1442
1443
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1444
11.2k
                                   const Instruction *I) {
1445
11.2k
  // Legalize the type.
1446
11.2k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1447
11.2k
1448
11.2k
  MVT MTy = LT.second;
1449
11.2k
1450
11.2k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1451
11.2k
  assert(ISD && "Invalid opcode");
1452
11.2k
1453
11.2k
  static const CostTblEntry SSE2CostTbl[] = {
1454
11.2k
    { ISD::SETCC,   MVT::v2i64,   8 },
1455
11.2k
    { ISD::SETCC,   MVT::v4i32,   1 },
1456
11.2k
    { ISD::SETCC,   MVT::v8i16,   1 },
1457
11.2k
    { ISD::SETCC,   MVT::v16i8,   1 },
1458
11.2k
  };
1459
11.2k
1460
11.2k
  static const CostTblEntry SSE42CostTbl[] = {
1461
11.2k
    { ISD::SETCC,   MVT::v2f64,   1 },
1462
11.2k
    { ISD::SETCC,   MVT::v4f32,   1 },
1463
11.2k
    { ISD::SETCC,   MVT::v2i64,   1 },
1464
11.2k
  };
1465
11.2k
1466
11.2k
  static const CostTblEntry AVX1CostTbl[] = {
1467
11.2k
    { ISD::SETCC,   MVT::v4f64,   1 },
1468
11.2k
    { ISD::SETCC,   MVT::v8f32,   1 },
1469
11.2k
    // AVX1 does not support 8-wide integer compare.
1470
11.2k
    { ISD::SETCC,   MVT::v4i64,   4 },
1471
11.2k
    { ISD::SETCC,   MVT::v8i32,   4 },
1472
11.2k
    { ISD::SETCC,   MVT::v16i16,  4 },
1473
11.2k
    { ISD::SETCC,   MVT::v32i8,   4 },
1474
11.2k
  };
1475
11.2k
1476
11.2k
  static const CostTblEntry AVX2CostTbl[] = {
1477
11.2k
    { ISD::SETCC,   MVT::v4i64,   1 },
1478
11.2k
    { ISD::SETCC,   MVT::v8i32,   1 },
1479
11.2k
    { ISD::SETCC,   MVT::v16i16,  1 },
1480
11.2k
    { ISD::SETCC,   MVT::v32i8,   1 },
1481
11.2k
  };
1482
11.2k
1483
11.2k
  static const CostTblEntry AVX512CostTbl[] = {
1484
11.2k
    { ISD::SETCC,   MVT::v8i64,   1 },
1485
11.2k
    { ISD::SETCC,   MVT::v16i32,  1 },
1486
11.2k
    { ISD::SETCC,   MVT::v8f64,   1 },
1487
11.2k
    { ISD::SETCC,   MVT::v16f32,  1 },
1488
11.2k
  };
1489
11.2k
1490
11.2k
  if (ST->hasAVX512())
1491
292
    
if (const auto *292
Entry292
= CostTableLookup(AVX512CostTbl, ISD, MTy))
1492
17
      return LT.first * Entry->Cost;
1493
11.2k
1494
11.2k
  
if (11.2k
ST->hasAVX2()11.2k
)
1495
1.04k
    
if (const auto *1.04k
Entry1.04k
= CostTableLookup(AVX2CostTbl, ISD, MTy))
1496
33
      return LT.first * Entry->Cost;
1497
11.2k
1498
11.2k
  
if (11.2k
ST->hasAVX()11.2k
)
1499
1.23k
    
if (const auto *1.23k
Entry1.23k
= CostTableLookup(AVX1CostTbl, ISD, MTy))
1500
24
      return LT.first * Entry->Cost;
1501
11.2k
1502
11.2k
  
if (11.2k
ST->hasSSE42()11.2k
)
1503
1.34k
    
if (const auto *1.34k
Entry1.34k
= CostTableLookup(SSE42CostTbl, ISD, MTy))
1504
222
      return LT.first * Entry->Cost;
1505
10.9k
1506
10.9k
  
if (10.9k
ST->hasSSE2()10.9k
)
1507
10.9k
    
if (const auto *10.9k
Entry10.9k
= CostTableLookup(SSE2CostTbl, ISD, MTy))
1508
530
      return LT.first * Entry->Cost;
1509
10.4k
1510
10.4k
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1511
10.4k
}
1512
1513
8
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
1514
1515
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1516
                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
1517
4.15k
                                      unsigned ScalarizationCostPassed) {
1518
4.15k
  // Costs should match the codegen from:
1519
4.15k
  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1520
4.15k
  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1521
4.15k
  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1522
4.15k
  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1523
4.15k
  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1524
4.15k
  static const CostTblEntry AVX512CDCostTbl[] = {
1525
4.15k
    { ISD::CTLZ,       MVT::v8i64,   1 },
1526
4.15k
    { ISD::CTLZ,       MVT::v16i32,  1 },
1527
4.15k
    { ISD::CTLZ,       MVT::v32i16,  8 },
1528
4.15k
    { ISD::CTLZ,       MVT::v64i8,  20 },
1529
4.15k
    { ISD::CTLZ,       MVT::v4i64,   1 },
1530
4.15k
    { ISD::CTLZ,       MVT::v8i32,   1 },
1531
4.15k
    { ISD::CTLZ,       MVT::v16i16,  4 },
1532
4.15k
    { ISD::CTLZ,       MVT::v32i8,  10 },
1533
4.15k
    { ISD::CTLZ,       MVT::v2i64,   1 },
1534
4.15k
    { ISD::CTLZ,       MVT::v4i32,   1 },
1535
4.15k
    { ISD::CTLZ,       MVT::v8i16,   4 },
1536
4.15k
    { ISD::CTLZ,       MVT::v16i8,   4 },
1537
4.15k
  };
1538
4.15k
  static const CostTblEntry AVX512BWCostTbl[] = {
1539
4.15k
    { ISD::BITREVERSE, MVT::v8i64,   5 },
1540
4.15k
    { ISD::BITREVERSE, MVT::v16i32,  5 },
1541
4.15k
    { ISD::BITREVERSE, MVT::v32i16,  5 },
1542
4.15k
    { ISD::BITREVERSE, MVT::v64i8,   5 },
1543
4.15k
    { ISD::CTLZ,       MVT::v8i64,  23 },
1544
4.15k
    { ISD::CTLZ,       MVT::v16i32, 22 },
1545
4.15k
    { ISD::CTLZ,       MVT::v32i16, 18 },
1546
4.15k
    { ISD::CTLZ,       MVT::v64i8,  17 },
1547
4.15k
    { ISD::CTPOP,      MVT::v8i64,   7 },
1548
4.15k
    { ISD::CTPOP,      MVT::v16i32, 11 },
1549
4.15k
    { ISD::CTPOP,      MVT::v32i16,  9 },
1550
4.15k
    { ISD::CTPOP,      MVT::v64i8,   6 },
1551
4.15k
    { ISD::CTTZ,       MVT::v8i64,  10 },
1552
4.15k
    { ISD::CTTZ,       MVT::v16i32, 14 },
1553
4.15k
    { ISD::CTTZ,       MVT::v32i16, 12 },
1554
4.15k
    { ISD::CTTZ,       MVT::v64i8,   9 },
1555
4.15k
  };
1556
4.15k
  static const CostTblEntry AVX512CostTbl[] = {
1557
4.15k
    { ISD::BITREVERSE, MVT::v8i64,  36 },
1558
4.15k
    { ISD::BITREVERSE, MVT::v16i32, 24 },
1559
4.15k
    { ISD::CTLZ,       MVT::v8i64,  29 },
1560
4.15k
    { ISD::CTLZ,       MVT::v16i32, 35 },
1561
4.15k
    { ISD::CTPOP,      MVT::v8i64,  16 },
1562
4.15k
    { ISD::CTPOP,      MVT::v16i32, 24 },
1563
4.15k
    { ISD::CTTZ,       MVT::v8i64,  20 },
1564
4.15k
    { ISD::CTTZ,       MVT::v16i32, 28 },
1565
4.15k
  };
1566
4.15k
  static const CostTblEntry XOPCostTbl[] = {
1567
4.15k
    { ISD::BITREVERSE, MVT::v4i64,   4 },
1568
4.15k
    { ISD::BITREVERSE, MVT::v8i32,   4 },
1569
4.15k
    { ISD::BITREVERSE, MVT::v16i16,  4 },
1570
4.15k
    { ISD::BITREVERSE, MVT::v32i8,   4 },
1571
4.15k
    { ISD::BITREVERSE, MVT::v2i64,   1 },
1572
4.15k
    { ISD::BITREVERSE, MVT::v4i32,   1 },
1573
4.15k
    { ISD::BITREVERSE, MVT::v8i16,   1 },
1574
4.15k
    { ISD::BITREVERSE, MVT::v16i8,   1 },
1575
4.15k
    { ISD::BITREVERSE, MVT::i64,     3 },
1576
4.15k
    { ISD::BITREVERSE, MVT::i32,     3 },
1577
4.15k
    { ISD::BITREVERSE, MVT::i16,     3 },
1578
4.15k
    { ISD::BITREVERSE, MVT::i8,      3 }
1579
4.15k
  };
1580
4.15k
  static const CostTblEntry AVX2CostTbl[] = {
1581
4.15k
    { ISD::BITREVERSE, MVT::v4i64,   5 },
1582
4.15k
    { ISD::BITREVERSE, MVT::v8i32,   5 },
1583
4.15k
    { ISD::BITREVERSE, MVT::v16i16,  5 },
1584
4.15k
    { ISD::BITREVERSE, MVT::v32i8,   5 },
1585
4.15k
    { ISD::BSWAP,      MVT::v4i64,   1 },
1586
4.15k
    { ISD::BSWAP,      MVT::v8i32,   1 },
1587
4.15k
    { ISD::BSWAP,      MVT::v16i16,  1 },
1588
4.15k
    { ISD::CTLZ,       MVT::v4i64,  23 },
1589
4.15k
    { ISD::CTLZ,       MVT::v8i32,  18 },
1590
4.15k
    { ISD::CTLZ,       MVT::v16i16, 14 },
1591
4.15k
    { ISD::CTLZ,       MVT::v32i8,   9 },
1592
4.15k
    { ISD::CTPOP,      MVT::v4i64,   7 },
1593
4.15k
    { ISD::CTPOP,      MVT::v8i32,  11 },
1594
4.15k
    { ISD::CTPOP,      MVT::v16i16,  9 },
1595
4.15k
    { ISD::CTPOP,      MVT::v32i8,   6 },
1596
4.15k
    { ISD::CTTZ,       MVT::v4i64,  10 },
1597
4.15k
    { ISD::CTTZ,       MVT::v8i32,  14 },
1598
4.15k
    { ISD::CTTZ,       MVT::v16i16, 12 },
1599
4.15k
    { ISD::CTTZ,       MVT::v32i8,   9 },
1600
4.15k
    { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
1601
4.15k
    { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
1602
4.15k
    { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
1603
4.15k
    { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
1604
4.15k
    { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
1605
4.15k
    { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
1606
4.15k
  };
1607
4.15k
  static const CostTblEntry AVX1CostTbl[] = {
1608
4.15k
    { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
1609
4.15k
    { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
1610
4.15k
    { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1611
4.15k
    { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
1612
4.15k
    { ISD::BSWAP,      MVT::v4i64,   4 },
1613
4.15k
    { ISD::BSWAP,      MVT::v8i32,   4 },
1614
4.15k
    { ISD::BSWAP,      MVT::v16i16,  4 },
1615
4.15k
    { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
1616
4.15k
    { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
1617
4.15k
    { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1618
4.15k
    { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1619
4.15k
    { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
1620
4.15k
    { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
1621
4.15k
    { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1622
4.15k
    { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
1623
4.15k
    { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
1624
4.15k
    { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
1625
4.15k
    { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1626
4.15k
    { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1627
4.15k
    { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
1628
4.15k
    { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
1629
4.15k
    { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
1630
4.15k
    { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
1631
4.15k
    { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
1632
4.15k
    { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
1633
4.15k
  };
1634
4.15k
  static const CostTblEntry SSE42CostTbl[] = {
1635
4.15k
    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
1636
4.15k
    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
1637
4.15k
  };
1638
4.15k
  static const CostTblEntry SSSE3CostTbl[] = {
1639
4.15k
    { ISD::BITREVERSE, MVT::v2i64,   5 },
1640
4.15k
    { ISD::BITREVERSE, MVT::v4i32,   5 },
1641
4.15k
    { ISD::BITREVERSE, MVT::v8i16,   5 },
1642
4.15k
    { ISD::BITREVERSE, MVT::v16i8,   5 },
1643
4.15k
    { ISD::BSWAP,      MVT::v2i64,   1 },
1644
4.15k
    { ISD::BSWAP,      MVT::v4i32,   1 },
1645
4.15k
    { ISD::BSWAP,      MVT::v8i16,   1 },
1646
4.15k
    { ISD::CTLZ,       MVT::v2i64,  23 },
1647
4.15k
    { ISD::CTLZ,       MVT::v4i32,  18 },
1648
4.15k
    { ISD::CTLZ,       MVT::v8i16,  14 },
1649
4.15k
    { ISD::CTLZ,       MVT::v16i8,   9 },
1650
4.15k
    { ISD::CTPOP,      MVT::v2i64,   7 },
1651
4.15k
    { ISD::CTPOP,      MVT::v4i32,  11 },
1652
4.15k
    { ISD::CTPOP,      MVT::v8i16,   9 },
1653
4.15k
    { ISD::CTPOP,      MVT::v16i8,   6 },
1654
4.15k
    { ISD::CTTZ,       MVT::v2i64,  10 },
1655
4.15k
    { ISD::CTTZ,       MVT::v4i32,  14 },
1656
4.15k
    { ISD::CTTZ,       MVT::v8i16,  12 },
1657
4.15k
    { ISD::CTTZ,       MVT::v16i8,   9 }
1658
4.15k
  };
1659
4.15k
  static const CostTblEntry SSE2CostTbl[] = {
1660
4.15k
    { ISD::BITREVERSE, MVT::v2i64,  29 },
1661
4.15k
    { ISD::BITREVERSE, MVT::v4i32,  27 },
1662
4.15k
    { ISD::BITREVERSE, MVT::v8i16,  27 },
1663
4.15k
    { ISD::BITREVERSE, MVT::v16i8,  20 },
1664
4.15k
    { ISD::BSWAP,      MVT::v2i64,   7 },
1665
4.15k
    { ISD::BSWAP,      MVT::v4i32,   7 },
1666
4.15k
    { ISD::BSWAP,      MVT::v8i16,   7 },
1667
4.15k
    { ISD::CTLZ,       MVT::v2i64,  25 },
1668
4.15k
    { ISD::CTLZ,       MVT::v4i32,  26 },
1669
4.15k
    { ISD::CTLZ,       MVT::v8i16,  20 },
1670
4.15k
    { ISD::CTLZ,       MVT::v16i8,  17 },
1671
4.15k
    { ISD::CTPOP,      MVT::v2i64,  12 },
1672
4.15k
    { ISD::CTPOP,      MVT::v4i32,  15 },
1673
4.15k
    { ISD::CTPOP,      MVT::v8i16,  13 },
1674
4.15k
    { ISD::CTPOP,      MVT::v16i8,  10 },
1675
4.15k
    { ISD::CTTZ,       MVT::v2i64,  14 },
1676
4.15k
    { ISD::CTTZ,       MVT::v4i32,  18 },
1677
4.15k
    { ISD::CTTZ,       MVT::v8i16,  16 },
1678
4.15k
    { ISD::CTTZ,       MVT::v16i8,  13 },
1679
4.15k
    { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
1680
4.15k
    { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
1681
4.15k
  };
1682
4.15k
  static const CostTblEntry SSE1CostTbl[] = {
1683
4.15k
    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
1684
4.15k
    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
1685
4.15k
  };
1686
4.15k
  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1687
4.15k
    { ISD::BITREVERSE, MVT::i64,    14 }
1688
4.15k
  };
1689
4.15k
  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1690
4.15k
    { ISD::BITREVERSE, MVT::i32,    14 },
1691
4.15k
    { ISD::BITREVERSE, MVT::i16,    14 },
1692
4.15k
    { ISD::BITREVERSE, MVT::i8,     11 }
1693
4.15k
  };
1694
4.15k
1695
4.15k
  unsigned ISD = ISD::DELETED_NODE;
1696
4.15k
  switch (IID) {
1697
2.22k
  default:
1698
2.22k
    break;
1699
320
  case Intrinsic::bitreverse:
1700
320
    ISD = ISD::BITREVERSE;
1701
320
    break;
1702
86
  case Intrinsic::bswap:
1703
86
    ISD = ISD::BSWAP;
1704
86
    break;
1705
642
  case Intrinsic::ctlz:
1706
642
    ISD = ISD::CTLZ;
1707
642
    break;
1708
238
  case Intrinsic::ctpop:
1709
238
    ISD = ISD::CTPOP;
1710
238
    break;
1711
506
  case Intrinsic::cttz:
1712
506
    ISD = ISD::CTTZ;
1713
506
    break;
1714
138
  case Intrinsic::sqrt:
1715
138
    ISD = ISD::FSQRT;
1716
138
    break;
1717
4.15k
  }
1718
4.15k
1719
4.15k
  // Legalize the type.
1720
4.15k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1721
4.15k
  MVT MTy = LT.second;
1722
4.15k
1723
4.15k
  // Attempt to lookup cost.
1724
4.15k
  if (ST->hasCDI())
1725
316
    
if (const auto *316
Entry316
= CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1726
24
      return LT.first * Entry->Cost;
1727
4.12k
1728
4.12k
  
if (4.12k
ST->hasBWI()4.12k
)
1729
268
    
if (const auto *268
Entry268
= CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1730
28
      return LT.first * Entry->Cost;
1731
4.10k
1732
4.10k
  
if (4.10k
ST->hasAVX512()4.10k
)
1733
392
    
if (const auto *392
Entry392
= CostTableLookup(AVX512CostTbl, ISD, MTy))
1734
14
      return LT.first * Entry->Cost;
1735
4.08k
1736
4.08k
  
if (4.08k
ST->hasXOP()4.08k
)
1737
304
    
if (const auto *304
Entry304
= CostTableLookup(XOPCostTbl, ISD, MTy))
1738
68
      return LT.first * Entry->Cost;
1739
4.01k
1740
4.01k
  
if (4.01k
ST->hasAVX2()4.01k
)
1741
989
    
if (const auto *989
Entry989
= CostTableLookup(AVX2CostTbl, ISD, MTy))
1742
239
      return LT.first * Entry->Cost;
1743
3.77k
1744
3.77k
  
if (3.77k
ST->hasAVX()3.77k
)
1745
1.55k
    
if (const auto *1.55k
Entry1.55k
= CostTableLookup(AVX1CostTbl, ISD, MTy))
1746
165
      return LT.first * Entry->Cost;
1747
3.61k
1748
3.61k
  
if (3.61k
ST->hasSSE42()3.61k
)
1749
1.84k
    
if (const auto *1.84k
Entry1.84k
= CostTableLookup(SSE42CostTbl, ISD, MTy))
1750
4
      return LT.first * Entry->Cost;
1751
3.61k
1752
3.61k
  
if (3.61k
ST->hasSSSE3()3.61k
)
1753
2.22k
    
if (const auto *2.22k
Entry2.22k
= CostTableLookup(SSSE3CostTbl, ISD, MTy))
1754
517
      return LT.first * Entry->Cost;
1755
3.09k
1756
3.09k
  
if (3.09k
ST->hasSSE2()3.09k
)
1757
3.09k
    
if (const auto *3.09k
Entry3.09k
= CostTableLookup(SSE2CostTbl, ISD, MTy))
1758
223
      return LT.first * Entry->Cost;
1759
2.87k
1760
2.87k
  
if (2.87k
ST->hasSSE1()2.87k
)
1761
2.87k
    
if (const auto *2.87k
Entry2.87k
= CostTableLookup(SSE1CostTbl, ISD, MTy))
1762
18
      return LT.first * Entry->Cost;
1763
2.85k
1764
2.85k
  
if (2.85k
ST->is64Bit()2.85k
)
1765
2.60k
    
if (const auto *2.60k
Entry2.60k
= CostTableLookup(X64CostTbl, ISD, MTy))
1766
13
      return LT.first * Entry->Cost;
1767
2.83k
1768
2.83k
  
if (const auto *2.83k
Entry2.83k
= CostTableLookup(X86CostTbl, ISD, MTy))
1769
65
    return LT.first * Entry->Cost;
1770
2.77k
1771
2.77k
  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1772
2.77k
}
1773
1774
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1775
2.57k
                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1776
2.57k
  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1777
2.57k
}
1778
1779
89.6k
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1780
89.6k
  assert(Val->isVectorTy() && "This must be a vector type");
1781
89.6k
1782
89.6k
  Type *ScalarType = Val->getScalarType();
1783
89.6k
1784
89.6k
  if (
Index != -1U89.6k
) {
1785
89.6k
    // Legalize the type.
1786
89.6k
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1787
89.6k
1788
89.6k
    // This type is legalized to a scalar type.
1789
89.6k
    if (!LT.second.isVector())
1790
921
      return 0;
1791
88.7k
1792
88.7k
    // The type may be split. Normalize the index to the new type.
1793
88.7k
    unsigned Width = LT.second.getVectorNumElements();
1794
88.7k
    Index = Index % Width;
1795
88.7k
1796
88.7k
    // Floating point scalars are already located in index #0.
1797
88.7k
    if (
ScalarType->isFloatingPointTy() && 88.7k
Index == 033.6k
)
1798
9.70k
      return 0;
1799
79.0k
  }
1800
79.0k
1801
79.0k
  // Add to the base cost if we know that the extracted element of a vector is
1802
79.0k
  // destined to be moved to and used in the integer register file.
1803
79.0k
  int RegisterFileMoveCost = 0;
1804
79.0k
  if (
Opcode == Instruction::ExtractElement && 79.0k
ScalarType->isPointerTy()36.4k
)
1805
1.72k
    RegisterFileMoveCost = 1;
1806
89.6k
1807
89.6k
  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1808
89.6k
}
1809
1810
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1811
17.4k
                                unsigned AddressSpace, const Instruction *I) {
1812
17.4k
  // Handle non-power-of-two vectors such as <3 x float>
1813
17.4k
  if (VectorType *
VTy17.4k
= dyn_cast<VectorType>(Src)) {
1814
9.24k
    unsigned NumElem = VTy->getVectorNumElements();
1815
9.24k
1816
9.24k
    // Handle a few common cases:
1817
9.24k
    // <3 x float>
1818
9.24k
    if (
NumElem == 3 && 9.24k
VTy->getScalarSizeInBits() == 324
)
1819
9.24k
      // Cost = 64 bit store + extract + 32 bit store.
1820
2
      return 3;
1821
9.24k
1822
9.24k
    // <3 x double>
1823
9.24k
    
if (9.24k
NumElem == 3 && 9.24k
VTy->getScalarSizeInBits() == 642
)
1824
9.24k
      // Cost = 128 bit store + unpack + 64 bit store.
1825
2
      return 3;
1826
9.23k
1827
9.23k
    // Assume that all other non-power-of-two numbers are scalarized.
1828
9.23k
    
if (9.23k
!isPowerOf2_32(NumElem)9.23k
) {
1829
4
      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1830
4
                                        AddressSpace);
1831
4
      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1832
4
                                               Opcode == Instruction::Store);
1833
4
      return NumElem * Cost + SplitCost;
1834
4
    }
1835
17.4k
  }
1836
17.4k
1837
17.4k
  // Legalize the type.
1838
17.4k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1839
17.4k
  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1840
17.4k
         "Invalid Opcode");
1841
17.4k
1842
17.4k
  // Each load/store unit costs 1.
1843
17.4k
  int Cost = LT.first * 1;
1844
17.4k
1845
17.4k
  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1846
17.4k
  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1847
17.4k
  if (
LT.second.getStoreSize() == 32 && 17.4k
ST->isUnalignedMem32Slow()1.54k
)
1848
553
    Cost *= 2;
1849
17.4k
1850
17.4k
  return Cost;
1851
17.4k
}
1852
1853
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1854
                                      unsigned Alignment,
1855
149
                                      unsigned AddressSpace) {
1856
149
  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1857
149
  if (!SrcVTy)
1858
149
    // To calculate scalar take the regular cost, without mask
1859
0
    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1860
149
1861
149
  unsigned NumElem = SrcVTy->getVectorNumElements();
1862
149
  VectorType *MaskTy =
1863
149
    VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1864
149
  if (
(Opcode == Instruction::Load && 149
!isLegalMaskedLoad(SrcVTy)78
) ||
1865
149
      
(Opcode == Instruction::Store && 149
!isLegalMaskedStore(SrcVTy)71
) ||
1866
149
      
!isPowerOf2_32(NumElem)149
) {
1867
0
    // Scalarization
1868
0
    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1869
0
    int ScalarCompareCost = getCmpSelInstrCost(
1870
0
        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1871
0
    int BranchCost = getCFInstrCost(Instruction::Br);
1872
0
    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1873
0
1874
0
    int ValueSplitCost = getScalarizationOverhead(
1875
0
        SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1876
0
    int MemopCost =
1877
0
        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1878
0
                                         Alignment, AddressSpace);
1879
0
    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1880
0
  }
1881
149
1882
149
  // Legalize the type.
1883
149
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1884
149
  auto VT = TLI->getValueType(DL, SrcVTy);
1885
149
  int Cost = 0;
1886
149
  if (
VT.isSimple() && 149
LT.second != VT.getSimpleVT()149
&&
1887
33
      LT.second.getVectorNumElements() == NumElem)
1888
149
    // Promotion requires expand/truncate for data and a shuffle for mask.
1889
20
    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1890
20
            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1891
149
1892
129
  else 
if (129
LT.second.getVectorNumElements() > NumElem129
) {
1893
13
    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1894
13
                                            LT.second.getVectorNumElements());
1895
13
    // Expanding requires fill mask with zeroes
1896
13
    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1897
13
  }
1898
149
  if (!ST->hasAVX512())
1899
77
    return Cost + LT.first*4; // Each maskmov costs 4
1900
72
1901
72
  // AVX-512 masked load/store is cheapper
1902
72
  return Cost+LT.first;
1903
72
}
1904
1905
int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1906
1.15k
                                          const SCEV *Ptr) {
1907
1.15k
  // Address computations in vectorized code with non-consecutive addresses will
1908
1.15k
  // likely result in more instructions compared to scalar code where the
1909
1.15k
  // computation can more often be merged into the index mode. The resulting
1910
1.15k
  // extra micro-ops can significantly decrease throughput.
1911
1.15k
  unsigned NumVectorInstToHideOverhead = 10;
1912
1.15k
1913
1.15k
  // Cost modeling of Strided Access Computation is hidden by the indexing
1914
1.15k
  // modes of X86 regardless of the stride value. We dont believe that there
1915
1.15k
  // is a difference between constant strided access in gerenal and constant
1916
1.15k
  // strided value which is less than or equal to 64.
1917
1.15k
  // Even in the case of (loop invariant) stride whose value is not known at
1918
1.15k
  // compile time, the address computation will not incur more than one extra
1919
1.15k
  // ADD instruction.
1920
1.15k
  if (
Ty->isVectorTy() && 1.15k
SE311
) {
1921
208
    if (!BaseT::isStridedAccess(Ptr))
1922
38
      return NumVectorInstToHideOverhead;
1923
170
    
if (170
!BaseT::getConstantStrideStep(SE, Ptr)170
)
1924
0
      return 1;
1925
1.11k
  }
1926
1.11k
1927
1.11k
  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1928
1.11k
}
1929
1930
int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
1931
200
                                           bool IsPairwise) {
1932
200
1933
200
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1934
200
1935
200
  MVT MTy = LT.second;
1936
200
1937
200
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1938
200
  assert(ISD && "Invalid opcode");
1939
200
1940
200
  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1941
200
  // and make it as the cost.
1942
200
1943
200
  static const CostTblEntry SSE42CostTblPairWise[] = {
1944
200
    { ISD::FADD,  MVT::v2f64,   2 },
1945
200
    { ISD::FADD,  MVT::v4f32,   4 },
1946
200
    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1947
200
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1948
200
    { ISD::ADD,   MVT::v8i16,   5 },
1949
200
  };
1950
200
1951
200
  static const CostTblEntry AVX1CostTblPairWise[] = {
1952
200
    { ISD::FADD,  MVT::v4f32,   4 },
1953
200
    { ISD::FADD,  MVT::v4f64,   5 },
1954
200
    { ISD::FADD,  MVT::v8f32,   7 },
1955
200
    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1956
200
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1957
200
    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
1958
200
    { ISD::ADD,   MVT::v8i16,   5 },
1959
200
    { ISD::ADD,   MVT::v8i32,   5 },
1960
200
  };
1961
200
1962
200
  static const CostTblEntry SSE42CostTblNoPairWise[] = {
1963
200
    { ISD::FADD,  MVT::v2f64,   2 },
1964
200
    { ISD::FADD,  MVT::v4f32,   4 },
1965
200
    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1966
200
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
1967
200
    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
1968
200
  };
1969
200
1970
200
  static const CostTblEntry AVX1CostTblNoPairWise[] = {
1971
200
    { ISD::FADD,  MVT::v4f32,   3 },
1972
200
    { ISD::FADD,  MVT::v4f64,   3 },
1973
200
    { ISD::FADD,  MVT::v8f32,   4 },
1974
200
    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1975
200
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
1976
200
    { ISD::ADD,   MVT::v4i64,   3 },
1977
200
    { ISD::ADD,   MVT::v8i16,   4 },
1978
200
    { ISD::ADD,   MVT::v8i32,   5 },
1979
200
  };
1980
200
1981
200
  if (
IsPairwise200
) {
1982
102
    if (ST->hasAVX())
1983
73
      
if (const auto *73
Entry73
= CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1984
71
        return LT.first * Entry->Cost;
1985
31
1986
31
    
if (31
ST->hasSSE42()31
)
1987
19
      
if (const auto *19
Entry19
= CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1988
19
        return LT.first * Entry->Cost;
1989
98
  } else {
1990
98
    if (ST->hasAVX())
1991
71
      
if (const auto *71
Entry71
= CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1992
69
        return LT.first * Entry->Cost;
1993
29
1994
29
    
if (29
ST->hasSSE42()29
)
1995
18
      
if (const auto *18
Entry18
= CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
1996
18
        return LT.first * Entry->Cost;
1997
23
  }
1998
23
1999
23
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2000
23
}
2001
2002
int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
2003
888
                                       bool IsPairwise, bool IsUnsigned) {
2004
888
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2005
888
2006
888
  MVT MTy = LT.second;
2007
888
2008
888
  int ISD;
2009
888
  if (
ValTy->isIntOrIntVectorTy()888
) {
2010
92
    ISD = IsUnsigned ? 
ISD::UMIN0
:
ISD::SMIN92
;
2011
888
  } else {
2012
796
    assert(ValTy->isFPOrFPVectorTy() &&
2013
796
           "Expected float point or integer vector type.");
2014
796
    ISD = ISD::FMINNUM;
2015
796
  }
2016
888
2017
888
  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2018
888
  // and make it as the cost.
2019
888
2020
888
  static const CostTblEntry SSE42CostTblPairWise[] = {
2021
888
      {ISD::FMINNUM, MVT::v2f64, 3},
2022
888
      {ISD::FMINNUM, MVT::v4f32, 2},
2023
888
      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2024
888
      {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2025
888
      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2026
888
      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2027
888
      {ISD::SMIN, MVT::v8i16, 2},
2028
888
      {ISD::UMIN, MVT::v8i16, 2},
2029
888
  };
2030
888
2031
888
  static const CostTblEntry AVX1CostTblPairWise[] = {
2032
888
      {ISD::FMINNUM, MVT::v4f32, 1},
2033
888
      {ISD::FMINNUM, MVT::v4f64, 1},
2034
888
      {ISD::FMINNUM, MVT::v8f32, 2},
2035
888
      {ISD::SMIN, MVT::v2i64, 3},
2036
888
      {ISD::UMIN, MVT::v2i64, 3},
2037
888
      {ISD::SMIN, MVT::v4i32, 1},
2038
888
      {ISD::UMIN, MVT::v4i32, 1},
2039
888
      {ISD::SMIN, MVT::v8i16, 1},
2040
888
      {ISD::UMIN, MVT::v8i16, 1},
2041
888
      {ISD::SMIN, MVT::v8i32, 3},
2042
888
      {ISD::UMIN, MVT::v8i32, 3},
2043
888
  };
2044
888
2045
888
  static const CostTblEntry AVX2CostTblPairWise[] = {
2046
888
      {ISD::SMIN, MVT::v4i64, 2},
2047
888
      {ISD::UMIN, MVT::v4i64, 2},
2048
888
      {ISD::SMIN, MVT::v8i32, 1},
2049
888
      {ISD::UMIN, MVT::v8i32, 1},
2050
888
      {ISD::SMIN, MVT::v16i16, 1},
2051
888
      {ISD::UMIN, MVT::v16i16, 1},
2052
888
      {ISD::SMIN, MVT::v32i8, 2},
2053
888
      {ISD::UMIN, MVT::v32i8, 2},
2054
888
  };
2055
888
2056
888
  static const CostTblEntry AVX512CostTblPairWise[] = {
2057
888
      {ISD::FMINNUM, MVT::v8f64, 1},
2058
888
      {ISD::FMINNUM, MVT::v16f32, 2},
2059
888
      {ISD::SMIN, MVT::v8i64, 2},
2060
888
      {ISD::UMIN, MVT::v8i64, 2},
2061
888
      {ISD::SMIN, MVT::v16i32, 1},
2062
888
      {ISD::UMIN, MVT::v16i32, 1},
2063
888
  };
2064
888
2065
888
  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2066
888
      {ISD::FMINNUM, MVT::v2f64, 3},
2067
888
      {ISD::FMINNUM, MVT::v4f32, 3},
2068
888
      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2069
888
      {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2070
888
      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2071
888
      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2072
888
      {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2073
888
      {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2074
888
  };
2075
888
2076
888
  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2077
888
      {ISD::FMINNUM, MVT::v4f32, 1},
2078
888
      {ISD::FMINNUM, MVT::v4f64, 1},
2079
888
      {ISD::FMINNUM, MVT::v8f32, 1},
2080
888
      {ISD::SMIN, MVT::v2i64, 3},
2081
888
      {ISD::UMIN, MVT::v2i64, 3},
2082
888
      {ISD::SMIN, MVT::v4i32, 1},
2083
888
      {ISD::UMIN, MVT::v4i32, 1},
2084
888
      {ISD::SMIN, MVT::v8i16, 1},
2085
888
      {ISD::UMIN, MVT::v8i16, 1},
2086
888
      {ISD::SMIN, MVT::v8i32, 2},
2087
888
      {ISD::UMIN, MVT::v8i32, 2},
2088
888
  };
2089
888
2090
888
  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2091
888
      {ISD::SMIN, MVT::v4i64, 1},
2092
888
      {ISD::UMIN, MVT::v4i64, 1},
2093
888
      {ISD::SMIN, MVT::v8i32, 1},
2094
888
      {ISD::UMIN, MVT::v8i32, 1},
2095
888
      {ISD::SMIN, MVT::v16i16, 1},
2096
888
      {ISD::UMIN, MVT::v16i16, 1},
2097
888
      {ISD::SMIN, MVT::v32i8, 1},
2098
888
      {ISD::UMIN, MVT::v32i8, 1},
2099
888
  };
2100
888
2101
888
  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2102
888
      {ISD::FMINNUM, MVT::v8f64, 1},
2103
888
      {ISD::FMINNUM, MVT::v16f32, 2},
2104
888
      {ISD::SMIN, MVT::v8i64, 1},
2105
888
      {ISD::UMIN, MVT::v8i64, 1},
2106
888
      {ISD::SMIN, MVT::v16i32, 1},
2107
888
      {ISD::UMIN, MVT::v16i32, 1},
2108
888
  };
2109
888
2110
888
  if (
IsPairwise888
) {
2111
444
    if (ST->hasAVX512())
2112
8
      
if (const auto *8
Entry8
= CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2113
4
        return LT.first * Entry->Cost;
2114
440
2115
440
    
if (440
ST->hasAVX2()440
)
2116
12
      
if (const auto *12
Entry12
= CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2117
4
        return LT.first * Entry->Cost;
2118
436
2119
436
    
if (436
ST->hasAVX()436
)
2120
20
      
if (const auto *20
Entry20
= CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2121
20
        return LT.first * Entry->Cost;
2122
416
2123
416
    
if (416
ST->hasSSE42()416
)
2124
0
      
if (const auto *0
Entry0
= CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2125
0
        return LT.first * Entry->Cost;
2126
444
  } else {
2127
444
    if (ST->hasAVX512())
2128
8
      
if (const auto *8
Entry8
=
2129
8
              CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2130
4
        return LT.first * Entry->Cost;
2131
440
2132
440
    
if (440
ST->hasAVX2()440
)
2133
12
      
if (const auto *12
Entry12
= CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2134
4
        return LT.first * Entry->Cost;
2135
436
2136
436
    
if (436
ST->hasAVX()436
)
2137
20
      
if (const auto *20
Entry20
= CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2138
20
        return LT.first * Entry->Cost;
2139
416
2140
416
    
if (416
ST->hasSSE42()416
)
2141
0
      
if (const auto *0
Entry0
= CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2142
0
        return LT.first * Entry->Cost;
2143
832
  }
2144
832
2145
832
  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2146
832
}
2147
2148
/// \brief Calculate the cost of materializing a 64-bit value. This helper
2149
/// method might only calculate a fraction of a larger immediate. Therefore it
2150
/// is valid to return a cost of ZERO.
2151
50.4k
int X86TTIImpl::getIntImmCost(int64_t Val) {
2152
50.4k
  if (Val == 0)
2153
234
    return TTI::TCC_Free;
2154
50.2k
2155
50.2k
  
if (50.2k
isInt<32>(Val)50.2k
)
2156
47.3k
    return TTI::TCC_Basic;
2157
2.86k
2158
2.86k
  return 2 * TTI::TCC_Basic;
2159
2.86k
}
2160
2161
79.7k
int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2162
79.7k
  assert(Ty->isIntegerTy());
2163
79.7k
2164
79.7k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2165
79.7k
  if (BitSize == 0)
2166
0
    return ~0U;
2167
79.7k
2168
79.7k
  // Never hoist constants larger than 128bit, because this might lead to
2169
79.7k
  // incorrect code generation or assertions in codegen.
2170
79.7k
  // Fixme: Create a cost model for types larger than i128 once the codegen
2171
79.7k
  // issues have been fixed.
2172
79.7k
  
if (79.7k
BitSize > 12879.7k
)
2173
93
    return TTI::TCC_Free;
2174
79.6k
2175
79.6k
  
if (79.6k
Imm == 079.6k
)
2176
29.5k
    return TTI::TCC_Free;
2177
50.1k
2178
50.1k
  // Sign-extend all constants to a multiple of 64-bit.
2179
50.1k
  APInt ImmVal = Imm;
2180
50.1k
  if (BitSize & 0x3f)
2181
33.5k
    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
2182
50.1k
2183
50.1k
  // Split the constant into 64-bit chunks and calculate the cost for each
2184
50.1k
  // chunk.
2185
50.1k
  int Cost = 0;
2186
100k
  for (unsigned ShiftVal = 0; 
ShiftVal < BitSize100k
;
ShiftVal += 6450.4k
) {
2187
50.4k
    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2188
50.4k
    int64_t Val = Tmp.getSExtValue();
2189
50.4k
    Cost += getIntImmCost(Val);
2190
50.4k
  }
2191
79.7k
  // We need at least one instruction to materialize the constant.
2192
79.7k
  return std::max(1, Cost);
2193
79.7k
}
2194
2195
int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2196
143k
                              Type *Ty) {
2197
143k
  assert(Ty->isIntegerTy());
2198
143k
2199
143k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2200
143k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2201
143k
  // here, so that constant hoisting will ignore this constant.
2202
143k
  if (BitSize == 0)
2203
0
    return TTI::TCC_Free;
2204
143k
2205
143k
  unsigned ImmIdx = ~0U;
2206
143k
  switch (Opcode) {
2207
20.5k
  default:
2208
20.5k
    return TTI::TCC_Free;
2209
33.5k
  case Instruction::GetElementPtr:
2210
33.5k
    // Always hoist the base address of a GetElementPtr. This prevents the
2211
33.5k
    // creation of new constants for every base constant that gets constant
2212
33.5k
    // folded with the offset.
2213
33.5k
    if (Idx == 0)
2214
9
      return 2 * TTI::TCC_Basic;
2215
33.5k
    return TTI::TCC_Free;
2216
6.36k
  case Instruction::Store:
2217
6.36k
    ImmIdx = 0;
2218
6.36k
    break;
2219
19.6k
  case Instruction::ICmp:
2220
19.6k
    // This is an imperfect hack to prevent constant hoisting of
2221
19.6k
    // compares that might be trying to check if a 64-bit value fits in
2222
19.6k
    // 32-bits. The backend can optimize these cases using a right shift by 32.
2223
19.6k
    // Ideally we would check the compare predicate here. There also other
2224
19.6k
    // similar immediates the backend can use shifts for.
2225
19.6k
    if (
Idx == 1 && 19.6k
Imm.getBitWidth() == 6419.5k
) {
2226
5.91k
      uint64_t ImmVal = Imm.getZExtValue();
2227
5.91k
      if (
ImmVal == 0x100000000ULL || 5.91k
ImmVal == 0xffffffff5.91k
)
2228
37
        return TTI::TCC_Free;
2229
19.6k
    }
2230
19.6k
    ImmIdx = 1;
2231
19.6k
    break;
2232
6.59k
  case Instruction::And:
2233
6.59k
    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2234
6.59k
    // by using a 32-bit operation with implicit zero extension. Detect such
2235
6.59k
    // immediates here as the normal path expects bit 31 to be sign extended.
2236
6.59k
    if (
Idx == 1 && 6.59k
Imm.getBitWidth() == 646.54k
&&
isUInt<32>(Imm.getZExtValue())2.82k
)
2237
1.55k
      return TTI::TCC_Free;
2238
5.04k
    
LLVM_FALLTHROUGH5.04k
;
2239
24.5k
  case Instruction::Add:
2240
24.5k
  case Instruction::Sub:
2241
24.5k
  case Instruction::Mul:
2242
24.5k
  case Instruction::UDiv:
2243
24.5k
  case Instruction::SDiv:
2244
24.5k
  case Instruction::URem:
2245
24.5k
  case Instruction::SRem:
2246
24.5k
  case Instruction::Or:
2247
24.5k
  case Instruction::Xor:
2248
24.5k
    ImmIdx = 1;
2249
24.5k
    break;
2250
24.5k
  // Always return TCC_Free for the shift value of a shift instruction.
2251
9.39k
  case Instruction::Shl:
2252
9.39k
  case Instruction::LShr:
2253
9.39k
  case Instruction::AShr:
2254
9.39k
    if (Idx == 1)
2255
8.50k
      return TTI::TCC_Free;
2256
887
    break;
2257
28.2k
  case Instruction::Trunc:
2258
28.2k
  case Instruction::ZExt:
2259
28.2k
  case Instruction::SExt:
2260
28.2k
  case Instruction::IntToPtr:
2261
28.2k
  case Instruction::PtrToInt:
2262
28.2k
  case Instruction::BitCast:
2263
28.2k
  case Instruction::PHI:
2264
28.2k
  case Instruction::Call:
2265
28.2k
  case Instruction::Select:
2266
28.2k
  case Instruction::Ret:
2267
28.2k
  case Instruction::Load:
2268
28.2k
    break;
2269
79.6k
  }
2270
79.6k
2271
79.6k
  
if (79.6k
Idx == ImmIdx79.6k
) {
2272
47.7k
    int NumConstants = (BitSize + 63) / 64;
2273
47.7k
    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2274
47.7k
    return (Cost <= NumConstants * TTI::TCC_Basic)
2275
45.2k
               ? static_cast<int>(TTI::TCC_Free)
2276
2.45k
               : Cost;
2277
47.7k
  }
2278
31.9k
2279
31.9k
  return X86TTIImpl::getIntImmCost(Imm, Ty);
2280
31.9k
}
2281
2282
int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2283
13.7k
                              Type *Ty) {
2284
13.7k
  assert(Ty->isIntegerTy());
2285
13.7k
2286
13.7k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2287
13.7k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2288
13.7k
  // here, so that constant hoisting will ignore this constant.
2289
13.7k
  if (BitSize == 0)
2290
0
    return TTI::TCC_Free;
2291
13.7k
2292
13.7k
  switch (IID) {
2293
13.1k
  default:
2294
13.1k
    return TTI::TCC_Free;
2295
86
  case Intrinsic::sadd_with_overflow:
2296
86
  case Intrinsic::uadd_with_overflow:
2297
86
  case Intrinsic::ssub_with_overflow:
2298
86
  case Intrinsic::usub_with_overflow:
2299
86
  case Intrinsic::smul_with_overflow:
2300
86
  case Intrinsic::umul_with_overflow:
2301
86
    if (
(Idx == 1) && 86
Imm.getBitWidth() <= 6468
&&
isInt<32>(Imm.getSExtValue())68
)
2302
61
      return TTI::TCC_Free;
2303
25
    break;
2304
237
  case Intrinsic::experimental_stackmap:
2305
237
    if (
(Idx < 2) || 237
(Imm.getBitWidth() <= 64 && 31
isInt<64>(Imm.getSExtValue())30
))
2306
236
      return TTI::TCC_Free;
2307
1
    break;
2308
273
  case Intrinsic::experimental_patchpoint_void:
2309
273
  case Intrinsic::experimental_patchpoint_i64:
2310
273
    if (
(Idx < 4) || 273
(Imm.getBitWidth() <= 64 && 37
isInt<64>(Imm.getSExtValue())37
))
2311
273
      return TTI::TCC_Free;
2312
0
    break;
2313
26
  }
2314
26
  return X86TTIImpl::getIntImmCost(Imm, Ty);
2315
26
}
2316
2317
unsigned X86TTIImpl::getUserCost(const User *U,
2318
512k
                                 ArrayRef<const Value *> Operands) {
2319
512k
  if (
isa<StoreInst>(U)512k
) {
2320
12.1k
    Value *Ptr = U->getOperand(1);
2321
12.1k
    // Store instruction with index and scale costs 2 Uops.
2322
12.1k
    // Check the preceding GEP to identify non-const indices.
2323
12.1k
    if (auto 
GEP12.1k
= dyn_cast<GetElementPtrInst>(Ptr)) {
2324
21.9k
      if (
!all_of(GEP->indices(), [](Value *V) 8.55k
{ return isa<Constant>(V); }21.9k
))
2325
5.14k
        return TTI::TCC_Basic * 2;
2326
6.95k
    }
2327
6.95k
    return TTI::TCC_Basic;
2328
6.95k
  }
2329
500k
  return BaseT::getUserCost(U, Operands);
2330
500k
}
2331
2332
// Return an average cost of Gather / Scatter instruction, maybe improved later
2333
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2334
91
                                unsigned Alignment, unsigned AddressSpace) {
2335
91
2336
91
  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2337
91
  unsigned VF = SrcVTy->getVectorNumElements();
2338
91
2339
91
  // Try to reduce index size from 64 bit (default for GEP)
2340
91
  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2341
91
  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2342
91
  // to split. Also check that the base pointer is the same for all lanes,
2343
91
  // and that there's at most one variable index.
2344
32
  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2345
32
    unsigned IndexSize = DL.getPointerSizeInBits();
2346
32
    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2347
32
    if (
IndexSize < 64 || 32
!GEP32
)
2348
0
      return IndexSize;
2349
32
2350
32
    unsigned NumOfVarIndices = 0;
2351
32
    Value *Ptrs = GEP->getPointerOperand();
2352
32
    if (
Ptrs->getType()->isVectorTy() && 32
!getSplatValue(Ptrs)6
)
2353
2
      return IndexSize;
2354
39
    
for (unsigned i = 1; 30
i < GEP->getNumOperands()39
;
++i9
) {
2355
30
      if (isa<Constant>(GEP->getOperand(i)))
2356
0
        continue;
2357
30
      Type *IndxTy = GEP->getOperand(i)->getType();
2358
30
      if (IndxTy->isVectorTy())
2359
8
        IndxTy = IndxTy->getVectorElementType();
2360
30
      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2361
28
          !isa<SExtInst>(GEP->getOperand(i))) ||
2362
9
         ++NumOfVarIndices > 1)
2363
21
        return IndexSize; // 64
2364
30
    }
2365
9
    return (unsigned)32;
2366
32
  };
2367
91
2368
91
2369
91
  // Trying to reduce IndexSize to 32 bits for vector 16.
2370
91
  // By default the IndexSize is equal to pointer size.
2371
32
  unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
2372
59
    DL.getPointerSizeInBits();
2373
91
2374
91
  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2375
91
                                                    IndexSize), VF);
2376
91
  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2377
91
  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2378
91
  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2379
91
  if (
SplitFactor > 191
) {
2380
23
    // Handle splitting of vector of pointers
2381
23
    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2382
23
    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2383
23
                                         AddressSpace);
2384
23
  }
2385
68
2386
68
  // The gather / scatter cost is given by Intel architects. It is a rough
2387
68
  // number since we are looking at one instruction in a time.
2388
68
  const int GSOverhead = 2;
2389
68
  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2390
68
                                           Alignment, AddressSpace);
2391
68
}
2392
2393
/// Return the cost of full scalarization of gather / scatter operation.
2394
///
2395
/// Opcode - Load or Store instruction.
2396
/// SrcVTy - The type of the data vector that should be gathered or scattered.
2397
/// VariableMask - The mask is non-constant at compile time.
2398
/// Alignment - Alignment for one element.
2399
/// AddressSpace - pointer[s] address space.
2400
///
2401
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2402
                                bool VariableMask, unsigned Alignment,
2403
72
                                unsigned AddressSpace) {
2404
72
  unsigned VF = SrcVTy->getVectorNumElements();
2405
72
2406
72
  int MaskUnpackCost = 0;
2407
72
  if (
VariableMask72
) {
2408
47
    VectorType *MaskTy =
2409
47
      VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2410
47
    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2411
47
    int ScalarCompareCost =
2412
47
      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2413
47
                         nullptr);
2414
47
    int BranchCost = getCFInstrCost(Instruction::Br);
2415
47
    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2416
47
  }
2417
72
2418
72
  // The cost of the scalar loads/stores.
2419
72
  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2420
72
                                          Alignment, AddressSpace);
2421
72
2422
72
  int InsertExtractCost = 0;
2423
72
  if (Opcode == Instruction::Load)
2424
229
    
for (unsigned i = 0; 43
i < VF229
;
++i186
)
2425
43
      // Add the cost of inserting each scalar load into the vector
2426
186
      InsertExtractCost +=
2427
186
        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2428
72
  else
2429
135
    
for (unsigned i = 0; 29
i < VF135
;
++i106
)
2430
29
      // Add the cost of extracting each element out of the data vector
2431
106
      InsertExtractCost +=
2432
106
        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2433
72
2434
72
  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2435
72
}
2436
2437
/// Calculate the cost of Gather / Scatter operation
2438
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2439
                                       Value *Ptr, bool VariableMask,
2440
140
                                       unsigned Alignment) {
2441
140
  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2442
140
  unsigned VF = SrcVTy->getVectorNumElements();
2443
140
  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2444
140
  if (
!PtrTy && 140
Ptr->getType()->isVectorTy()37
)
2445
37
    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2446
140
  assert(PtrTy && "Unexpected type for Ptr argument");
2447
140
  unsigned AddressSpace = PtrTy->getAddressSpace();
2448
140
2449
140
  bool Scalarize = false;
2450
140
  if (
(Opcode == Instruction::Load && 140
!isLegalMaskedGather(SrcVTy)81
) ||
2451
131
      
(Opcode == Instruction::Store && 131
!isLegalMaskedScatter(SrcVTy)59
))
2452
12
    Scalarize = true;
2453
140
  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2454
140
  // Vector-4 of gather/scatter instruction does not exist on KNL.
2455
140
  // We can extend it to 8 elements, but zeroing upper bits of
2456
140
  // the mask vector will add more instructions. Right now we give the scalar
2457
140
  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2458
140
  // is better in the VariableMask case.
2459
140
  if (
VF == 2 || 140
(VF == 4 && 110
!ST->hasVLX()43
))
2460
66
    Scalarize = true;
2461
140
2462
140
  if (Scalarize)
2463
72
    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2464
72
                           AddressSpace);
2465
68
2466
68
  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2467
68
}
2468
2469
bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
2470
170k
                               TargetTransformInfo::LSRCost &C2) {
2471
170k
    // X86 specific here are "instruction number 1st priority".
2472
170k
    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2473
170k
                    C1.NumIVMuls, C1.NumBaseAdds,
2474
170k
                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2475
170k
           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2476
170k
                    C2.NumIVMuls, C2.NumBaseAdds,
2477
170k
                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2478
170k
}
2479
2480
526
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
2481
526
  Type *ScalarTy = DataTy->getScalarType();
2482
526
  int DataWidth = isa<PointerType>(ScalarTy) ?
2483
526
    
DL.getPointerSizeInBits()34
:
ScalarTy->getPrimitiveSizeInBits()492
;
2484
526
2485
526
  return ((DataWidth == 32 || 
DataWidth == 64257
) &&
ST->hasAVX()482
) ||
2486
61
         
((DataWidth == 8 || 61
DataWidth == 1639
) &&
ST->hasBWI()44
);
2487
526
}
2488
2489
194
bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
2490
194
  return isLegalMaskedLoad(DataType);
2491
194
}
2492
2493
849
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
2494
849
  // This function is called now in two cases: from the Loop Vectorizer
2495
849
  // and from the Scalarizer.
2496
849
  // When the Loop Vectorizer asks about legality of the feature,
2497
849
  // the vectorization factor is not calculated yet. The Loop Vectorizer
2498
849
  // sends a scalar type and the decision is based on the width of the
2499
849
  // scalar element.
2500
849
  // Later on, the cost model will estimate usage this intrinsic based on
2501
849
  // the vector type.
2502
849
  // The Scalarizer asks again about legality. It sends a vector type.
2503
849
  // In this case we can reject non-power-of-2 vectors.
2504
849
  if (
isa<VectorType>(DataTy) && 849
!isPowerOf2_32(DataTy->getVectorNumElements())451
)
2505
6
    return false;
2506
843
  Type *ScalarTy = DataTy->getScalarType();
2507
843
  int DataWidth = isa<PointerType>(ScalarTy) ?
2508
843
    
DL.getPointerSizeInBits()64
:
ScalarTy->getPrimitiveSizeInBits()779
;
2509
843
2510
843
  // AVX-512 allows gather and scatter
2511
843
  return (DataWidth == 32 || 
DataWidth == 64263
) &&
ST->hasAVX512()814
;
2512
849
}
2513
2514
380
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
2515
380
  return isLegalMaskedGather(DataType);
2516
380
}
2517
2518
62
bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2519
62
  EVT VT = TLI->getValueType(DL, DataType);
2520
62
  return TLI->isOperationLegal(IsSigned ? 
ISD::SDIVREM14
:
ISD::UDIVREM48
, VT);
2521
62
}
2522
2523
bool X86TTIImpl::areInlineCompatible(const Function *Caller,
2524
38.2k
                                     const Function *Callee) const {
2525
38.2k
  const TargetMachine &TM = getTLI()->getTargetMachine();
2526
38.2k
2527
38.2k
  // Work this as a subsetting of subtarget features.
2528
38.2k
  const FeatureBitset &CallerBits =
2529
38.2k
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
2530
38.2k
  const FeatureBitset &CalleeBits =
2531
38.2k
      TM.getSubtargetImpl(*Callee)->getFeatureBits();
2532
38.2k
2533
38.2k
  // FIXME: This is likely too limiting as it will include subtarget features
2534
38.2k
  // that we might not care about for inlining, but it is conservatively
2535
38.2k
  // correct.
2536
38.2k
  return (CallerBits & CalleeBits) == CalleeBits;
2537
38.2k
}
2538
2539
400
bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {
2540
400
  // TODO: We can increase these based on available vector ops.
2541
400
  MaxLoadSize = ST->is64Bit() ? 
8201
:
4199
;
2542
400
  return true;
2543
400
}
2544
2545
475
bool X86TTIImpl::enableInterleavedAccessVectorization() {
2546
475
  // TODO: We expect this to be beneficial regardless of arch,
2547
475
  // but there are currently some unexplained performance artifacts on Atom.
2548
475
  // As a temporary solution, disable on Atom.
2549
475
  return !(ST->isAtom());
2550
475
}
2551
2552
// Get estimation for interleaved load/store operations for AVX2.
2553
// \p Factor is the interleaved-access factor (stride) - number of
2554
// (interleaved) elements in the group.
2555
// \p Indices contains the indices for a strided load: when the
2556
// interleaved load has gaps they indicate which elements are used.
2557
// If Indices is empty (or if the number of indices is equal to the size
2558
// of the interleaved-access as given in \p Factor) the access has no gaps.
2559
//
2560
// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2561
// computing the cost using a generic formula as a function of generic
2562
// shuffles. We therefore use a lookup table instead, filled according to
2563
// the instruction sequences that codegen currently generates.
2564
int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
2565
                                               unsigned Factor,
2566
                                               ArrayRef<unsigned> Indices,
2567
                                               unsigned Alignment,
2568
2
                                               unsigned AddressSpace) {
2569
2
2570
2
  // We currently Support only fully-interleaved groups, with no gaps.
2571
2
  // TODO: Support also strided loads (interleaved-groups with gaps).
2572
2
  if (
Indices.size() && 2
Indices.size() != Factor0
)
2573
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2574
0
                                             Alignment, AddressSpace);
2575
2
2576
2
  // VecTy for interleave memop is <VF*Factor x Elt>.
2577
2
  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2578
2
  // VecTy = <12 x i32>.
2579
2
  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2580
2
2581
2
  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2582
2
  // the VF=2, while v2i128 is an unsupported MVT vector type
2583
2
  // (see MachineValueType.h::getVectorVT()).
2584
2
  if (!LegalVT.isVector())
2585
2
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2586
2
                                             Alignment, AddressSpace);
2587
0
2588
0
  unsigned VF = VecTy->getVectorNumElements() / Factor;
2589
0
  Type *ScalarTy = VecTy->getVectorElementType();
2590
0
2591
0
  // Calculate the number of memory operations (NumOfMemOps), required
2592
0
  // for load/store the VecTy.
2593
0
  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2594
0
  unsigned LegalVTSize = LegalVT.getStoreSize();
2595
0
  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2596
0
2597
0
  // Get the cost of one memory operation.
2598
0
  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2599
0
                                        LegalVT.getVectorNumElements());
2600
0
  unsigned MemOpCost =
2601
0
      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2602
0
2603
0
  VectorType *VT = VectorType::get(ScalarTy, VF);
2604
0
  EVT ETy = TLI->getValueType(DL, VT);
2605
0
  if (!ETy.isSimple())
2606
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2607
0
                                             Alignment, AddressSpace);
2608
0
2609
0
  // TODO: Complete for other data-types and strides.
2610
0
  // Each combination of Stride, ElementTy and VF results in a different
2611
0
  // sequence; The cost tables are therefore accessed with:
2612
0
  // Factor (stride) and VectorType=VFxElemType.
2613
0
  // The Cost accounts only for the shuffle sequence;
2614
0
  // The cost of the loads/stores is accounted for separately.
2615
0
  //
2616
0
  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2617
0
    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
2618
0
    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
2619
0
    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
2620
0
    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
2621
0
    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
2622
0
2623
0
    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
2624
0
    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
2625
0
    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
2626
0
    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
2627
0
    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
2628
0
  };
2629
0
2630
0
  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2631
0
    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
2632
0
    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
2633
0
    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
2634
0
    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
2635
0
    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
2636
0
2637
0
    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
2638
0
    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
2639
0
    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
2640
0
    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
2641
0
    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
2642
0
  };
2643
0
2644
0
  if (
Opcode == Instruction::Load0
) {
2645
0
    if (const auto *Entry =
2646
0
            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2647
0
      return NumOfMemOps * MemOpCost + Entry->Cost;
2648
0
  } else {
2649
0
    assert(Opcode == Instruction::Store &&
2650
0
           "Expected Store Instruction at this  point");
2651
0
    if (const auto *Entry =
2652
0
            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2653
0
      return NumOfMemOps * MemOpCost + Entry->Cost;
2654
0
  }
2655
0
2656
0
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2657
0
                                           Alignment, AddressSpace);
2658
0
}
2659
2660
// Get estimation for interleaved load/store operations and strided load.
2661
// \p Indices contains indices for strided load.
2662
// \p Factor - the factor of interleaving.
2663
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
2664
int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
2665
                                                 unsigned Factor,
2666
                                                 ArrayRef<unsigned> Indices,
2667
                                                 unsigned Alignment,
2668
3
                                                 unsigned AddressSpace) {
2669
3
2670
3
  // VecTy for interleave memop is <VF*Factor x Elt>.
2671
3
  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2672
3
  // VecTy = <12 x i32>.
2673
3
2674
3
  // Calculate the number of memory operations (NumOfMemOps), required
2675
3
  // for load/store the VecTy.
2676
3
  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2677
3
  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2678
3
  unsigned LegalVTSize = LegalVT.getStoreSize();
2679
3
  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2680
3
2681
3
  // Get the cost of one memory operation.
2682
3
  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2683
3
                                        LegalVT.getVectorNumElements());
2684
3
  unsigned MemOpCost =
2685
3
      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2686
3
2687
3
  if (
Opcode == Instruction::Load3
) {
2688
3
    // Kind of shuffle depends on number of loaded values.
2689
3
    // If we load the entire data in one register, we can use a 1-src shuffle.
2690
3
    // Otherwise, we'll merge 2 sources in each operation.
2691
3
    TTI::ShuffleKind ShuffleKind =
2692
3
        (NumOfMemOps > 1) ? 
TTI::SK_PermuteTwoSrc3
:
TTI::SK_PermuteSingleSrc0
;
2693
3
2694
3
    unsigned ShuffleCost =
2695
3
        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2696
3
2697
3
    unsigned NumOfLoadsInInterleaveGrp =
2698
3
        Indices.size() ? 
Indices.size()3
:
Factor0
;
2699
3
    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2700
3
                                     VecTy->getVectorNumElements() / Factor);
2701
3
    unsigned NumOfResults =
2702
3
        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2703
3
        NumOfLoadsInInterleaveGrp;
2704
3
2705
3
    // About a half of the loads may be folded in shuffles when we have only
2706
3
    // one result. If we have more than one result, we do not fold loads at all.
2707
3
    unsigned NumOfUnfoldedLoads =
2708
3
        NumOfResults > 1 ? 
NumOfMemOps0
:
NumOfMemOps / 23
;
2709
3
2710
3
    // Get a number of shuffle operations per result.
2711
3
    unsigned NumOfShufflesPerResult =
2712
3
        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2713
3
2714
3
    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2715
3
    // When we have more than one destination, we need additional instructions
2716
3
    // to keep sources.
2717
3
    unsigned NumOfMoves = 0;
2718
3
    if (
NumOfResults > 1 && 3
ShuffleKind == TTI::SK_PermuteTwoSrc0
)
2719
0
      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2720
3
2721
3
    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2722
3
               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2723
3
2724
3
    return Cost;
2725
3
  }
2726
0
2727
0
  // Store.
2728
3
  assert(Opcode == Instruction::Store &&
2729
0
         "Expected Store Instruction at this  point");
2730
0
2731
0
  // There is no strided stores meanwhile. And store can't be folded in
2732
0
  // shuffle.
2733
0
  unsigned NumOfSources = Factor; // The number of values to be merged.
2734
0
  unsigned ShuffleCost =
2735
0
      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2736
0
  unsigned NumOfShufflesPerStore = NumOfSources - 1;
2737
0
2738
0
  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2739
0
  // We need additional instructions to keep sources.
2740
0
  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2741
0
  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2742
0
             NumOfMoves;
2743
0
  return Cost;
2744
0
}
2745
2746
int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2747
                                           unsigned Factor,
2748
                                           ArrayRef<unsigned> Indices,
2749
                                           unsigned Alignment,
2750
9
                                           unsigned AddressSpace) {
2751
9
  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
2752
9
    RequiresBW = false;
2753
9
    Type *EltTy = VecTy->getVectorElementType();
2754
9
    if (
EltTy->isFloatTy() || 9
EltTy->isDoubleTy()9
||
EltTy->isIntegerTy(64)9
||
2755
9
        
EltTy->isIntegerTy(32)6
||
EltTy->isPointerTy()2
)
2756
7
      return true;
2757
2
    
if (2
EltTy->isIntegerTy(16) || 2
EltTy->isIntegerTy(8)2
) {
2758
0
      RequiresBW = true;
2759
0
      return true;
2760
0
    }
2761
2
    return false;
2762
2
  };
2763
9
  bool RequiresBW;
2764
9
  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
2765
9
  if (
ST->hasAVX512() && 9
HasAVX512Solution5
&&
(!RequiresBW || 3
ST->hasBWI()0
))
2766
3
    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2767
3
                                            Alignment, AddressSpace);
2768
6
  
if (6
ST->hasAVX2()6
)
2769
2
    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2770
2
                                          Alignment, AddressSpace);
2771
4
2772
4
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2773
4
                                           Alignment, AddressSpace);
2774
4
}