Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file
9
/// This file implements a TargetTransformInfo analysis pass specific to the
10
/// X86 target machine. It uses the target's detailed information to provide
11
/// more precise answers to certain TTI queries, while letting the target
12
/// independent and default TTI implementations handle the rest.
13
///
14
//===----------------------------------------------------------------------===//
15
/// About Cost Model numbers used below it's necessary to say the following:
16
/// the numbers correspond to some "generic" X86 CPU instead of usage of
17
/// concrete CPU model. Usually the numbers correspond to CPU where the feature
18
/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19
/// the lookups below the cost is based on Nehalem as that was the first CPU
20
/// to support that feature level and thus has most likely the worst case cost.
21
/// Some examples of other technologies/CPUs:
22
///   SSE 3   - Pentium4 / Athlon64
23
///   SSE 4.1 - Penryn
24
///   SSE 4.2 - Nehalem
25
///   AVX     - Sandy Bridge
26
///   AVX2    - Haswell
27
///   AVX-512 - Xeon Phi / Skylake
28
/// And some examples of instruction target dependent costs (latency)
29
///                   divss     sqrtss          rsqrtss
30
///   AMD K7            11-16     19              3
31
///   Piledriver        9-24      13-15           5
32
///   Jaguar            14        16              2
33
///   Pentium II,III    18        30              2
34
///   Nehalem           7-14      7-18            3
35
///   Haswell           10-13     11              5
36
/// TODO: Develop and implement  the target dependent cost model and
37
/// specialize cost numbers for different Cost Model Targets such as throughput,
38
/// code size, latency and uop count.
39
//===----------------------------------------------------------------------===//
40
41
#include "X86TargetTransformInfo.h"
42
#include "llvm/Analysis/TargetTransformInfo.h"
43
#include "llvm/CodeGen/BasicTTIImpl.h"
44
#include "llvm/CodeGen/CostTable.h"
45
#include "llvm/CodeGen/TargetLowering.h"
46
#include "llvm/IR/IntrinsicInst.h"
47
#include "llvm/Support/Debug.h"
48
49
using namespace llvm;
50
51
#define DEBUG_TYPE "x86tti"
52
53
//===----------------------------------------------------------------------===//
54
//
55
// X86 cost model.
56
//
57
//===----------------------------------------------------------------------===//
58
59
TargetTransformInfo::PopcntSupportKind
60
14.0k
X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
61
14.0k
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
62
14.0k
  // TODO: Currently the __builtin_popcount() implementation using SSE3
63
14.0k
  //   instructions is inefficient. Once the problem is fixed, we should
64
14.0k
  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
65
14.0k
  return ST->hasPOPCNT() ? 
TTI::PSK_FastHardware4.25k
:
TTI::PSK_Software9.75k
;
66
14.0k
}
67
68
llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
69
2
  TargetTransformInfo::CacheLevel Level) const {
70
2
  switch (Level) {
71
2
  case TargetTransformInfo::CacheLevel::L1D:
72
0
    //   - Penryn
73
0
    //   - Nehalem
74
0
    //   - Westmere
75
0
    //   - Sandy Bridge
76
0
    //   - Ivy Bridge
77
0
    //   - Haswell
78
0
    //   - Broadwell
79
0
    //   - Skylake
80
0
    //   - Kabylake
81
0
    return 32 * 1024;  //  32 KByte
82
2
  case TargetTransformInfo::CacheLevel::L2D:
83
2
    //   - Penryn
84
2
    //   - Nehalem
85
2
    //   - Westmere
86
2
    //   - Sandy Bridge
87
2
    //   - Ivy Bridge
88
2
    //   - Haswell
89
2
    //   - Broadwell
90
2
    //   - Skylake
91
2
    //   - Kabylake
92
2
    return 256 * 1024; // 256 KByte
93
0
  }
94
0
95
0
  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
96
0
}
97
98
llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
99
4
  TargetTransformInfo::CacheLevel Level) const {
100
4
  //   - Penryn
101
4
  //   - Nehalem
102
4
  //   - Westmere
103
4
  //   - Sandy Bridge
104
4
  //   - Ivy Bridge
105
4
  //   - Haswell
106
4
  //   - Broadwell
107
4
  //   - Skylake
108
4
  //   - Kabylake
109
4
  switch (Level) {
110
4
  case TargetTransformInfo::CacheLevel::L1D:
111
2
    LLVM_FALLTHROUGH;
112
4
  case TargetTransformInfo::CacheLevel::L2D:
113
4
    return 8;
114
0
  }
115
0
116
0
  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
117
0
}
118
119
757k
unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
120
757k
  if (Vector && 
!ST->hasSSE1()52.9k
)
121
1.45k
    return 0;
122
756k
123
756k
  if (ST->is64Bit()) {
124
648k
    if (Vector && 
ST->hasAVX512()45.3k
)
125
706
      return 32;
126
647k
    return 16;
127
647k
  }
128
107k
  return 8;
129
107k
}
130
131
27.5k
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
132
27.5k
  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
133
27.5k
  if (Vector) {
134
27.5k
    if (ST->hasAVX512() && 
PreferVectorWidth >= 512734
)
135
540
      return 512;
136
27.0k
    if (ST->hasAVX() && 
PreferVectorWidth >= 2568.25k
)
137
8.25k
      return 256;
138
18.7k
    if (ST->hasSSE1() && PreferVectorWidth >= 128)
139
18.7k
      return 128;
140
1
    return 0;
141
1
  }
142
0
143
0
  if (ST->is64Bit())
144
0
    return 64;
145
0
146
0
  return 32;
147
0
}
148
149
255
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
150
255
  return getRegisterBitWidth(true);
151
255
}
152
153
2.23k
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
154
2.23k
  // If the loop will not be vectorized, don't interleave the loop.
155
2.23k
  // Let regular unroll to unroll the loop, which saves the overflow
156
2.23k
  // check and memory check cost.
157
2.23k
  if (VF == 1)
158
1.44k
    return 1;
159
787
160
787
  if (ST->isAtom())
161
0
    return 1;
162
787
163
787
  // Sandybridge and Haswell have multiple execution ports and pipelined
164
787
  // vector units.
165
787
  if (ST->hasAVX())
166
287
    return 4;
167
500
168
500
  return 2;
169
500
}
170
171
int X86TTIImpl::getArithmeticInstrCost(
172
    unsigned Opcode, Type *Ty,
173
    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
174
    TTI::OperandValueProperties Opd1PropInfo,
175
    TTI::OperandValueProperties Opd2PropInfo,
176
129k
    ArrayRef<const Value *> Args) {
177
129k
  // Legalize the type.
178
129k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
179
129k
180
129k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
181
129k
  assert(ISD && "Invalid opcode");
182
129k
183
129k
  static const CostTblEntry GLMCostTable[] = {
184
129k
    { ISD::FDIV,  MVT::f32,   18 }, // divss
185
129k
    { ISD::FDIV,  MVT::v4f32, 35 }, // divps
186
129k
    { ISD::FDIV,  MVT::f64,   33 }, // divsd
187
129k
    { ISD::FDIV,  MVT::v2f64, 65 }, // divpd
188
129k
  };
189
129k
190
129k
  if (ST->isGLM())
191
2.40k
    if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
192
16
                                            LT.second))
193
16
      return LT.first * Entry->Cost;
194
129k
195
129k
  static const CostTblEntry SLMCostTable[] = {
196
129k
    { ISD::MUL,   MVT::v4i32, 11 }, // pmulld
197
129k
    { ISD::MUL,   MVT::v8i16, 2  }, // pmullw
198
129k
    { ISD::MUL,   MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
199
129k
    { ISD::FMUL,  MVT::f64,   2  }, // mulsd
200
129k
    { ISD::FMUL,  MVT::v2f64, 4  }, // mulpd
201
129k
    { ISD::FMUL,  MVT::v4f32, 2  }, // mulps
202
129k
    { ISD::FDIV,  MVT::f32,   17 }, // divss
203
129k
    { ISD::FDIV,  MVT::v4f32, 39 }, // divps
204
129k
    { ISD::FDIV,  MVT::f64,   32 }, // divsd
205
129k
    { ISD::FDIV,  MVT::v2f64, 69 }, // divpd
206
129k
    { ISD::FADD,  MVT::v2f64, 2  }, // addpd
207
129k
    { ISD::FSUB,  MVT::v2f64, 2  }, // subpd
208
129k
    // v2i64/v4i64 mul is custom lowered as a series of long:
209
129k
    // multiplies(3), shifts(3) and adds(2)
210
129k
    // slm muldq version throughput is 2 and addq throughput 4
211
129k
    // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
212
129k
    //       3X4 (addq throughput) = 17
213
129k
    { ISD::MUL,   MVT::v2i64, 17 },
214
129k
    // slm addq\subq throughput is 4
215
129k
    { ISD::ADD,   MVT::v2i64, 4  },
216
129k
    { ISD::SUB,   MVT::v2i64, 4  },
217
129k
  };
218
129k
219
129k
  if (ST->isSLM()) {
220
3.32k
    if (Args.size() == 2 && 
ISD == ISD::MUL1.15k
&&
LT.second == MVT::v4i3290
) {
221
24
      // Check if the operands can be shrinked into a smaller datatype.
222
24
      bool Op1Signed = false;
223
24
      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
224
24
      bool Op2Signed = false;
225
24
      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
226
24
227
24
      bool signedMode = Op1Signed | Op2Signed;
228
24
      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
229
24
230
24
      if (OpMinSize <= 7)
231
1
        return LT.first * 3; // pmullw/sext
232
23
      if (!signedMode && 
OpMinSize <= 816
)
233
2
        return LT.first * 3; // pmullw/zext
234
21
      if (OpMinSize <= 15)
235
5
        return LT.first * 5; // pmullw/pmulhw/pshuf
236
16
      if (!signedMode && 
OpMinSize <= 1613
)
237
2
        return LT.first * 5; // pmullw/pmulhw/pshuf
238
3.31k
    }
239
3.31k
240
3.31k
    if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
241
477
                                            LT.second)) {
242
477
      return LT.first * Entry->Cost;
243
477
    }
244
129k
  }
245
129k
246
129k
  if ((ISD == ISD::SDIV || 
ISD == ISD::SREM127k
||
ISD == ISD::UDIV126k
||
247
129k
       
ISD == ISD::UREM122k
) &&
248
129k
      
(12.8k
Op2Info == TargetTransformInfo::OK_UniformConstantValue12.8k
||
249
12.8k
       
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue7.71k
) &&
250
129k
      
Opd2PropInfo == TargetTransformInfo::OP_PowerOf26.19k
) {
251
2.22k
    if (ISD == ISD::SDIV || 
ISD == ISD::SREM1.89k
) {
252
651
      // On X86, vector signed division by constants power-of-two are
253
651
      // normally expanded to the sequence SRA + SRL + ADD + SRA.
254
651
      // The OperandValue properties may not be the same as that of the previous
255
651
      // operation; conservatively assume OP_None.
256
651
      int Cost =
257
651
          2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
258
651
                                     TargetTransformInfo::OP_None,
259
651
                                     TargetTransformInfo::OP_None);
260
651
      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
261
651
                                     TargetTransformInfo::OP_None,
262
651
                                     TargetTransformInfo::OP_None);
263
651
      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
264
651
                                     TargetTransformInfo::OP_None,
265
651
                                     TargetTransformInfo::OP_None);
266
651
267
651
      if (ISD == ISD::SREM) {
268
320
        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
269
320
        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
270
320
        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
271
320
      }
272
651
273
651
      return Cost;
274
651
    }
275
1.57k
276
1.57k
    // Vector unsigned division/remainder will be simplified to shifts/masks.
277
1.57k
    if (ISD == ISD::UDIV)
278
320
      return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
279
320
                                    TargetTransformInfo::OP_None,
280
320
                                    TargetTransformInfo::OP_None);
281
1.25k
282
1.25k
    if (ISD == ISD::UREM)
283
1.25k
      return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
284
1.25k
                                    TargetTransformInfo::OP_None,
285
1.25k
                                    TargetTransformInfo::OP_None);
286
127k
  }
287
127k
288
127k
  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
289
127k
    { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
290
127k
    { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
291
127k
    { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
292
127k
  };
293
127k
294
127k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
295
127k
      
ST->hasBWI()43.6k
) {
296
847
    if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
297
26
                                            LT.second))
298
26
      return LT.first * Entry->Cost;
299
127k
  }
300
127k
301
127k
  static const CostTblEntry AVX512UniformConstCostTable[] = {
302
127k
    { ISD::SRA,  MVT::v2i64,   1 },
303
127k
    { ISD::SRA,  MVT::v4i64,   1 },
304
127k
    { ISD::SRA,  MVT::v8i64,   1 },
305
127k
  };
306
127k
307
127k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
308
127k
      
ST->hasAVX512()43.5k
) {
309
2.04k
    if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
310
39
                                            LT.second))
311
39
      return LT.first * Entry->Cost;
312
127k
  }
313
127k
314
127k
  static const CostTblEntry AVX2UniformConstCostTable[] = {
315
127k
    { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
316
127k
    { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
317
127k
    { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
318
127k
319
127k
    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
320
127k
  };
321
127k
322
127k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
323
127k
      
ST->hasAVX2()43.5k
) {
324
13.6k
    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
325
183
                                            LT.second))
326
183
      return LT.first * Entry->Cost;
327
127k
  }
328
127k
329
127k
  static const CostTblEntry SSE2UniformConstCostTable[] = {
330
127k
    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
331
127k
    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
332
127k
    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
333
127k
334
127k
    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
335
127k
    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
336
127k
    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
337
127k
  };
338
127k
339
127k
  // XOP has faster vXi8 shifts.
340
127k
  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
341
127k
      
ST->hasSSE2()43.3k
&&
!ST->hasXOP()43.3k
) {
342
43.1k
    if (const auto *Entry =
343
658
            CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
344
658
      return LT.first * Entry->Cost;
345
126k
  }
346
126k
347
126k
  static const CostTblEntry AVX512BWConstCostTable[] = {
348
126k
    { ISD::SDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
349
126k
    { ISD::SREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
350
126k
    { ISD::UDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
351
126k
    { ISD::UREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
352
126k
    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
353
126k
    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
354
126k
    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
355
126k
    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
356
126k
  };
357
126k
358
126k
  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
359
126k
       
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue83.6k
) &&
360
126k
      
ST->hasBWI()50.8k
) {
361
1.07k
    if (const auto *Entry =
362
16
            CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
363
16
      return LT.first * Entry->Cost;
364
126k
  }
365
126k
366
126k
  static const CostTblEntry AVX512ConstCostTable[] = {
367
126k
    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
368
126k
    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
369
126k
    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
370
126k
    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
371
126k
  };
372
126k
373
126k
  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
374
126k
       
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue83.6k
) &&
375
126k
      
ST->hasAVX512()50.8k
) {
376
2.64k
    if (const auto *Entry =
377
19
            CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
378
19
      return LT.first * Entry->Cost;
379
126k
  }
380
126k
381
126k
  static const CostTblEntry AVX2ConstCostTable[] = {
382
126k
    { ISD::SDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
383
126k
    { ISD::SREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
384
126k
    { ISD::UDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
385
126k
    { ISD::UREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
386
126k
    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
387
126k
    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
388
126k
    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
389
126k
    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
390
126k
    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
391
126k
    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
392
126k
    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
393
126k
    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
394
126k
  };
395
126k
396
126k
  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
397
126k
       
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue83.6k
) &&
398
126k
      
ST->hasAVX2()50.8k
) {
399
16.0k
    if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
400
133
      return LT.first * Entry->Cost;
401
126k
  }
402
126k
403
126k
  static const CostTblEntry SSE2ConstCostTable[] = {
404
126k
    { ISD::SDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
405
126k
    { ISD::SREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
406
126k
    { ISD::SDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
407
126k
    { ISD::SREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
408
126k
    { ISD::UDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
409
126k
    { ISD::UREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
410
126k
    { ISD::UDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
411
126k
    { ISD::UREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
412
126k
    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
413
126k
    { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
414
126k
    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
415
126k
    { ISD::SREM, MVT::v8i16,     8 }, // pmulhw+mul+sub sequence
416
126k
    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
417
126k
    { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
418
126k
    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
419
126k
    { ISD::UREM, MVT::v8i16,     8 }, // pmulhuw+mul+sub sequence
420
126k
    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
421
126k
    { ISD::SREM, MVT::v8i32,  48+2 }, // 2*pmuludq+mul+sub sequence + split.
422
126k
    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
423
126k
    { ISD::SREM, MVT::v4i32,    24 }, // pmuludq+mul+sub sequence
424
126k
    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
425
126k
    { ISD::UREM, MVT::v8i32,  40+2 }, // 2*pmuludq+mul+sub sequence + split.
426
126k
    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
427
126k
    { ISD::UREM, MVT::v4i32,    20 }, // pmuludq+mul+sub sequence
428
126k
  };
429
126k
430
126k
  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
431
126k
       
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue83.5k
) &&
432
126k
      
ST->hasSSE2()50.7k
) {
433
50.7k
    // pmuldq sequence.
434
50.7k
    if (ISD == ISD::SDIV && 
LT.second == MVT::v8i32552
&&
ST->hasAVX()11
)
435
11
      return LT.first * 32;
436
50.7k
    if (ISD == ISD::SREM && 
LT.second == MVT::v8i32344
&&
ST->hasAVX()8
)
437
8
      return LT.first * 38;
438
50.6k
    if (ISD == ISD::SDIV && 
LT.second == MVT::v4i32541
&&
ST->hasSSE41()57
)
439
37
      return LT.first * 15;
440
50.6k
    if (ISD == ISD::SREM && 
LT.second == MVT::v4i32336
&&
ST->hasSSE41()40
)
441
28
      return LT.first * 20;
442
50.6k
443
50.6k
    if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
444
556
      return LT.first * Entry->Cost;
445
125k
  }
446
125k
447
125k
  static const CostTblEntry AVX2UniformCostTable[] = {
448
125k
    // Uniform splats are cheaper for the following instructions.
449
125k
    { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
450
125k
    { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
451
125k
    { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
452
125k
  };
453
125k
454
125k
  if (ST->hasAVX2() &&
455
125k
      
(39.4k
(Op2Info == TargetTransformInfo::OK_UniformConstantValue)39.4k
||
456
39.4k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)26.2k
)) {
457
13.8k
    if (const auto *Entry =
458
302
            CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
459
302
      return LT.first * Entry->Cost;
460
125k
  }
461
125k
462
125k
  static const CostTblEntry SSE2UniformCostTable[] = {
463
125k
    // Uniform splats are cheaper for the following instructions.
464
125k
    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
465
125k
    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
466
125k
    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
467
125k
468
125k
    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
469
125k
    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
470
125k
    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
471
125k
472
125k
    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
473
125k
    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
474
125k
  };
475
125k
476
125k
  if (ST->hasSSE2() &&
477
125k
      
(125k
(Op2Info == TargetTransformInfo::OK_UniformConstantValue)125k
||
478
125k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)83.0k
)) {
479
43.6k
    if (const auto *Entry =
480
4.47k
            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
481
4.47k
      return LT.first * Entry->Cost;
482
120k
  }
483
120k
484
120k
  static const CostTblEntry AVX512DQCostTable[] = {
485
120k
    { ISD::MUL,  MVT::v2i64, 1 },
486
120k
    { ISD::MUL,  MVT::v4i64, 1 },
487
120k
    { ISD::MUL,  MVT::v8i64, 1 }
488
120k
  };
489
120k
490
120k
  // Look for AVX512DQ lowering tricks for custom cases.
491
120k
  if (ST->hasDQI())
492
2.33k
    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
493
39
      return LT.first * Entry->Cost;
494
120k
495
120k
  static const CostTblEntry AVX512BWCostTable[] = {
496
120k
    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
497
120k
    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
498
120k
    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
499
120k
500
120k
    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
501
120k
    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
502
120k
    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
503
120k
504
120k
    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
505
120k
    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
506
120k
    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
507
120k
508
120k
    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
509
120k
    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
510
120k
    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
511
120k
512
120k
    { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
513
120k
    { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
514
120k
    { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
515
120k
  };
516
120k
517
120k
  // Look for AVX512BW lowering tricks for custom cases.
518
120k
  if (ST->hasBWI())
519
2.81k
    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
520
236
      return LT.first * Entry->Cost;
521
120k
522
120k
  static const CostTblEntry AVX512CostTable[] = {
523
120k
    { ISD::SHL,     MVT::v16i32,     1 },
524
120k
    { ISD::SRL,     MVT::v16i32,     1 },
525
120k
    { ISD::SRA,     MVT::v16i32,     1 },
526
120k
527
120k
    { ISD::SHL,     MVT::v8i64,      1 },
528
120k
    { ISD::SRL,     MVT::v8i64,      1 },
529
120k
530
120k
    { ISD::SRA,     MVT::v2i64,      1 },
531
120k
    { ISD::SRA,     MVT::v4i64,      1 },
532
120k
    { ISD::SRA,     MVT::v8i64,      1 },
533
120k
534
120k
    { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
535
120k
    { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
536
120k
    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld (Skylake from agner.org)
537
120k
    { ISD::MUL,     MVT::v8i32,      1 }, // pmulld (Skylake from agner.org)
538
120k
    { ISD::MUL,     MVT::v4i32,      1 }, // pmulld (Skylake from agner.org)
539
120k
    { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
540
120k
541
120k
    { ISD::FADD,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
542
120k
    { ISD::FSUB,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
543
120k
    { ISD::FMUL,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
544
120k
545
120k
    { ISD::FADD,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
546
120k
    { ISD::FSUB,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
547
120k
    { ISD::FMUL,    MVT::v16f32,     1 }, // Skylake from http://www.agner.org/
548
120k
  };
549
120k
550
120k
  if (ST->hasAVX512())
551
6.86k
    if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
552
629
      return LT.first * Entry->Cost;
553
119k
554
119k
  static const CostTblEntry AVX2ShiftCostTable[] = {
555
119k
    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
556
119k
    // customize them to detect the cases where shift amount is a scalar one.
557
119k
    { ISD::SHL,     MVT::v4i32,    1 },
558
119k
    { ISD::SRL,     MVT::v4i32,    1 },
559
119k
    { ISD::SRA,     MVT::v4i32,    1 },
560
119k
    { ISD::SHL,     MVT::v8i32,    1 },
561
119k
    { ISD::SRL,     MVT::v8i32,    1 },
562
119k
    { ISD::SRA,     MVT::v8i32,    1 },
563
119k
    { ISD::SHL,     MVT::v2i64,    1 },
564
119k
    { ISD::SRL,     MVT::v2i64,    1 },
565
119k
    { ISD::SHL,     MVT::v4i64,    1 },
566
119k
    { ISD::SRL,     MVT::v4i64,    1 },
567
119k
  };
568
119k
569
119k
  // Look for AVX2 lowering tricks.
570
119k
  if (ST->hasAVX2()) {
571
37.1k
    if (ISD == ISD::SHL && 
LT.second == MVT::v16i162.49k
&&
572
37.1k
        
(90
Op2Info == TargetTransformInfo::OK_UniformConstantValue90
||
573
90
         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
574
48
      // On AVX2, a packed v16i16 shift left by a constant build_vector
575
48
      // is lowered into a vector multiply (vpmullw).
576
48
      return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
577
48
                                    TargetTransformInfo::OP_None,
578
48
                                    TargetTransformInfo::OP_None);
579
37.0k
580
37.0k
    if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
581
3.65k
      return LT.first * Entry->Cost;
582
116k
  }
583
116k
584
116k
  static const CostTblEntry XOPShiftCostTable[] = {
585
116k
    // 128bit shifts take 1cy, but right shifts require negation beforehand.
586
116k
    { ISD::SHL,     MVT::v16i8,    1 },
587
116k
    { ISD::SRL,     MVT::v16i8,    2 },
588
116k
    { ISD::SRA,     MVT::v16i8,    2 },
589
116k
    { ISD::SHL,     MVT::v8i16,    1 },
590
116k
    { ISD::SRL,     MVT::v8i16,    2 },
591
116k
    { ISD::SRA,     MVT::v8i16,    2 },
592
116k
    { ISD::SHL,     MVT::v4i32,    1 },
593
116k
    { ISD::SRL,     MVT::v4i32,    2 },
594
116k
    { ISD::SRA,     MVT::v4i32,    2 },
595
116k
    { ISD::SHL,     MVT::v2i64,    1 },
596
116k
    { ISD::SRL,     MVT::v2i64,    2 },
597
116k
    { ISD::SRA,     MVT::v2i64,    2 },
598
116k
    // 256bit shifts require splitting if AVX2 didn't catch them above.
599
116k
    { ISD::SHL,     MVT::v32i8,  2+2 },
600
116k
    { ISD::SRL,     MVT::v32i8,  4+2 },
601
116k
    { ISD::SRA,     MVT::v32i8,  4+2 },
602
116k
    { ISD::SHL,     MVT::v16i16, 2+2 },
603
116k
    { ISD::SRL,     MVT::v16i16, 4+2 },
604
116k
    { ISD::SRA,     MVT::v16i16, 4+2 },
605
116k
    { ISD::SHL,     MVT::v8i32,  2+2 },
606
116k
    { ISD::SRL,     MVT::v8i32,  4+2 },
607
116k
    { ISD::SRA,     MVT::v8i32,  4+2 },
608
116k
    { ISD::SHL,     MVT::v4i64,  2+2 },
609
116k
    { ISD::SRL,     MVT::v4i64,  4+2 },
610
116k
    { ISD::SRA,     MVT::v4i64,  4+2 },
611
116k
  };
612
116k
613
116k
  // Look for XOP lowering tricks.
614
116k
  if (ST->hasXOP()) {
615
861
    // If the right shift is constant then we'll fold the negation so
616
861
    // it's as cheap as a left shift.
617
861
    int ShiftISD = ISD;
618
861
    if ((ShiftISD == ISD::SRL || 
ShiftISD == ISD::SRA698
) &&
619
861
        
(251
Op2Info == TargetTransformInfo::OK_UniformConstantValue251
||
620
251
         
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue210
))
621
108
      ShiftISD = ISD::SHL;
622
861
    if (const auto *Entry =
623
388
            CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
624
388
      return LT.first * Entry->Cost;
625
115k
  }
626
115k
627
115k
  static const CostTblEntry SSE2UniformShiftCostTable[] = {
628
115k
    // Uniform splats are cheaper for the following instructions.
629
115k
    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
630
115k
    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
631
115k
    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
632
115k
633
115k
    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
634
115k
    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
635
115k
    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
636
115k
637
115k
    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
638
115k
    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
639
115k
    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
640
115k
    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
641
115k
  };
642
115k
643
115k
  if (ST->hasSSE2() &&
644
115k
      
(115k
(Op2Info == TargetTransformInfo::OK_UniformConstantValue)115k
||
645
115k
       
(Op2Info == TargetTransformInfo::OK_UniformValue)78.0k
)) {
646
38.3k
647
38.3k
    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
648
38.3k
    if (ISD == ISD::SRA && 
LT.second == MVT::v4i641.02k
&&
ST->hasAVX2()22
)
649
2
      return LT.first * 4; // 2*psrad + shuffle.
650
38.3k
651
38.3k
    if (const auto *Entry =
652
692
            CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
653
692
      return LT.first * Entry->Cost;
654
115k
  }
655
115k
656
115k
  if (ISD == ISD::SHL &&
657
115k
      
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue6.29k
) {
658
1.22k
    MVT VT = LT.second;
659
1.22k
    // Vector shift left by non uniform constant can be lowered
660
1.22k
    // into vector multiply.
661
1.22k
    if (((VT == MVT::v8i16 || 
VT == MVT::v4i321.10k
) &&
ST->hasSSE2()207
) ||
662
1.22k
        
(1.01k
(1.01k
VT == MVT::v16i161.01k
||
VT == MVT::v8i32990
) &&
ST->hasAVX()48
))
663
255
      ISD = ISD::MUL;
664
1.22k
  }
665
115k
666
115k
  static const CostTblEntry AVX2CostTable[] = {
667
115k
    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
668
115k
    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
669
115k
670
115k
    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
671
115k
    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
672
115k
673
115k
    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
674
115k
    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
675
115k
    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
676
115k
    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
677
115k
678
115k
    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
679
115k
    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
680
115k
    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
681
115k
    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
682
115k
    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
683
115k
    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
684
115k
    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
685
115k
    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
686
115k
687
115k
    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
688
115k
    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
689
115k
    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
690
115k
    { ISD::MUL,  MVT::v8i32,      2 }, // pmulld (Haswell from agner.org)
691
115k
    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
692
115k
693
115k
    { ISD::FADD, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
694
115k
    { ISD::FADD, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
695
115k
    { ISD::FSUB, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
696
115k
    { ISD::FSUB, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
697
115k
    { ISD::FMUL, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
698
115k
    { ISD::FMUL, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
699
115k
700
115k
    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
701
115k
    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
702
115k
    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
703
115k
    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
704
115k
    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
705
115k
    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
706
115k
  };
707
115k
708
115k
  // Look for AVX2 lowering tricks for custom cases.
709
115k
  if (ST->hasAVX2())
710
33.2k
    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
711
2.05k
      return LT.first * Entry->Cost;
712
113k
713
113k
  static const CostTblEntry AVX1CostTable[] = {
714
113k
    // We don't have to scalarize unsupported ops. We can issue two half-sized
715
113k
    // operations and we only need to extract the upper YMM half.
716
113k
    // Two ops + 1 extract + 1 insert = 4.
717
113k
    { ISD::MUL,     MVT::v16i16,     4 },
718
113k
    { ISD::MUL,     MVT::v8i32,      4 },
719
113k
    { ISD::SUB,     MVT::v32i8,      4 },
720
113k
    { ISD::ADD,     MVT::v32i8,      4 },
721
113k
    { ISD::SUB,     MVT::v16i16,     4 },
722
113k
    { ISD::ADD,     MVT::v16i16,     4 },
723
113k
    { ISD::SUB,     MVT::v8i32,      4 },
724
113k
    { ISD::ADD,     MVT::v8i32,      4 },
725
113k
    { ISD::SUB,     MVT::v4i64,      4 },
726
113k
    { ISD::ADD,     MVT::v4i64,      4 },
727
113k
728
113k
    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
729
113k
    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
730
113k
    // Because we believe v4i64 to be a legal type, we must also include the
731
113k
    // extract+insert in the cost table. Therefore, the cost here is 18
732
113k
    // instead of 8.
733
113k
    { ISD::MUL,     MVT::v4i64,     18 },
734
113k
735
113k
    { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
736
113k
737
113k
    { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
738
113k
    { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
739
113k
    { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
740
113k
    { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
741
113k
    { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
742
113k
    { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
743
113k
  };
744
113k
745
113k
  if (ST->hasAVX())
746
37.0k
    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
747
984
      return LT.first * Entry->Cost;
748
112k
749
112k
  static const CostTblEntry SSE42CostTable[] = {
750
112k
    { ISD::FADD, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
751
112k
    { ISD::FADD, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
752
112k
    { ISD::FADD, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
753
112k
    { ISD::FADD, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
754
112k
755
112k
    { ISD::FSUB, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
756
112k
    { ISD::FSUB, MVT::f32 ,    1 }, // Nehalem from http://www.agner.org/
757
112k
    { ISD::FSUB, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
758
112k
    { ISD::FSUB, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
759
112k
760
112k
    { ISD::FMUL, MVT::f64,     1 }, // Nehalem from http://www.agner.org/
761
112k
    { ISD::FMUL, MVT::f32,     1 }, // Nehalem from http://www.agner.org/
762
112k
    { ISD::FMUL, MVT::v2f64,   1 }, // Nehalem from http://www.agner.org/
763
112k
    { ISD::FMUL, MVT::v4f32,   1 }, // Nehalem from http://www.agner.org/
764
112k
765
112k
    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
766
112k
    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
767
112k
    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
768
112k
    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
769
112k
  };
770
112k
771
112k
  if (ST->hasSSE42())
772
42.3k
    if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
773
1.47k
      return LT.first * Entry->Cost;
774
110k
775
110k
  static const CostTblEntry SSE41CostTable[] = {
776
110k
    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
777
110k
    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
778
110k
    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
779
110k
    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
780
110k
    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
781
110k
    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
782
110k
783
110k
    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
784
110k
    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
785
110k
    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
786
110k
    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
787
110k
    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
788
110k
    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
789
110k
790
110k
    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
791
110k
    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
792
110k
    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
793
110k
    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
794
110k
    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
795
110k
    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
796
110k
797
110k
    { ISD::MUL,  MVT::v4i32,       2 }  // pmulld (Nehalem from agner.org)
798
110k
  };
799
110k
800
110k
  if (ST->hasSSE41())
801
42.0k
    if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
802
1.79k
      return LT.first * Entry->Cost;
803
108k
804
108k
  static const CostTblEntry SSE2CostTable[] = {
805
108k
    // We don't correctly identify costs of casts because they are marked as
806
108k
    // custom.
807
108k
    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
808
108k
    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
809
108k
    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
810
108k
    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
811
108k
    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
812
108k
813
108k
    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
814
108k
    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
815
108k
    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
816
108k
    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
817
108k
    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
818
108k
819
108k
    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
820
108k
    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
821
108k
    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
822
108k
    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
823
108k
    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
824
108k
825
108k
    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
826
108k
    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
827
108k
    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
828
108k
    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
829
108k
830
108k
    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
831
108k
    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
832
108k
    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
833
108k
    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
834
108k
835
108k
    { ISD::FADD, MVT::f32,         2 }, // Pentium IV from http://www.agner.org/
836
108k
    { ISD::FADD, MVT::f64,         2 }, // Pentium IV from http://www.agner.org/
837
108k
838
108k
    { ISD::FSUB, MVT::f32,         2 }, // Pentium IV from http://www.agner.org/
839
108k
    { ISD::FSUB, MVT::f64,         2 }, // Pentium IV from http://www.agner.org/
840
108k
  };
841
108k
842
108k
  if (ST->hasSSE2())
843
108k
    if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
844
10.4k
      return LT.first * Entry->Cost;
845
98.3k
846
98.3k
  static const CostTblEntry SSE1CostTable[] = {
847
98.3k
    { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
848
98.3k
    { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
849
98.3k
850
98.3k
    { ISD::FADD, MVT::f32,    1 }, // Pentium III from http://www.agner.org/
851
98.3k
    { ISD::FADD, MVT::v4f32,  2 }, // Pentium III from http://www.agner.org/
852
98.3k
853
98.3k
    { ISD::FSUB, MVT::f32,    1 }, // Pentium III from http://www.agner.org/
854
98.3k
    { ISD::FSUB, MVT::v4f32,  2 }, // Pentium III from http://www.agner.org/
855
98.3k
856
98.3k
    { ISD::ADD, MVT::i8,      1 }, // Pentium III from http://www.agner.org/
857
98.3k
    { ISD::ADD, MVT::i16,     1 }, // Pentium III from http://www.agner.org/
858
98.3k
    { ISD::ADD, MVT::i32,     1 }, // Pentium III from http://www.agner.org/
859
98.3k
860
98.3k
    { ISD::SUB, MVT::i8,      1 }, // Pentium III from http://www.agner.org/
861
98.3k
    { ISD::SUB, MVT::i16,     1 }, // Pentium III from http://www.agner.org/
862
98.3k
    { ISD::SUB, MVT::i32,     1 }, // Pentium III from http://www.agner.org/
863
98.3k
  };
864
98.3k
865
98.3k
  if (ST->hasSSE1())
866
98.2k
    if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
867
6.42k
      return LT.first * Entry->Cost;
868
91.8k
869
91.8k
  // It is not a good idea to vectorize division. We have to scalarize it and
870
91.8k
  // in the process we will often end up having to spilling regular
871
91.8k
  // registers. The overhead of division is going to dominate most kernels
872
91.8k
  // anyways so try hard to prevent vectorization of division - it is
873
91.8k
  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
874
91.8k
  // to hide "20 cycles" for each lane.
875
91.8k
  if (LT.second.isVector() && 
(41.4k
ISD == ISD::SDIV41.4k
||
ISD == ISD::SREM41.0k
||
876
41.4k
                               
ISD == ISD::UDIV40.8k
||
ISD == ISD::UREM39.4k
)) {
877
3.34k
    int ScalarCost = getArithmeticInstrCost(
878
3.34k
        Opcode, Ty->getScalarType(), Op1Info, Op2Info,
879
3.34k
        TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
880
3.34k
    return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
881
3.34k
  }
882
88.5k
883
88.5k
  // Fallback to the default implementation.
884
88.5k
  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
885
88.5k
}
886
887
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
888
30.4k
                               Type *SubTp) {
889
30.4k
  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
890
30.4k
  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
891
30.4k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
892
30.4k
893
30.4k
  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
894
30.4k
  if (Kind == TTI::SK_Transpose)
895
220
    Kind = TTI::SK_PermuteTwoSrc;
896
30.4k
897
30.4k
  // For Broadcasts we are splatting the first element from the first input
898
30.4k
  // register, so only need to reference that input and all the output
899
30.4k
  // registers are the same.
900
30.4k
  if (Kind == TTI::SK_Broadcast)
901
6.06k
    LT.first = 1;
902
30.4k
903
30.4k
  // Subvector extractions are free if they start at the beginning of a
904
30.4k
  // vector and cheap if the subvectors are aligned.
905
30.4k
  if (Kind == TTI::SK_ExtractSubvector && 
LT.second.isVector()1.47k
) {
906
1.47k
    int NumElts = LT.second.getVectorNumElements();
907
1.47k
    if ((Index % NumElts) == 0)
908
1.41k
      return 0;
909
60
    std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
910
60
    if (SubLT.second.isVector()) {
911
60
      int NumSubElts = SubLT.second.getVectorNumElements();
912
60
      if ((Index % NumSubElts) == 0 && 
(NumElts % NumSubElts) == 048
)
913
48
        return SubLT.first;
914
28.9k
    }
915
60
  }
916
28.9k
917
28.9k
  // We are going to permute multiple sources and the result will be in multiple
918
28.9k
  // destinations. Providing an accurate cost only for splits where the element
919
28.9k
  // type remains the same.
920
28.9k
  if (Kind == TTI::SK_PermuteSingleSrc && 
LT.first != 13.42k
) {
921
375
    MVT LegalVT = LT.second;
922
375
    if (LegalVT.isVector() &&
923
375
        LegalVT.getVectorElementType().getSizeInBits() ==
924
360
            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
925
375
        
LegalVT.getVectorNumElements() < Tp->getVectorNumElements()267
) {
926
267
927
267
      unsigned VecTySize = DL.getTypeStoreSize(Tp);
928
267
      unsigned LegalVTSize = LegalVT.getStoreSize();
929
267
      // Number of source vectors after legalization:
930
267
      unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
931
267
      // Number of destination vectors after legalization:
932
267
      unsigned NumOfDests = LT.first;
933
267
934
267
      Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
935
267
                                         LegalVT.getVectorNumElements());
936
267
937
267
      unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
938
267
      return NumOfShuffles *
939
267
             getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
940
267
    }
941
108
942
108
    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
943
108
  }
944
28.5k
945
28.5k
  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
946
28.5k
  if (Kind == TTI::SK_PermuteTwoSrc && 
LT.first != 11.32k
) {
947
342
    // We assume that source and destination have the same vector type.
948
342
    int NumOfDests = LT.first;
949
342
    int NumOfShufflesPerDest = LT.first * 2 - 1;
950
342
    LT.first = NumOfDests * NumOfShufflesPerDest;
951
342
  }
952
28.5k
953
28.5k
  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
954
28.5k
      {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
955
28.5k
      {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
956
28.5k
957
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
958
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
959
28.5k
960
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
961
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
962
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}  // vpermt2b
963
28.5k
  };
964
28.5k
965
28.5k
  if (ST->hasVBMI())
966
130
    if (const auto *Entry =
967
10
            CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
968
10
      return LT.first * Entry->Cost;
969
28.5k
970
28.5k
  static const CostTblEntry AVX512BWShuffleTbl[] = {
971
28.5k
      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
972
28.5k
      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
973
28.5k
974
28.5k
      {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
975
28.5k
      {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
976
28.5k
      {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
977
28.5k
978
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
979
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
980
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1},  // vpermw
981
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
982
28.5k
      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3},  // vpermw + zext/trunc
983
28.5k
984
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
985
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
986
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpermt2w
987
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3},  // zext + vpermt2w + trunc
988
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
989
28.5k
      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}   // zext + vpermt2w + trunc
990
28.5k
  };
991
28.5k
992
28.5k
  if (ST->hasBWI())
993
825
    if (const auto *Entry =
994
211
            CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
995
211
      return LT.first * Entry->Cost;
996
28.3k
997
28.3k
  static const CostTblEntry AVX512ShuffleTbl[] = {
998
28.3k
      {TTI::SK_Broadcast, MVT::v8f64, 1},  // vbroadcastpd
999
28.3k
      {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1000
28.3k
      {TTI::SK_Broadcast, MVT::v8i64, 1},  // vpbroadcastq
1001
28.3k
      {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1002
28.3k
1003
28.3k
      {TTI::SK_Reverse, MVT::v8f64, 1},  // vpermpd
1004
28.3k
      {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1005
28.3k
      {TTI::SK_Reverse, MVT::v8i64, 1},  // vpermq
1006
28.3k
      {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1007
28.3k
1008
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1},  // vpermpd
1009
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1010
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1},  // vpermpd
1011
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1012
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1013
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1},  // vpermps
1014
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1},  // vpermq
1015
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1016
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1},  // vpermq
1017
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1018
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1019
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1},  // vpermd
1020
28.3k
      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1},  // pshufb
1021
28.3k
1022
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1},  // vpermt2pd
1023
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1024
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1},  // vpermt2q
1025
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1026
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1},  // vpermt2pd
1027
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1},  // vpermt2ps
1028
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1},  // vpermt2q
1029
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1},  // vpermt2d
1030
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1},  // vpermt2pd
1031
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1},  // vpermt2ps
1032
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1},  // vpermt2q
1033
28.3k
      {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}   // vpermt2d
1034
28.3k
  };
1035
28.3k
1036
28.3k
  if (ST->hasAVX512())
1037
1.64k
    if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1038
785
      return LT.first * Entry->Cost;
1039
27.5k
1040
27.5k
  static const CostTblEntry AVX2ShuffleTbl[] = {
1041
27.5k
      {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
1042
27.5k
      {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
1043
27.5k
      {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
1044
27.5k
      {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
1045
27.5k
      {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1046
27.5k
      {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
1047
27.5k
1048
27.5k
      {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
1049
27.5k
      {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
1050
27.5k
      {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
1051
27.5k
      {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
1052
27.5k
      {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1053
27.5k
      {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
1054
27.5k
1055
27.5k
      {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1056
27.5k
      {TTI::SK_Select, MVT::v32i8, 1},  // vpblendvb
1057
27.5k
1058
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
1059
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
1060
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
1061
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
1062
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1063
27.5k
                                                  // + vpblendvb
1064
27.5k
      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
1065
27.5k
                                                  // + vpblendvb
1066
27.5k
1067
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
1068
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
1069
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
1070
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
1071
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1072
27.5k
                                               // + vpblendvb
1073
27.5k
      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
1074
27.5k
                                               // + vpblendvb
1075
27.5k
  };
1076
27.5k
1077
27.5k
  if (ST->hasAVX2())
1078
8.32k
    if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1079
866
      return LT.first * Entry->Cost;
1080
26.7k
1081
26.7k
  static const CostTblEntry XOPShuffleTbl[] = {
1082
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
1083
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
1084
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
1085
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
1086
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1087
26.7k
                                                  // + vinsertf128
1088
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
1089
26.7k
                                                  // + vinsertf128
1090
26.7k
1091
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1092
26.7k
                                               // + vinsertf128
1093
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
1094
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
1095
26.7k
                                               // + vinsertf128
1096
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
1097
26.7k
  };
1098
26.7k
1099
26.7k
  if (ST->hasXOP())
1100
260
    if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1101
16
      return LT.first * Entry->Cost;
1102
26.7k
1103
26.7k
  static const CostTblEntry AVX1ShuffleTbl[] = {
1104
26.7k
      {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1105
26.7k
      {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1106
26.7k
      {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1107
26.7k
      {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1108
26.7k
      {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1109
26.7k
      {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
1110
26.7k
1111
26.7k
      {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
1112
26.7k
      {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
1113
26.7k
      {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
1114
26.7k
      {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
1115
26.7k
      {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1116
26.7k
                                         // + vinsertf128
1117
26.7k
      {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
1118
26.7k
                                         // + vinsertf128
1119
26.7k
1120
26.7k
      {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
1121
26.7k
      {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
1122
26.7k
      {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
1123
26.7k
      {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
1124
26.7k
      {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1125
26.7k
      {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
1126
26.7k
1127
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
1128
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
1129
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
1130
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
1131
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1132
26.7k
                                                  // + 2*por + vinsertf128
1133
26.7k
      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
1134
26.7k
                                                  // + 2*por + vinsertf128
1135
26.7k
1136
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
1137
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
1138
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
1139
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
1140
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1141
26.7k
                                                // + 4*por + vinsertf128
1142
26.7k
      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
1143
26.7k
                                                // + 4*por + vinsertf128
1144
26.7k
  };
1145
26.7k
1146
26.7k
  if (ST->hasAVX())
1147
8.65k
    if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1148
673
      return LT.first * Entry->Cost;
1149
26.0k
1150
26.0k
  static const CostTblEntry SSE41ShuffleTbl[] = {
1151
26.0k
      {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1152
26.0k
      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1153
26.0k
      {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1154
26.0k
      {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1155
26.0k
      {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1156
26.0k
      {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
1157
26.0k
  };
1158
26.0k
1159
26.0k
  if (ST->hasSSE41())
1160
9.57k
    if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1161
5.25k
      return LT.first * Entry->Cost;
1162
20.7k
1163
20.7k
  static const CostTblEntry SSSE3ShuffleTbl[] = {
1164
20.7k
      {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1165
20.7k
      {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1166
20.7k
1167
20.7k
      {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1168
20.7k
      {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1169
20.7k
1170
20.7k
      {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1171
20.7k
      {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1172
20.7k
1173
20.7k
      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1174
20.7k
      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1175
20.7k
1176
20.7k
      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1177
20.7k
      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1178
20.7k
  };
1179
20.7k
1180
20.7k
  if (ST->hasSSSE3())
1181
18.4k
    if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1182
1.12k
      return LT.first * Entry->Cost;
1183
19.6k
1184
19.6k
  static const CostTblEntry SSE2ShuffleTbl[] = {
1185
19.6k
      {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1186
19.6k
      {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1187
19.6k
      {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1188
19.6k
      {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1189
19.6k
      {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1190
19.6k
1191
19.6k
      {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1192
19.6k
      {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1193
19.6k
      {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1194
19.6k
      {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1195
19.6k
      {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1196
19.6k
                                        // + 2*pshufd + 2*unpck + packus
1197
19.6k
1198
19.6k
      {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1199
19.6k
      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1200
19.6k
      {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1201
19.6k
      {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1202
19.6k
      {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1203
19.6k
1204
19.6k
      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1205
19.6k
      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1206
19.6k
      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1207
19.6k
      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1208
19.6k
                                                  // + pshufd/unpck
1209
19.6k
    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1210
19.6k
                                                  // + 2*pshufd + 2*unpck + 2*packus
1211
19.6k
1212
19.6k
    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
1213
19.6k
    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
1214
19.6k
    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
1215
19.6k
    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
1216
19.6k
    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
1217
19.6k
  };
1218
19.6k
1219
19.6k
  if (ST->hasSSE2())
1220
19.6k
    if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1221
18.5k
      return LT.first * Entry->Cost;
1222
1.06k
1223
1.06k
  static const CostTblEntry SSE1ShuffleTbl[] = {
1224
1.06k
    { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
1225
1.06k
    { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
1226
1.06k
    { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
1227
1.06k
    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1228
1.06k
    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
1229
1.06k
  };
1230
1.06k
1231
1.06k
  if (ST->hasSSE1())
1232
1.06k
    if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1233
558
      return LT.first * Entry->Cost;
1234
511
1235
511
  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1236
511
}
1237
1238
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1239
20.4k
                                 const Instruction *I) {
1240
20.4k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1241
20.4k
  assert(ISD && "Invalid opcode");
1242
20.4k
1243
20.4k
  // FIXME: Need a better design of the cost table to handle non-simple types of
1244
20.4k
  // potential massive combinations (elem_num x src_type x dst_type).
1245
20.4k
1246
20.4k
  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1247
20.4k
    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1248
20.4k
    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1249
20.4k
1250
20.4k
    // Mask sign extend has an instruction.
1251
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
1252
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
1253
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1254
20.4k
    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
1255
20.4k
    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1256
20.4k
    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
1257
20.4k
1258
20.4k
    // Mask zero extend is a load + broadcast.
1259
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
1260
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
1261
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1262
20.4k
    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
1263
20.4k
    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1264
20.4k
    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
1265
20.4k
  };
1266
20.4k
1267
20.4k
  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1268
20.4k
    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1269
20.4k
    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1270
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1271
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1272
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1273
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1274
20.4k
1275
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
1276
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
1277
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
1278
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
1279
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
1280
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
1281
20.4k
1282
20.4k
    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
1283
20.4k
    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
1284
20.4k
    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
1285
20.4k
    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
1286
20.4k
    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
1287
20.4k
    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
1288
20.4k
1289
20.4k
    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
1290
20.4k
    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
1291
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
1292
20.4k
    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
1293
20.4k
    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
1294
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
1295
20.4k
  };
1296
20.4k
1297
20.4k
  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1298
20.4k
  // 256-bit wide vectors.
1299
20.4k
1300
20.4k
  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1301
20.4k
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
1302
20.4k
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
1303
20.4k
    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
1304
20.4k
1305
20.4k
    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
1306
20.4k
    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
1307
20.4k
    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
1308
20.4k
    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
1309
20.4k
1310
20.4k
    // v16i1 -> v16i32 - load + broadcast
1311
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1312
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1313
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1314
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1315
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1316
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1317
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1318
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1319
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1320
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1321
20.4k
1322
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1323
20.4k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1324
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1325
20.4k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1326
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1327
20.4k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1328
20.4k
    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1329
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1330
20.4k
1331
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1332
20.4k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1333
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
1334
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
1335
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
1336
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1337
20.4k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1338
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
1339
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
1340
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
1341
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1342
20.4k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1343
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
1344
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
1345
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
1346
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
1347
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
1348
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1349
20.4k
    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1350
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
1351
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
1352
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
1353
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
1354
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
1355
20.4k
1356
20.4k
    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
1357
20.4k
1358
20.4k
    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
1359
20.4k
    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
1360
20.4k
    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
1361
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
1362
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  2 },
1363
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  2 },
1364
20.4k
    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
1365
20.4k
    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 2 },
1366
20.4k
    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 2 },
1367
20.4k
  };
1368
20.4k
1369
20.4k
  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1370
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1371
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1372
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1373
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1374
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1375
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1376
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1377
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1378
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1379
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1380
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1381
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1382
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1383
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1384
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1385
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1386
20.4k
1387
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
1388
20.4k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
1389
20.4k
    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
1390
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
1391
20.4k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
1392
20.4k
    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
1393
20.4k
1394
20.4k
    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
1395
20.4k
    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
1396
20.4k
1397
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
1398
20.4k
  };
1399
20.4k
1400
20.4k
  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1401
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
1402
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
1403
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
1404
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
1405
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
1406
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1407
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
1408
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1409
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1410
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1411
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
1412
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1413
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1414
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1415
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1416
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1417
20.4k
1418
20.4k
    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
1419
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
1420
20.4k
    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
1421
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
1422
20.4k
    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
1423
20.4k
    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
1424
20.4k
    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
1425
20.4k
1426
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
1427
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
1428
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
1429
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
1430
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
1431
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
1432
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
1433
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
1434
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1435
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
1436
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
1437
20.4k
    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
1438
20.4k
1439
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
1440
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
1441
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
1442
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
1443
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
1444
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
1445
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
1446
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
1447
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1448
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
1449
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
1450
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
1451
20.4k
    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
1452
20.4k
    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
1453
20.4k
    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
1454
20.4k
    // The generic code to compute the scalar overhead is currently broken.
1455
20.4k
    // Workaround this limitation by estimating the scalarization overhead
1456
20.4k
    // here. We have roughly 10 instructions per scalar element.
1457
20.4k
    // Multiply that by the vector width.
1458
20.4k
    // FIXME: remove that when PR19268 is fixed.
1459
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1460
20.4k
    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1461
20.4k
1462
20.4k
    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
1463
20.4k
    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
1464
20.4k
    // This node is expanded into scalarized operations but BasicTTI is overly
1465
20.4k
    // optimistic estimating its cost.  It computes 3 per element (one
1466
20.4k
    // vector-extract, one scalar conversion and one vector-insert).  The
1467
20.4k
    // problem is that the inserts form a read-modify-write chain so latency
1468
20.4k
    // should be factored in too.  Inflating the cost per element by 1.
1469
20.4k
    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
1470
20.4k
    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
1471
20.4k
1472
20.4k
    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
1473
20.4k
    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
1474
20.4k
  };
1475
20.4k
1476
20.4k
  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1477
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1478
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1479
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1480
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1481
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1482
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1483
20.4k
1484
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1485
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
1486
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1487
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1488
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1489
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1490
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1491
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1492
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1493
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1494
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1495
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1496
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1497
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1498
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1499
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1500
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1501
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1502
20.4k
1503
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
1504
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
1505
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
1506
20.4k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
1507
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
1508
20.4k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
1509
20.4k
    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
1510
20.4k
1511
20.4k
    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
1512
20.4k
  };
1513
20.4k
1514
20.4k
  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1515
20.4k
    // These are somewhat magic numbers justified by looking at the output of
1516
20.4k
    // Intel's IACA, running some kernels and making sure when we take
1517
20.4k
    // legalization into account the throughput will be overestimated.
1518
20.4k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1519
20.4k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1520
20.4k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1521
20.4k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1522
20.4k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1523
20.4k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1524
20.4k
    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1525
20.4k
    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1526
20.4k
1527
20.4k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1528
20.4k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1529
20.4k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1530
20.4k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1531
20.4k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1532
20.4k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1533
20.4k
    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
1534
20.4k
    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1535
20.4k
1536
20.4k
    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
1537
20.4k
1538
20.4k
    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
1539
20.4k
1540
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1541
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
1542
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
1543
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
1544
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
1545
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
1546
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1547
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
1548
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1549
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1550
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
1551
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
1552
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
1553
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
1554
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1555
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
1556
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1557
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
1558
20.4k
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
1559
20.4k
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
1560
20.4k
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1561
20.4k
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1562
20.4k
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
1563
20.4k
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
1564
20.4k
1565
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
1566
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
1567
20.4k
    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
1568
20.4k
    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
1569
20.4k
    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
1570
20.4k
    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
1571
20.4k
    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
1572
20.4k
    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
1573
20.4k
    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
1574
20.4k
  };
1575
20.4k
1576
20.4k
  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1577
20.4k
  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1578
20.4k
1579
20.4k
  if (ST->hasSSE2() && 
!ST->hasAVX()20.4k
) {
1580
10.3k
    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1581
698
                                                   LTDest.second, LTSrc.second))
1582
698
      return LTSrc.first * Entry->Cost;
1583
19.7k
  }
1584
19.7k
1585
19.7k
  EVT SrcTy = TLI->getValueType(DL, Src);
1586
19.7k
  EVT DstTy = TLI->getValueType(DL, Dst);
1587
19.7k
1588
19.7k
  // The function getSimpleVT only handles simple value types.
1589
19.7k
  if (!SrcTy.isSimple() || 
!DstTy.isSimple()19.0k
)
1590
1.05k
    return BaseT::getCastInstrCost(Opcode, Dst, Src);
1591
18.6k
1592
18.6k
  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1593
18.6k
  MVT SimpleDstTy = DstTy.getSimpleVT();
1594
18.6k
1595
18.6k
  // Make sure that neither type is going to be split before using the
1596
18.6k
  // AVX512 tables. This handles -mprefer-vector-width=256
1597
18.6k
  // with -min-legal-vector-width<=256
1598
18.6k
  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1599
18.6k
      
TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector16.5k
) {
1600
15.4k
    if (ST->hasBWI())
1601
1.02k
      if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1602
28
                                                     SimpleDstTy, SimpleSrcTy))
1603
28
        return Entry->Cost;
1604
15.4k
1605
15.4k
    if (ST->hasDQI())
1606
1.14k
      if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1607
90
                                                     SimpleDstTy, SimpleSrcTy))
1608
90
        return Entry->Cost;
1609
15.3k
1610
15.3k
    if (ST->hasAVX512())
1611
2.43k
      if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1612
503
                                                     SimpleDstTy, SimpleSrcTy))
1613
503
        return Entry->Cost;
1614
18.0k
  }
1615
18.0k
1616
18.0k
  if (ST->hasAVX2()) {
1617
5.81k
    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1618
632
                                                   SimpleDstTy, SimpleSrcTy))
1619
632
      return Entry->Cost;
1620
17.4k
  }
1621
17.4k
1622
17.4k
  if (ST->hasAVX()) {
1623
8.31k
    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1624
1.26k
                                                   SimpleDstTy, SimpleSrcTy))
1625
1.26k
      return Entry->Cost;
1626
16.1k
  }
1627
16.1k
1628
16.1k
  if (ST->hasSSE41()) {
1629
10.1k
    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1630
671
                                                   SimpleDstTy, SimpleSrcTy))
1631
671
      return Entry->Cost;
1632
15.4k
  }
1633
15.4k
1634
15.4k
  if (ST->hasSSE2()) {
1635
15.4k
    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1636
508
                                                   SimpleDstTy, SimpleSrcTy))
1637
508
      return Entry->Cost;
1638
14.9k
  }
1639
14.9k
1640
14.9k
  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1641
14.9k
}
1642
1643
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1644
23.4k
                                   const Instruction *I) {
1645
23.4k
  // Legalize the type.
1646
23.4k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1647
23.4k
1648
23.4k
  MVT MTy = LT.second;
1649
23.4k
1650
23.4k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1651
23.4k
  assert(ISD && "Invalid opcode");
1652
23.4k
1653
23.4k
  unsigned ExtraCost = 0;
1654
23.4k
  if (I && 
(17.4k
Opcode == Instruction::ICmp17.4k
||
Opcode == Instruction::FCmp5.45k
)) {
1655
14.5k
    // Some vector comparison predicates cost extra instructions.
1656
14.5k
    if (MTy.isVector() &&
1657
14.5k
        
!(6.88k
(6.88k
ST->hasXOP()6.88k
&&
(331
!ST->hasAVX2()331
||
MTy.is128BitVector()160
)) ||
1658
6.88k
          
(6.67k
ST->hasAVX512()6.67k
&&
32 <= MTy.getScalarSizeInBits()702
) ||
1659
6.88k
          
ST->hasBWI()6.14k
)) {
1660
6.05k
      switch (cast<CmpInst>(I)->getPredicate()) {
1661
6.05k
      case CmpInst::Predicate::ICMP_NE:
1662
308
        // xor(cmpeq(x,y),-1)
1663
308
        ExtraCost = 1;
1664
308
        break;
1665
6.05k
      case CmpInst::Predicate::ICMP_SGE:
1666
360
      case CmpInst::Predicate::ICMP_SLE:
1667
360
        // xor(cmpgt(x,y),-1)
1668
360
        ExtraCost = 1;
1669
360
        break;
1670
1.92k
      case CmpInst::Predicate::ICMP_ULT:
1671
1.92k
      case CmpInst::Predicate::ICMP_UGT:
1672
1.92k
        // cmpgt(xor(x,signbit),xor(y,signbit))
1673
1.92k
        // xor(cmpeq(pmaxu(x,y),x),-1)
1674
1.92k
        ExtraCost = 2;
1675
1.92k
        break;
1676
1.92k
      case CmpInst::Predicate::ICMP_ULE:
1677
475
      case CmpInst::Predicate::ICMP_UGE:
1678
475
        if ((ST->hasSSE41() && 
MTy.getScalarSizeInBits() == 32319
) ||
1679
475
            
(412
ST->hasSSE2()412
&&
MTy.getScalarSizeInBits() < 32412
)) {
1680
251
          // cmpeq(psubus(x,y),0)
1681
251
          // cmpeq(pminu(x,y),x)
1682
251
          ExtraCost = 1;
1683
251
        } else {
1684
224
          // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
1685
224
          ExtraCost = 3;
1686
224
        }
1687
475
        break;
1688
2.99k
      default:
1689
2.99k
        break;
1690
23.4k
      }
1691
23.4k
    }
1692
14.5k
  }
1693
23.4k
1694
23.4k
  static const CostTblEntry AVX512BWCostTbl[] = {
1695
23.4k
    { ISD::SETCC,   MVT::v32i16,  1 },
1696
23.4k
    { ISD::SETCC,   MVT::v64i8,   1 },
1697
23.4k
1698
23.4k
    { ISD::SELECT,  MVT::v32i16,  1 },
1699
23.4k
    { ISD::SELECT,  MVT::v64i8,   1 },
1700
23.4k
  };
1701
23.4k
1702
23.4k
  static const CostTblEntry AVX512CostTbl[] = {
1703
23.4k
    { ISD::SETCC,   MVT::v8i64,   1 },
1704
23.4k
    { ISD::SETCC,   MVT::v16i32,  1 },
1705
23.4k
    { ISD::SETCC,   MVT::v8f64,   1 },
1706
23.4k
    { ISD::SETCC,   MVT::v16f32,  1 },
1707
23.4k
1708
23.4k
    { ISD::SELECT,  MVT::v8i64,   1 },
1709
23.4k
    { ISD::SELECT,  MVT::v16i32,  1 },
1710
23.4k
    { ISD::SELECT,  MVT::v8f64,   1 },
1711
23.4k
    { ISD::SELECT,  MVT::v16f32,  1 },
1712
23.4k
  };
1713
23.4k
1714
23.4k
  static const CostTblEntry AVX2CostTbl[] = {
1715
23.4k
    { ISD::SETCC,   MVT::v4i64,   1 },
1716
23.4k
    { ISD::SETCC,   MVT::v8i32,   1 },
1717
23.4k
    { ISD::SETCC,   MVT::v16i16,  1 },
1718
23.4k
    { ISD::SETCC,   MVT::v32i8,   1 },
1719
23.4k
1720
23.4k
    { ISD::SELECT,  MVT::v4i64,   1 }, // pblendvb
1721
23.4k
    { ISD::SELECT,  MVT::v8i32,   1 }, // pblendvb
1722
23.4k
    { ISD::SELECT,  MVT::v16i16,  1 }, // pblendvb
1723
23.4k
    { ISD::SELECT,  MVT::v32i8,   1 }, // pblendvb
1724
23.4k
  };
1725
23.4k
1726
23.4k
  static const CostTblEntry AVX1CostTbl[] = {
1727
23.4k
    { ISD::SETCC,   MVT::v4f64,   1 },
1728
23.4k
    { ISD::SETCC,   MVT::v8f32,   1 },
1729
23.4k
    // AVX1 does not support 8-wide integer compare.
1730
23.4k
    { ISD::SETCC,   MVT::v4i64,   4 },
1731
23.4k
    { ISD::SETCC,   MVT::v8i32,   4 },
1732
23.4k
    { ISD::SETCC,   MVT::v16i16,  4 },
1733
23.4k
    { ISD::SETCC,   MVT::v32i8,   4 },
1734
23.4k
1735
23.4k
    { ISD::SELECT,  MVT::v4f64,   1 }, // vblendvpd
1736
23.4k
    { ISD::SELECT,  MVT::v8f32,   1 }, // vblendvps
1737
23.4k
    { ISD::SELECT,  MVT::v4i64,   1 }, // vblendvpd
1738
23.4k
    { ISD::SELECT,  MVT::v8i32,   1 }, // vblendvps
1739
23.4k
    { ISD::SELECT,  MVT::v16i16,  3 }, // vandps + vandnps + vorps
1740
23.4k
    { ISD::SELECT,  MVT::v32i8,   3 }, // vandps + vandnps + vorps
1741
23.4k
  };
1742
23.4k
1743
23.4k
  static const CostTblEntry SSE42CostTbl[] = {
1744
23.4k
    { ISD::SETCC,   MVT::v2f64,   1 },
1745
23.4k
    { ISD::SETCC,   MVT::v4f32,   1 },
1746
23.4k
    { ISD::SETCC,   MVT::v2i64,   1 },
1747
23.4k
  };
1748
23.4k
1749
23.4k
  static const CostTblEntry SSE41CostTbl[] = {
1750
23.4k
    { ISD::SELECT,  MVT::v2f64,   1 }, // blendvpd
1751
23.4k
    { ISD::SELECT,  MVT::v4f32,   1 }, // blendvps
1752
23.4k
    { ISD::SELECT,  MVT::v2i64,   1 }, // pblendvb
1753
23.4k
    { ISD::SELECT,  MVT::v4i32,   1 }, // pblendvb
1754
23.4k
    { ISD::SELECT,  MVT::v8i16,   1 }, // pblendvb
1755
23.4k
    { ISD::SELECT,  MVT::v16i8,   1 }, // pblendvb
1756
23.4k
  };
1757
23.4k
1758
23.4k
  static const CostTblEntry SSE2CostTbl[] = {
1759
23.4k
    { ISD::SETCC,   MVT::v2f64,   2 },
1760
23.4k
    { ISD::SETCC,   MVT::f64,     1 },
1761
23.4k
    { ISD::SETCC,   MVT::v2i64,   8 },
1762
23.4k
    { ISD::SETCC,   MVT::v4i32,   1 },
1763
23.4k
    { ISD::SETCC,   MVT::v8i16,   1 },
1764
23.4k
    { ISD::SETCC,   MVT::v16i8,   1 },
1765
23.4k
1766
23.4k
    { ISD::SELECT,  MVT::v2f64,   3 }, // andpd + andnpd + orpd
1767
23.4k
    { ISD::SELECT,  MVT::v2i64,   3 }, // pand + pandn + por
1768
23.4k
    { ISD::SELECT,  MVT::v4i32,   3 }, // pand + pandn + por
1769
23.4k
    { ISD::SELECT,  MVT::v8i16,   3 }, // pand + pandn + por
1770
23.4k
    { ISD::SELECT,  MVT::v16i8,   3 }, // pand + pandn + por
1771
23.4k
  };
1772
23.4k
1773
23.4k
  static const CostTblEntry SSE1CostTbl[] = {
1774
23.4k
    { ISD::SETCC,   MVT::v4f32,   2 },
1775
23.4k
    { ISD::SETCC,   MVT::f32,     1 },
1776
23.4k
1777
23.4k
    { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
1778
23.4k
  };
1779
23.4k
1780
23.4k
  if (ST->hasBWI())
1781
1.15k
    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1782
186
      return LT.first * (ExtraCost + Entry->Cost);
1783
23.3k
1784
23.3k
  if (ST->hasAVX512())
1785
2.56k
    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1786
388
      return LT.first * (ExtraCost + Entry->Cost);
1787
22.9k
1788
22.9k
  if (ST->hasAVX2())
1789
7.07k
    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1790
1.30k
      return LT.first * (ExtraCost + Entry->Cost);
1791
21.6k
1792
21.6k
  if (ST->hasAVX())
1793
8.28k
    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1794
1.30k
      return LT.first * (ExtraCost + Entry->Cost);
1795
20.3k
1796
20.3k
  if (ST->hasSSE42())
1797
9.66k
    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1798
2.05k
      return LT.first * (ExtraCost + Entry->Cost);
1799
18.2k
1800
18.2k
  if (ST->hasSSE41())
1801
8.18k
    if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
1802
1.13k
      return LT.first * (ExtraCost + Entry->Cost);
1803
17.1k
1804
17.1k
  if (ST->hasSSE2())
1805
17.1k
    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1806
6.15k
      return LT.first * (ExtraCost + Entry->Cost);
1807
10.9k
1808
10.9k
  if (ST->hasSSE1())
1809
10.9k
    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1810
767
      return LT.first * (ExtraCost + Entry->Cost);
1811
10.1k
1812
10.1k
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1813
10.1k
}
1814
1815
8
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
1816
1817
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1818
                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
1819
10.6k
                                      unsigned ScalarizationCostPassed) {
1820
10.6k
  // Costs should match the codegen from:
1821
10.6k
  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1822
10.6k
  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1823
10.6k
  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1824
10.6k
  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1825
10.6k
  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1826
10.6k
  static const CostTblEntry AVX512CDCostTbl[] = {
1827
10.6k
    { ISD::CTLZ,       MVT::v8i64,   1 },
1828
10.6k
    { ISD::CTLZ,       MVT::v16i32,  1 },
1829
10.6k
    { ISD::CTLZ,       MVT::v32i16,  8 },
1830
10.6k
    { ISD::CTLZ,       MVT::v64i8,  20 },
1831
10.6k
    { ISD::CTLZ,       MVT::v4i64,   1 },
1832
10.6k
    { ISD::CTLZ,       MVT::v8i32,   1 },
1833
10.6k
    { ISD::CTLZ,       MVT::v16i16,  4 },
1834
10.6k
    { ISD::CTLZ,       MVT::v32i8,  10 },
1835
10.6k
    { ISD::CTLZ,       MVT::v2i64,   1 },
1836
10.6k
    { ISD::CTLZ,       MVT::v4i32,   1 },
1837
10.6k
    { ISD::CTLZ,       MVT::v8i16,   4 },
1838
10.6k
    { ISD::CTLZ,       MVT::v16i8,   4 },
1839
10.6k
  };
1840
10.6k
  static const CostTblEntry AVX512BWCostTbl[] = {
1841
10.6k
    { ISD::BITREVERSE, MVT::v8i64,   5 },
1842
10.6k
    { ISD::BITREVERSE, MVT::v16i32,  5 },
1843
10.6k
    { ISD::BITREVERSE, MVT::v32i16,  5 },
1844
10.6k
    { ISD::BITREVERSE, MVT::v64i8,   5 },
1845
10.6k
    { ISD::CTLZ,       MVT::v8i64,  23 },
1846
10.6k
    { ISD::CTLZ,       MVT::v16i32, 22 },
1847
10.6k
    { ISD::CTLZ,       MVT::v32i16, 18 },
1848
10.6k
    { ISD::CTLZ,       MVT::v64i8,  17 },
1849
10.6k
    { ISD::CTPOP,      MVT::v8i64,   7 },
1850
10.6k
    { ISD::CTPOP,      MVT::v16i32, 11 },
1851
10.6k
    { ISD::CTPOP,      MVT::v32i16,  9 },
1852
10.6k
    { ISD::CTPOP,      MVT::v64i8,   6 },
1853
10.6k
    { ISD::CTTZ,       MVT::v8i64,  10 },
1854
10.6k
    { ISD::CTTZ,       MVT::v16i32, 14 },
1855
10.6k
    { ISD::CTTZ,       MVT::v32i16, 12 },
1856
10.6k
    { ISD::CTTZ,       MVT::v64i8,   9 },
1857
10.6k
    { ISD::SADDSAT,    MVT::v32i16,  1 },
1858
10.6k
    { ISD::SADDSAT,    MVT::v64i8,   1 },
1859
10.6k
    { ISD::SSUBSAT,    MVT::v32i16,  1 },
1860
10.6k
    { ISD::SSUBSAT,    MVT::v64i8,   1 },
1861
10.6k
    { ISD::UADDSAT,    MVT::v32i16,  1 },
1862
10.6k
    { ISD::UADDSAT,    MVT::v64i8,   1 },
1863
10.6k
    { ISD::USUBSAT,    MVT::v32i16,  1 },
1864
10.6k
    { ISD::USUBSAT,    MVT::v64i8,   1 },
1865
10.6k
  };
1866
10.6k
  static const CostTblEntry AVX512CostTbl[] = {
1867
10.6k
    { ISD::BITREVERSE, MVT::v8i64,  36 },
1868
10.6k
    { ISD::BITREVERSE, MVT::v16i32, 24 },
1869
10.6k
    { ISD::CTLZ,       MVT::v8i64,  29 },
1870
10.6k
    { ISD::CTLZ,       MVT::v16i32, 35 },
1871
10.6k
    { ISD::CTPOP,      MVT::v8i64,  16 },
1872
10.6k
    { ISD::CTPOP,      MVT::v16i32, 24 },
1873
10.6k
    { ISD::CTTZ,       MVT::v8i64,  20 },
1874
10.6k
    { ISD::CTTZ,       MVT::v16i32, 28 },
1875
10.6k
    { ISD::USUBSAT,    MVT::v16i32,  2 }, // pmaxud + psubd
1876
10.6k
    { ISD::USUBSAT,    MVT::v2i64,   2 }, // pmaxuq + psubq
1877
10.6k
    { ISD::USUBSAT,    MVT::v4i64,   2 }, // pmaxuq + psubq
1878
10.6k
    { ISD::USUBSAT,    MVT::v8i64,   2 }, // pmaxuq + psubq
1879
10.6k
    { ISD::UADDSAT,    MVT::v16i32,  3 }, // not + pminud + paddd
1880
10.6k
    { ISD::UADDSAT,    MVT::v2i64,   3 }, // not + pminuq + paddq
1881
10.6k
    { ISD::UADDSAT,    MVT::v4i64,   3 }, // not + pminuq + paddq
1882
10.6k
    { ISD::UADDSAT,    MVT::v8i64,   3 }, // not + pminuq + paddq
1883
10.6k
  };
1884
10.6k
  static const CostTblEntry XOPCostTbl[] = {
1885
10.6k
    { ISD::BITREVERSE, MVT::v4i64,   4 },
1886
10.6k
    { ISD::BITREVERSE, MVT::v8i32,   4 },
1887
10.6k
    { ISD::BITREVERSE, MVT::v16i16,  4 },
1888
10.6k
    { ISD::BITREVERSE, MVT::v32i8,   4 },
1889
10.6k
    { ISD::BITREVERSE, MVT::v2i64,   1 },
1890
10.6k
    { ISD::BITREVERSE, MVT::v4i32,   1 },
1891
10.6k
    { ISD::BITREVERSE, MVT::v8i16,   1 },
1892
10.6k
    { ISD::BITREVERSE, MVT::v16i8,   1 },
1893
10.6k
    { ISD::BITREVERSE, MVT::i64,     3 },
1894
10.6k
    { ISD::BITREVERSE, MVT::i32,     3 },
1895
10.6k
    { ISD::BITREVERSE, MVT::i16,     3 },
1896
10.6k
    { ISD::BITREVERSE, MVT::i8,      3 }
1897
10.6k
  };
1898
10.6k
  static const CostTblEntry AVX2CostTbl[] = {
1899
10.6k
    { ISD::BITREVERSE, MVT::v4i64,   5 },
1900
10.6k
    { ISD::BITREVERSE, MVT::v8i32,   5 },
1901
10.6k
    { ISD::BITREVERSE, MVT::v16i16,  5 },
1902
10.6k
    { ISD::BITREVERSE, MVT::v32i8,   5 },
1903
10.6k
    { ISD::BSWAP,      MVT::v4i64,   1 },
1904
10.6k
    { ISD::BSWAP,      MVT::v8i32,   1 },
1905
10.6k
    { ISD::BSWAP,      MVT::v16i16,  1 },
1906
10.6k
    { ISD::CTLZ,       MVT::v4i64,  23 },
1907
10.6k
    { ISD::CTLZ,       MVT::v8i32,  18 },
1908
10.6k
    { ISD::CTLZ,       MVT::v16i16, 14 },
1909
10.6k
    { ISD::CTLZ,       MVT::v32i8,   9 },
1910
10.6k
    { ISD::CTPOP,      MVT::v4i64,   7 },
1911
10.6k
    { ISD::CTPOP,      MVT::v8i32,  11 },
1912
10.6k
    { ISD::CTPOP,      MVT::v16i16,  9 },
1913
10.6k
    { ISD::CTPOP,      MVT::v32i8,   6 },
1914
10.6k
    { ISD::CTTZ,       MVT::v4i64,  10 },
1915
10.6k
    { ISD::CTTZ,       MVT::v8i32,  14 },
1916
10.6k
    { ISD::CTTZ,       MVT::v16i16, 12 },
1917
10.6k
    { ISD::CTTZ,       MVT::v32i8,   9 },
1918
10.6k
    { ISD::SADDSAT,    MVT::v16i16,  1 },
1919
10.6k
    { ISD::SADDSAT,    MVT::v32i8,   1 },
1920
10.6k
    { ISD::SSUBSAT,    MVT::v16i16,  1 },
1921
10.6k
    { ISD::SSUBSAT,    MVT::v32i8,   1 },
1922
10.6k
    { ISD::UADDSAT,    MVT::v16i16,  1 },
1923
10.6k
    { ISD::UADDSAT,    MVT::v32i8,   1 },
1924
10.6k
    { ISD::UADDSAT,    MVT::v8i32,   3 }, // not + pminud + paddd
1925
10.6k
    { ISD::USUBSAT,    MVT::v16i16,  1 },
1926
10.6k
    { ISD::USUBSAT,    MVT::v32i8,   1 },
1927
10.6k
    { ISD::USUBSAT,    MVT::v8i32,   2 }, // pmaxud + psubd
1928
10.6k
    { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
1929
10.6k
    { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
1930
10.6k
    { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
1931
10.6k
    { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
1932
10.6k
    { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
1933
10.6k
    { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
1934
10.6k
  };
1935
10.6k
  static const CostTblEntry AVX1CostTbl[] = {
1936
10.6k
    { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
1937
10.6k
    { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
1938
10.6k
    { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1939
10.6k
    { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
1940
10.6k
    { ISD::BSWAP,      MVT::v4i64,   4 },
1941
10.6k
    { ISD::BSWAP,      MVT::v8i32,   4 },
1942
10.6k
    { ISD::BSWAP,      MVT::v16i16,  4 },
1943
10.6k
    { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
1944
10.6k
    { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
1945
10.6k
    { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1946
10.6k
    { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1947
10.6k
    { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
1948
10.6k
    { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
1949
10.6k
    { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1950
10.6k
    { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
1951
10.6k
    { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
1952
10.6k
    { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
1953
10.6k
    { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1954
10.6k
    { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1955
10.6k
    { ISD::SADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
1956
10.6k
    { ISD::SADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
1957
10.6k
    { ISD::SSUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
1958
10.6k
    { ISD::SSUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
1959
10.6k
    { ISD::UADDSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
1960
10.6k
    { ISD::UADDSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
1961
10.6k
    { ISD::UADDSAT,    MVT::v8i32,   8 }, // 2 x 128-bit Op + extract/insert
1962
10.6k
    { ISD::USUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
1963
10.6k
    { ISD::USUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
1964
10.6k
    { ISD::USUBSAT,    MVT::v8i32,   6 }, // 2 x 128-bit Op + extract/insert
1965
10.6k
    { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
1966
10.6k
    { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
1967
10.6k
    { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
1968
10.6k
    { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
1969
10.6k
    { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
1970
10.6k
    { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
1971
10.6k
  };
1972
10.6k
  static const CostTblEntry GLMCostTbl[] = {
1973
10.6k
    { ISD::FSQRT, MVT::f32,   19 }, // sqrtss
1974
10.6k
    { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1975
10.6k
    { ISD::FSQRT, MVT::f64,   34 }, // sqrtsd
1976
10.6k
    { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1977
10.6k
  };
1978
10.6k
  static const CostTblEntry SLMCostTbl[] = {
1979
10.6k
    { ISD::FSQRT, MVT::f32,   20 }, // sqrtss
1980
10.6k
    { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1981
10.6k
    { ISD::FSQRT, MVT::f64,   35 }, // sqrtsd
1982
10.6k
    { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1983
10.6k
  };
1984
10.6k
  static const CostTblEntry SSE42CostTbl[] = {
1985
10.6k
    { ISD::USUBSAT,    MVT::v4i32,   2 }, // pmaxud + psubd
1986
10.6k
    { ISD::UADDSAT,    MVT::v4i32,   3 }, // not + pminud + paddd
1987
10.6k
    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
1988
10.6k
    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
1989
10.6k
  };
1990
10.6k
  static const CostTblEntry SSSE3CostTbl[] = {
1991
10.6k
    { ISD::BITREVERSE, MVT::v2i64,   5 },
1992
10.6k
    { ISD::BITREVERSE, MVT::v4i32,   5 },
1993
10.6k
    { ISD::BITREVERSE, MVT::v8i16,   5 },
1994
10.6k
    { ISD::BITREVERSE, MVT::v16i8,   5 },
1995
10.6k
    { ISD::BSWAP,      MVT::v2i64,   1 },
1996
10.6k
    { ISD::BSWAP,      MVT::v4i32,   1 },
1997
10.6k
    { ISD::BSWAP,      MVT::v8i16,   1 },
1998
10.6k
    { ISD::CTLZ,       MVT::v2i64,  23 },
1999
10.6k
    { ISD::CTLZ,       MVT::v4i32,  18 },
2000
10.6k
    { ISD::CTLZ,       MVT::v8i16,  14 },
2001
10.6k
    { ISD::CTLZ,       MVT::v16i8,   9 },
2002
10.6k
    { ISD::CTPOP,      MVT::v2i64,   7 },
2003
10.6k
    { ISD::CTPOP,      MVT::v4i32,  11 },
2004
10.6k
    { ISD::CTPOP,      MVT::v8i16,   9 },
2005
10.6k
    { ISD::CTPOP,      MVT::v16i8,   6 },
2006
10.6k
    { ISD::CTTZ,       MVT::v2i64,  10 },
2007
10.6k
    { ISD::CTTZ,       MVT::v4i32,  14 },
2008
10.6k
    { ISD::CTTZ,       MVT::v8i16,  12 },
2009
10.6k
    { ISD::CTTZ,       MVT::v16i8,   9 }
2010
10.6k
  };
2011
10.6k
  static const CostTblEntry SSE2CostTbl[] = {
2012
10.6k
    { ISD::BITREVERSE, MVT::v2i64,  29 },
2013
10.6k
    { ISD::BITREVERSE, MVT::v4i32,  27 },
2014
10.6k
    { ISD::BITREVERSE, MVT::v8i16,  27 },
2015
10.6k
    { ISD::BITREVERSE, MVT::v16i8,  20 },
2016
10.6k
    { ISD::BSWAP,      MVT::v2i64,   7 },
2017
10.6k
    { ISD::BSWAP,      MVT::v4i32,   7 },
2018
10.6k
    { ISD::BSWAP,      MVT::v8i16,   7 },
2019
10.6k
    { ISD::CTLZ,       MVT::v2i64,  25 },
2020
10.6k
    { ISD::CTLZ,       MVT::v4i32,  26 },
2021
10.6k
    { ISD::CTLZ,       MVT::v8i16,  20 },
2022
10.6k
    { ISD::CTLZ,       MVT::v16i8,  17 },
2023
10.6k
    { ISD::CTPOP,      MVT::v2i64,  12 },
2024
10.6k
    { ISD::CTPOP,      MVT::v4i32,  15 },
2025
10.6k
    { ISD::CTPOP,      MVT::v8i16,  13 },
2026
10.6k
    { ISD::CTPOP,      MVT::v16i8,  10 },
2027
10.6k
    { ISD::CTTZ,       MVT::v2i64,  14 },
2028
10.6k
    { ISD::CTTZ,       MVT::v4i32,  18 },
2029
10.6k
    { ISD::CTTZ,       MVT::v8i16,  16 },
2030
10.6k
    { ISD::CTTZ,       MVT::v16i8,  13 },
2031
10.6k
    { ISD::SADDSAT,    MVT::v8i16,   1 },
2032
10.6k
    { ISD::SADDSAT,    MVT::v16i8,   1 },
2033
10.6k
    { ISD::SSUBSAT,    MVT::v8i16,   1 },
2034
10.6k
    { ISD::SSUBSAT,    MVT::v16i8,   1 },
2035
10.6k
    { ISD::UADDSAT,    MVT::v8i16,   1 },
2036
10.6k
    { ISD::UADDSAT,    MVT::v16i8,   1 },
2037
10.6k
    { ISD::USUBSAT,    MVT::v8i16,   1 },
2038
10.6k
    { ISD::USUBSAT,    MVT::v16i8,   1 },
2039
10.6k
    { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
2040
10.6k
    { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
2041
10.6k
  };
2042
10.6k
  static const CostTblEntry SSE1CostTbl[] = {
2043
10.6k
    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
2044
10.6k
    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
2045
10.6k
  };
2046
10.6k
  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2047
10.6k
    { ISD::BITREVERSE, MVT::i64,    14 },
2048
10.6k
    { ISD::SADDO,      MVT::i64,     1 },
2049
10.6k
    { ISD::UADDO,      MVT::i64,     1 },
2050
10.6k
  };
2051
10.6k
  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2052
10.6k
    { ISD::BITREVERSE, MVT::i32,    14 },
2053
10.6k
    { ISD::BITREVERSE, MVT::i16,    14 },
2054
10.6k
    { ISD::BITREVERSE, MVT::i8,     11 },
2055
10.6k
    { ISD::SADDO,      MVT::i32,     1 },
2056
10.6k
    { ISD::SADDO,      MVT::i16,     1 },
2057
10.6k
    { ISD::SADDO,      MVT::i8,      1 },
2058
10.6k
    { ISD::UADDO,      MVT::i32,     1 },
2059
10.6k
    { ISD::UADDO,      MVT::i16,     1 },
2060
10.6k
    { ISD::UADDO,      MVT::i8,      1 },
2061
10.6k
  };
2062
10.6k
2063
10.6k
  Type *OpTy = RetTy;
2064
10.6k
  unsigned ISD = ISD::DELETED_NODE;
2065
10.6k
  switch (IID) {
2066
10.6k
  default:
2067
6.01k
    break;
2068
10.6k
  case Intrinsic::bitreverse:
2069
320
    ISD = ISD::BITREVERSE;
2070
320
    break;
2071
10.6k
  case Intrinsic::bswap:
2072
92
    ISD = ISD::BSWAP;
2073
92
    break;
2074
10.6k
  case Intrinsic::ctlz:
2075
566
    ISD = ISD::CTLZ;
2076
566
    break;
2077
10.6k
  case Intrinsic::ctpop:
2078
222
    ISD = ISD::CTPOP;
2079
222
    break;
2080
10.6k
  case Intrinsic::cttz:
2081
442
    ISD = ISD::CTTZ;
2082
442
    break;
2083
10.6k
  case Intrinsic::sadd_sat:
2084
342
    ISD = ISD::SADDSAT;
2085
342
    break;
2086
10.6k
  case Intrinsic::ssub_sat:
2087
342
    ISD = ISD::SSUBSAT;
2088
342
    break;
2089
10.6k
  case Intrinsic::uadd_sat:
2090
322
    ISD = ISD::UADDSAT;
2091
322
    break;
2092
10.6k
  case Intrinsic::usub_sat:
2093
322
    ISD = ISD::USUBSAT;
2094
322
    break;
2095
10.6k
  case Intrinsic::sqrt:
2096
182
    ISD = ISD::FSQRT;
2097
182
    break;
2098
10.6k
  case Intrinsic::sadd_with_overflow:
2099
792
  case Intrinsic::ssub_with_overflow:
2100
792
    // SSUBO has same costs so don't duplicate.
2101
792
    ISD = ISD::SADDO;
2102
792
    OpTy = RetTy->getContainedType(0);
2103
792
    break;
2104
792
  case Intrinsic::uadd_with_overflow:
2105
648
  case Intrinsic::usub_with_overflow:
2106
648
    // USUBO has same costs so don't duplicate.
2107
648
    ISD = ISD::UADDO;
2108
648
    OpTy = RetTy->getContainedType(0);
2109
648
    break;
2110
10.6k
  }
2111
10.6k
2112
10.6k
  if (ISD != ISD::DELETED_NODE) {
2113
4.59k
    // Legalize the type.
2114
4.59k
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
2115
4.59k
    MVT MTy = LT.second;
2116
4.59k
2117
4.59k
    // Attempt to lookup cost.
2118
4.59k
    if (ST->isGLM())
2119
170
      if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2120
8
        return LT.first * Entry->Cost;
2121
4.58k
2122
4.58k
    if (ST->isSLM())
2123
410
      if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2124
8
        return LT.first * Entry->Cost;
2125
4.57k
2126
4.57k
    if (ST->hasCDI())
2127
372
      if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2128
24
        return LT.first * Entry->Cost;
2129
4.55k
2130
4.55k
    if (ST->hasBWI())
2131
524
      if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2132
36
        return LT.first * Entry->Cost;
2133
4.51k
2134
4.51k
    if (ST->hasAVX512())
2135
1.02k
      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2136
50
        return LT.first * Entry->Cost;
2137
4.46k
2138
4.46k
    if (ST->hasXOP())
2139
96
      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2140
68
        return LT.first * Entry->Cost;
2141
4.39k
2142
4.39k
    if (ST->hasAVX2())
2143
1.57k
      if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2144
305
        return LT.first * Entry->Cost;
2145
4.09k
2146
4.09k
    if (ST->hasAVX())
2147
2.17k
      if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2148
185
        return LT.first * Entry->Cost;
2149
3.90k
2150
3.90k
    if (ST->hasSSE42())
2151
3.00k
      if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2152
42
        return LT.first * Entry->Cost;
2153
3.86k
2154
3.86k
    if (ST->hasSSSE3())
2155
3.19k
      if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2156
480
        return LT.first * Entry->Cost;
2157
3.38k
2158
3.38k
    if (ST->hasSSE2())
2159
3.37k
      if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2160
517
        return LT.first * Entry->Cost;
2161
2.86k
2162
2.86k
    if (ST->hasSSE1())
2163
2.86k
      if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2164
34
        return LT.first * Entry->Cost;
2165
2.83k
2166
2.83k
    if (ST->is64Bit())
2167
2.80k
      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2168
189
        return LT.first * Entry->Cost;
2169
2.64k
2170
2.64k
    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2171
553
      return LT.first * Entry->Cost;
2172
8.10k
  }
2173
8.10k
2174
8.10k
  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2175
8.10k
}
2176
2177
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
2178
                                      ArrayRef<Value *> Args, FastMathFlags FMF,
2179
14.0k
                                      unsigned VF) {
2180
14.0k
  static const CostTblEntry AVX512CostTbl[] = {
2181
14.0k
    { ISD::ROTL,       MVT::v8i64,   1 },
2182
14.0k
    { ISD::ROTL,       MVT::v4i64,   1 },
2183
14.0k
    { ISD::ROTL,       MVT::v2i64,   1 },
2184
14.0k
    { ISD::ROTL,       MVT::v16i32,  1 },
2185
14.0k
    { ISD::ROTL,       MVT::v8i32,   1 },
2186
14.0k
    { ISD::ROTL,       MVT::v4i32,   1 },
2187
14.0k
    { ISD::ROTR,       MVT::v8i64,   1 },
2188
14.0k
    { ISD::ROTR,       MVT::v4i64,   1 },
2189
14.0k
    { ISD::ROTR,       MVT::v2i64,   1 },
2190
14.0k
    { ISD::ROTR,       MVT::v16i32,  1 },
2191
14.0k
    { ISD::ROTR,       MVT::v8i32,   1 },
2192
14.0k
    { ISD::ROTR,       MVT::v4i32,   1 }
2193
14.0k
  };
2194
14.0k
  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2195
14.0k
  static const CostTblEntry XOPCostTbl[] = {
2196
14.0k
    { ISD::ROTL,       MVT::v4i64,   4 },
2197
14.0k
    { ISD::ROTL,       MVT::v8i32,   4 },
2198
14.0k
    { ISD::ROTL,       MVT::v16i16,  4 },
2199
14.0k
    { ISD::ROTL,       MVT::v32i8,   4 },
2200
14.0k
    { ISD::ROTL,       MVT::v2i64,   1 },
2201
14.0k
    { ISD::ROTL,       MVT::v4i32,   1 },
2202
14.0k
    { ISD::ROTL,       MVT::v8i16,   1 },
2203
14.0k
    { ISD::ROTL,       MVT::v16i8,   1 },
2204
14.0k
    { ISD::ROTR,       MVT::v4i64,   6 },
2205
14.0k
    { ISD::ROTR,       MVT::v8i32,   6 },
2206
14.0k
    { ISD::ROTR,       MVT::v16i16,  6 },
2207
14.0k
    { ISD::ROTR,       MVT::v32i8,   6 },
2208
14.0k
    { ISD::ROTR,       MVT::v2i64,   2 },
2209
14.0k
    { ISD::ROTR,       MVT::v4i32,   2 },
2210
14.0k
    { ISD::ROTR,       MVT::v8i16,   2 },
2211
14.0k
    { ISD::ROTR,       MVT::v16i8,   2 }
2212
14.0k
  };
2213
14.0k
  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2214
14.0k
    { ISD::ROTL,       MVT::i64,     1 },
2215
14.0k
    { ISD::ROTR,       MVT::i64,     1 },
2216
14.0k
    { ISD::FSHL,       MVT::i64,     4 }
2217
14.0k
  };
2218
14.0k
  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2219
14.0k
    { ISD::ROTL,       MVT::i32,     1 },
2220
14.0k
    { ISD::ROTL,       MVT::i16,     1 },
2221
14.0k
    { ISD::ROTL,       MVT::i8,      1 },
2222
14.0k
    { ISD::ROTR,       MVT::i32,     1 },
2223
14.0k
    { ISD::ROTR,       MVT::i16,     1 },
2224
14.0k
    { ISD::ROTR,       MVT::i8,      1 },
2225
14.0k
    { ISD::FSHL,       MVT::i32,     4 },
2226
14.0k
    { ISD::FSHL,       MVT::i16,     4 },
2227
14.0k
    { ISD::FSHL,       MVT::i8,      4 }
2228
14.0k
  };
2229
14.0k
2230
14.0k
  unsigned ISD = ISD::DELETED_NODE;
2231
14.0k
  switch (IID) {
2232
14.0k
  default:
2233
11.4k
    break;
2234
14.0k
  case Intrinsic::fshl:
2235
1.32k
    ISD = ISD::FSHL;
2236
1.32k
    if (Args[0] == Args[1])
2237
660
      ISD = ISD::ROTL;
2238
1.32k
    break;
2239
14.0k
  case Intrinsic::fshr:
2240
1.32k
    // FSHR has same costs so don't duplicate.
2241
1.32k
    ISD = ISD::FSHL;
2242
1.32k
    if (Args[0] == Args[1])
2243
660
      ISD = ISD::ROTR;
2244
1.32k
    break;
2245
14.0k
  }
2246
14.0k
2247
14.0k
  if (ISD != ISD::DELETED_NODE) {
2248
2.64k
    // Legalize the type.
2249
2.64k
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2250
2.64k
    MVT MTy = LT.second;
2251
2.64k
2252
2.64k
    // Attempt to lookup cost.
2253
2.64k
    if (ST->hasAVX512())
2254
720
      if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2255
144
        return LT.first * Entry->Cost;
2256
2.49k
2257
2.49k
    if (ST->hasXOP())
2258
240
      if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2259
96
        return LT.first * Entry->Cost;
2260
2.40k
2261
2.40k
    if (ST->is64Bit())
2262
2.40k
      if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2263
132
        return LT.first * Entry->Cost;
2264
2.26k
2265
2.26k
    if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2266
396
      return LT.first * Entry->Cost;
2267
13.2k
  }
2268
13.2k
2269
13.2k
  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2270
13.2k
}
2271
2272
322k
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2273
322k
  assert(Val->isVectorTy() && "This must be a vector type");
2274
322k
2275
322k
  Type *ScalarType = Val->getScalarType();
2276
322k
2277
322k
  if (Index != -1U) {
2278
322k
    // Legalize the type.
2279
322k
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2280
322k
2281
322k
    // This type is legalized to a scalar type.
2282
322k
    if (!LT.second.isVector())
2283
4.68k
      return 0;
2284
317k
2285
317k
    // The type may be split. Normalize the index to the new type.
2286
317k
    unsigned Width = LT.second.getVectorNumElements();
2287
317k
    Index = Index % Width;
2288
317k
2289
317k
    // Floating point scalars are already located in index #0.
2290
317k
    if (ScalarType->isFloatingPointTy() && 
Index == 038.2k
)
2291
11.2k
      return 0;
2292
306k
  }
2293
306k
2294
306k
  // Add to the base cost if we know that the extracted element of a vector is
2295
306k
  // destined to be moved to and used in the integer register file.
2296
306k
  int RegisterFileMoveCost = 0;
2297
306k
  if (Opcode == Instruction::ExtractElement && 
ScalarType->isPointerTy()142k
)
2298
13.3k
    RegisterFileMoveCost = 1;
2299
306k
2300
306k
  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2301
306k
}
2302
2303
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2304
45.7k
                                unsigned AddressSpace, const Instruction *I) {
2305
45.7k
  // Handle non-power-of-two vectors such as <3 x float>
2306
45.7k
  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2307
22.8k
    unsigned NumElem = VTy->getVectorNumElements();
2308
22.8k
2309
22.8k
    // Handle a few common cases:
2310
22.8k
    // <3 x float>
2311
22.8k
    if (NumElem == 3 && 
VTy->getScalarSizeInBits() == 324
)
2312
2
      // Cost = 64 bit store + extract + 32 bit store.
2313
2
      return 3;
2314
22.8k
2315
22.8k
    // <3 x double>
2316
22.8k
    if (NumElem == 3 && 
VTy->getScalarSizeInBits() == 642
)
2317
2
      // Cost = 128 bit store + unpack + 64 bit store.
2318
2
      return 3;
2319
22.8k
2320
22.8k
    // Assume that all other non-power-of-two numbers are scalarized.
2321
22.8k
    if (!isPowerOf2_32(NumElem)) {
2322
4
      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2323
4
                                        AddressSpace);
2324
4
      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2325
4
                                               Opcode == Instruction::Store);
2326
4
      return NumElem * Cost + SplitCost;
2327
4
    }
2328
45.7k
  }
2329
45.7k
2330
45.7k
  // Legalize the type.
2331
45.7k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2332
45.7k
  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2333
45.7k
         "Invalid Opcode");
2334
45.7k
2335
45.7k
  // Each load/store unit costs 1.
2336
45.7k
  int Cost = LT.first * 1;
2337
45.7k
2338
45.7k
  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2339
45.7k
  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2340
45.7k
  if (LT.second.getStoreSize() == 32 && 
ST->isUnalignedMem32Slow()2.78k
)
2341
747
    Cost *= 2;
2342
45.7k
2343
45.7k
  return Cost;
2344
45.7k
}
2345
2346
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2347
                                      unsigned Alignment,
2348
981
                                      unsigned AddressSpace) {
2349
981
  bool IsLoad = (Instruction::Load == Opcode);
2350
981
  bool IsStore = (Instruction::Store == Opcode);
2351
981
2352
981
  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2353
981
  if (!SrcVTy)
2354
0
    // To calculate scalar take the regular cost, without mask
2355
0
    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2356
981
2357
981
  unsigned NumElem = SrcVTy->getVectorNumElements();
2358
981
  VectorType *MaskTy =
2359
981
      VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2360
981
  if ((IsLoad && 
!isLegalMaskedLoad(SrcVTy)504
) ||
2361
981
      
(781
IsStore781
&&
!isLegalMaskedStore(SrcVTy)477
) ||
!isPowerOf2_32(NumElem)589
) {
2362
393
    // Scalarization
2363
393
    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2364
393
    int ScalarCompareCost = getCmpSelInstrCost(
2365
393
        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2366
393
    int BranchCost = getCFInstrCost(Instruction::Br);
2367
393
    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2368
393
2369
393
    int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
2370
393
    int MemopCost =
2371
393
        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2372
393
                                         Alignment, AddressSpace);
2373
393
    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2374
393
  }
2375
588
2376
588
  // Legalize the type.
2377
588
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2378
588
  auto VT = TLI->getValueType(DL, SrcVTy);
2379
588
  int Cost = 0;
2380
588
  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2381
588
      
LT.second.getVectorNumElements() == NumElem183
)
2382
62
    // Promotion requires expand/truncate for data and a shuffle for mask.
2383
62
    Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
2384
62
            getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
2385
526
2386
526
  else if (LT.second.getVectorNumElements() > NumElem) {
2387
73
    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2388
73
                                            LT.second.getVectorNumElements());
2389
73
    // Expanding requires fill mask with zeroes
2390
73
    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2391
73
  }
2392
588
2393
588
  // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
2394
588
  if (!ST->hasAVX512())
2395
300
    return Cost + LT.first * (IsLoad ? 
2155
:
8145
);
2396
288
2397
288
  // AVX-512 masked load/store is cheapper
2398
288
  return Cost + LT.first;
2399
288
}
2400
2401
int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
2402
3.44k
                                          const SCEV *Ptr) {
2403
3.44k
  // Address computations in vectorized code with non-consecutive addresses will
2404
3.44k
  // likely result in more instructions compared to scalar code where the
2405
3.44k
  // computation can more often be merged into the index mode. The resulting
2406
3.44k
  // extra micro-ops can significantly decrease throughput.
2407
3.44k
  const unsigned NumVectorInstToHideOverhead = 10;
2408
3.44k
2409
3.44k
  // Cost modeling of Strided Access Computation is hidden by the indexing
2410
3.44k
  // modes of X86 regardless of the stride value. We dont believe that there
2411
3.44k
  // is a difference between constant strided access in gerenal and constant
2412
3.44k
  // strided value which is less than or equal to 64.
2413
3.44k
  // Even in the case of (loop invariant) stride whose value is not known at
2414
3.44k
  // compile time, the address computation will not incur more than one extra
2415
3.44k
  // ADD instruction.
2416
3.44k
  if (Ty->isVectorTy() && 
SE1.13k
) {
2417
1.00k
    if (!BaseT::isStridedAccess(Ptr))
2418
270
      return NumVectorInstToHideOverhead;
2419
735
    if (!BaseT::getConstantStrideStep(SE, Ptr))
2420
0
      return 1;
2421
3.17k
  }
2422
3.17k
2423
3.17k
  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2424
3.17k
}
2425
2426
int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2427
2.54k
                                           bool IsPairwise) {
2428
2.54k
2429
2.54k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2430
2.54k
2431
2.54k
  MVT MTy = LT.second;
2432
2.54k
2433
2.54k
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2434
2.54k
  assert(ISD && "Invalid opcode");
2435
2.54k
2436
2.54k
  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2437
2.54k
  // and make it as the cost.
2438
2.54k
2439
2.54k
  static const CostTblEntry SSE42CostTblPairWise[] = {
2440
2.54k
    { ISD::FADD,  MVT::v2f64,   2 },
2441
2.54k
    { ISD::FADD,  MVT::v4f32,   4 },
2442
2.54k
    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
2443
2.54k
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
2444
2.54k
    { ISD::ADD,   MVT::v8i16,   5 },
2445
2.54k
  };
2446
2.54k
2447
2.54k
  static const CostTblEntry AVX1CostTblPairWise[] = {
2448
2.54k
    { ISD::FADD,  MVT::v4f32,   4 },
2449
2.54k
    { ISD::FADD,  MVT::v4f64,   5 },
2450
2.54k
    { ISD::FADD,  MVT::v8f32,   7 },
2451
2.54k
    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
2452
2.54k
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
2453
2.54k
    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
2454
2.54k
    { ISD::ADD,   MVT::v8i16,   5 },
2455
2.54k
    { ISD::ADD,   MVT::v8i32,   5 },
2456
2.54k
  };
2457
2.54k
2458
2.54k
  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2459
2.54k
    { ISD::FADD,  MVT::v2f64,   2 },
2460
2.54k
    { ISD::FADD,  MVT::v4f32,   4 },
2461
2.54k
    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
2462
2.54k
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
2463
2.54k
    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
2464
2.54k
  };
2465
2.54k
2466
2.54k
  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2467
2.54k
    { ISD::FADD,  MVT::v4f32,   3 },
2468
2.54k
    { ISD::FADD,  MVT::v4f64,   3 },
2469
2.54k
    { ISD::FADD,  MVT::v8f32,   4 },
2470
2.54k
    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
2471
2.54k
    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
2472
2.54k
    { ISD::ADD,   MVT::v4i64,   3 },
2473
2.54k
    { ISD::ADD,   MVT::v8i16,   4 },
2474
2.54k
    { ISD::ADD,   MVT::v8i32,   5 },
2475
2.54k
  };
2476
2.54k
2477
2.54k
  if (IsPairwise) {
2478
161
    if (ST->hasAVX())
2479
90
      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2480
79
        return LT.first * Entry->Cost;
2481
82
2482
82
    if (ST->hasSSE42())
2483
31
      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2484
22
        return LT.first * Entry->Cost;
2485
2.38k
  } else {
2486
2.38k
    if (ST->hasAVX())
2487
1.47k
      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2488
188
        return LT.first * Entry->Cost;
2489
2.19k
2490
2.19k
    if (ST->hasSSE42())
2491
1.58k
      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2492
54
        return LT.first * Entry->Cost;
2493
2.19k
  }
2494
2.19k
2495
2.19k
  static const CostTblEntry AVX2BoolReduction[] = {
2496
2.19k
    { ISD::AND,  MVT::v16i16,  2 }, // vpmovmskb + cmp
2497
2.19k
    { ISD::AND,  MVT::v32i8,   2 }, // vpmovmskb + cmp
2498
2.19k
    { ISD::OR,   MVT::v16i16,  2 }, // vpmovmskb + cmp
2499
2.19k
    { ISD::OR,   MVT::v32i8,   2 }, // vpmovmskb + cmp
2500
2.19k
  };
2501
2.19k
2502
2.19k
  static const CostTblEntry AVX1BoolReduction[] = {
2503
2.19k
    { ISD::AND,  MVT::v4i64,   2 }, // vmovmskpd + cmp
2504
2.19k
    { ISD::AND,  MVT::v8i32,   2 }, // vmovmskps + cmp
2505
2.19k
    { ISD::AND,  MVT::v16i16,  4 }, // vextractf128 + vpand + vpmovmskb + cmp
2506
2.19k
    { ISD::AND,  MVT::v32i8,   4 }, // vextractf128 + vpand + vpmovmskb + cmp
2507
2.19k
    { ISD::OR,   MVT::v4i64,   2 }, // vmovmskpd + cmp
2508
2.19k
    { ISD::OR,   MVT::v8i32,   2 }, // vmovmskps + cmp
2509
2.19k
    { ISD::OR,   MVT::v16i16,  4 }, // vextractf128 + vpor + vpmovmskb + cmp
2510
2.19k
    { ISD::OR,   MVT::v32i8,   4 }, // vextractf128 + vpor + vpmovmskb + cmp
2511
2.19k
  };
2512
2.19k
2513
2.19k
  static const CostTblEntry SSE2BoolReduction[] = {
2514
2.19k
    { ISD::AND,  MVT::v2i64,   2 }, // movmskpd + cmp
2515
2.19k
    { ISD::AND,  MVT::v4i32,   2 }, // movmskps + cmp
2516
2.19k
    { ISD::AND,  MVT::v8i16,   2 }, // pmovmskb + cmp
2517
2.19k
    { ISD::AND,  MVT::v16i8,   2 }, // pmovmskb + cmp
2518
2.19k
    { ISD::OR,   MVT::v2i64,   2 }, // movmskpd + cmp
2519
2.19k
    { ISD::OR,   MVT::v4i32,   2 }, // movmskps + cmp
2520
2.19k
    { ISD::OR,   MVT::v8i16,   2 }, // pmovmskb + cmp
2521
2.19k
    { ISD::OR,   MVT::v16i8,   2 }, // pmovmskb + cmp
2522
2.19k
  };
2523
2.19k
2524
2.19k
  // Handle bool allof/anyof patterns.
2525
2.19k
  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
2526
430
    if (ST->hasAVX2())
2527
202
      if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
2528
12
        return LT.first * Entry->Cost;
2529
418
    if (ST->hasAVX())
2530
238
      if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
2531
12
        return LT.first * Entry->Cost;
2532
406
    if (ST->hasSSE2())
2533
406
      if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
2534
162
        return LT.first * Entry->Cost;
2535
2.01k
  }
2536
2.01k
2537
2.01k
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2538
2.01k
}
2539
2540
int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
2541
1.54k
                                       bool IsPairwise, bool IsUnsigned) {
2542
1.54k
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2543
1.54k
2544
1.54k
  MVT MTy = LT.second;
2545
1.54k
2546
1.54k
  int ISD;
2547
1.54k
  if (ValTy->isIntOrIntVectorTy()) {
2548
1.51k
    ISD = IsUnsigned ? 
ISD::UMIN734
:
ISD::SMIN784
;
2549
1.51k
  } else {
2550
28
    assert(ValTy->isFPOrFPVectorTy() &&
2551
28
           "Expected float point or integer vector type.");
2552
28
    ISD = ISD::FMINNUM;
2553
28
  }
2554
1.54k
2555
1.54k
  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2556
1.54k
  // and make it as the cost.
2557
1.54k
2558
1.54k
  static const CostTblEntry SSE1CostTblPairWise[] = {
2559
1.54k
      {ISD::FMINNUM, MVT::v4f32, 4},
2560
1.54k
  };
2561
1.54k
2562
1.54k
  static const CostTblEntry SSE2CostTblPairWise[] = {
2563
1.54k
      {ISD::FMINNUM, MVT::v2f64, 3},
2564
1.54k
      {ISD::SMIN, MVT::v2i64, 6},
2565
1.54k
      {ISD::UMIN, MVT::v2i64, 8},
2566
1.54k
      {ISD::SMIN, MVT::v4i32, 6},
2567
1.54k
      {ISD::UMIN, MVT::v4i32, 8},
2568
1.54k
      {ISD::SMIN, MVT::v8i16, 4},
2569
1.54k
      {ISD::UMIN, MVT::v8i16, 6},
2570
1.54k
      {ISD::SMIN, MVT::v16i8, 8},
2571
1.54k
      {ISD::UMIN, MVT::v16i8, 6},
2572
1.54k
  };
2573
1.54k
2574
1.54k
  static const CostTblEntry SSE41CostTblPairWise[] = {
2575
1.54k
      {ISD::FMINNUM, MVT::v4f32, 2},
2576
1.54k
      {ISD::SMIN, MVT::v2i64, 9},
2577
1.54k
      {ISD::UMIN, MVT::v2i64,10},
2578
1.54k
      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2579
1.54k
      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2580
1.54k
      {ISD::SMIN, MVT::v8i16, 2},
2581
1.54k
      {ISD::UMIN, MVT::v8i16, 2},
2582
1.54k
      {ISD::SMIN, MVT::v16i8, 3},
2583
1.54k
      {ISD::UMIN, MVT::v16i8, 3},
2584
1.54k
  };
2585
1.54k
2586
1.54k
  static const CostTblEntry SSE42CostTblPairWise[] = {
2587
1.54k
      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2588
1.54k
      {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2589
1.54k
  };
2590
1.54k
2591
1.54k
  static const CostTblEntry AVX1CostTblPairWise[] = {
2592
1.54k
      {ISD::FMINNUM, MVT::v4f32, 1},
2593
1.54k
      {ISD::FMINNUM, MVT::v4f64, 1},
2594
1.54k
      {ISD::FMINNUM, MVT::v8f32, 2},
2595
1.54k
      {ISD::SMIN, MVT::v2i64, 3},
2596
1.54k
      {ISD::UMIN, MVT::v2i64, 3},
2597
1.54k
      {ISD::SMIN, MVT::v4i32, 1},
2598
1.54k
      {ISD::UMIN, MVT::v4i32, 1},
2599
1.54k
      {ISD::SMIN, MVT::v8i16, 1},
2600
1.54k
      {ISD::UMIN, MVT::v8i16, 1},
2601
1.54k
      {ISD::SMIN, MVT::v16i8, 2},
2602
1.54k
      {ISD::UMIN, MVT::v16i8, 2},
2603
1.54k
      {ISD::SMIN, MVT::v4i64, 7},
2604
1.54k
      {ISD::UMIN, MVT::v4i64, 7},
2605
1.54k
      {ISD::SMIN, MVT::v8i32, 3},
2606
1.54k
      {ISD::UMIN, MVT::v8i32, 3},
2607
1.54k
      {ISD::SMIN, MVT::v16i16, 3},
2608
1.54k
      {ISD::UMIN, MVT::v16i16, 3},
2609
1.54k
      {ISD::SMIN, MVT::v32i8, 3},
2610
1.54k
      {ISD::UMIN, MVT::v32i8, 3},
2611
1.54k
  };
2612
1.54k
2613
1.54k
  static const CostTblEntry AVX2CostTblPairWise[] = {
2614
1.54k
      {ISD::SMIN, MVT::v4i64, 2},
2615
1.54k
      {ISD::UMIN, MVT::v4i64, 2},
2616
1.54k
      {ISD::SMIN, MVT::v8i32, 1},
2617
1.54k
      {ISD::UMIN, MVT::v8i32, 1},
2618
1.54k
      {ISD::SMIN, MVT::v16i16, 1},
2619
1.54k
      {ISD::UMIN, MVT::v16i16, 1},
2620
1.54k
      {ISD::SMIN, MVT::v32i8, 2},
2621
1.54k
      {ISD::UMIN, MVT::v32i8, 2},
2622
1.54k
  };
2623
1.54k
2624
1.54k
  static const CostTblEntry AVX512CostTblPairWise[] = {
2625
1.54k
      {ISD::FMINNUM, MVT::v8f64, 1},
2626
1.54k
      {ISD::FMINNUM, MVT::v16f32, 2},
2627
1.54k
      {ISD::SMIN, MVT::v8i64, 2},
2628
1.54k
      {ISD::UMIN, MVT::v8i64, 2},
2629
1.54k
      {ISD::SMIN, MVT::v16i32, 1},
2630
1.54k
      {ISD::UMIN, MVT::v16i32, 1},
2631
1.54k
  };
2632
1.54k
2633
1.54k
  static const CostTblEntry SSE1CostTblNoPairWise[] = {
2634
1.54k
      {ISD::FMINNUM, MVT::v4f32, 4},
2635
1.54k
  };
2636
1.54k
2637
1.54k
  static const CostTblEntry SSE2CostTblNoPairWise[] = {
2638
1.54k
      {ISD::FMINNUM, MVT::v2f64, 3},
2639
1.54k
      {ISD::SMIN, MVT::v2i64, 6},
2640
1.54k
      {ISD::UMIN, MVT::v2i64, 8},
2641
1.54k
      {ISD::SMIN, MVT::v4i32, 6},
2642
1.54k
      {ISD::UMIN, MVT::v4i32, 8},
2643
1.54k
      {ISD::SMIN, MVT::v8i16, 4},
2644
1.54k
      {ISD::UMIN, MVT::v8i16, 6},
2645
1.54k
      {ISD::SMIN, MVT::v16i8, 8},
2646
1.54k
      {ISD::UMIN, MVT::v16i8, 6},
2647
1.54k
  };
2648
1.54k
2649
1.54k
  static const CostTblEntry SSE41CostTblNoPairWise[] = {
2650
1.54k
      {ISD::FMINNUM, MVT::v4f32, 3},
2651
1.54k
      {ISD::SMIN, MVT::v2i64, 9},
2652
1.54k
      {ISD::UMIN, MVT::v2i64,11},
2653
1.54k
      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2654
1.54k
      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2655
1.54k
      {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2656
1.54k
      {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2657
1.54k
      {ISD::SMIN, MVT::v16i8, 3},
2658
1.54k
      {ISD::UMIN, MVT::v16i8, 3},
2659
1.54k
  };
2660
1.54k
2661
1.54k
  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2662
1.54k
      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2663
1.54k
      {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2664
1.54k
  };
2665
1.54k
2666
1.54k
  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2667
1.54k
      {ISD::FMINNUM, MVT::v4f32, 1},
2668
1.54k
      {ISD::FMINNUM, MVT::v4f64, 1},
2669
1.54k
      {ISD::FMINNUM, MVT::v8f32, 1},
2670
1.54k
      {ISD::SMIN, MVT::v2i64, 3},
2671
1.54k
      {ISD::UMIN, MVT::v2i64, 3},
2672
1.54k
      {ISD::SMIN, MVT::v4i32, 1},
2673
1.54k
      {ISD::UMIN, MVT::v4i32, 1},
2674
1.54k
      {ISD::SMIN, MVT::v8i16, 1},
2675
1.54k
      {ISD::UMIN, MVT::v8i16, 1},
2676
1.54k
      {ISD::SMIN, MVT::v16i8, 2},
2677
1.54k
      {ISD::UMIN, MVT::v16i8, 2},
2678
1.54k
      {ISD::SMIN, MVT::v4i64, 7},
2679
1.54k
      {ISD::UMIN, MVT::v4i64, 7},
2680
1.54k
      {ISD::SMIN, MVT::v8i32, 2},
2681
1.54k
      {ISD::UMIN, MVT::v8i32, 2},
2682
1.54k
      {ISD::SMIN, MVT::v16i16, 2},
2683
1.54k
      {ISD::UMIN, MVT::v16i16, 2},
2684
1.54k
      {ISD::SMIN, MVT::v32i8, 2},
2685
1.54k
      {ISD::UMIN, MVT::v32i8, 2},
2686
1.54k
  };
2687
1.54k
2688
1.54k
  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2689
1.54k
      {ISD::SMIN, MVT::v4i64, 1},
2690
1.54k
      {ISD::UMIN, MVT::v4i64, 1},
2691
1.54k
      {ISD::SMIN, MVT::v8i32, 1},
2692
1.54k
      {ISD::UMIN, MVT::v8i32, 1},
2693
1.54k
      {ISD::SMIN, MVT::v16i16, 1},
2694
1.54k
      {ISD::UMIN, MVT::v16i16, 1},
2695
1.54k
      {ISD::SMIN, MVT::v32i8, 1},
2696
1.54k
      {ISD::UMIN, MVT::v32i8, 1},
2697
1.54k
  };
2698
1.54k
2699
1.54k
  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2700
1.54k
      {ISD::FMINNUM, MVT::v8f64, 1},
2701
1.54k
      {ISD::FMINNUM, MVT::v16f32, 2},
2702
1.54k
      {ISD::SMIN, MVT::v8i64, 1},
2703
1.54k
      {ISD::UMIN, MVT::v8i64, 1},
2704
1.54k
      {ISD::SMIN, MVT::v16i32, 1},
2705
1.54k
      {ISD::UMIN, MVT::v16i32, 1},
2706
1.54k
  };
2707
1.54k
2708
1.54k
  if (IsPairwise) {
2709
41
    if (ST->hasAVX512())
2710
8
      if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2711
4
        return LT.first * Entry->Cost;
2712
37
2713
37
    if (ST->hasAVX2())
2714
12
      if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2715
4
        return LT.first * Entry->Cost;
2716
33
2717
33
    if (ST->hasAVX())
2718
25
      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2719
25
        return LT.first * Entry->Cost;
2720
8
2721
8
    if (ST->hasSSE42())
2722
0
      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2723
0
        return LT.first * Entry->Cost;
2724
8
2725
8
    if (ST->hasSSE41())
2726
0
      if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
2727
0
        return LT.first * Entry->Cost;
2728
8
2729
8
    if (ST->hasSSE2())
2730
8
      if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
2731
5
        return LT.first * Entry->Cost;
2732
3
2733
3
    if (ST->hasSSE1())
2734
3
      if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
2735
3
        return LT.first * Entry->Cost;
2736
1.50k
  } else {
2737
1.50k
    if (ST->hasAVX512())
2738
557
      if (const auto *Entry =
2739
100
              CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2740
100
        return LT.first * Entry->Cost;
2741
1.40k
2742
1.40k
    if (ST->hasAVX2())
2743
648
      if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2744
260
        return LT.first * Entry->Cost;
2745
1.14k
2746
1.14k
    if (ST->hasAVX())
2747
588
      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2748
516
        return LT.first * Entry->Cost;
2749
629
2750
629
    if (ST->hasSSE42())
2751
255
      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2752
44
        return LT.first * Entry->Cost;
2753
585
2754
585
    if (ST->hasSSE41())
2755
211
      if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
2756
131
        return LT.first * Entry->Cost;
2757
454
2758
454
    if (ST->hasSSE2())
2759
454
      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
2760
355
        return LT.first * Entry->Cost;
2761
99
2762
99
    if (ST->hasSSE1())
2763
99
      if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
2764
3
        return LT.first * Entry->Cost;
2765
96
  }
2766
96
2767
96
  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2768
96
}
2769
2770
/// Calculate the cost of materializing a 64-bit value. This helper
2771
/// method might only calculate a fraction of a larger immediate. Therefore it
2772
/// is valid to return a cost of ZERO.
2773
157k
int X86TTIImpl::getIntImmCost(int64_t Val) {
2774
157k
  if (Val == 0)
2775
434
    return TTI::TCC_Free;
2776
156k
2777
156k
  if (isInt<32>(Val))
2778
137k
    return TTI::TCC_Basic;
2779
19.6k
2780
19.6k
  return 2 * TTI::TCC_Basic;
2781
19.6k
}
2782
2783
300k
int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2784
300k
  assert(Ty->isIntegerTy());
2785
300k
2786
300k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2787
300k
  if (BitSize == 0)
2788
0
    return ~0U;
2789
300k
2790
300k
  // Never hoist constants larger than 128bit, because this might lead to
2791
300k
  // incorrect code generation or assertions in codegen.
2792
300k
  // Fixme: Create a cost model for types larger than i128 once the codegen
2793
300k
  // issues have been fixed.
2794
300k
  if (BitSize > 128)
2795
164
    return TTI::TCC_Free;
2796
300k
2797
300k
  if (Imm == 0)
2798
143k
    return TTI::TCC_Free;
2799
156k
2800
156k
  // Sign-extend all constants to a multiple of 64-bit.
2801
156k
  APInt ImmVal = Imm;
2802
156k
  if (BitSize % 64 != 0)
2803
83.1k
    ImmVal = Imm.sext(alignTo(BitSize, 64));
2804
156k
2805
156k
  // Split the constant into 64-bit chunks and calculate the cost for each
2806
156k
  // chunk.
2807
156k
  int Cost = 0;
2808
313k
  for (unsigned ShiftVal = 0; ShiftVal < BitSize; 
ShiftVal += 64157k
) {
2809
157k
    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2810
157k
    int64_t Val = Tmp.getSExtValue();
2811
157k
    Cost += getIntImmCost(Val);
2812
157k
  }
2813
156k
  // We need at least one instruction to materialize the constant.
2814
156k
  return std::max(1, Cost);
2815
156k
}
2816
2817
int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2818
568k
                              Type *Ty) {
2819
568k
  assert(Ty->isIntegerTy());
2820
568k
2821
568k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2822
568k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2823
568k
  // here, so that constant hoisting will ignore this constant.
2824
568k
  if (BitSize == 0)
2825
0
    return TTI::TCC_Free;
2826
568k
2827
568k
  unsigned ImmIdx = ~0U;
2828
568k
  switch (Opcode) {
2829
568k
  default:
2830
43.3k
    return TTI::TCC_Free;
2831
568k
  case Instruction::GetElementPtr:
2832
183k
    // Always hoist the base address of a GetElementPtr. This prevents the
2833
183k
    // creation of new constants for every base constant that gets constant
2834
183k
    // folded with the offset.
2835
183k
    if (Idx == 0)
2836
339
      return 2 * TTI::TCC_Basic;
2837
183k
    return TTI::TCC_Free;
2838
183k
  case Instruction::Store:
2839
35.8k
    ImmIdx = 0;
2840
35.8k
    break;
2841
183k
  case Instruction::ICmp:
2842
96.2k
    // This is an imperfect hack to prevent constant hoisting of
2843
96.2k
    // compares that might be trying to check if a 64-bit value fits in
2844
96.2k
    // 32-bits. The backend can optimize these cases using a right shift by 32.
2845
96.2k
    // Ideally we would check the compare predicate here. There also other
2846
96.2k
    // similar immediates the backend can use shifts for.
2847
96.2k
    if (Idx == 1 && 
Imm.getBitWidth() == 6496.1k
) {
2848
35.0k
      uint64_t ImmVal = Imm.getZExtValue();
2849
35.0k
      if (ImmVal == 0x100000000ULL || 
ImmVal == 0xffffffff35.0k
)
2850
70
        return TTI::TCC_Free;
2851
96.1k
    }
2852
96.1k
    ImmIdx = 1;
2853
96.1k
    break;
2854
96.1k
  case Instruction::And:
2855
28.4k
    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2856
28.4k
    // by using a 32-bit operation with implicit zero extension. Detect such
2857
28.4k
    // immediates here as the normal path expects bit 31 to be sign extended.
2858
28.4k
    if (Idx == 1 && 
Imm.getBitWidth() == 6428.4k
&&
isUInt<32>(Imm.getZExtValue())17.4k
)
2859
8.68k
      return TTI::TCC_Free;
2860
19.7k
    ImmIdx = 1;
2861
19.7k
    break;
2862
43.5k
  case Instruction::Add:
2863
43.5k
  case Instruction::Sub:
2864
43.5k
    // For add/sub, we can use the opposite instruction for INT32_MIN.
2865
43.5k
    if (Idx == 1 && 
Imm.getBitWidth() == 6437.3k
&&
Imm.getZExtValue() == 0x8000000024.2k
)
2866
17
      return TTI::TCC_Free;
2867
43.5k
    ImmIdx = 1;
2868
43.5k
    break;
2869
43.5k
  case Instruction::UDiv:
2870
2.33k
  case Instruction::SDiv:
2871
2.33k
  case Instruction::URem:
2872
2.33k
  case Instruction::SRem:
2873
2.33k
    // Division by constant is typically expanded later into a different
2874
2.33k
    // instruction sequence. This completely changes the constants.
2875
2.33k
    // Report them as "free" to stop ConstantHoist from marking them as opaque.
2876
2.33k
    return TTI::TCC_Free;
2877
12.2k
  case Instruction::Mul:
2878
12.2k
  case Instruction::Or:
2879
12.2k
  case Instruction::Xor:
2880
12.2k
    ImmIdx = 1;
2881
12.2k
    break;
2882
12.2k
  // Always return TCC_Free for the shift value of a shift instruction.
2883
35.2k
  case Instruction::Shl:
2884
35.2k
  case Instruction::LShr:
2885
35.2k
  case Instruction::AShr:
2886
35.2k
    if (Idx == 1)
2887
30.6k
      return TTI::TCC_Free;
2888
4.61k
    break;
2889
87.9k
  case Instruction::Trunc:
2890
87.9k
  case Instruction::ZExt:
2891
87.9k
  case Instruction::SExt:
2892
87.9k
  case Instruction::IntToPtr:
2893
87.9k
  case Instruction::PtrToInt:
2894
87.9k
  case Instruction::BitCast:
2895
87.9k
  case Instruction::PHI:
2896
87.9k
  case Instruction::Call:
2897
87.9k
  case Instruction::Select:
2898
87.9k
  case Instruction::Ret:
2899
87.9k
  case Instruction::Load:
2900
87.9k
    break;
2901
300k
  }
2902
300k
2903
300k
  if (Idx == ImmIdx) {
2904
201k
    int NumConstants = divideCeil(BitSize, 64);
2905
201k
    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2906
201k
    return (Cost <= NumConstants * TTI::TCC_Basic)
2907
201k
               ? 
static_cast<int>(TTI::TCC_Free)183k
2908
201k
               : 
Cost17.5k
;
2909
201k
  }
2910
99.1k
2911
99.1k
  return X86TTIImpl::getIntImmCost(Imm, Ty);
2912
99.1k
}
2913
2914
int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2915
63.8k
                              Type *Ty) {
2916
63.8k
  assert(Ty->isIntegerTy());
2917
63.8k
2918
63.8k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2919
63.8k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2920
63.8k
  // here, so that constant hoisting will ignore this constant.
2921
63.8k
  if (BitSize == 0)
2922
0
    return TTI::TCC_Free;
2923
63.8k
2924
63.8k
  switch (IID) {
2925
63.8k
  default:
2926
63.2k
    return TTI::TCC_Free;
2927
63.8k
  case Intrinsic::sadd_with_overflow:
2928
114
  case Intrinsic::uadd_with_overflow:
2929
114
  case Intrinsic::ssub_with_overflow:
2930
114
  case Intrinsic::usub_with_overflow:
2931
114
  case Intrinsic::smul_with_overflow:
2932
114
  case Intrinsic::umul_with_overflow:
2933
114
    if ((Idx == 1) && 
Imm.getBitWidth() <= 6494
&&
isInt<32>(Imm.getSExtValue())94
)
2934
87
      return TTI::TCC_Free;
2935
27
    break;
2936
237
  case Intrinsic::experimental_stackmap:
2937
237
    if ((Idx < 2) || 
(31
Imm.getBitWidth() <= 6431
&&
isInt<64>(Imm.getSExtValue())30
))
2938
236
      return TTI::TCC_Free;
2939
1
    break;
2940
283
  case Intrinsic::experimental_patchpoint_void:
2941
283
  case Intrinsic::experimental_patchpoint_i64:
2942
283
    if ((Idx < 4) || 
(41
Imm.getBitWidth() <= 6441
&&
isInt<64>(Imm.getSExtValue())41
))
2943
283
      return TTI::TCC_Free;
2944
0
    break;
2945
28
  }
2946
28
  return X86TTIImpl::getIntImmCost(Imm, Ty);
2947
28
}
2948
2949
unsigned X86TTIImpl::getUserCost(const User *U,
2950
3.35M
                                 ArrayRef<const Value *> Operands) {
2951
3.35M
  if (isa<StoreInst>(U)) {
2952
134k
    Value *Ptr = U->getOperand(1);
2953
134k
    // Store instruction with index and scale costs 2 Uops.
2954
134k
    // Check the preceding GEP to identify non-const indices.
2955
134k
    if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2956
249k
      if (
!all_of(GEP->indices(), [](Value *V) 81.6k
{ return isa<Constant>(V); }))
2957
18.1k
        return TTI::TCC_Basic * 2;
2958
116k
    }
2959
116k
    return TTI::TCC_Basic;
2960
116k
  }
2961
3.22M
  return BaseT::getUserCost(U, Operands);
2962
3.22M
}
2963
2964
// Return an average cost of Gather / Scatter instruction, maybe improved later
2965
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2966
264
                                unsigned Alignment, unsigned AddressSpace) {
2967
264
2968
264
  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2969
264
  unsigned VF = SrcVTy->getVectorNumElements();
2970
264
2971
264
  // Try to reduce index size from 64 bit (default for GEP)
2972
264
  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2973
264
  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2974
264
  // to split. Also check that the base pointer is the same for all lanes,
2975
264
  // and that there's at most one variable index.
2976
264
  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2977
61
    unsigned IndexSize = DL.getPointerSizeInBits();
2978
61
    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2979
61
    if (IndexSize < 64 || !GEP)
2980
19
      return IndexSize;
2981
42
2982
42
    unsigned NumOfVarIndices = 0;
2983
42
    Value *Ptrs = GEP->getPointerOperand();
2984
42
    if (Ptrs->getType()->isVectorTy() && 
!getSplatValue(Ptrs)12
)
2985
4
      return IndexSize;
2986
55
    
for (unsigned i = 1; 38
i < GEP->getNumOperands();
++i17
) {
2987
38
      if (isa<Constant>(GEP->getOperand(i)))
2988
0
        continue;
2989
38
      Type *IndxTy = GEP->getOperand(i)->getType();
2990
38
      if (IndxTy->isVectorTy())
2991
16
        IndxTy = IndxTy->getVectorElementType();
2992
38
      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2993
38
          
!isa<SExtInst>(GEP->getOperand(i))34
) ||
2994
38
         
++NumOfVarIndices > 117
)
2995
21
        return IndexSize; // 64
2996
38
    }
2997
38
    
return (unsigned)3217
;
2998
38
  };
2999
264
3000
264
3001
264
  // Trying to reduce IndexSize to 32 bits for vector 16.
3002
264
  // By default the IndexSize is equal to pointer size.
3003
264
  unsigned IndexSize = (ST->hasAVX512() && 
VF >= 16198
)
3004
264
                           ? 
getIndexSizeInBits(Ptr, DL)61
3005
264
                           : 
DL.getPointerSizeInBits()203
;
3006
264
3007
264
  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
3008
264
                                                    IndexSize), VF);
3009
264
  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
3010
264
  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
3011
264
  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
3012
264
  if (SplitFactor > 1) {
3013
64
    // Handle splitting of vector of pointers
3014
64
    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
3015
64
    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
3016
64
                                         AddressSpace);
3017
64
  }
3018
200
3019
200
  // The gather / scatter cost is given by Intel architects. It is a rough
3020
200
  // number since we are looking at one instruction in a time.
3021
200
  const int GSOverhead = (Opcode == Instruction::Load)
3022
200
                             ? 
ST->getGatherOverhead()130
3023
200
                             : 
ST->getScatterOverhead()70
;
3024
200
  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3025
200
                                           Alignment, AddressSpace);
3026
200
}
3027
3028
/// Return the cost of full scalarization of gather / scatter operation.
3029
///
3030
/// Opcode - Load or Store instruction.
3031
/// SrcVTy - The type of the data vector that should be gathered or scattered.
3032
/// VariableMask - The mask is non-constant at compile time.
3033
/// Alignment - Alignment for one element.
3034
/// AddressSpace - pointer[s] address space.
3035
///
3036
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
3037
                                bool VariableMask, unsigned Alignment,
3038
773
                                unsigned AddressSpace) {
3039
773
  unsigned VF = SrcVTy->getVectorNumElements();
3040
773
3041
773
  int MaskUnpackCost = 0;
3042
773
  if (VariableMask) {
3043
131
    VectorType *MaskTy =
3044
131
      VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
3045
131
    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
3046
131
    int ScalarCompareCost =
3047
131
      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
3048
131
                         nullptr);
3049
131
    int BranchCost = getCFInstrCost(Instruction::Br);
3050
131
    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
3051
131
  }
3052
773
3053
773
  // The cost of the scalar loads/stores.
3054
773
  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
3055
773
                                          Alignment, AddressSpace);
3056
773
3057
773
  int InsertExtractCost = 0;
3058
773
  if (Opcode == Instruction::Load)
3059
4.53k
    
for (unsigned i = 0; 402
i < VF;
++i4.12k
)
3060
4.12k
      // Add the cost of inserting each scalar load into the vector
3061
4.12k
      InsertExtractCost +=
3062
4.12k
        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
3063
371
  else
3064
4.25k
    
for (unsigned i = 0; 371
i < VF;
++i3.88k
)
3065
3.88k
      // Add the cost of extracting each element out of the data vector
3066
3.88k
      InsertExtractCost +=
3067
3.88k
        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
3068
773
3069
773
  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
3070
773
}
3071
3072
/// Calculate the cost of Gather / Scatter operation
3073
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
3074
                                       Value *Ptr, bool VariableMask,
3075
973
                                       unsigned Alignment) {
3076
973
  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
3077
973
  unsigned VF = SrcVTy->getVectorNumElements();
3078
973
  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3079
973
  if (!PtrTy && 
Ptr->getType()->isVectorTy()841
)
3080
841
    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
3081
973
  assert(PtrTy && "Unexpected type for Ptr argument");
3082
973
  unsigned AddressSpace = PtrTy->getAddressSpace();
3083
973
3084
973
  bool Scalarize = false;
3085
973
  if ((Opcode == Instruction::Load && 
!isLegalMaskedGather(SrcVTy)532
) ||
3086
973
      
(649
Opcode == Instruction::Store649
&&
!isLegalMaskedScatter(SrcVTy)441
))
3087
634
    Scalarize = true;
3088
973
  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
3089
973
  // Vector-4 of gather/scatter instruction does not exist on KNL.
3090
973
  // We can extend it to 8 elements, but zeroing upper bits of
3091
973
  // the mask vector will add more instructions. Right now we give the scalar
3092
973
  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
3093
973
  // is better in the VariableMask case.
3094
973
  if (ST->hasAVX512() && 
(373
VF == 2373
||
(291
VF == 4291
&&
!ST->hasVLX()91
)))
3095
143
    Scalarize = true;
3096
973
3097
973
  if (Scalarize)
3098
773
    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
3099
773
                           AddressSpace);
3100
200
3101
200
  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
3102
200
}
3103
3104
bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
3105
397k
                               TargetTransformInfo::LSRCost &C2) {
3106
397k
    // X86 specific here are "instruction number 1st priority".
3107
397k
    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
3108
397k
                    C1.NumIVMuls, C1.NumBaseAdds,
3109
397k
                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3110
397k
           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
3111
397k
                    C2.NumIVMuls, C2.NumBaseAdds,
3112
397k
                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3113
397k
}
3114
3115
113k
bool X86TTIImpl::canMacroFuseCmp() {
3116
113k
  return ST->hasMacroFusion() || 
ST->hasBranchFusion()19.2k
;
3117
113k
}
3118
3119
3.00k
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
3120
3.00k
  if (!ST->hasAVX())
3121
647
    return false;
3122
2.36k
3123
2.36k
  // The backend can't handle a single element vector.
3124
2.36k
  if (isa<VectorType>(DataTy) && 
DataTy->getVectorNumElements() == 11.77k
)
3125
65
    return false;
3126
2.29k
  Type *ScalarTy = DataTy->getScalarType();
3127
2.29k
3128
2.29k
  if (ScalarTy->isPointerTy())
3129
58
    return true;
3130
2.23k
3131
2.23k
  if (ScalarTy->isFloatTy() || 
ScalarTy->isDoubleTy()1.86k
)
3132
804
    return true;
3133
1.43k
3134
1.43k
  if (!ScalarTy->isIntegerTy())
3135
2
    return false;
3136
1.43k
3137
1.43k
  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3138
1.43k
  return IntWidth == 32 || 
IntWidth == 64941
||
3139
1.43k
         
(695
(695
IntWidth == 8695
||
IntWidth == 16236
) &&
ST->hasBWI()695
);
3140
1.43k
}
3141
3142
1.72k
bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
3143
1.72k
  return isLegalMaskedLoad(DataType);
3144
1.72k
}
3145
3146
1
bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
3147
1
  unsigned DataSize = DL.getTypeStoreSize(DataType);
3148
1
  // The only supported nontemporal loads are for aligned vectors of 16 or 32
3149
1
  // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
3150
1
  // (the equivalent stores only require AVX).
3151
1
  if (Alignment >= DataSize && 
(0
DataSize == 160
||
DataSize == 320
))
3152
0
    return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
3153
1
3154
1
  return false;
3155
1
}
3156
3157
1
bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
3158
1
  unsigned DataSize = DL.getTypeStoreSize(DataType);
3159
1
3160
1
  // SSE4A supports nontemporal stores of float and double at arbitrary
3161
1
  // alignment.
3162
1
  if (ST->hasSSE4A() && 
(0
DataType->isFloatTy()0
||
DataType->isDoubleTy()0
))
3163
0
    return true;
3164
1
3165
1
  // Besides the SSE4A subtarget exception above, only aligned stores are
3166
1
  // available nontemporaly on any other subtarget.  And only stores with a size
3167
1
  // of 4..32 bytes (powers of 2, only) are permitted.
3168
1
  if (Alignment < DataSize || 
DataSize < 40
||
DataSize > 320
||
3169
1
      
!isPowerOf2_32(DataSize)0
)
3170
1
    return false;
3171
0
3172
0
  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
3173
0
  // loads require AVX2).
3174
0
  if (DataSize == 32)
3175
0
    return ST->hasAVX();
3176
0
  else if (DataSize == 16)
3177
0
    return ST->hasSSE1();
3178
0
  return true;
3179
0
}
3180
3181
543
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
3182
543
  if (!isa<VectorType>(DataTy))
3183
0
    return false;
3184
543
3185
543
  if (!ST->hasAVX512())
3186
115
    return false;
3187
428
3188
428
  // The backend can't handle a single element vector.
3189
428
  if (DataTy->getVectorNumElements() == 1)
3190
0
    return false;
3191
428
3192
428
  Type *ScalarTy = DataTy->getVectorElementType();
3193
428
3194
428
  if (ScalarTy->isFloatTy() || 
ScalarTy->isDoubleTy()350
)
3195
146
    return true;
3196
282
3197
282
  if (!ScalarTy->isIntegerTy())
3198
0
    return false;
3199
282
3200
282
  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3201
282
  return IntWidth == 32 || 
IntWidth == 64228
||
3202
282
         
(168
(168
IntWidth == 8168
||
IntWidth == 1684
) &&
ST->hasVBMI2()168
);
3203
282
}
3204
3205
229
bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
3206
229
  return isLegalMaskedExpandLoad(DataTy);
3207
229
}
3208
3209
1.91k
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
3210
1.91k
  // Some CPUs have better gather performance than others.
3211
1.91k
  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
3212
1.91k
  // enable gather with a -march.
3213
1.91k
  if (!(ST->hasAVX512() || 
(746
ST->hasFastGather()746
&&
ST->hasAVX2()124
)))
3214
635
    return false;
3215
1.27k
3216
1.27k
  // This function is called now in two cases: from the Loop Vectorizer
3217
1.27k
  // and from the Scalarizer.
3218
1.27k
  // When the Loop Vectorizer asks about legality of the feature,
3219
1.27k
  // the vectorization factor is not calculated yet. The Loop Vectorizer
3220
1.27k
  // sends a scalar type and the decision is based on the width of the
3221
1.27k
  // scalar element.
3222
1.27k
  // Later on, the cost model will estimate usage this intrinsic based on
3223
1.27k
  // the vector type.
3224
1.27k
  // The Scalarizer asks again about legality. It sends a vector type.
3225
1.27k
  // In this case we can reject non-power-of-2 vectors.
3226
1.27k
  // We also reject single element vectors as the type legalizer can't
3227
1.27k
  // scalarize it.
3228
1.27k
  if (isa<VectorType>(DataTy)) {
3229
885
    unsigned NumElts = DataTy->getVectorNumElements();
3230
885
    if (NumElts == 1 || 
!isPowerOf2_32(NumElts)853
)
3231
38
      return false;
3232
1.24k
  }
3233
1.24k
  Type *ScalarTy = DataTy->getScalarType();
3234
1.24k
  if (ScalarTy->isPointerTy())
3235
6
    return true;
3236
1.23k
3237
1.23k
  if (ScalarTy->isFloatTy() || 
ScalarTy->isDoubleTy()812
)
3238
557
    return true;
3239
678
3240
678
  if (!ScalarTy->isIntegerTy())
3241
0
    return false;
3242
678
3243
678
  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
3244
678
  return IntWidth == 32 || 
IntWidth == 64249
;
3245
678
}
3246
3247
1.65k
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
3248
1.65k
  // AVX2 doesn't support scatter
3249
1.65k
  if (!ST->hasAVX512())
3250
1.15k
    return false;
3251
506
  return isLegalMaskedGather(DataType);
3252
506
}
3253
3254
424
bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
3255
424
  EVT VT = TLI->getValueType(DL, DataType);
3256
424
  return TLI->isOperationLegal(IsSigned ? 
ISD::SDIVREM16
:
ISD::UDIVREM408
, VT);
3257
424
}
3258
3259
5
bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
3260
5
  return false;
3261
5
}
3262
3263
bool X86TTIImpl::areInlineCompatible(const Function *Caller,
3264
207k
                                     const Function *Callee) const {
3265
207k
  const TargetMachine &TM = getTLI()->getTargetMachine();
3266
207k
3267
207k
  // Work this as a subsetting of subtarget features.
3268
207k
  const FeatureBitset &CallerBits =
3269
207k
      TM.getSubtargetImpl(*Caller)->getFeatureBits();
3270
207k
  const FeatureBitset &CalleeBits =
3271
207k
      TM.getSubtargetImpl(*Callee)->getFeatureBits();
3272
207k
3273
207k
  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
3274
207k
  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
3275
207k
  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
3276
207k
}
3277
3278
bool X86TTIImpl::areFunctionArgsABICompatible(
3279
    const Function *Caller, const Function *Callee,
3280
1.53k
    SmallPtrSetImpl<Argument *> &Args) const {
3281
1.53k
  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
3282
2
    return false;
3283
1.53k
3284
1.53k
  // If we get here, we know the target features match. If one function
3285
1.53k
  // considers 512-bit vectors legal and the other does not, consider them
3286
1.53k
  // incompatible.
3287
1.53k
  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
3288
1.53k
  const TargetMachine &TM = getTLI()->getTargetMachine();
3289
1.53k
3290
1.53k
  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
3291
1.53k
         TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
3292
1.53k
}
3293
3294
X86TTIImpl::TTI::MemCmpExpansionOptions
3295
135k
X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3296
135k
  TTI::MemCmpExpansionOptions Options;
3297
135k
  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3298
135k
  Options.NumLoadsPerBlock = 2;
3299
135k
  if (IsZeroCmp) {
3300
135k
    // Only enable vector loads for equality comparison. Right now the vector
3301
135k
    // version is not as fast for three way compare (see #33329).
3302
135k
    // TODO: enable AVX512 when the DAG is ready.
3303
135k
    // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
3304
135k
    const unsigned PreferredWidth = ST->getPreferVectorWidth();
3305
135k
    if (PreferredWidth >= 256 && 
ST->hasAVX2()135k
)
Options.LoadSizes.push_back(32)61.0k
;
3306
135k
    if (PreferredWidth >= 128 && 
ST->hasSSE2()135k
)
Options.LoadSizes.push_back(16)127k
;
3307
135k
    // All GPR and vector loads can be unaligned. SIMD compare requires integer
3308
135k
    // vectors (SSE2/AVX2).
3309
135k
    Options.AllowOverlappingLoads = true;
3310
135k
  }
3311
135k
  if (ST->is64Bit()) {
3312
109k
    Options.LoadSizes.push_back(8);
3313
109k
  }
3314
135k
  Options.LoadSizes.push_back(4);
3315
135k
  Options.LoadSizes.push_back(2);
3316
135k
  Options.LoadSizes.push_back(1);
3317
135k
  return Options;
3318
135k
}
3319
3320
1.20k
bool X86TTIImpl::enableInterleavedAccessVectorization() {
3321
1.20k
  // TODO: We expect this to be beneficial regardless of arch,
3322
1.20k
  // but there are currently some unexplained performance artifacts on Atom.
3323
1.20k
  // As a temporary solution, disable on Atom.
3324
1.20k
  return !(ST->isAtom());
3325
1.20k
}
3326
3327
// Get estimation for interleaved load/store operations for AVX2.
3328
// \p Factor is the interleaved-access factor (stride) - number of
3329
// (interleaved) elements in the group.
3330
// \p Indices contains the indices for a strided load: when the
3331
// interleaved load has gaps they indicate which elements are used.
3332
// If Indices is empty (or if the number of indices is equal to the size
3333
// of the interleaved-access as given in \p Factor) the access has no gaps.
3334
//
3335
// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
3336
// computing the cost using a generic formula as a function of generic
3337
// shuffles. We therefore use a lookup table instead, filled according to
3338
// the instruction sequences that codegen currently generates.
3339
int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
3340
                                               unsigned Factor,
3341
                                               ArrayRef<unsigned> Indices,
3342
                                               unsigned Alignment,
3343
                                               unsigned AddressSpace,
3344
                                               bool UseMaskForCond,
3345
28
                                               bool UseMaskForGaps) {
3346
28
3347
28
  if (UseMaskForCond || UseMaskForGaps)
3348
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3349
0
                                             Alignment, AddressSpace,
3350
0
                                             UseMaskForCond, UseMaskForGaps);
3351
28
3352
28
  // We currently Support only fully-interleaved groups, with no gaps.
3353
28
  // TODO: Support also strided loads (interleaved-groups with gaps).
3354
28
  if (Indices.size() && 
Indices.size() != Factor6
)
3355
4
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3356
4
                                             Alignment, AddressSpace);
3357
24
3358
24
  // VecTy for interleave memop is <VF*Factor x Elt>.
3359
24
  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3360
24
  // VecTy = <12 x i32>.
3361
24
  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3362
24
3363
24
  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3364
24
  // the VF=2, while v2i128 is an unsupported MVT vector type
3365
24
  // (see MachineValueType.h::getVectorVT()).
3366
24
  if (!LegalVT.isVector())
3367
2
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3368
2
                                             Alignment, AddressSpace);
3369
22
3370
22
  unsigned VF = VecTy->getVectorNumElements() / Factor;
3371
22
  Type *ScalarTy = VecTy->getVectorElementType();
3372
22
3373
22
  // Calculate the number of memory operations (NumOfMemOps), required
3374
22
  // for load/store the VecTy.
3375
22
  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3376
22
  unsigned LegalVTSize = LegalVT.getStoreSize();
3377
22
  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3378
22
3379
22
  // Get the cost of one memory operation.
3380
22
  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3381
22
                                        LegalVT.getVectorNumElements());
3382
22
  unsigned MemOpCost =
3383
22
      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3384
22
3385
22
  VectorType *VT = VectorType::get(ScalarTy, VF);
3386
22
  EVT ETy = TLI->getValueType(DL, VT);
3387
22
  if (!ETy.isSimple())
3388
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3389
0
                                             Alignment, AddressSpace);
3390
22
3391
22
  // TODO: Complete for other data-types and strides.
3392
22
  // Each combination of Stride, ElementTy and VF results in a different
3393
22
  // sequence; The cost tables are therefore accessed with:
3394
22
  // Factor (stride) and VectorType=VFxElemType.
3395
22
  // The Cost accounts only for the shuffle sequence;
3396
22
  // The cost of the loads/stores is accounted for separately.
3397
22
  //
3398
22
  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3399
22
    { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3400
22
    { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3401
22
3402
22
    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
3403
22
    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
3404
22
    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
3405
22
    { 3, MVT::v16i8, 11},  //(load 48i8 and) deinterleave into 3 x 16i8
3406
22
    { 3, MVT::v32i8, 13},  //(load 96i8 and) deinterleave into 3 x 32i8
3407
22
    { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3408
22
3409
22
    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
3410
22
    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
3411
22
    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
3412
22
    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
3413
22
    { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3414
22
3415
22
    { 8, MVT::v8f32, 40 }  //(load 64f32 and)deinterleave into 8 x 8f32
3416
22
  };
3417
22
3418
22
  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3419
22
    { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3420
22
    { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3421
22
3422
22
    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
3423
22
    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
3424
22
    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
3425
22
    { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3426
22
    { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3427
22
3428
22
    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
3429
22
    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
3430
22
    { 4, MVT::v8i8,  10 }, //interleave 4 x 8i8  into 32i8 (and store)
3431
22
    { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3432
22
    { 4, MVT::v32i8, 12 }  //interleave 4 x 32i8 into 128i8 (and store)
3433
22
  };
3434
22
3435
22
  if (Opcode == Instruction::Load) {
3436
2
    if (const auto *Entry =
3437
0
            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3438
0
      return NumOfMemOps * MemOpCost + Entry->Cost;
3439
20
  } else {
3440
20
    assert(Opcode == Instruction::Store &&
3441
20
           "Expected Store Instruction at this  point");
3442
20
    if (const auto *Entry =
3443
6
            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3444
6
      return NumOfMemOps * MemOpCost + Entry->Cost;
3445
16
  }
3446
16
3447
16
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3448
16
                                           Alignment, AddressSpace);
3449
16
}
3450
3451
// Get estimation for interleaved load/store operations and strided load.
3452
// \p Indices contains indices for strided load.
3453
// \p Factor - the factor of interleaving.
3454
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
3455
int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
3456
                                                 unsigned Factor,
3457
                                                 ArrayRef<unsigned> Indices,
3458
                                                 unsigned Alignment,
3459
                                                 unsigned AddressSpace,
3460
                                                 bool UseMaskForCond,
3461
15
                                                 bool UseMaskForGaps) {
3462
15
3463
15
  if (UseMaskForCond || 
UseMaskForGaps4
)
3464
12
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3465
12
                                             Alignment, AddressSpace,
3466
12
                                             UseMaskForCond, UseMaskForGaps);
3467
3
3468
3
  // VecTy for interleave memop is <VF*Factor x Elt>.
3469
3
  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3470
3
  // VecTy = <12 x i32>.
3471
3
3472
3
  // Calculate the number of memory operations (NumOfMemOps), required
3473
3
  // for load/store the VecTy.
3474
3
  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3475
3
  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3476
3
  unsigned LegalVTSize = LegalVT.getStoreSize();
3477
3
  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3478
3
3479
3
  // Get the cost of one memory operation.
3480
3
  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3481
3
                                        LegalVT.getVectorNumElements());
3482
3
  unsigned MemOpCost =
3483
3
      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3484
3
3485
3
  unsigned VF = VecTy->getVectorNumElements() / Factor;
3486
3
  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3487
3
3488
3
  if (Opcode == Instruction::Load) {
3489
3
    // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3490
3
    // contain the cost of the optimized shuffle sequence that the
3491
3
    // X86InterleavedAccess pass will generate.
3492
3
    // The cost of loads and stores are computed separately from the table.
3493
3
3494
3
    // X86InterleavedAccess support only the following interleaved-access group.
3495
3
    static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3496
3
        {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3497
3
        {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3498
3
        {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3499
3
    };
3500
3
3501
3
    if (const auto *Entry =
3502
0
            CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3503
0
      return NumOfMemOps * MemOpCost + Entry->Cost;
3504
3
    //If an entry does not exist, fallback to the default implementation.
3505
3
3506
3
    // Kind of shuffle depends on number of loaded values.
3507
3
    // If we load the entire data in one register, we can use a 1-src shuffle.
3508
3
    // Otherwise, we'll merge 2 sources in each operation.
3509
3
    TTI::ShuffleKind ShuffleKind =
3510
3
        (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : 
TTI::SK_PermuteSingleSrc0
;
3511
3
3512
3
    unsigned ShuffleCost =
3513
3
        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3514
3
3515
3
    unsigned NumOfLoadsInInterleaveGrp =
3516
3
        Indices.size() ? Indices.size() : 
Factor0
;
3517
3
    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3518
3
                                     VecTy->getVectorNumElements() / Factor);
3519
3
    unsigned NumOfResults =
3520
3
        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3521
3
        NumOfLoadsInInterleaveGrp;
3522
3
3523
3
    // About a half of the loads may be folded in shuffles when we have only
3524
3
    // one result. If we have more than one result, we do not fold loads at all.
3525
3
    unsigned NumOfUnfoldedLoads =
3526
3
        NumOfResults > 1 ? 
NumOfMemOps0
: NumOfMemOps / 2;
3527
3
3528
3
    // Get a number of shuffle operations per result.
3529
3
    unsigned NumOfShufflesPerResult =
3530
3
        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3531
3
3532
3
    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3533
3
    // When we have more than one destination, we need additional instructions
3534
3
    // to keep sources.
3535
3
    unsigned NumOfMoves = 0;
3536
3
    if (NumOfResults > 1 && 
ShuffleKind == TTI::SK_PermuteTwoSrc0
)
3537
0
      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3538
3
3539
3
    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3540
3
               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3541
3
3542
3
    return Cost;
3543
3
  }
3544
0
3545
0
  // Store.
3546
0
  assert(Opcode == Instruction::Store &&
3547
0
         "Expected Store Instruction at this  point");
3548
0
  // X86InterleavedAccess support only the following interleaved-access group.
3549
0
  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3550
0
      {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3551
0
      {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3552
0
      {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3553
0
3554
0
      {4, MVT::v8i8, 10},  // interleave 4 x 8i8  into 32i8  (and store)
3555
0
      {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8  (and store)
3556
0
      {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3557
0
      {4, MVT::v64i8, 24}  // interleave 4 x 32i8 into 256i8 (and store)
3558
0
  };
3559
0
3560
0
  if (const auto *Entry =
3561
0
          CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3562
0
    return NumOfMemOps * MemOpCost + Entry->Cost;
3563
0
  //If an entry does not exist, fallback to the default implementation.
3564
0
3565
0
  // There is no strided stores meanwhile. And store can't be folded in
3566
0
  // shuffle.
3567
0
  unsigned NumOfSources = Factor; // The number of values to be merged.
3568
0
  unsigned ShuffleCost =
3569
0
      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3570
0
  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3571
0
3572
0
  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3573
0
  // We need additional instructions to keep sources.
3574
0
  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3575
0
  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3576
0
             NumOfMoves;
3577
0
  return Cost;
3578
0
}
3579
3580
int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3581
                                           unsigned Factor,
3582
                                           ArrayRef<unsigned> Indices,
3583
                                           unsigned Alignment,
3584
                                           unsigned AddressSpace,
3585
                                           bool UseMaskForCond,
3586
82
                                           bool UseMaskForGaps) {
3587
82
  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3588
17
    Type *EltTy = VecTy->getVectorElementType();
3589
17
    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3590
17
        
EltTy->isIntegerTy(32)14
||
EltTy->isPointerTy()14
)
3591
3
      return true;
3592
14
    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3593
12
      return HasBW;
3594
2
    return false;
3595
2
  };
3596
82
  if (ST->hasAVX512() && 
isSupportedOnAVX512(VecTy, ST->hasBWI())17
)
3597
15
    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3598
15
                                            Alignment, AddressSpace,
3599
15
                                            UseMaskForCond, UseMaskForGaps);
3600
67
  if (ST->hasAVX2())
3601
28
    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3602
28
                                          Alignment, AddressSpace,
3603
28
                                          UseMaskForCond, UseMaskForGaps);
3604
39
3605
39
  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3606
39
                                           Alignment, AddressSpace,
3607
39
                                           UseMaskForCond, UseMaskForGaps);
3608
39
}