/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | /// \file |
10 | | /// This file implements a TargetTransformInfo analysis pass specific to the |
11 | | /// X86 target machine. It uses the target's detailed information to provide |
12 | | /// more precise answers to certain TTI queries, while letting the target |
13 | | /// independent and default TTI implementations handle the rest. |
14 | | /// |
15 | | //===----------------------------------------------------------------------===// |
16 | | /// About Cost Model numbers used below it's necessary to say the following: |
17 | | /// the numbers correspond to some "generic" X86 CPU instead of usage of |
18 | | /// concrete CPU model. Usually the numbers correspond to CPU where the feature |
19 | | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in |
20 | | /// the lookups below the cost is based on Nehalem as that was the first CPU |
21 | | /// to support that feature level and thus has most likely the worst case cost. |
22 | | /// Some examples of other technologies/CPUs: |
23 | | /// SSE 3 - Pentium4 / Athlon64 |
24 | | /// SSE 4.1 - Penryn |
25 | | /// SSE 4.2 - Nehalem |
26 | | /// AVX - Sandy Bridge |
27 | | /// AVX2 - Haswell |
28 | | /// AVX-512 - Xeon Phi / Skylake |
29 | | /// And some examples of instruction target dependent costs (latency) |
30 | | /// divss sqrtss rsqrtss |
31 | | /// AMD K7 11-16 19 3 |
32 | | /// Piledriver 9-24 13-15 5 |
33 | | /// Jaguar 14 16 2 |
34 | | /// Pentium II,III 18 30 2 |
35 | | /// Nehalem 7-14 7-18 3 |
36 | | /// Haswell 10-13 11 5 |
37 | | /// TODO: Develop and implement the target dependent cost model and |
38 | | /// specialize cost numbers for different Cost Model Targets such as throughput, |
39 | | /// code size, latency and uop count. |
40 | | //===----------------------------------------------------------------------===// |
41 | | |
42 | | #include "X86TargetTransformInfo.h" |
43 | | #include "llvm/Analysis/TargetTransformInfo.h" |
44 | | #include "llvm/CodeGen/BasicTTIImpl.h" |
45 | | #include "llvm/IR/IntrinsicInst.h" |
46 | | #include "llvm/Support/Debug.h" |
47 | | #include "llvm/Target/CostTable.h" |
48 | | #include "llvm/Target/TargetLowering.h" |
49 | | |
50 | | using namespace llvm; |
51 | | |
52 | | #define DEBUG_TYPE "x86tti" |
53 | | |
54 | | //===----------------------------------------------------------------------===// |
55 | | // |
56 | | // X86 cost model. |
57 | | // |
58 | | //===----------------------------------------------------------------------===// |
59 | | |
60 | | TargetTransformInfo::PopcntSupportKind |
61 | 3.41k | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { |
62 | 3.41k | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
63 | 3.41k | // TODO: Currently the __builtin_popcount() implementation using SSE3 |
64 | 3.41k | // instructions is inefficient. Once the problem is fixed, we should |
65 | 3.41k | // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
66 | 3.41k | return ST->hasPOPCNT() ? TTI::PSK_FastHardware692 : TTI::PSK_Software2.72k ; |
67 | 3.41k | } |
68 | | |
69 | | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( |
70 | 2 | TargetTransformInfo::CacheLevel Level) const { |
71 | 2 | switch (Level) { |
72 | 0 | case TargetTransformInfo::CacheLevel::L1D: |
73 | 0 | // - Penry |
74 | 0 | // - Nehalem |
75 | 0 | // - Westmere |
76 | 0 | // - Sandy Bridge |
77 | 0 | // - Ivy Bridge |
78 | 0 | // - Haswell |
79 | 0 | // - Broadwell |
80 | 0 | // - Skylake |
81 | 0 | // - Kabylake |
82 | 0 | return 32 * 1024; // 32 KByte |
83 | 2 | case TargetTransformInfo::CacheLevel::L2D: |
84 | 2 | // - Penry |
85 | 2 | // - Nehalem |
86 | 2 | // - Westmere |
87 | 2 | // - Sandy Bridge |
88 | 2 | // - Ivy Bridge |
89 | 2 | // - Haswell |
90 | 2 | // - Broadwell |
91 | 2 | // - Skylake |
92 | 2 | // - Kabylake |
93 | 2 | return 256 * 1024; // 256 KByte |
94 | 0 | } |
95 | 0 |
|
96 | 0 | llvm_unreachable0 ("Unknown TargetTransformInfo::CacheLevel"); |
97 | 0 | } |
98 | | |
99 | | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( |
100 | 4 | TargetTransformInfo::CacheLevel Level) const { |
101 | 4 | // - Penry |
102 | 4 | // - Nehalem |
103 | 4 | // - Westmere |
104 | 4 | // - Sandy Bridge |
105 | 4 | // - Ivy Bridge |
106 | 4 | // - Haswell |
107 | 4 | // - Broadwell |
108 | 4 | // - Skylake |
109 | 4 | // - Kabylake |
110 | 4 | switch (Level) { |
111 | 2 | case TargetTransformInfo::CacheLevel::L1D: |
112 | 2 | LLVM_FALLTHROUGH; |
113 | 4 | case TargetTransformInfo::CacheLevel::L2D: |
114 | 4 | return 8; |
115 | 0 | } |
116 | 0 |
|
117 | 0 | llvm_unreachable0 ("Unknown TargetTransformInfo::CacheLevel"); |
118 | 0 | } |
119 | | |
120 | 331k | unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { |
121 | 331k | if (Vector && 331k !ST->hasSSE1()20.1k ) |
122 | 1.40k | return 0; |
123 | 330k | |
124 | 330k | if (330k ST->is64Bit()330k ) { |
125 | 202k | if (Vector && 202k ST->hasAVX512()13.2k ) |
126 | 280 | return 32; |
127 | 202k | return 16; |
128 | 202k | } |
129 | 127k | return 8; |
130 | 127k | } |
131 | | |
132 | 9.72k | unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { |
133 | 9.72k | if (Vector9.72k ) { |
134 | 9.72k | if (ST->hasAVX512()) |
135 | 261 | return 512; |
136 | 9.45k | if (9.45k ST->hasAVX()9.45k ) |
137 | 2.31k | return 256; |
138 | 7.14k | if (7.14k ST->hasSSE1()7.14k ) |
139 | 7.14k | return 128; |
140 | 0 | return 0; |
141 | 0 | } |
142 | 0 |
|
143 | 0 | if (0 ST->is64Bit()0 ) |
144 | 0 | return 64; |
145 | 0 |
|
146 | 0 | return 32; |
147 | 0 | } |
148 | | |
149 | 86 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
150 | 86 | return getRegisterBitWidth(true); |
151 | 86 | } |
152 | | |
153 | 1.58k | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { |
154 | 1.58k | // If the loop will not be vectorized, don't interleave the loop. |
155 | 1.58k | // Let regular unroll to unroll the loop, which saves the overflow |
156 | 1.58k | // check and memory check cost. |
157 | 1.58k | if (VF == 1) |
158 | 1.23k | return 1; |
159 | 352 | |
160 | 352 | if (352 ST->isAtom()352 ) |
161 | 0 | return 1; |
162 | 352 | |
163 | 352 | // Sandybridge and Haswell have multiple execution ports and pipelined |
164 | 352 | // vector units. |
165 | 352 | if (352 ST->hasAVX()352 ) |
166 | 98 | return 4; |
167 | 254 | |
168 | 254 | return 2; |
169 | 254 | } |
170 | | |
171 | | int X86TTIImpl::getArithmeticInstrCost( |
172 | | unsigned Opcode, Type *Ty, |
173 | | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, |
174 | | TTI::OperandValueProperties Opd1PropInfo, |
175 | | TTI::OperandValueProperties Opd2PropInfo, |
176 | 13.5k | ArrayRef<const Value *> Args) { |
177 | 13.5k | // Legalize the type. |
178 | 13.5k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
179 | 13.5k | |
180 | 13.5k | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
181 | 13.5k | assert(ISD && "Invalid opcode"); |
182 | 13.5k | |
183 | 13.5k | static const CostTblEntry SLMCostTable[] = { |
184 | 13.5k | { ISD::MUL, MVT::v4i32, 11 }, // pmulld |
185 | 13.5k | { ISD::MUL, MVT::v8i16, 2 }, // pmullw |
186 | 13.5k | { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. |
187 | 13.5k | { ISD::FMUL, MVT::f64, 2 }, // mulsd |
188 | 13.5k | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd |
189 | 13.5k | { ISD::FMUL, MVT::v4f32, 2 }, // mulps |
190 | 13.5k | { ISD::FDIV, MVT::f32, 17 }, // divss |
191 | 13.5k | { ISD::FDIV, MVT::v4f32, 39 }, // divps |
192 | 13.5k | { ISD::FDIV, MVT::f64, 32 }, // divsd |
193 | 13.5k | { ISD::FDIV, MVT::v2f64, 69 }, // divpd |
194 | 13.5k | { ISD::FADD, MVT::v2f64, 2 }, // addpd |
195 | 13.5k | { ISD::FSUB, MVT::v2f64, 2 }, // subpd |
196 | 13.5k | // v2i64/v4i64 mul is custom lowered as a series of long: |
197 | 13.5k | // multiplies(3), shifts(3) and adds(2) |
198 | 13.5k | // slm muldq version throughput is 2 and addq throughput 4 |
199 | 13.5k | // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + |
200 | 13.5k | // 3X4 (addq throughput) = 17 |
201 | 13.5k | { ISD::MUL, MVT::v2i64, 17 }, |
202 | 13.5k | // slm addq\subq throughput is 4 |
203 | 13.5k | { ISD::ADD, MVT::v2i64, 4 }, |
204 | 13.5k | { ISD::SUB, MVT::v2i64, 4 }, |
205 | 13.5k | }; |
206 | 13.5k | |
207 | 13.5k | if (ST->isSLM()13.5k ) { |
208 | 303 | if (Args.size() == 2 && 303 ISD == ISD::MUL303 && LT.second == MVT::v4i3270 ) { |
209 | 21 | // Check if the operands can be shrinked into a smaller datatype. |
210 | 21 | bool Op1Signed = false; |
211 | 21 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); |
212 | 21 | bool Op2Signed = false; |
213 | 21 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); |
214 | 21 | |
215 | 21 | bool signedMode = Op1Signed | Op2Signed; |
216 | 21 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); |
217 | 21 | |
218 | 21 | if (OpMinSize <= 7) |
219 | 1 | return LT.first * 3; // pmullw/sext |
220 | 20 | if (20 !signedMode && 20 OpMinSize <= 813 ) |
221 | 2 | return LT.first * 3; // pmullw/zext |
222 | 18 | if (18 OpMinSize <= 1518 ) |
223 | 5 | return LT.first * 5; // pmullw/pmulhw/pshuf |
224 | 13 | if (13 !signedMode && 13 OpMinSize <= 1610 ) |
225 | 2 | return LT.first * 5; // pmullw/pmulhw/pshuf |
226 | 293 | } |
227 | 293 | if (const auto *293 Entry293 = CostTableLookup(SLMCostTable, ISD, |
228 | 182 | LT.second)) { |
229 | 182 | return LT.first * Entry->Cost; |
230 | 182 | } |
231 | 13.3k | } |
232 | 13.3k | |
233 | 13.3k | if (13.3k ISD == ISD::SDIV && |
234 | 399 | Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
235 | 13.3k | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2213 ) { |
236 | 5 | // On X86, vector signed division by constants power-of-two are |
237 | 5 | // normally expanded to the sequence SRA + SRL + ADD + SRA. |
238 | 5 | // The OperandValue properties many not be same as that of previous |
239 | 5 | // operation;conservatively assume OP_None. |
240 | 5 | int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, |
241 | 5 | Op2Info, TargetTransformInfo::OP_None, |
242 | 5 | TargetTransformInfo::OP_None); |
243 | 5 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, |
244 | 5 | TargetTransformInfo::OP_None, |
245 | 5 | TargetTransformInfo::OP_None); |
246 | 5 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, |
247 | 5 | TargetTransformInfo::OP_None, |
248 | 5 | TargetTransformInfo::OP_None); |
249 | 5 | |
250 | 5 | return Cost; |
251 | 5 | } |
252 | 13.3k | |
253 | 13.3k | static const CostTblEntry AVX512BWUniformConstCostTable[] = { |
254 | 13.3k | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. |
255 | 13.3k | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. |
256 | 13.3k | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. |
257 | 13.3k | |
258 | 13.3k | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence |
259 | 13.3k | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence |
260 | 13.3k | }; |
261 | 13.3k | |
262 | 13.3k | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
263 | 13.3k | ST->hasBWI()4.06k ) { |
264 | 168 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, |
265 | 168 | LT.second)) |
266 | 10 | return LT.first * Entry->Cost; |
267 | 13.3k | } |
268 | 13.3k | |
269 | 13.3k | static const CostTblEntry AVX512UniformConstCostTable[] = { |
270 | 13.3k | { ISD::SRA, MVT::v2i64, 1 }, |
271 | 13.3k | { ISD::SRA, MVT::v4i64, 1 }, |
272 | 13.3k | { ISD::SRA, MVT::v8i64, 1 }, |
273 | 13.3k | |
274 | 13.3k | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence |
275 | 13.3k | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence |
276 | 13.3k | }; |
277 | 13.3k | |
278 | 13.3k | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
279 | 13.3k | ST->hasAVX512()4.05k ) { |
280 | 505 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, |
281 | 505 | LT.second)) |
282 | 26 | return LT.first * Entry->Cost; |
283 | 13.2k | } |
284 | 13.2k | |
285 | 13.2k | static const CostTblEntry AVX2UniformConstCostTable[] = { |
286 | 13.2k | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. |
287 | 13.2k | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. |
288 | 13.2k | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. |
289 | 13.2k | |
290 | 13.2k | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. |
291 | 13.2k | |
292 | 13.2k | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence |
293 | 13.2k | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence |
294 | 13.2k | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence |
295 | 13.2k | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence |
296 | 13.2k | }; |
297 | 13.2k | |
298 | 13.2k | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
299 | 13.2k | ST->hasAVX2()4.03k ) { |
300 | 1.17k | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, |
301 | 1.17k | LT.second)) |
302 | 98 | return LT.first * Entry->Cost; |
303 | 13.1k | } |
304 | 13.1k | |
305 | 13.1k | static const CostTblEntry SSE2UniformConstCostTable[] = { |
306 | 13.1k | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. |
307 | 13.1k | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. |
308 | 13.1k | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. |
309 | 13.1k | |
310 | 13.1k | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. |
311 | 13.1k | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. |
312 | 13.1k | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. |
313 | 13.1k | |
314 | 13.1k | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. |
315 | 13.1k | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence |
316 | 13.1k | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. |
317 | 13.1k | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence |
318 | 13.1k | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. |
319 | 13.1k | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence |
320 | 13.1k | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. |
321 | 13.1k | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence |
322 | 13.1k | }; |
323 | 13.1k | |
324 | 13.1k | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
325 | 13.1k | ST->hasSSE2()3.93k ) { |
326 | 3.92k | // pmuldq sequence. |
327 | 3.92k | if (ISD == ISD::SDIV && 3.92k LT.second == MVT::v8i32176 && ST->hasAVX()5 ) |
328 | 5 | return LT.first * 32; |
329 | 3.92k | if (3.92k ISD == ISD::SDIV && 3.92k LT.second == MVT::v4i32171 && ST->hasSSE41()37 ) |
330 | 21 | return LT.first * 15; |
331 | 3.90k | |
332 | 3.90k | // XOP has faster vXi8 shifts. |
333 | 3.90k | if (3.90k (ISD != ISD::SHL && 3.90k ISD != ISD::SRL3.65k && ISD != ISD::SRA3.22k ) || |
334 | 978 | !ST->hasXOP()) |
335 | 3.83k | if (const auto *3.83k Entry3.83k = |
336 | 3.83k | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) |
337 | 196 | return LT.first * Entry->Cost; |
338 | 12.9k | } |
339 | 12.9k | |
340 | 12.9k | static const CostTblEntry AVX2UniformCostTable[] = { |
341 | 12.9k | // Uniform splats are cheaper for the following instructions. |
342 | 12.9k | { ISD::SHL, MVT::v16i16, 1 }, // psllw. |
343 | 12.9k | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. |
344 | 12.9k | { ISD::SRA, MVT::v16i16, 1 }, // psraw. |
345 | 12.9k | }; |
346 | 12.9k | |
347 | 12.9k | if (ST->hasAVX2() && |
348 | 4.16k | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
349 | 12.9k | (Op2Info == TargetTransformInfo::OK_UniformValue)3.14k )) { |
350 | 1.31k | if (const auto *Entry = |
351 | 1.31k | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) |
352 | 84 | return LT.first * Entry->Cost; |
353 | 12.8k | } |
354 | 12.8k | |
355 | 12.8k | static const CostTblEntry SSE2UniformCostTable[] = { |
356 | 12.8k | // Uniform splats are cheaper for the following instructions. |
357 | 12.8k | { ISD::SHL, MVT::v8i16, 1 }, // psllw. |
358 | 12.8k | { ISD::SHL, MVT::v4i32, 1 }, // pslld |
359 | 12.8k | { ISD::SHL, MVT::v2i64, 1 }, // psllq. |
360 | 12.8k | |
361 | 12.8k | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. |
362 | 12.8k | { ISD::SRL, MVT::v4i32, 1 }, // psrld. |
363 | 12.8k | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. |
364 | 12.8k | |
365 | 12.8k | { ISD::SRA, MVT::v8i16, 1 }, // psraw. |
366 | 12.8k | { ISD::SRA, MVT::v4i32, 1 }, // psrad. |
367 | 12.8k | }; |
368 | 12.8k | |
369 | 12.8k | if (ST->hasSSE2() && |
370 | 12.8k | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
371 | 12.8k | (Op2Info == TargetTransformInfo::OK_UniformValue)9.19k )) { |
372 | 4.15k | if (const auto *Entry = |
373 | 4.15k | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) |
374 | 475 | return LT.first * Entry->Cost; |
375 | 12.4k | } |
376 | 12.4k | |
377 | 12.4k | static const CostTblEntry AVX512DQCostTable[] = { |
378 | 12.4k | { ISD::MUL, MVT::v2i64, 1 }, |
379 | 12.4k | { ISD::MUL, MVT::v4i64, 1 }, |
380 | 12.4k | { ISD::MUL, MVT::v8i64, 1 } |
381 | 12.4k | }; |
382 | 12.4k | |
383 | 12.4k | // Look for AVX512DQ lowering tricks for custom cases. |
384 | 12.4k | if (ST->hasDQI()) |
385 | 517 | if (const auto *517 Entry517 = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) |
386 | 5 | return LT.first * Entry->Cost; |
387 | 12.3k | |
388 | 12.3k | static const CostTblEntry AVX512BWCostTable[] = { |
389 | 12.3k | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw |
390 | 12.3k | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw |
391 | 12.3k | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw |
392 | 12.3k | |
393 | 12.3k | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw |
394 | 12.3k | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw |
395 | 12.3k | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw |
396 | 12.3k | |
397 | 12.3k | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw |
398 | 12.3k | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw |
399 | 12.3k | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw |
400 | 12.3k | |
401 | 12.3k | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. |
402 | 12.3k | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. |
403 | 12.3k | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. |
404 | 12.3k | |
405 | 12.3k | { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. |
406 | 12.3k | { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. |
407 | 12.3k | { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. |
408 | 12.3k | |
409 | 12.3k | // Vectorizing division is a bad idea. See the SSE2 table for more comments. |
410 | 12.3k | { ISD::SDIV, MVT::v64i8, 64*20 }, |
411 | 12.3k | { ISD::SDIV, MVT::v32i16, 32*20 }, |
412 | 12.3k | { ISD::UDIV, MVT::v64i8, 64*20 }, |
413 | 12.3k | { ISD::UDIV, MVT::v32i16, 32*20 } |
414 | 12.3k | }; |
415 | 12.3k | |
416 | 12.3k | // Look for AVX512BW lowering tricks for custom cases. |
417 | 12.3k | if (ST->hasBWI()) |
418 | 702 | if (const auto *702 Entry702 = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) |
419 | 95 | return LT.first * Entry->Cost; |
420 | 12.3k | |
421 | 12.3k | static const CostTblEntry AVX512CostTable[] = { |
422 | 12.3k | { ISD::SHL, MVT::v16i32, 1 }, |
423 | 12.3k | { ISD::SRL, MVT::v16i32, 1 }, |
424 | 12.3k | { ISD::SRA, MVT::v16i32, 1 }, |
425 | 12.3k | |
426 | 12.3k | { ISD::SHL, MVT::v8i64, 1 }, |
427 | 12.3k | { ISD::SRL, MVT::v8i64, 1 }, |
428 | 12.3k | |
429 | 12.3k | { ISD::SRA, MVT::v2i64, 1 }, |
430 | 12.3k | { ISD::SRA, MVT::v4i64, 1 }, |
431 | 12.3k | { ISD::SRA, MVT::v8i64, 1 }, |
432 | 12.3k | |
433 | 12.3k | { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. |
434 | 12.3k | { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. |
435 | 12.3k | { ISD::MUL, MVT::v16i32, 1 }, // pmulld |
436 | 12.3k | { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add |
437 | 12.3k | |
438 | 12.3k | // Vectorizing division is a bad idea. See the SSE2 table for more comments. |
439 | 12.3k | { ISD::SDIV, MVT::v16i32, 16*20 }, |
440 | 12.3k | { ISD::SDIV, MVT::v8i64, 8*20 }, |
441 | 12.3k | { ISD::UDIV, MVT::v16i32, 16*20 }, |
442 | 12.3k | { ISD::UDIV, MVT::v8i64, 8*20 } |
443 | 12.3k | }; |
444 | 12.3k | |
445 | 12.3k | if (ST->hasAVX512()) |
446 | 1.80k | if (const auto *1.80k Entry1.80k = CostTableLookup(AVX512CostTable, ISD, LT.second)) |
447 | 236 | return LT.first * Entry->Cost; |
448 | 12.0k | |
449 | 12.0k | static const CostTblEntry AVX2ShiftCostTable[] = { |
450 | 12.0k | // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to |
451 | 12.0k | // customize them to detect the cases where shift amount is a scalar one. |
452 | 12.0k | { ISD::SHL, MVT::v4i32, 1 }, |
453 | 12.0k | { ISD::SRL, MVT::v4i32, 1 }, |
454 | 12.0k | { ISD::SRA, MVT::v4i32, 1 }, |
455 | 12.0k | { ISD::SHL, MVT::v8i32, 1 }, |
456 | 12.0k | { ISD::SRL, MVT::v8i32, 1 }, |
457 | 12.0k | { ISD::SRA, MVT::v8i32, 1 }, |
458 | 12.0k | { ISD::SHL, MVT::v2i64, 1 }, |
459 | 12.0k | { ISD::SRL, MVT::v2i64, 1 }, |
460 | 12.0k | { ISD::SHL, MVT::v4i64, 1 }, |
461 | 12.0k | { ISD::SRL, MVT::v4i64, 1 }, |
462 | 12.0k | }; |
463 | 12.0k | |
464 | 12.0k | // Look for AVX2 lowering tricks. |
465 | 12.0k | if (ST->hasAVX2()12.0k ) { |
466 | 3.58k | if (ISD == ISD::SHL && 3.58k LT.second == MVT::v16i16372 && |
467 | 42 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
468 | 42 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) |
469 | 3.58k | // On AVX2, a packed v16i16 shift left by a constant build_vector |
470 | 3.58k | // is lowered into a vector multiply (vpmullw). |
471 | 24 | return LT.first; |
472 | 3.56k | |
473 | 3.56k | if (const auto *3.56k Entry3.56k = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) |
474 | 370 | return LT.first * Entry->Cost; |
475 | 11.6k | } |
476 | 11.6k | |
477 | 11.6k | static const CostTblEntry XOPShiftCostTable[] = { |
478 | 11.6k | // 128bit shifts take 1cy, but right shifts require negation beforehand. |
479 | 11.6k | { ISD::SHL, MVT::v16i8, 1 }, |
480 | 11.6k | { ISD::SRL, MVT::v16i8, 2 }, |
481 | 11.6k | { ISD::SRA, MVT::v16i8, 2 }, |
482 | 11.6k | { ISD::SHL, MVT::v8i16, 1 }, |
483 | 11.6k | { ISD::SRL, MVT::v8i16, 2 }, |
484 | 11.6k | { ISD::SRA, MVT::v8i16, 2 }, |
485 | 11.6k | { ISD::SHL, MVT::v4i32, 1 }, |
486 | 11.6k | { ISD::SRL, MVT::v4i32, 2 }, |
487 | 11.6k | { ISD::SRA, MVT::v4i32, 2 }, |
488 | 11.6k | { ISD::SHL, MVT::v2i64, 1 }, |
489 | 11.6k | { ISD::SRL, MVT::v2i64, 2 }, |
490 | 11.6k | { ISD::SRA, MVT::v2i64, 2 }, |
491 | 11.6k | // 256bit shifts require splitting if AVX2 didn't catch them above. |
492 | 11.6k | { ISD::SHL, MVT::v32i8, 2+2 }, |
493 | 11.6k | { ISD::SRL, MVT::v32i8, 4+2 }, |
494 | 11.6k | { ISD::SRA, MVT::v32i8, 4+2 }, |
495 | 11.6k | { ISD::SHL, MVT::v16i16, 2+2 }, |
496 | 11.6k | { ISD::SRL, MVT::v16i16, 4+2 }, |
497 | 11.6k | { ISD::SRA, MVT::v16i16, 4+2 }, |
498 | 11.6k | { ISD::SHL, MVT::v8i32, 2+2 }, |
499 | 11.6k | { ISD::SRL, MVT::v8i32, 4+2 }, |
500 | 11.6k | { ISD::SRA, MVT::v8i32, 4+2 }, |
501 | 11.6k | { ISD::SHL, MVT::v4i64, 2+2 }, |
502 | 11.6k | { ISD::SRL, MVT::v4i64, 4+2 }, |
503 | 11.6k | { ISD::SRA, MVT::v4i64, 4+2 }, |
504 | 11.6k | }; |
505 | 11.6k | |
506 | 11.6k | // Look for XOP lowering tricks. |
507 | 11.6k | if (ST->hasXOP()) |
508 | 324 | if (const auto *324 Entry324 = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) |
509 | 218 | return LT.first * Entry->Cost; |
510 | 11.4k | |
511 | 11.4k | static const CostTblEntry SSE2UniformShiftCostTable[] = { |
512 | 11.4k | // Uniform splats are cheaper for the following instructions. |
513 | 11.4k | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. |
514 | 11.4k | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. |
515 | 11.4k | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. |
516 | 11.4k | |
517 | 11.4k | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. |
518 | 11.4k | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. |
519 | 11.4k | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. |
520 | 11.4k | |
521 | 11.4k | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. |
522 | 11.4k | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. |
523 | 11.4k | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. |
524 | 11.4k | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. |
525 | 11.4k | }; |
526 | 11.4k | |
527 | 11.4k | if (ST->hasSSE2() && |
528 | 11.4k | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
529 | 11.4k | (Op2Info == TargetTransformInfo::OK_UniformValue)8.24k )) { |
530 | 3.40k | |
531 | 3.40k | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. |
532 | 3.40k | if (ISD == ISD::SRA && 3.40k LT.second == MVT::v4i64235 && ST->hasAVX2()6 ) |
533 | 2 | return LT.first * 4; // 2*psrad + shuffle. |
534 | 3.39k | |
535 | 3.39k | if (const auto *3.39k Entry3.39k = |
536 | 3.39k | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) |
537 | 131 | return LT.first * Entry->Cost; |
538 | 11.3k | } |
539 | 11.3k | |
540 | 11.3k | if (11.3k ISD == ISD::SHL && |
541 | 11.3k | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue442 ) { |
542 | 104 | MVT VT = LT.second; |
543 | 104 | // Vector shift left by non uniform constant can be lowered |
544 | 104 | // into vector multiply. |
545 | 104 | if (((VT == MVT::v8i16 || 104 VT == MVT::v4i3272 ) && ST->hasSSE2()49 ) || |
546 | 55 | ((VT == MVT::v16i16 || 55 VT == MVT::v8i3251 ) && ST->hasAVX()8 )) |
547 | 57 | ISD = ISD::MUL; |
548 | 104 | } |
549 | 11.3k | |
550 | 11.3k | static const CostTblEntry AVX2CostTable[] = { |
551 | 11.3k | { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. |
552 | 11.3k | { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. |
553 | 11.3k | |
554 | 11.3k | { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. |
555 | 11.3k | { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. |
556 | 11.3k | |
557 | 11.3k | { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. |
558 | 11.3k | { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. |
559 | 11.3k | { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. |
560 | 11.3k | { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. |
561 | 11.3k | |
562 | 11.3k | { ISD::SUB, MVT::v32i8, 1 }, // psubb |
563 | 11.3k | { ISD::ADD, MVT::v32i8, 1 }, // paddb |
564 | 11.3k | { ISD::SUB, MVT::v16i16, 1 }, // psubw |
565 | 11.3k | { ISD::ADD, MVT::v16i16, 1 }, // paddw |
566 | 11.3k | { ISD::SUB, MVT::v8i32, 1 }, // psubd |
567 | 11.3k | { ISD::ADD, MVT::v8i32, 1 }, // paddd |
568 | 11.3k | { ISD::SUB, MVT::v4i64, 1 }, // psubq |
569 | 11.3k | { ISD::ADD, MVT::v4i64, 1 }, // paddq |
570 | 11.3k | |
571 | 11.3k | { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. |
572 | 11.3k | { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. |
573 | 11.3k | { ISD::MUL, MVT::v16i16, 1 }, // pmullw |
574 | 11.3k | { ISD::MUL, MVT::v8i32, 1 }, // pmulld |
575 | 11.3k | { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add |
576 | 11.3k | |
577 | 11.3k | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ |
578 | 11.3k | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ |
579 | 11.3k | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ |
580 | 11.3k | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ |
581 | 11.3k | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ |
582 | 11.3k | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ |
583 | 11.3k | }; |
584 | 11.3k | |
585 | 11.3k | // Look for AVX2 lowering tricks for custom cases. |
586 | 11.3k | if (ST->hasAVX2()) |
587 | 3.10k | if (const auto *3.10k Entry3.10k = CostTableLookup(AVX2CostTable, ISD, LT.second)) |
588 | 368 | return LT.first * Entry->Cost; |
589 | 10.9k | |
590 | 10.9k | static const CostTblEntry AVX1CostTable[] = { |
591 | 10.9k | // We don't have to scalarize unsupported ops. We can issue two half-sized |
592 | 10.9k | // operations and we only need to extract the upper YMM half. |
593 | 10.9k | // Two ops + 1 extract + 1 insert = 4. |
594 | 10.9k | { ISD::MUL, MVT::v16i16, 4 }, |
595 | 10.9k | { ISD::MUL, MVT::v8i32, 4 }, |
596 | 10.9k | { ISD::SUB, MVT::v32i8, 4 }, |
597 | 10.9k | { ISD::ADD, MVT::v32i8, 4 }, |
598 | 10.9k | { ISD::SUB, MVT::v16i16, 4 }, |
599 | 10.9k | { ISD::ADD, MVT::v16i16, 4 }, |
600 | 10.9k | { ISD::SUB, MVT::v8i32, 4 }, |
601 | 10.9k | { ISD::ADD, MVT::v8i32, 4 }, |
602 | 10.9k | { ISD::SUB, MVT::v4i64, 4 }, |
603 | 10.9k | { ISD::ADD, MVT::v4i64, 4 }, |
604 | 10.9k | |
605 | 10.9k | // A v4i64 multiply is custom lowered as two split v2i64 vectors that then |
606 | 10.9k | // are lowered as a series of long multiplies(3), shifts(3) and adds(2) |
607 | 10.9k | // Because we believe v4i64 to be a legal type, we must also include the |
608 | 10.9k | // extract+insert in the cost table. Therefore, the cost here is 18 |
609 | 10.9k | // instead of 8. |
610 | 10.9k | { ISD::MUL, MVT::v4i64, 18 }, |
611 | 10.9k | |
612 | 10.9k | { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. |
613 | 10.9k | |
614 | 10.9k | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ |
615 | 10.9k | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ |
616 | 10.9k | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ |
617 | 10.9k | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ |
618 | 10.9k | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ |
619 | 10.9k | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ |
620 | 10.9k | |
621 | 10.9k | // Vectorizing division is a bad idea. See the SSE2 table for more comments. |
622 | 10.9k | { ISD::SDIV, MVT::v32i8, 32*20 }, |
623 | 10.9k | { ISD::SDIV, MVT::v16i16, 16*20 }, |
624 | 10.9k | { ISD::SDIV, MVT::v8i32, 8*20 }, |
625 | 10.9k | { ISD::SDIV, MVT::v4i64, 4*20 }, |
626 | 10.9k | { ISD::UDIV, MVT::v32i8, 32*20 }, |
627 | 10.9k | { ISD::UDIV, MVT::v16i16, 16*20 }, |
628 | 10.9k | { ISD::UDIV, MVT::v8i32, 8*20 }, |
629 | 10.9k | { ISD::UDIV, MVT::v4i64, 4*20 }, |
630 | 10.9k | }; |
631 | 10.9k | |
632 | 10.9k | if (ST->hasAVX()) |
633 | 4.11k | if (const auto *4.11k Entry4.11k = CostTableLookup(AVX1CostTable, ISD, LT.second)) |
634 | 225 | return LT.first * Entry->Cost; |
635 | 10.7k | |
636 | 10.7k | static const CostTblEntry SSE42CostTable[] = { |
637 | 10.7k | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ |
638 | 10.7k | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ |
639 | 10.7k | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ |
640 | 10.7k | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ |
641 | 10.7k | }; |
642 | 10.7k | |
643 | 10.7k | if (ST->hasSSE42()) |
644 | 4.65k | if (const auto *4.65k Entry4.65k = CostTableLookup(SSE42CostTable, ISD, LT.second)) |
645 | 16 | return LT.first * Entry->Cost; |
646 | 10.7k | |
647 | 10.7k | static const CostTblEntry SSE41CostTable[] = { |
648 | 10.7k | { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. |
649 | 10.7k | { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. |
650 | 10.7k | { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. |
651 | 10.7k | { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. |
652 | 10.7k | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld |
653 | 10.7k | { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split |
654 | 10.7k | |
655 | 10.7k | { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. |
656 | 10.7k | { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. |
657 | 10.7k | { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. |
658 | 10.7k | { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. |
659 | 10.7k | { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. |
660 | 10.7k | { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. |
661 | 10.7k | |
662 | 10.7k | { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. |
663 | 10.7k | { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. |
664 | 10.7k | { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. |
665 | 10.7k | { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. |
666 | 10.7k | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. |
667 | 10.7k | { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. |
668 | 10.7k | |
669 | 10.7k | { ISD::MUL, MVT::v4i32, 1 } // pmulld |
670 | 10.7k | }; |
671 | 10.7k | |
672 | 10.7k | if (ST->hasSSE41()) |
673 | 4.73k | if (const auto *4.73k Entry4.73k = CostTableLookup(SSE41CostTable, ISD, LT.second)) |
674 | 302 | return LT.first * Entry->Cost; |
675 | 10.4k | |
676 | 10.4k | static const CostTblEntry SSE2CostTable[] = { |
677 | 10.4k | // We don't correctly identify costs of casts because they are marked as |
678 | 10.4k | // custom. |
679 | 10.4k | { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. |
680 | 10.4k | { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. |
681 | 10.4k | { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. |
682 | 10.4k | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. |
683 | 10.4k | { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. |
684 | 10.4k | |
685 | 10.4k | { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. |
686 | 10.4k | { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. |
687 | 10.4k | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. |
688 | 10.4k | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. |
689 | 10.4k | { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. |
690 | 10.4k | |
691 | 10.4k | { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. |
692 | 10.4k | { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. |
693 | 10.4k | { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. |
694 | 10.4k | { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. |
695 | 10.4k | { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. |
696 | 10.4k | |
697 | 10.4k | { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. |
698 | 10.4k | { ISD::MUL, MVT::v8i16, 1 }, // pmullw |
699 | 10.4k | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle |
700 | 10.4k | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add |
701 | 10.4k | |
702 | 10.4k | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ |
703 | 10.4k | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ |
704 | 10.4k | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ |
705 | 10.4k | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ |
706 | 10.4k | |
707 | 10.4k | // It is not a good idea to vectorize division. We have to scalarize it and |
708 | 10.4k | // in the process we will often end up having to spilling regular |
709 | 10.4k | // registers. The overhead of division is going to dominate most kernels |
710 | 10.4k | // anyways so try hard to prevent vectorization of division - it is |
711 | 10.4k | // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
712 | 10.4k | // to hide "20 cycles" for each lane. |
713 | 10.4k | { ISD::SDIV, MVT::v16i8, 16*20 }, |
714 | 10.4k | { ISD::SDIV, MVT::v8i16, 8*20 }, |
715 | 10.4k | { ISD::SDIV, MVT::v4i32, 4*20 }, |
716 | 10.4k | { ISD::SDIV, MVT::v2i64, 2*20 }, |
717 | 10.4k | { ISD::UDIV, MVT::v16i8, 16*20 }, |
718 | 10.4k | { ISD::UDIV, MVT::v8i16, 8*20 }, |
719 | 10.4k | { ISD::UDIV, MVT::v4i32, 4*20 }, |
720 | 10.4k | { ISD::UDIV, MVT::v2i64, 2*20 }, |
721 | 10.4k | }; |
722 | 10.4k | |
723 | 10.4k | if (ST->hasSSE2()) |
724 | 10.3k | if (const auto *10.3k Entry10.3k = CostTableLookup(SSE2CostTable, ISD, LT.second)) |
725 | 809 | return LT.first * Entry->Cost; |
726 | 9.59k | |
727 | 9.59k | static const CostTblEntry SSE1CostTable[] = { |
728 | 9.59k | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ |
729 | 9.59k | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ |
730 | 9.59k | }; |
731 | 9.59k | |
732 | 9.59k | if (ST->hasSSE1()) |
733 | 9.59k | if (const auto *9.59k Entry9.59k = CostTableLookup(SSE1CostTable, ISD, LT.second)) |
734 | 0 | return LT.first * Entry->Cost; |
735 | 9.59k | |
736 | 9.59k | // Fallback to the default implementation. |
737 | 9.59k | return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); |
738 | 9.59k | } |
739 | | |
740 | | int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, |
741 | 5.01k | Type *SubTp) { |
742 | 5.01k | // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
743 | 5.01k | // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. |
744 | 5.01k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); |
745 | 5.01k | |
746 | 5.01k | // For Broadcasts we are splatting the first element from the first input |
747 | 5.01k | // register, so only need to reference that input and all the output |
748 | 5.01k | // registers are the same. |
749 | 5.01k | if (Kind == TTI::SK_Broadcast) |
750 | 1.41k | LT.first = 1; |
751 | 5.01k | |
752 | 5.01k | // We are going to permute multiple sources and the result will be in multiple |
753 | 5.01k | // destinations. Providing an accurate cost only for splits where the element |
754 | 5.01k | // type remains the same. |
755 | 5.01k | if (Kind == TTI::SK_PermuteSingleSrc && 5.01k LT.first != 1547 ) { |
756 | 175 | MVT LegalVT = LT.second; |
757 | 175 | if (LegalVT.getVectorElementType().getSizeInBits() == |
758 | 175 | Tp->getVectorElementType()->getPrimitiveSizeInBits() && |
759 | 175 | LegalVT.getVectorNumElements() < Tp->getVectorNumElements()175 ) { |
760 | 175 | |
761 | 175 | unsigned VecTySize = DL.getTypeStoreSize(Tp); |
762 | 175 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
763 | 175 | // Number of source vectors after legalization: |
764 | 175 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
765 | 175 | // Number of destination vectors after legalization: |
766 | 175 | unsigned NumOfDests = LT.first; |
767 | 175 | |
768 | 175 | Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), |
769 | 175 | LegalVT.getVectorNumElements()); |
770 | 175 | |
771 | 175 | unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
772 | 175 | return NumOfShuffles * |
773 | 175 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); |
774 | 175 | } |
775 | 0 |
|
776 | 0 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); |
777 | 0 | } |
778 | 4.83k | |
779 | 4.83k | // For 2-input shuffles, we must account for splitting the 2 inputs into many. |
780 | 4.83k | if (4.83k Kind == TTI::SK_PermuteTwoSrc && 4.83k LT.first != 1385 ) { |
781 | 101 | // We assume that source and destination have the same vector type. |
782 | 101 | int NumOfDests = LT.first; |
783 | 101 | int NumOfShufflesPerDest = LT.first * 2 - 1; |
784 | 101 | LT.first = NumOfDests * NumOfShufflesPerDest; |
785 | 101 | } |
786 | 4.83k | |
787 | 4.83k | static const CostTblEntry AVX512VBMIShuffleTbl[] = { |
788 | 4.83k | { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb |
789 | 4.83k | { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb |
790 | 4.83k | |
791 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb |
792 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb |
793 | 4.83k | |
794 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b |
795 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b |
796 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b |
797 | 4.83k | }; |
798 | 4.83k | |
799 | 4.83k | if (ST->hasVBMI()) |
800 | 84 | if (const auto *84 Entry84 = |
801 | 84 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) |
802 | 7 | return LT.first * Entry->Cost; |
803 | 4.83k | |
804 | 4.83k | static const CostTblEntry AVX512BWShuffleTbl[] = { |
805 | 4.83k | { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw |
806 | 4.83k | { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb |
807 | 4.83k | |
808 | 4.83k | { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw |
809 | 4.83k | { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw |
810 | 4.83k | { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 |
811 | 4.83k | |
812 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw |
813 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw |
814 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw |
815 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 |
816 | 4.83k | { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc |
817 | 4.83k | |
818 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w |
819 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w |
820 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w |
821 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc |
822 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 |
823 | 4.83k | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc |
824 | 4.83k | }; |
825 | 4.83k | |
826 | 4.83k | if (ST->hasBWI()) |
827 | 244 | if (const auto *244 Entry244 = |
828 | 244 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) |
829 | 42 | return LT.first * Entry->Cost; |
830 | 4.78k | |
831 | 4.78k | static const CostTblEntry AVX512ShuffleTbl[] = { |
832 | 4.78k | { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd |
833 | 4.78k | { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps |
834 | 4.78k | { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq |
835 | 4.78k | { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd |
836 | 4.78k | |
837 | 4.78k | { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd |
838 | 4.78k | { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps |
839 | 4.78k | { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq |
840 | 4.78k | { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd |
841 | 4.78k | |
842 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd |
843 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd |
844 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd |
845 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps |
846 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps |
847 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps |
848 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq |
849 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq |
850 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq |
851 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd |
852 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd |
853 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd |
854 | 4.78k | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb |
855 | 4.78k | |
856 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd |
857 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps |
858 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q |
859 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d |
860 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd |
861 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps |
862 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q |
863 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d |
864 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd |
865 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps |
866 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q |
867 | 4.78k | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d |
868 | 4.78k | }; |
869 | 4.78k | |
870 | 4.78k | if (ST->hasAVX512()) |
871 | 469 | if (const auto *469 Entry469 = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) |
872 | 162 | return LT.first * Entry->Cost; |
873 | 4.62k | |
874 | 4.62k | static const CostTblEntry AVX2ShuffleTbl[] = { |
875 | 4.62k | { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd |
876 | 4.62k | { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps |
877 | 4.62k | { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq |
878 | 4.62k | { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd |
879 | 4.62k | { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw |
880 | 4.62k | { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb |
881 | 4.62k | |
882 | 4.62k | { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd |
883 | 4.62k | { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps |
884 | 4.62k | { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq |
885 | 4.62k | { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd |
886 | 4.62k | { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb |
887 | 4.62k | { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb |
888 | 4.62k | |
889 | 4.62k | { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw |
890 | 4.62k | { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb |
891 | 4.62k | |
892 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd |
893 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps |
894 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq |
895 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd |
896 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb |
897 | 4.62k | // + vpblendvb |
898 | 4.62k | { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb |
899 | 4.62k | // + vpblendvb |
900 | 4.62k | |
901 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd |
902 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps |
903 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd |
904 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd |
905 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb |
906 | 4.62k | // + vpblendvb |
907 | 4.62k | { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb |
908 | 4.62k | // + vpblendvb |
909 | 4.62k | }; |
910 | 4.62k | |
911 | 4.62k | if (ST->hasAVX2()) |
912 | 848 | if (const auto *848 Entry848 = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) |
913 | 309 | return LT.first * Entry->Cost; |
914 | 4.31k | |
915 | 4.31k | static const CostTblEntry XOPShuffleTbl[] = { |
916 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd |
917 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps |
918 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd |
919 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps |
920 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm |
921 | 4.31k | // + vinsertf128 |
922 | 4.31k | { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm |
923 | 4.31k | // + vinsertf128 |
924 | 4.31k | |
925 | 4.31k | { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm |
926 | 4.31k | // + vinsertf128 |
927 | 4.31k | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm |
928 | 4.31k | { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm |
929 | 4.31k | // + vinsertf128 |
930 | 4.31k | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm |
931 | 4.31k | }; |
932 | 4.31k | |
933 | 4.31k | if (ST->hasXOP()) |
934 | 96 | if (const auto *96 Entry96 = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) |
935 | 16 | return LT.first * Entry->Cost; |
936 | 4.30k | |
937 | 4.30k | static const CostTblEntry AVX1ShuffleTbl[] = { |
938 | 4.30k | { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd |
939 | 4.30k | { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps |
940 | 4.30k | { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd |
941 | 4.30k | { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps |
942 | 4.30k | { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 |
943 | 4.30k | { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 |
944 | 4.30k | |
945 | 4.30k | { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd |
946 | 4.30k | { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps |
947 | 4.30k | { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd |
948 | 4.30k | { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps |
949 | 4.30k | { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb |
950 | 4.30k | // + vinsertf128 |
951 | 4.30k | { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb |
952 | 4.30k | // + vinsertf128 |
953 | 4.30k | |
954 | 4.30k | { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd |
955 | 4.30k | { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd |
956 | 4.30k | { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps |
957 | 4.30k | { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps |
958 | 4.30k | { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor |
959 | 4.30k | { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor |
960 | 4.30k | |
961 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd |
962 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd |
963 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps |
964 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps |
965 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb |
966 | 4.30k | // + 2*por + vinsertf128 |
967 | 4.30k | { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb |
968 | 4.30k | // + 2*por + vinsertf128 |
969 | 4.30k | |
970 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd |
971 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps |
972 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd |
973 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps |
974 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb |
975 | 4.30k | // + 4*por + vinsertf128 |
976 | 4.30k | { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb |
977 | 4.30k | // + 4*por + vinsertf128 |
978 | 4.30k | }; |
979 | 4.30k | |
980 | 4.30k | if (ST->hasAVX()) |
981 | 887 | if (const auto *887 Entry887 = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) |
982 | 186 | return LT.first * Entry->Cost; |
983 | 4.11k | |
984 | 4.11k | static const CostTblEntry SSE41ShuffleTbl[] = { |
985 | 4.11k | { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw |
986 | 4.11k | { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd |
987 | 4.11k | { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw |
988 | 4.11k | { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps |
989 | 4.11k | { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw |
990 | 4.11k | { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb |
991 | 4.11k | }; |
992 | 4.11k | |
993 | 4.11k | if (ST->hasSSE41()) |
994 | 1.05k | if (const auto *1.05k Entry1.05k = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) |
995 | 136 | return LT.first * Entry->Cost; |
996 | 3.97k | |
997 | 3.97k | static const CostTblEntry SSSE3ShuffleTbl[] = { |
998 | 3.97k | { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb |
999 | 3.97k | { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb |
1000 | 3.97k | |
1001 | 3.97k | { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb |
1002 | 3.97k | { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb |
1003 | 3.97k | |
1004 | 3.97k | { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por |
1005 | 3.97k | { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por |
1006 | 3.97k | |
1007 | 3.97k | { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb |
1008 | 3.97k | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb |
1009 | 3.97k | |
1010 | 3.97k | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por |
1011 | 3.97k | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por |
1012 | 3.97k | }; |
1013 | 3.97k | |
1014 | 3.97k | if (ST->hasSSSE3()) |
1015 | 1.71k | if (const auto *1.71k Entry1.71k = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) |
1016 | 324 | return LT.first * Entry->Cost; |
1017 | 3.65k | |
1018 | 3.65k | static const CostTblEntry SSE2ShuffleTbl[] = { |
1019 | 3.65k | { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd |
1020 | 3.65k | { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd |
1021 | 3.65k | { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd |
1022 | 3.65k | { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd |
1023 | 3.65k | { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd |
1024 | 3.65k | |
1025 | 3.65k | { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd |
1026 | 3.65k | { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd |
1027 | 3.65k | { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd |
1028 | 3.65k | { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd |
1029 | 3.65k | { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw |
1030 | 3.65k | // + 2*pshufd + 2*unpck + packus |
1031 | 3.65k | |
1032 | 3.65k | { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd |
1033 | 3.65k | { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd |
1034 | 3.65k | { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps |
1035 | 3.65k | { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por |
1036 | 3.65k | { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por |
1037 | 3.65k | |
1038 | 3.65k | { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd |
1039 | 3.65k | { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd |
1040 | 3.65k | { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd |
1041 | 3.65k | { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw |
1042 | 3.65k | // + pshufd/unpck |
1043 | 3.65k | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw |
1044 | 3.65k | // + 2*pshufd + 2*unpck + 2*packus |
1045 | 3.65k | |
1046 | 3.65k | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd |
1047 | 3.65k | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd |
1048 | 3.65k | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} |
1049 | 3.65k | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute |
1050 | 3.65k | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute |
1051 | 3.65k | }; |
1052 | 3.65k | |
1053 | 3.65k | if (ST->hasSSE2()) |
1054 | 3.65k | if (const auto *3.65k Entry3.65k = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) |
1055 | 1.59k | return LT.first * Entry->Cost; |
1056 | 2.06k | |
1057 | 2.06k | static const CostTblEntry SSE1ShuffleTbl[] = { |
1058 | 2.06k | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps |
1059 | 2.06k | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps |
1060 | 2.06k | { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps |
1061 | 2.06k | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps |
1062 | 2.06k | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps |
1063 | 2.06k | }; |
1064 | 2.06k | |
1065 | 2.06k | if (ST->hasSSE1()) |
1066 | 2.06k | if (const auto *2.06k Entry2.06k = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) |
1067 | 316 | return LT.first * Entry->Cost; |
1068 | 1.74k | |
1069 | 1.74k | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); |
1070 | 1.74k | } |
1071 | | |
1072 | | int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
1073 | 5.05k | const Instruction *I) { |
1074 | 5.05k | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1075 | 5.05k | assert(ISD && "Invalid opcode"); |
1076 | 5.05k | |
1077 | 5.05k | // FIXME: Need a better design of the cost table to handle non-simple types of |
1078 | 5.05k | // potential massive combinations (elem_num x src_type x dst_type). |
1079 | 5.05k | |
1080 | 5.05k | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { |
1081 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
1082 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
1083 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
1084 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
1085 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
1086 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
1087 | 5.05k | |
1088 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
1089 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
1090 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
1091 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
1092 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
1093 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
1094 | 5.05k | |
1095 | 5.05k | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, |
1096 | 5.05k | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, |
1097 | 5.05k | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, |
1098 | 5.05k | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
1099 | 5.05k | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, |
1100 | 5.05k | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, |
1101 | 5.05k | |
1102 | 5.05k | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, |
1103 | 5.05k | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, |
1104 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, |
1105 | 5.05k | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
1106 | 5.05k | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, |
1107 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, |
1108 | 5.05k | }; |
1109 | 5.05k | |
1110 | 5.05k | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
1111 | 5.05k | // 256-bit wide vectors. |
1112 | 5.05k | |
1113 | 5.05k | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { |
1114 | 5.05k | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, |
1115 | 5.05k | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, |
1116 | 5.05k | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, |
1117 | 5.05k | |
1118 | 5.05k | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, |
1119 | 5.05k | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, |
1120 | 5.05k | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, |
1121 | 5.05k | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, |
1122 | 5.05k | |
1123 | 5.05k | // v16i1 -> v16i32 - load + broadcast |
1124 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, |
1125 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, |
1126 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
1127 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
1128 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
1129 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
1130 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
1131 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
1132 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
1133 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
1134 | 5.05k | |
1135 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
1136 | 5.05k | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
1137 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, |
1138 | 5.05k | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, |
1139 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
1140 | 5.05k | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, |
1141 | 5.05k | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
1142 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
1143 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, |
1144 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, |
1145 | 5.05k | |
1146 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
1147 | 5.05k | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
1148 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, |
1149 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, |
1150 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, |
1151 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, |
1152 | 5.05k | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, |
1153 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, |
1154 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, |
1155 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, |
1156 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
1157 | 5.05k | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, |
1158 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, |
1159 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, |
1160 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
1161 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, |
1162 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, |
1163 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
1164 | 5.05k | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
1165 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, |
1166 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, |
1167 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, |
1168 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, |
1169 | 5.05k | |
1170 | 5.05k | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, |
1171 | 5.05k | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
1172 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, |
1173 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, |
1174 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, |
1175 | 5.05k | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, |
1176 | 5.05k | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, |
1177 | 5.05k | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, |
1178 | 5.05k | }; |
1179 | 5.05k | |
1180 | 5.05k | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { |
1181 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, |
1182 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, |
1183 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, |
1184 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, |
1185 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, |
1186 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, |
1187 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
1188 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
1189 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
1190 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
1191 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
1192 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
1193 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
1194 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
1195 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
1196 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
1197 | 5.05k | |
1198 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, |
1199 | 5.05k | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, |
1200 | 5.05k | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, |
1201 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, |
1202 | 5.05k | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, |
1203 | 5.05k | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, |
1204 | 5.05k | |
1205 | 5.05k | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, |
1206 | 5.05k | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, |
1207 | 5.05k | |
1208 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, |
1209 | 5.05k | }; |
1210 | 5.05k | |
1211 | 5.05k | static const TypeConversionCostTblEntry AVXConversionTbl[] = { |
1212 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, |
1213 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, |
1214 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, |
1215 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, |
1216 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, |
1217 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, |
1218 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, |
1219 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, |
1220 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, |
1221 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, |
1222 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, |
1223 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
1224 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, |
1225 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, |
1226 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, |
1227 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, |
1228 | 5.05k | |
1229 | 5.05k | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, |
1230 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, |
1231 | 5.05k | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, |
1232 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, |
1233 | 5.05k | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, |
1234 | 5.05k | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, |
1235 | 5.05k | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, |
1236 | 5.05k | |
1237 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, |
1238 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, |
1239 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, |
1240 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, |
1241 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, |
1242 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, |
1243 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, |
1244 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, |
1245 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, |
1246 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
1247 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, |
1248 | 5.05k | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, |
1249 | 5.05k | |
1250 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, |
1251 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, |
1252 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, |
1253 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, |
1254 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, |
1255 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, |
1256 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
1257 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, |
1258 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, |
1259 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, |
1260 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, |
1261 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, |
1262 | 5.05k | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, |
1263 | 5.05k | // The generic code to compute the scalar overhead is currently broken. |
1264 | 5.05k | // Workaround this limitation by estimating the scalarization overhead |
1265 | 5.05k | // here. We have roughly 10 instructions per scalar element. |
1266 | 5.05k | // Multiply that by the vector width. |
1267 | 5.05k | // FIXME: remove that when PR19268 is fixed. |
1268 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 }, |
1269 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 }, |
1270 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, |
1271 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, |
1272 | 5.05k | |
1273 | 5.05k | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, |
1274 | 5.05k | { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, |
1275 | 5.05k | // This node is expanded into scalarized operations but BasicTTI is overly |
1276 | 5.05k | // optimistic estimating its cost. It computes 3 per element (one |
1277 | 5.05k | // vector-extract, one scalar conversion and one vector-insert). The |
1278 | 5.05k | // problem is that the inserts form a read-modify-write chain so latency |
1279 | 5.05k | // should be factored in too. Inflating the cost per element by 1. |
1280 | 5.05k | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, |
1281 | 5.05k | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, |
1282 | 5.05k | |
1283 | 5.05k | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, |
1284 | 5.05k | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, |
1285 | 5.05k | }; |
1286 | 5.05k | |
1287 | 5.05k | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { |
1288 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, |
1289 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, |
1290 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, |
1291 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, |
1292 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
1293 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
1294 | 5.05k | |
1295 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, |
1296 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, |
1297 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, |
1298 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, |
1299 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, |
1300 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, |
1301 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, |
1302 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, |
1303 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
1304 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
1305 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, |
1306 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, |
1307 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, |
1308 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, |
1309 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
1310 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
1311 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, |
1312 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, |
1313 | 5.05k | |
1314 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, |
1315 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, |
1316 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, |
1317 | 5.05k | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, |
1318 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, |
1319 | 5.05k | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, |
1320 | 5.05k | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, |
1321 | 5.05k | |
1322 | 5.05k | }; |
1323 | 5.05k | |
1324 | 5.05k | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { |
1325 | 5.05k | // These are somewhat magic numbers justified by looking at the output of |
1326 | 5.05k | // Intel's IACA, running some kernels and making sure when we take |
1327 | 5.05k | // legalization into account the throughput will be overestimated. |
1328 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, |
1329 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, |
1330 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, |
1331 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, |
1332 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, |
1333 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, |
1334 | 5.05k | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, |
1335 | 5.05k | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, |
1336 | 5.05k | |
1337 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, |
1338 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, |
1339 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, |
1340 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, |
1341 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, |
1342 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, |
1343 | 5.05k | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, |
1344 | 5.05k | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, |
1345 | 5.05k | |
1346 | 5.05k | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, |
1347 | 5.05k | |
1348 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, |
1349 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, |
1350 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, |
1351 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, |
1352 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, |
1353 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, |
1354 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, |
1355 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, |
1356 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, |
1357 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, |
1358 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, |
1359 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, |
1360 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, |
1361 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, |
1362 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, |
1363 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, |
1364 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
1365 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, |
1366 | 5.05k | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, |
1367 | 5.05k | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, |
1368 | 5.05k | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, |
1369 | 5.05k | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, |
1370 | 5.05k | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, |
1371 | 5.05k | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, |
1372 | 5.05k | |
1373 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, |
1374 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, |
1375 | 5.05k | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, |
1376 | 5.05k | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, |
1377 | 5.05k | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, |
1378 | 5.05k | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, |
1379 | 5.05k | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, |
1380 | 5.05k | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, |
1381 | 5.05k | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, |
1382 | 5.05k | }; |
1383 | 5.05k | |
1384 | 5.05k | std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); |
1385 | 5.05k | std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); |
1386 | 5.05k | |
1387 | 5.05k | if (ST->hasSSE2() && 5.05k !ST->hasAVX()5.05k ) { |
1388 | 2.20k | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, |
1389 | 2.20k | LTDest.second, LTSrc.second)) |
1390 | 250 | return LTSrc.first * Entry->Cost; |
1391 | 4.80k | } |
1392 | 4.80k | |
1393 | 4.80k | EVT SrcTy = TLI->getValueType(DL, Src); |
1394 | 4.80k | EVT DstTy = TLI->getValueType(DL, Dst); |
1395 | 4.80k | |
1396 | 4.80k | // The function getSimpleVT only handles simple value types. |
1397 | 4.80k | if (!SrcTy.isSimple() || 4.80k !DstTy.isSimple()4.76k ) |
1398 | 75 | return BaseT::getCastInstrCost(Opcode, Dst, Src); |
1399 | 4.73k | |
1400 | 4.73k | if (4.73k ST->hasDQI()4.73k ) |
1401 | 280 | if (const auto *280 Entry280 = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, |
1402 | 280 | DstTy.getSimpleVT(), |
1403 | 280 | SrcTy.getSimpleVT())) |
1404 | 42 | return Entry->Cost; |
1405 | 4.68k | |
1406 | 4.68k | if (4.68k ST->hasAVX512()4.68k ) |
1407 | 552 | if (const auto *552 Entry552 = ConvertCostTableLookup(AVX512FConversionTbl, ISD, |
1408 | 552 | DstTy.getSimpleVT(), |
1409 | 552 | SrcTy.getSimpleVT())) |
1410 | 126 | return Entry->Cost; |
1411 | 4.56k | |
1412 | 4.56k | if (4.56k ST->hasAVX2()4.56k ) { |
1413 | 1.44k | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, |
1414 | 1.44k | DstTy.getSimpleVT(), |
1415 | 1.44k | SrcTy.getSimpleVT())) |
1416 | 95 | return Entry->Cost; |
1417 | 4.46k | } |
1418 | 4.46k | |
1419 | 4.46k | if (4.46k ST->hasAVX()4.46k ) { |
1420 | 2.56k | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, |
1421 | 2.56k | DstTy.getSimpleVT(), |
1422 | 2.56k | SrcTy.getSimpleVT())) |
1423 | 474 | return Entry->Cost; |
1424 | 3.99k | } |
1425 | 3.99k | |
1426 | 3.99k | if (3.99k ST->hasSSE41()3.99k ) { |
1427 | 2.35k | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, |
1428 | 2.35k | DstTy.getSimpleVT(), |
1429 | 2.35k | SrcTy.getSimpleVT())) |
1430 | 84 | return Entry->Cost; |
1431 | 3.90k | } |
1432 | 3.90k | |
1433 | 3.90k | if (3.90k ST->hasSSE2()3.90k ) { |
1434 | 3.90k | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, |
1435 | 3.90k | DstTy.getSimpleVT(), |
1436 | 3.90k | SrcTy.getSimpleVT())) |
1437 | 141 | return Entry->Cost; |
1438 | 3.76k | } |
1439 | 3.76k | |
1440 | 3.76k | return BaseT::getCastInstrCost(Opcode, Dst, Src); |
1441 | 3.76k | } |
1442 | | |
1443 | | int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
1444 | 11.2k | const Instruction *I) { |
1445 | 11.2k | // Legalize the type. |
1446 | 11.2k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
1447 | 11.2k | |
1448 | 11.2k | MVT MTy = LT.second; |
1449 | 11.2k | |
1450 | 11.2k | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1451 | 11.2k | assert(ISD && "Invalid opcode"); |
1452 | 11.2k | |
1453 | 11.2k | static const CostTblEntry SSE2CostTbl[] = { |
1454 | 11.2k | { ISD::SETCC, MVT::v2i64, 8 }, |
1455 | 11.2k | { ISD::SETCC, MVT::v4i32, 1 }, |
1456 | 11.2k | { ISD::SETCC, MVT::v8i16, 1 }, |
1457 | 11.2k | { ISD::SETCC, MVT::v16i8, 1 }, |
1458 | 11.2k | }; |
1459 | 11.2k | |
1460 | 11.2k | static const CostTblEntry SSE42CostTbl[] = { |
1461 | 11.2k | { ISD::SETCC, MVT::v2f64, 1 }, |
1462 | 11.2k | { ISD::SETCC, MVT::v4f32, 1 }, |
1463 | 11.2k | { ISD::SETCC, MVT::v2i64, 1 }, |
1464 | 11.2k | }; |
1465 | 11.2k | |
1466 | 11.2k | static const CostTblEntry AVX1CostTbl[] = { |
1467 | 11.2k | { ISD::SETCC, MVT::v4f64, 1 }, |
1468 | 11.2k | { ISD::SETCC, MVT::v8f32, 1 }, |
1469 | 11.2k | // AVX1 does not support 8-wide integer compare. |
1470 | 11.2k | { ISD::SETCC, MVT::v4i64, 4 }, |
1471 | 11.2k | { ISD::SETCC, MVT::v8i32, 4 }, |
1472 | 11.2k | { ISD::SETCC, MVT::v16i16, 4 }, |
1473 | 11.2k | { ISD::SETCC, MVT::v32i8, 4 }, |
1474 | 11.2k | }; |
1475 | 11.2k | |
1476 | 11.2k | static const CostTblEntry AVX2CostTbl[] = { |
1477 | 11.2k | { ISD::SETCC, MVT::v4i64, 1 }, |
1478 | 11.2k | { ISD::SETCC, MVT::v8i32, 1 }, |
1479 | 11.2k | { ISD::SETCC, MVT::v16i16, 1 }, |
1480 | 11.2k | { ISD::SETCC, MVT::v32i8, 1 }, |
1481 | 11.2k | }; |
1482 | 11.2k | |
1483 | 11.2k | static const CostTblEntry AVX512CostTbl[] = { |
1484 | 11.2k | { ISD::SETCC, MVT::v8i64, 1 }, |
1485 | 11.2k | { ISD::SETCC, MVT::v16i32, 1 }, |
1486 | 11.2k | { ISD::SETCC, MVT::v8f64, 1 }, |
1487 | 11.2k | { ISD::SETCC, MVT::v16f32, 1 }, |
1488 | 11.2k | }; |
1489 | 11.2k | |
1490 | 11.2k | if (ST->hasAVX512()) |
1491 | 292 | if (const auto *292 Entry292 = CostTableLookup(AVX512CostTbl, ISD, MTy)) |
1492 | 17 | return LT.first * Entry->Cost; |
1493 | 11.2k | |
1494 | 11.2k | if (11.2k ST->hasAVX2()11.2k ) |
1495 | 1.04k | if (const auto *1.04k Entry1.04k = CostTableLookup(AVX2CostTbl, ISD, MTy)) |
1496 | 33 | return LT.first * Entry->Cost; |
1497 | 11.2k | |
1498 | 11.2k | if (11.2k ST->hasAVX()11.2k ) |
1499 | 1.23k | if (const auto *1.23k Entry1.23k = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
1500 | 24 | return LT.first * Entry->Cost; |
1501 | 11.2k | |
1502 | 11.2k | if (11.2k ST->hasSSE42()11.2k ) |
1503 | 1.34k | if (const auto *1.34k Entry1.34k = CostTableLookup(SSE42CostTbl, ISD, MTy)) |
1504 | 222 | return LT.first * Entry->Cost; |
1505 | 10.9k | |
1506 | 10.9k | if (10.9k ST->hasSSE2()10.9k ) |
1507 | 10.9k | if (const auto *10.9k Entry10.9k = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
1508 | 530 | return LT.first * Entry->Cost; |
1509 | 10.4k | |
1510 | 10.4k | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); |
1511 | 10.4k | } |
1512 | | |
1513 | 8 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } |
1514 | | |
1515 | | int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, |
1516 | | ArrayRef<Type *> Tys, FastMathFlags FMF, |
1517 | 4.15k | unsigned ScalarizationCostPassed) { |
1518 | 4.15k | // Costs should match the codegen from: |
1519 | 4.15k | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll |
1520 | 4.15k | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll |
1521 | 4.15k | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll |
1522 | 4.15k | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll |
1523 | 4.15k | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll |
1524 | 4.15k | static const CostTblEntry AVX512CDCostTbl[] = { |
1525 | 4.15k | { ISD::CTLZ, MVT::v8i64, 1 }, |
1526 | 4.15k | { ISD::CTLZ, MVT::v16i32, 1 }, |
1527 | 4.15k | { ISD::CTLZ, MVT::v32i16, 8 }, |
1528 | 4.15k | { ISD::CTLZ, MVT::v64i8, 20 }, |
1529 | 4.15k | { ISD::CTLZ, MVT::v4i64, 1 }, |
1530 | 4.15k | { ISD::CTLZ, MVT::v8i32, 1 }, |
1531 | 4.15k | { ISD::CTLZ, MVT::v16i16, 4 }, |
1532 | 4.15k | { ISD::CTLZ, MVT::v32i8, 10 }, |
1533 | 4.15k | { ISD::CTLZ, MVT::v2i64, 1 }, |
1534 | 4.15k | { ISD::CTLZ, MVT::v4i32, 1 }, |
1535 | 4.15k | { ISD::CTLZ, MVT::v8i16, 4 }, |
1536 | 4.15k | { ISD::CTLZ, MVT::v16i8, 4 }, |
1537 | 4.15k | }; |
1538 | 4.15k | static const CostTblEntry AVX512BWCostTbl[] = { |
1539 | 4.15k | { ISD::BITREVERSE, MVT::v8i64, 5 }, |
1540 | 4.15k | { ISD::BITREVERSE, MVT::v16i32, 5 }, |
1541 | 4.15k | { ISD::BITREVERSE, MVT::v32i16, 5 }, |
1542 | 4.15k | { ISD::BITREVERSE, MVT::v64i8, 5 }, |
1543 | 4.15k | { ISD::CTLZ, MVT::v8i64, 23 }, |
1544 | 4.15k | { ISD::CTLZ, MVT::v16i32, 22 }, |
1545 | 4.15k | { ISD::CTLZ, MVT::v32i16, 18 }, |
1546 | 4.15k | { ISD::CTLZ, MVT::v64i8, 17 }, |
1547 | 4.15k | { ISD::CTPOP, MVT::v8i64, 7 }, |
1548 | 4.15k | { ISD::CTPOP, MVT::v16i32, 11 }, |
1549 | 4.15k | { ISD::CTPOP, MVT::v32i16, 9 }, |
1550 | 4.15k | { ISD::CTPOP, MVT::v64i8, 6 }, |
1551 | 4.15k | { ISD::CTTZ, MVT::v8i64, 10 }, |
1552 | 4.15k | { ISD::CTTZ, MVT::v16i32, 14 }, |
1553 | 4.15k | { ISD::CTTZ, MVT::v32i16, 12 }, |
1554 | 4.15k | { ISD::CTTZ, MVT::v64i8, 9 }, |
1555 | 4.15k | }; |
1556 | 4.15k | static const CostTblEntry AVX512CostTbl[] = { |
1557 | 4.15k | { ISD::BITREVERSE, MVT::v8i64, 36 }, |
1558 | 4.15k | { ISD::BITREVERSE, MVT::v16i32, 24 }, |
1559 | 4.15k | { ISD::CTLZ, MVT::v8i64, 29 }, |
1560 | 4.15k | { ISD::CTLZ, MVT::v16i32, 35 }, |
1561 | 4.15k | { ISD::CTPOP, MVT::v8i64, 16 }, |
1562 | 4.15k | { ISD::CTPOP, MVT::v16i32, 24 }, |
1563 | 4.15k | { ISD::CTTZ, MVT::v8i64, 20 }, |
1564 | 4.15k | { ISD::CTTZ, MVT::v16i32, 28 }, |
1565 | 4.15k | }; |
1566 | 4.15k | static const CostTblEntry XOPCostTbl[] = { |
1567 | 4.15k | { ISD::BITREVERSE, MVT::v4i64, 4 }, |
1568 | 4.15k | { ISD::BITREVERSE, MVT::v8i32, 4 }, |
1569 | 4.15k | { ISD::BITREVERSE, MVT::v16i16, 4 }, |
1570 | 4.15k | { ISD::BITREVERSE, MVT::v32i8, 4 }, |
1571 | 4.15k | { ISD::BITREVERSE, MVT::v2i64, 1 }, |
1572 | 4.15k | { ISD::BITREVERSE, MVT::v4i32, 1 }, |
1573 | 4.15k | { ISD::BITREVERSE, MVT::v8i16, 1 }, |
1574 | 4.15k | { ISD::BITREVERSE, MVT::v16i8, 1 }, |
1575 | 4.15k | { ISD::BITREVERSE, MVT::i64, 3 }, |
1576 | 4.15k | { ISD::BITREVERSE, MVT::i32, 3 }, |
1577 | 4.15k | { ISD::BITREVERSE, MVT::i16, 3 }, |
1578 | 4.15k | { ISD::BITREVERSE, MVT::i8, 3 } |
1579 | 4.15k | }; |
1580 | 4.15k | static const CostTblEntry AVX2CostTbl[] = { |
1581 | 4.15k | { ISD::BITREVERSE, MVT::v4i64, 5 }, |
1582 | 4.15k | { ISD::BITREVERSE, MVT::v8i32, 5 }, |
1583 | 4.15k | { ISD::BITREVERSE, MVT::v16i16, 5 }, |
1584 | 4.15k | { ISD::BITREVERSE, MVT::v32i8, 5 }, |
1585 | 4.15k | { ISD::BSWAP, MVT::v4i64, 1 }, |
1586 | 4.15k | { ISD::BSWAP, MVT::v8i32, 1 }, |
1587 | 4.15k | { ISD::BSWAP, MVT::v16i16, 1 }, |
1588 | 4.15k | { ISD::CTLZ, MVT::v4i64, 23 }, |
1589 | 4.15k | { ISD::CTLZ, MVT::v8i32, 18 }, |
1590 | 4.15k | { ISD::CTLZ, MVT::v16i16, 14 }, |
1591 | 4.15k | { ISD::CTLZ, MVT::v32i8, 9 }, |
1592 | 4.15k | { ISD::CTPOP, MVT::v4i64, 7 }, |
1593 | 4.15k | { ISD::CTPOP, MVT::v8i32, 11 }, |
1594 | 4.15k | { ISD::CTPOP, MVT::v16i16, 9 }, |
1595 | 4.15k | { ISD::CTPOP, MVT::v32i8, 6 }, |
1596 | 4.15k | { ISD::CTTZ, MVT::v4i64, 10 }, |
1597 | 4.15k | { ISD::CTTZ, MVT::v8i32, 14 }, |
1598 | 4.15k | { ISD::CTTZ, MVT::v16i16, 12 }, |
1599 | 4.15k | { ISD::CTTZ, MVT::v32i8, 9 }, |
1600 | 4.15k | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ |
1601 | 4.15k | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ |
1602 | 4.15k | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ |
1603 | 4.15k | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ |
1604 | 4.15k | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ |
1605 | 4.15k | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ |
1606 | 4.15k | }; |
1607 | 4.15k | static const CostTblEntry AVX1CostTbl[] = { |
1608 | 4.15k | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert |
1609 | 4.15k | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert |
1610 | 4.15k | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert |
1611 | 4.15k | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert |
1612 | 4.15k | { ISD::BSWAP, MVT::v4i64, 4 }, |
1613 | 4.15k | { ISD::BSWAP, MVT::v8i32, 4 }, |
1614 | 4.15k | { ISD::BSWAP, MVT::v16i16, 4 }, |
1615 | 4.15k | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert |
1616 | 4.15k | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert |
1617 | 4.15k | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert |
1618 | 4.15k | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert |
1619 | 4.15k | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert |
1620 | 4.15k | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert |
1621 | 4.15k | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert |
1622 | 4.15k | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert |
1623 | 4.15k | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert |
1624 | 4.15k | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert |
1625 | 4.15k | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert |
1626 | 4.15k | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert |
1627 | 4.15k | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ |
1628 | 4.15k | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ |
1629 | 4.15k | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ |
1630 | 4.15k | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ |
1631 | 4.15k | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ |
1632 | 4.15k | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ |
1633 | 4.15k | }; |
1634 | 4.15k | static const CostTblEntry SSE42CostTbl[] = { |
1635 | 4.15k | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ |
1636 | 4.15k | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ |
1637 | 4.15k | }; |
1638 | 4.15k | static const CostTblEntry SSSE3CostTbl[] = { |
1639 | 4.15k | { ISD::BITREVERSE, MVT::v2i64, 5 }, |
1640 | 4.15k | { ISD::BITREVERSE, MVT::v4i32, 5 }, |
1641 | 4.15k | { ISD::BITREVERSE, MVT::v8i16, 5 }, |
1642 | 4.15k | { ISD::BITREVERSE, MVT::v16i8, 5 }, |
1643 | 4.15k | { ISD::BSWAP, MVT::v2i64, 1 }, |
1644 | 4.15k | { ISD::BSWAP, MVT::v4i32, 1 }, |
1645 | 4.15k | { ISD::BSWAP, MVT::v8i16, 1 }, |
1646 | 4.15k | { ISD::CTLZ, MVT::v2i64, 23 }, |
1647 | 4.15k | { ISD::CTLZ, MVT::v4i32, 18 }, |
1648 | 4.15k | { ISD::CTLZ, MVT::v8i16, 14 }, |
1649 | 4.15k | { ISD::CTLZ, MVT::v16i8, 9 }, |
1650 | 4.15k | { ISD::CTPOP, MVT::v2i64, 7 }, |
1651 | 4.15k | { ISD::CTPOP, MVT::v4i32, 11 }, |
1652 | 4.15k | { ISD::CTPOP, MVT::v8i16, 9 }, |
1653 | 4.15k | { ISD::CTPOP, MVT::v16i8, 6 }, |
1654 | 4.15k | { ISD::CTTZ, MVT::v2i64, 10 }, |
1655 | 4.15k | { ISD::CTTZ, MVT::v4i32, 14 }, |
1656 | 4.15k | { ISD::CTTZ, MVT::v8i16, 12 }, |
1657 | 4.15k | { ISD::CTTZ, MVT::v16i8, 9 } |
1658 | 4.15k | }; |
1659 | 4.15k | static const CostTblEntry SSE2CostTbl[] = { |
1660 | 4.15k | { ISD::BITREVERSE, MVT::v2i64, 29 }, |
1661 | 4.15k | { ISD::BITREVERSE, MVT::v4i32, 27 }, |
1662 | 4.15k | { ISD::BITREVERSE, MVT::v8i16, 27 }, |
1663 | 4.15k | { ISD::BITREVERSE, MVT::v16i8, 20 }, |
1664 | 4.15k | { ISD::BSWAP, MVT::v2i64, 7 }, |
1665 | 4.15k | { ISD::BSWAP, MVT::v4i32, 7 }, |
1666 | 4.15k | { ISD::BSWAP, MVT::v8i16, 7 }, |
1667 | 4.15k | { ISD::CTLZ, MVT::v2i64, 25 }, |
1668 | 4.15k | { ISD::CTLZ, MVT::v4i32, 26 }, |
1669 | 4.15k | { ISD::CTLZ, MVT::v8i16, 20 }, |
1670 | 4.15k | { ISD::CTLZ, MVT::v16i8, 17 }, |
1671 | 4.15k | { ISD::CTPOP, MVT::v2i64, 12 }, |
1672 | 4.15k | { ISD::CTPOP, MVT::v4i32, 15 }, |
1673 | 4.15k | { ISD::CTPOP, MVT::v8i16, 13 }, |
1674 | 4.15k | { ISD::CTPOP, MVT::v16i8, 10 }, |
1675 | 4.15k | { ISD::CTTZ, MVT::v2i64, 14 }, |
1676 | 4.15k | { ISD::CTTZ, MVT::v4i32, 18 }, |
1677 | 4.15k | { ISD::CTTZ, MVT::v8i16, 16 }, |
1678 | 4.15k | { ISD::CTTZ, MVT::v16i8, 13 }, |
1679 | 4.15k | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ |
1680 | 4.15k | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ |
1681 | 4.15k | }; |
1682 | 4.15k | static const CostTblEntry SSE1CostTbl[] = { |
1683 | 4.15k | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ |
1684 | 4.15k | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ |
1685 | 4.15k | }; |
1686 | 4.15k | static const CostTblEntry X64CostTbl[] = { // 64-bit targets |
1687 | 4.15k | { ISD::BITREVERSE, MVT::i64, 14 } |
1688 | 4.15k | }; |
1689 | 4.15k | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
1690 | 4.15k | { ISD::BITREVERSE, MVT::i32, 14 }, |
1691 | 4.15k | { ISD::BITREVERSE, MVT::i16, 14 }, |
1692 | 4.15k | { ISD::BITREVERSE, MVT::i8, 11 } |
1693 | 4.15k | }; |
1694 | 4.15k | |
1695 | 4.15k | unsigned ISD = ISD::DELETED_NODE; |
1696 | 4.15k | switch (IID) { |
1697 | 2.22k | default: |
1698 | 2.22k | break; |
1699 | 320 | case Intrinsic::bitreverse: |
1700 | 320 | ISD = ISD::BITREVERSE; |
1701 | 320 | break; |
1702 | 86 | case Intrinsic::bswap: |
1703 | 86 | ISD = ISD::BSWAP; |
1704 | 86 | break; |
1705 | 642 | case Intrinsic::ctlz: |
1706 | 642 | ISD = ISD::CTLZ; |
1707 | 642 | break; |
1708 | 238 | case Intrinsic::ctpop: |
1709 | 238 | ISD = ISD::CTPOP; |
1710 | 238 | break; |
1711 | 506 | case Intrinsic::cttz: |
1712 | 506 | ISD = ISD::CTTZ; |
1713 | 506 | break; |
1714 | 138 | case Intrinsic::sqrt: |
1715 | 138 | ISD = ISD::FSQRT; |
1716 | 138 | break; |
1717 | 4.15k | } |
1718 | 4.15k | |
1719 | 4.15k | // Legalize the type. |
1720 | 4.15k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); |
1721 | 4.15k | MVT MTy = LT.second; |
1722 | 4.15k | |
1723 | 4.15k | // Attempt to lookup cost. |
1724 | 4.15k | if (ST->hasCDI()) |
1725 | 316 | if (const auto *316 Entry316 = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) |
1726 | 24 | return LT.first * Entry->Cost; |
1727 | 4.12k | |
1728 | 4.12k | if (4.12k ST->hasBWI()4.12k ) |
1729 | 268 | if (const auto *268 Entry268 = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) |
1730 | 28 | return LT.first * Entry->Cost; |
1731 | 4.10k | |
1732 | 4.10k | if (4.10k ST->hasAVX512()4.10k ) |
1733 | 392 | if (const auto *392 Entry392 = CostTableLookup(AVX512CostTbl, ISD, MTy)) |
1734 | 14 | return LT.first * Entry->Cost; |
1735 | 4.08k | |
1736 | 4.08k | if (4.08k ST->hasXOP()4.08k ) |
1737 | 304 | if (const auto *304 Entry304 = CostTableLookup(XOPCostTbl, ISD, MTy)) |
1738 | 68 | return LT.first * Entry->Cost; |
1739 | 4.01k | |
1740 | 4.01k | if (4.01k ST->hasAVX2()4.01k ) |
1741 | 989 | if (const auto *989 Entry989 = CostTableLookup(AVX2CostTbl, ISD, MTy)) |
1742 | 239 | return LT.first * Entry->Cost; |
1743 | 3.77k | |
1744 | 3.77k | if (3.77k ST->hasAVX()3.77k ) |
1745 | 1.55k | if (const auto *1.55k Entry1.55k = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
1746 | 165 | return LT.first * Entry->Cost; |
1747 | 3.61k | |
1748 | 3.61k | if (3.61k ST->hasSSE42()3.61k ) |
1749 | 1.84k | if (const auto *1.84k Entry1.84k = CostTableLookup(SSE42CostTbl, ISD, MTy)) |
1750 | 4 | return LT.first * Entry->Cost; |
1751 | 3.61k | |
1752 | 3.61k | if (3.61k ST->hasSSSE3()3.61k ) |
1753 | 2.22k | if (const auto *2.22k Entry2.22k = CostTableLookup(SSSE3CostTbl, ISD, MTy)) |
1754 | 517 | return LT.first * Entry->Cost; |
1755 | 3.09k | |
1756 | 3.09k | if (3.09k ST->hasSSE2()3.09k ) |
1757 | 3.09k | if (const auto *3.09k Entry3.09k = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
1758 | 223 | return LT.first * Entry->Cost; |
1759 | 2.87k | |
1760 | 2.87k | if (2.87k ST->hasSSE1()2.87k ) |
1761 | 2.87k | if (const auto *2.87k Entry2.87k = CostTableLookup(SSE1CostTbl, ISD, MTy)) |
1762 | 18 | return LT.first * Entry->Cost; |
1763 | 2.85k | |
1764 | 2.85k | if (2.85k ST->is64Bit()2.85k ) |
1765 | 2.60k | if (const auto *2.60k Entry2.60k = CostTableLookup(X64CostTbl, ISD, MTy)) |
1766 | 13 | return LT.first * Entry->Cost; |
1767 | 2.83k | |
1768 | 2.83k | if (const auto *2.83k Entry2.83k = CostTableLookup(X86CostTbl, ISD, MTy)) |
1769 | 65 | return LT.first * Entry->Cost; |
1770 | 2.77k | |
1771 | 2.77k | return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); |
1772 | 2.77k | } |
1773 | | |
1774 | | int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, |
1775 | 2.57k | ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { |
1776 | 2.57k | return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); |
1777 | 2.57k | } |
1778 | | |
1779 | 89.6k | int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { |
1780 | 89.6k | assert(Val->isVectorTy() && "This must be a vector type"); |
1781 | 89.6k | |
1782 | 89.6k | Type *ScalarType = Val->getScalarType(); |
1783 | 89.6k | |
1784 | 89.6k | if (Index != -1U89.6k ) { |
1785 | 89.6k | // Legalize the type. |
1786 | 89.6k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); |
1787 | 89.6k | |
1788 | 89.6k | // This type is legalized to a scalar type. |
1789 | 89.6k | if (!LT.second.isVector()) |
1790 | 921 | return 0; |
1791 | 88.7k | |
1792 | 88.7k | // The type may be split. Normalize the index to the new type. |
1793 | 88.7k | unsigned Width = LT.second.getVectorNumElements(); |
1794 | 88.7k | Index = Index % Width; |
1795 | 88.7k | |
1796 | 88.7k | // Floating point scalars are already located in index #0. |
1797 | 88.7k | if (ScalarType->isFloatingPointTy() && 88.7k Index == 033.6k ) |
1798 | 9.70k | return 0; |
1799 | 79.0k | } |
1800 | 79.0k | |
1801 | 79.0k | // Add to the base cost if we know that the extracted element of a vector is |
1802 | 79.0k | // destined to be moved to and used in the integer register file. |
1803 | 79.0k | int RegisterFileMoveCost = 0; |
1804 | 79.0k | if (Opcode == Instruction::ExtractElement && 79.0k ScalarType->isPointerTy()36.4k ) |
1805 | 1.72k | RegisterFileMoveCost = 1; |
1806 | 89.6k | |
1807 | 89.6k | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; |
1808 | 89.6k | } |
1809 | | |
1810 | | int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, |
1811 | 17.4k | unsigned AddressSpace, const Instruction *I) { |
1812 | 17.4k | // Handle non-power-of-two vectors such as <3 x float> |
1813 | 17.4k | if (VectorType *VTy17.4k = dyn_cast<VectorType>(Src)) { |
1814 | 9.24k | unsigned NumElem = VTy->getVectorNumElements(); |
1815 | 9.24k | |
1816 | 9.24k | // Handle a few common cases: |
1817 | 9.24k | // <3 x float> |
1818 | 9.24k | if (NumElem == 3 && 9.24k VTy->getScalarSizeInBits() == 324 ) |
1819 | 9.24k | // Cost = 64 bit store + extract + 32 bit store. |
1820 | 2 | return 3; |
1821 | 9.24k | |
1822 | 9.24k | // <3 x double> |
1823 | 9.24k | if (9.24k NumElem == 3 && 9.24k VTy->getScalarSizeInBits() == 642 ) |
1824 | 9.24k | // Cost = 128 bit store + unpack + 64 bit store. |
1825 | 2 | return 3; |
1826 | 9.23k | |
1827 | 9.23k | // Assume that all other non-power-of-two numbers are scalarized. |
1828 | 9.23k | if (9.23k !isPowerOf2_32(NumElem)9.23k ) { |
1829 | 4 | int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, |
1830 | 4 | AddressSpace); |
1831 | 4 | int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, |
1832 | 4 | Opcode == Instruction::Store); |
1833 | 4 | return NumElem * Cost + SplitCost; |
1834 | 4 | } |
1835 | 17.4k | } |
1836 | 17.4k | |
1837 | 17.4k | // Legalize the type. |
1838 | 17.4k | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); |
1839 | 17.4k | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
1840 | 17.4k | "Invalid Opcode"); |
1841 | 17.4k | |
1842 | 17.4k | // Each load/store unit costs 1. |
1843 | 17.4k | int Cost = LT.first * 1; |
1844 | 17.4k | |
1845 | 17.4k | // This isn't exactly right. We're using slow unaligned 32-byte accesses as a |
1846 | 17.4k | // proxy for a double-pumped AVX memory interface such as on Sandybridge. |
1847 | 17.4k | if (LT.second.getStoreSize() == 32 && 17.4k ST->isUnalignedMem32Slow()1.54k ) |
1848 | 553 | Cost *= 2; |
1849 | 17.4k | |
1850 | 17.4k | return Cost; |
1851 | 17.4k | } |
1852 | | |
1853 | | int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, |
1854 | | unsigned Alignment, |
1855 | 149 | unsigned AddressSpace) { |
1856 | 149 | VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); |
1857 | 149 | if (!SrcVTy) |
1858 | 149 | // To calculate scalar take the regular cost, without mask |
1859 | 0 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); |
1860 | 149 | |
1861 | 149 | unsigned NumElem = SrcVTy->getVectorNumElements(); |
1862 | 149 | VectorType *MaskTy = |
1863 | 149 | VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); |
1864 | 149 | if ((Opcode == Instruction::Load && 149 !isLegalMaskedLoad(SrcVTy)78 ) || |
1865 | 149 | (Opcode == Instruction::Store && 149 !isLegalMaskedStore(SrcVTy)71 ) || |
1866 | 149 | !isPowerOf2_32(NumElem)149 ) { |
1867 | 0 | // Scalarization |
1868 | 0 | int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); |
1869 | 0 | int ScalarCompareCost = getCmpSelInstrCost( |
1870 | 0 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); |
1871 | 0 | int BranchCost = getCFInstrCost(Instruction::Br); |
1872 | 0 | int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); |
1873 | 0 |
|
1874 | 0 | int ValueSplitCost = getScalarizationOverhead( |
1875 | 0 | SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); |
1876 | 0 | int MemopCost = |
1877 | 0 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), |
1878 | 0 | Alignment, AddressSpace); |
1879 | 0 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; |
1880 | 0 | } |
1881 | 149 | |
1882 | 149 | // Legalize the type. |
1883 | 149 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); |
1884 | 149 | auto VT = TLI->getValueType(DL, SrcVTy); |
1885 | 149 | int Cost = 0; |
1886 | 149 | if (VT.isSimple() && 149 LT.second != VT.getSimpleVT()149 && |
1887 | 33 | LT.second.getVectorNumElements() == NumElem) |
1888 | 149 | // Promotion requires expand/truncate for data and a shuffle for mask. |
1889 | 20 | Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + |
1890 | 20 | getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); |
1891 | 149 | |
1892 | 129 | else if (129 LT.second.getVectorNumElements() > NumElem129 ) { |
1893 | 13 | VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), |
1894 | 13 | LT.second.getVectorNumElements()); |
1895 | 13 | // Expanding requires fill mask with zeroes |
1896 | 13 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); |
1897 | 13 | } |
1898 | 149 | if (!ST->hasAVX512()) |
1899 | 77 | return Cost + LT.first*4; // Each maskmov costs 4 |
1900 | 72 | |
1901 | 72 | // AVX-512 masked load/store is cheapper |
1902 | 72 | return Cost+LT.first; |
1903 | 72 | } |
1904 | | |
1905 | | int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, |
1906 | 1.15k | const SCEV *Ptr) { |
1907 | 1.15k | // Address computations in vectorized code with non-consecutive addresses will |
1908 | 1.15k | // likely result in more instructions compared to scalar code where the |
1909 | 1.15k | // computation can more often be merged into the index mode. The resulting |
1910 | 1.15k | // extra micro-ops can significantly decrease throughput. |
1911 | 1.15k | unsigned NumVectorInstToHideOverhead = 10; |
1912 | 1.15k | |
1913 | 1.15k | // Cost modeling of Strided Access Computation is hidden by the indexing |
1914 | 1.15k | // modes of X86 regardless of the stride value. We dont believe that there |
1915 | 1.15k | // is a difference between constant strided access in gerenal and constant |
1916 | 1.15k | // strided value which is less than or equal to 64. |
1917 | 1.15k | // Even in the case of (loop invariant) stride whose value is not known at |
1918 | 1.15k | // compile time, the address computation will not incur more than one extra |
1919 | 1.15k | // ADD instruction. |
1920 | 1.15k | if (Ty->isVectorTy() && 1.15k SE311 ) { |
1921 | 208 | if (!BaseT::isStridedAccess(Ptr)) |
1922 | 38 | return NumVectorInstToHideOverhead; |
1923 | 170 | if (170 !BaseT::getConstantStrideStep(SE, Ptr)170 ) |
1924 | 0 | return 1; |
1925 | 1.11k | } |
1926 | 1.11k | |
1927 | 1.11k | return BaseT::getAddressComputationCost(Ty, SE, Ptr); |
1928 | 1.11k | } |
1929 | | |
1930 | | int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, |
1931 | 200 | bool IsPairwise) { |
1932 | 200 | |
1933 | 200 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
1934 | 200 | |
1935 | 200 | MVT MTy = LT.second; |
1936 | 200 | |
1937 | 200 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1938 | 200 | assert(ISD && "Invalid opcode"); |
1939 | 200 | |
1940 | 200 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
1941 | 200 | // and make it as the cost. |
1942 | 200 | |
1943 | 200 | static const CostTblEntry SSE42CostTblPairWise[] = { |
1944 | 200 | { ISD::FADD, MVT::v2f64, 2 }, |
1945 | 200 | { ISD::FADD, MVT::v4f32, 4 }, |
1946 | 200 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". |
1947 | 200 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". |
1948 | 200 | { ISD::ADD, MVT::v8i16, 5 }, |
1949 | 200 | }; |
1950 | 200 | |
1951 | 200 | static const CostTblEntry AVX1CostTblPairWise[] = { |
1952 | 200 | { ISD::FADD, MVT::v4f32, 4 }, |
1953 | 200 | { ISD::FADD, MVT::v4f64, 5 }, |
1954 | 200 | { ISD::FADD, MVT::v8f32, 7 }, |
1955 | 200 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". |
1956 | 200 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". |
1957 | 200 | { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". |
1958 | 200 | { ISD::ADD, MVT::v8i16, 5 }, |
1959 | 200 | { ISD::ADD, MVT::v8i32, 5 }, |
1960 | 200 | }; |
1961 | 200 | |
1962 | 200 | static const CostTblEntry SSE42CostTblNoPairWise[] = { |
1963 | 200 | { ISD::FADD, MVT::v2f64, 2 }, |
1964 | 200 | { ISD::FADD, MVT::v4f32, 4 }, |
1965 | 200 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". |
1966 | 200 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". |
1967 | 200 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". |
1968 | 200 | }; |
1969 | 200 | |
1970 | 200 | static const CostTblEntry AVX1CostTblNoPairWise[] = { |
1971 | 200 | { ISD::FADD, MVT::v4f32, 3 }, |
1972 | 200 | { ISD::FADD, MVT::v4f64, 3 }, |
1973 | 200 | { ISD::FADD, MVT::v8f32, 4 }, |
1974 | 200 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". |
1975 | 200 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". |
1976 | 200 | { ISD::ADD, MVT::v4i64, 3 }, |
1977 | 200 | { ISD::ADD, MVT::v8i16, 4 }, |
1978 | 200 | { ISD::ADD, MVT::v8i32, 5 }, |
1979 | 200 | }; |
1980 | 200 | |
1981 | 200 | if (IsPairwise200 ) { |
1982 | 102 | if (ST->hasAVX()) |
1983 | 73 | if (const auto *73 Entry73 = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) |
1984 | 71 | return LT.first * Entry->Cost; |
1985 | 31 | |
1986 | 31 | if (31 ST->hasSSE42()31 ) |
1987 | 19 | if (const auto *19 Entry19 = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) |
1988 | 19 | return LT.first * Entry->Cost; |
1989 | 98 | } else { |
1990 | 98 | if (ST->hasAVX()) |
1991 | 71 | if (const auto *71 Entry71 = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) |
1992 | 69 | return LT.first * Entry->Cost; |
1993 | 29 | |
1994 | 29 | if (29 ST->hasSSE42()29 ) |
1995 | 18 | if (const auto *18 Entry18 = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) |
1996 | 18 | return LT.first * Entry->Cost; |
1997 | 23 | } |
1998 | 23 | |
1999 | 23 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); |
2000 | 23 | } |
2001 | | |
2002 | | int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, |
2003 | 888 | bool IsPairwise, bool IsUnsigned) { |
2004 | 888 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
2005 | 888 | |
2006 | 888 | MVT MTy = LT.second; |
2007 | 888 | |
2008 | 888 | int ISD; |
2009 | 888 | if (ValTy->isIntOrIntVectorTy()888 ) { |
2010 | 92 | ISD = IsUnsigned ? ISD::UMIN0 : ISD::SMIN92 ; |
2011 | 888 | } else { |
2012 | 796 | assert(ValTy->isFPOrFPVectorTy() && |
2013 | 796 | "Expected float point or integer vector type."); |
2014 | 796 | ISD = ISD::FMINNUM; |
2015 | 796 | } |
2016 | 888 | |
2017 | 888 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
2018 | 888 | // and make it as the cost. |
2019 | 888 | |
2020 | 888 | static const CostTblEntry SSE42CostTblPairWise[] = { |
2021 | 888 | {ISD::FMINNUM, MVT::v2f64, 3}, |
2022 | 888 | {ISD::FMINNUM, MVT::v4f32, 2}, |
2023 | 888 | {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" |
2024 | 888 | {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" |
2025 | 888 | {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" |
2026 | 888 | {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" |
2027 | 888 | {ISD::SMIN, MVT::v8i16, 2}, |
2028 | 888 | {ISD::UMIN, MVT::v8i16, 2}, |
2029 | 888 | }; |
2030 | 888 | |
2031 | 888 | static const CostTblEntry AVX1CostTblPairWise[] = { |
2032 | 888 | {ISD::FMINNUM, MVT::v4f32, 1}, |
2033 | 888 | {ISD::FMINNUM, MVT::v4f64, 1}, |
2034 | 888 | {ISD::FMINNUM, MVT::v8f32, 2}, |
2035 | 888 | {ISD::SMIN, MVT::v2i64, 3}, |
2036 | 888 | {ISD::UMIN, MVT::v2i64, 3}, |
2037 | 888 | {ISD::SMIN, MVT::v4i32, 1}, |
2038 | 888 | {ISD::UMIN, MVT::v4i32, 1}, |
2039 | 888 | {ISD::SMIN, MVT::v8i16, 1}, |
2040 | 888 | {ISD::UMIN, MVT::v8i16, 1}, |
2041 | 888 | {ISD::SMIN, MVT::v8i32, 3}, |
2042 | 888 | {ISD::UMIN, MVT::v8i32, 3}, |
2043 | 888 | }; |
2044 | 888 | |
2045 | 888 | static const CostTblEntry AVX2CostTblPairWise[] = { |
2046 | 888 | {ISD::SMIN, MVT::v4i64, 2}, |
2047 | 888 | {ISD::UMIN, MVT::v4i64, 2}, |
2048 | 888 | {ISD::SMIN, MVT::v8i32, 1}, |
2049 | 888 | {ISD::UMIN, MVT::v8i32, 1}, |
2050 | 888 | {ISD::SMIN, MVT::v16i16, 1}, |
2051 | 888 | {ISD::UMIN, MVT::v16i16, 1}, |
2052 | 888 | {ISD::SMIN, MVT::v32i8, 2}, |
2053 | 888 | {ISD::UMIN, MVT::v32i8, 2}, |
2054 | 888 | }; |
2055 | 888 | |
2056 | 888 | static const CostTblEntry AVX512CostTblPairWise[] = { |
2057 | 888 | {ISD::FMINNUM, MVT::v8f64, 1}, |
2058 | 888 | {ISD::FMINNUM, MVT::v16f32, 2}, |
2059 | 888 | {ISD::SMIN, MVT::v8i64, 2}, |
2060 | 888 | {ISD::UMIN, MVT::v8i64, 2}, |
2061 | 888 | {ISD::SMIN, MVT::v16i32, 1}, |
2062 | 888 | {ISD::UMIN, MVT::v16i32, 1}, |
2063 | 888 | }; |
2064 | 888 | |
2065 | 888 | static const CostTblEntry SSE42CostTblNoPairWise[] = { |
2066 | 888 | {ISD::FMINNUM, MVT::v2f64, 3}, |
2067 | 888 | {ISD::FMINNUM, MVT::v4f32, 3}, |
2068 | 888 | {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" |
2069 | 888 | {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" |
2070 | 888 | {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" |
2071 | 888 | {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" |
2072 | 888 | {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" |
2073 | 888 | {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" |
2074 | 888 | }; |
2075 | 888 | |
2076 | 888 | static const CostTblEntry AVX1CostTblNoPairWise[] = { |
2077 | 888 | {ISD::FMINNUM, MVT::v4f32, 1}, |
2078 | 888 | {ISD::FMINNUM, MVT::v4f64, 1}, |
2079 | 888 | {ISD::FMINNUM, MVT::v8f32, 1}, |
2080 | 888 | {ISD::SMIN, MVT::v2i64, 3}, |
2081 | 888 | {ISD::UMIN, MVT::v2i64, 3}, |
2082 | 888 | {ISD::SMIN, MVT::v4i32, 1}, |
2083 | 888 | {ISD::UMIN, MVT::v4i32, 1}, |
2084 | 888 | {ISD::SMIN, MVT::v8i16, 1}, |
2085 | 888 | {ISD::UMIN, MVT::v8i16, 1}, |
2086 | 888 | {ISD::SMIN, MVT::v8i32, 2}, |
2087 | 888 | {ISD::UMIN, MVT::v8i32, 2}, |
2088 | 888 | }; |
2089 | 888 | |
2090 | 888 | static const CostTblEntry AVX2CostTblNoPairWise[] = { |
2091 | 888 | {ISD::SMIN, MVT::v4i64, 1}, |
2092 | 888 | {ISD::UMIN, MVT::v4i64, 1}, |
2093 | 888 | {ISD::SMIN, MVT::v8i32, 1}, |
2094 | 888 | {ISD::UMIN, MVT::v8i32, 1}, |
2095 | 888 | {ISD::SMIN, MVT::v16i16, 1}, |
2096 | 888 | {ISD::UMIN, MVT::v16i16, 1}, |
2097 | 888 | {ISD::SMIN, MVT::v32i8, 1}, |
2098 | 888 | {ISD::UMIN, MVT::v32i8, 1}, |
2099 | 888 | }; |
2100 | 888 | |
2101 | 888 | static const CostTblEntry AVX512CostTblNoPairWise[] = { |
2102 | 888 | {ISD::FMINNUM, MVT::v8f64, 1}, |
2103 | 888 | {ISD::FMINNUM, MVT::v16f32, 2}, |
2104 | 888 | {ISD::SMIN, MVT::v8i64, 1}, |
2105 | 888 | {ISD::UMIN, MVT::v8i64, 1}, |
2106 | 888 | {ISD::SMIN, MVT::v16i32, 1}, |
2107 | 888 | {ISD::UMIN, MVT::v16i32, 1}, |
2108 | 888 | }; |
2109 | 888 | |
2110 | 888 | if (IsPairwise888 ) { |
2111 | 444 | if (ST->hasAVX512()) |
2112 | 8 | if (const auto *8 Entry8 = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) |
2113 | 4 | return LT.first * Entry->Cost; |
2114 | 440 | |
2115 | 440 | if (440 ST->hasAVX2()440 ) |
2116 | 12 | if (const auto *12 Entry12 = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) |
2117 | 4 | return LT.first * Entry->Cost; |
2118 | 436 | |
2119 | 436 | if (436 ST->hasAVX()436 ) |
2120 | 20 | if (const auto *20 Entry20 = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) |
2121 | 20 | return LT.first * Entry->Cost; |
2122 | 416 | |
2123 | 416 | if (416 ST->hasSSE42()416 ) |
2124 | 0 | if (const auto *0 Entry0 = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) |
2125 | 0 | return LT.first * Entry->Cost; |
2126 | 444 | } else { |
2127 | 444 | if (ST->hasAVX512()) |
2128 | 8 | if (const auto *8 Entry8 = |
2129 | 8 | CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) |
2130 | 4 | return LT.first * Entry->Cost; |
2131 | 440 | |
2132 | 440 | if (440 ST->hasAVX2()440 ) |
2133 | 12 | if (const auto *12 Entry12 = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) |
2134 | 4 | return LT.first * Entry->Cost; |
2135 | 436 | |
2136 | 436 | if (436 ST->hasAVX()436 ) |
2137 | 20 | if (const auto *20 Entry20 = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) |
2138 | 20 | return LT.first * Entry->Cost; |
2139 | 416 | |
2140 | 416 | if (416 ST->hasSSE42()416 ) |
2141 | 0 | if (const auto *0 Entry0 = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) |
2142 | 0 | return LT.first * Entry->Cost; |
2143 | 832 | } |
2144 | 832 | |
2145 | 832 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); |
2146 | 832 | } |
2147 | | |
2148 | | /// \brief Calculate the cost of materializing a 64-bit value. This helper |
2149 | | /// method might only calculate a fraction of a larger immediate. Therefore it |
2150 | | /// is valid to return a cost of ZERO. |
2151 | 50.4k | int X86TTIImpl::getIntImmCost(int64_t Val) { |
2152 | 50.4k | if (Val == 0) |
2153 | 234 | return TTI::TCC_Free; |
2154 | 50.2k | |
2155 | 50.2k | if (50.2k isInt<32>(Val)50.2k ) |
2156 | 47.3k | return TTI::TCC_Basic; |
2157 | 2.86k | |
2158 | 2.86k | return 2 * TTI::TCC_Basic; |
2159 | 2.86k | } |
2160 | | |
2161 | 79.7k | int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { |
2162 | 79.7k | assert(Ty->isIntegerTy()); |
2163 | 79.7k | |
2164 | 79.7k | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
2165 | 79.7k | if (BitSize == 0) |
2166 | 0 | return ~0U; |
2167 | 79.7k | |
2168 | 79.7k | // Never hoist constants larger than 128bit, because this might lead to |
2169 | 79.7k | // incorrect code generation or assertions in codegen. |
2170 | 79.7k | // Fixme: Create a cost model for types larger than i128 once the codegen |
2171 | 79.7k | // issues have been fixed. |
2172 | 79.7k | if (79.7k BitSize > 12879.7k ) |
2173 | 93 | return TTI::TCC_Free; |
2174 | 79.6k | |
2175 | 79.6k | if (79.6k Imm == 079.6k ) |
2176 | 29.5k | return TTI::TCC_Free; |
2177 | 50.1k | |
2178 | 50.1k | // Sign-extend all constants to a multiple of 64-bit. |
2179 | 50.1k | APInt ImmVal = Imm; |
2180 | 50.1k | if (BitSize & 0x3f) |
2181 | 33.5k | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); |
2182 | 50.1k | |
2183 | 50.1k | // Split the constant into 64-bit chunks and calculate the cost for each |
2184 | 50.1k | // chunk. |
2185 | 50.1k | int Cost = 0; |
2186 | 100k | for (unsigned ShiftVal = 0; ShiftVal < BitSize100k ; ShiftVal += 6450.4k ) { |
2187 | 50.4k | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); |
2188 | 50.4k | int64_t Val = Tmp.getSExtValue(); |
2189 | 50.4k | Cost += getIntImmCost(Val); |
2190 | 50.4k | } |
2191 | 79.7k | // We need at least one instruction to materialize the constant. |
2192 | 79.7k | return std::max(1, Cost); |
2193 | 79.7k | } |
2194 | | |
2195 | | int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, |
2196 | 143k | Type *Ty) { |
2197 | 143k | assert(Ty->isIntegerTy()); |
2198 | 143k | |
2199 | 143k | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
2200 | 143k | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
2201 | 143k | // here, so that constant hoisting will ignore this constant. |
2202 | 143k | if (BitSize == 0) |
2203 | 0 | return TTI::TCC_Free; |
2204 | 143k | |
2205 | 143k | unsigned ImmIdx = ~0U; |
2206 | 143k | switch (Opcode) { |
2207 | 20.5k | default: |
2208 | 20.5k | return TTI::TCC_Free; |
2209 | 33.5k | case Instruction::GetElementPtr: |
2210 | 33.5k | // Always hoist the base address of a GetElementPtr. This prevents the |
2211 | 33.5k | // creation of new constants for every base constant that gets constant |
2212 | 33.5k | // folded with the offset. |
2213 | 33.5k | if (Idx == 0) |
2214 | 9 | return 2 * TTI::TCC_Basic; |
2215 | 33.5k | return TTI::TCC_Free; |
2216 | 6.36k | case Instruction::Store: |
2217 | 6.36k | ImmIdx = 0; |
2218 | 6.36k | break; |
2219 | 19.6k | case Instruction::ICmp: |
2220 | 19.6k | // This is an imperfect hack to prevent constant hoisting of |
2221 | 19.6k | // compares that might be trying to check if a 64-bit value fits in |
2222 | 19.6k | // 32-bits. The backend can optimize these cases using a right shift by 32. |
2223 | 19.6k | // Ideally we would check the compare predicate here. There also other |
2224 | 19.6k | // similar immediates the backend can use shifts for. |
2225 | 19.6k | if (Idx == 1 && 19.6k Imm.getBitWidth() == 6419.5k ) { |
2226 | 5.91k | uint64_t ImmVal = Imm.getZExtValue(); |
2227 | 5.91k | if (ImmVal == 0x100000000ULL || 5.91k ImmVal == 0xffffffff5.91k ) |
2228 | 37 | return TTI::TCC_Free; |
2229 | 19.6k | } |
2230 | 19.6k | ImmIdx = 1; |
2231 | 19.6k | break; |
2232 | 6.59k | case Instruction::And: |
2233 | 6.59k | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes |
2234 | 6.59k | // by using a 32-bit operation with implicit zero extension. Detect such |
2235 | 6.59k | // immediates here as the normal path expects bit 31 to be sign extended. |
2236 | 6.59k | if (Idx == 1 && 6.59k Imm.getBitWidth() == 646.54k && isUInt<32>(Imm.getZExtValue())2.82k ) |
2237 | 1.55k | return TTI::TCC_Free; |
2238 | 5.04k | LLVM_FALLTHROUGH5.04k ; |
2239 | 24.5k | case Instruction::Add: |
2240 | 24.5k | case Instruction::Sub: |
2241 | 24.5k | case Instruction::Mul: |
2242 | 24.5k | case Instruction::UDiv: |
2243 | 24.5k | case Instruction::SDiv: |
2244 | 24.5k | case Instruction::URem: |
2245 | 24.5k | case Instruction::SRem: |
2246 | 24.5k | case Instruction::Or: |
2247 | 24.5k | case Instruction::Xor: |
2248 | 24.5k | ImmIdx = 1; |
2249 | 24.5k | break; |
2250 | 24.5k | // Always return TCC_Free for the shift value of a shift instruction. |
2251 | 9.39k | case Instruction::Shl: |
2252 | 9.39k | case Instruction::LShr: |
2253 | 9.39k | case Instruction::AShr: |
2254 | 9.39k | if (Idx == 1) |
2255 | 8.50k | return TTI::TCC_Free; |
2256 | 887 | break; |
2257 | 28.2k | case Instruction::Trunc: |
2258 | 28.2k | case Instruction::ZExt: |
2259 | 28.2k | case Instruction::SExt: |
2260 | 28.2k | case Instruction::IntToPtr: |
2261 | 28.2k | case Instruction::PtrToInt: |
2262 | 28.2k | case Instruction::BitCast: |
2263 | 28.2k | case Instruction::PHI: |
2264 | 28.2k | case Instruction::Call: |
2265 | 28.2k | case Instruction::Select: |
2266 | 28.2k | case Instruction::Ret: |
2267 | 28.2k | case Instruction::Load: |
2268 | 28.2k | break; |
2269 | 79.6k | } |
2270 | 79.6k | |
2271 | 79.6k | if (79.6k Idx == ImmIdx79.6k ) { |
2272 | 47.7k | int NumConstants = (BitSize + 63) / 64; |
2273 | 47.7k | int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); |
2274 | 47.7k | return (Cost <= NumConstants * TTI::TCC_Basic) |
2275 | 45.2k | ? static_cast<int>(TTI::TCC_Free) |
2276 | 2.45k | : Cost; |
2277 | 47.7k | } |
2278 | 31.9k | |
2279 | 31.9k | return X86TTIImpl::getIntImmCost(Imm, Ty); |
2280 | 31.9k | } |
2281 | | |
2282 | | int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, |
2283 | 13.7k | Type *Ty) { |
2284 | 13.7k | assert(Ty->isIntegerTy()); |
2285 | 13.7k | |
2286 | 13.7k | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
2287 | 13.7k | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
2288 | 13.7k | // here, so that constant hoisting will ignore this constant. |
2289 | 13.7k | if (BitSize == 0) |
2290 | 0 | return TTI::TCC_Free; |
2291 | 13.7k | |
2292 | 13.7k | switch (IID) { |
2293 | 13.1k | default: |
2294 | 13.1k | return TTI::TCC_Free; |
2295 | 86 | case Intrinsic::sadd_with_overflow: |
2296 | 86 | case Intrinsic::uadd_with_overflow: |
2297 | 86 | case Intrinsic::ssub_with_overflow: |
2298 | 86 | case Intrinsic::usub_with_overflow: |
2299 | 86 | case Intrinsic::smul_with_overflow: |
2300 | 86 | case Intrinsic::umul_with_overflow: |
2301 | 86 | if ((Idx == 1) && 86 Imm.getBitWidth() <= 6468 && isInt<32>(Imm.getSExtValue())68 ) |
2302 | 61 | return TTI::TCC_Free; |
2303 | 25 | break; |
2304 | 237 | case Intrinsic::experimental_stackmap: |
2305 | 237 | if ((Idx < 2) || 237 (Imm.getBitWidth() <= 64 && 31 isInt<64>(Imm.getSExtValue())30 )) |
2306 | 236 | return TTI::TCC_Free; |
2307 | 1 | break; |
2308 | 273 | case Intrinsic::experimental_patchpoint_void: |
2309 | 273 | case Intrinsic::experimental_patchpoint_i64: |
2310 | 273 | if ((Idx < 4) || 273 (Imm.getBitWidth() <= 64 && 37 isInt<64>(Imm.getSExtValue())37 )) |
2311 | 273 | return TTI::TCC_Free; |
2312 | 0 | break; |
2313 | 26 | } |
2314 | 26 | return X86TTIImpl::getIntImmCost(Imm, Ty); |
2315 | 26 | } |
2316 | | |
2317 | | unsigned X86TTIImpl::getUserCost(const User *U, |
2318 | 512k | ArrayRef<const Value *> Operands) { |
2319 | 512k | if (isa<StoreInst>(U)512k ) { |
2320 | 12.1k | Value *Ptr = U->getOperand(1); |
2321 | 12.1k | // Store instruction with index and scale costs 2 Uops. |
2322 | 12.1k | // Check the preceding GEP to identify non-const indices. |
2323 | 12.1k | if (auto GEP12.1k = dyn_cast<GetElementPtrInst>(Ptr)) { |
2324 | 21.9k | if (!all_of(GEP->indices(), [](Value *V) 8.55k { return isa<Constant>(V); }21.9k )) |
2325 | 5.14k | return TTI::TCC_Basic * 2; |
2326 | 6.95k | } |
2327 | 6.95k | return TTI::TCC_Basic; |
2328 | 6.95k | } |
2329 | 500k | return BaseT::getUserCost(U, Operands); |
2330 | 500k | } |
2331 | | |
2332 | | // Return an average cost of Gather / Scatter instruction, maybe improved later |
2333 | | int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, |
2334 | 91 | unsigned Alignment, unsigned AddressSpace) { |
2335 | 91 | |
2336 | 91 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); |
2337 | 91 | unsigned VF = SrcVTy->getVectorNumElements(); |
2338 | 91 | |
2339 | 91 | // Try to reduce index size from 64 bit (default for GEP) |
2340 | 91 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the |
2341 | 91 | // operation will use 16 x 64 indices which do not fit in a zmm and needs |
2342 | 91 | // to split. Also check that the base pointer is the same for all lanes, |
2343 | 91 | // and that there's at most one variable index. |
2344 | 32 | auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { |
2345 | 32 | unsigned IndexSize = DL.getPointerSizeInBits(); |
2346 | 32 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); |
2347 | 32 | if (IndexSize < 64 || 32 !GEP32 ) |
2348 | 0 | return IndexSize; |
2349 | 32 | |
2350 | 32 | unsigned NumOfVarIndices = 0; |
2351 | 32 | Value *Ptrs = GEP->getPointerOperand(); |
2352 | 32 | if (Ptrs->getType()->isVectorTy() && 32 !getSplatValue(Ptrs)6 ) |
2353 | 2 | return IndexSize; |
2354 | 39 | for (unsigned i = 1; 30 i < GEP->getNumOperands()39 ; ++i9 ) { |
2355 | 30 | if (isa<Constant>(GEP->getOperand(i))) |
2356 | 0 | continue; |
2357 | 30 | Type *IndxTy = GEP->getOperand(i)->getType(); |
2358 | 30 | if (IndxTy->isVectorTy()) |
2359 | 8 | IndxTy = IndxTy->getVectorElementType(); |
2360 | 30 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && |
2361 | 28 | !isa<SExtInst>(GEP->getOperand(i))) || |
2362 | 9 | ++NumOfVarIndices > 1) |
2363 | 21 | return IndexSize; // 64 |
2364 | 30 | } |
2365 | 9 | return (unsigned)32; |
2366 | 32 | }; |
2367 | 91 | |
2368 | 91 | |
2369 | 91 | // Trying to reduce IndexSize to 32 bits for vector 16. |
2370 | 91 | // By default the IndexSize is equal to pointer size. |
2371 | 32 | unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : |
2372 | 59 | DL.getPointerSizeInBits(); |
2373 | 91 | |
2374 | 91 | Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), |
2375 | 91 | IndexSize), VF); |
2376 | 91 | std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); |
2377 | 91 | std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); |
2378 | 91 | int SplitFactor = std::max(IdxsLT.first, SrcLT.first); |
2379 | 91 | if (SplitFactor > 191 ) { |
2380 | 23 | // Handle splitting of vector of pointers |
2381 | 23 | Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); |
2382 | 23 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, |
2383 | 23 | AddressSpace); |
2384 | 23 | } |
2385 | 68 | |
2386 | 68 | // The gather / scatter cost is given by Intel architects. It is a rough |
2387 | 68 | // number since we are looking at one instruction in a time. |
2388 | 68 | const int GSOverhead = 2; |
2389 | 68 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), |
2390 | 68 | Alignment, AddressSpace); |
2391 | 68 | } |
2392 | | |
2393 | | /// Return the cost of full scalarization of gather / scatter operation. |
2394 | | /// |
2395 | | /// Opcode - Load or Store instruction. |
2396 | | /// SrcVTy - The type of the data vector that should be gathered or scattered. |
2397 | | /// VariableMask - The mask is non-constant at compile time. |
2398 | | /// Alignment - Alignment for one element. |
2399 | | /// AddressSpace - pointer[s] address space. |
2400 | | /// |
2401 | | int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, |
2402 | | bool VariableMask, unsigned Alignment, |
2403 | 72 | unsigned AddressSpace) { |
2404 | 72 | unsigned VF = SrcVTy->getVectorNumElements(); |
2405 | 72 | |
2406 | 72 | int MaskUnpackCost = 0; |
2407 | 72 | if (VariableMask72 ) { |
2408 | 47 | VectorType *MaskTy = |
2409 | 47 | VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); |
2410 | 47 | MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); |
2411 | 47 | int ScalarCompareCost = |
2412 | 47 | getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), |
2413 | 47 | nullptr); |
2414 | 47 | int BranchCost = getCFInstrCost(Instruction::Br); |
2415 | 47 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); |
2416 | 47 | } |
2417 | 72 | |
2418 | 72 | // The cost of the scalar loads/stores. |
2419 | 72 | int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), |
2420 | 72 | Alignment, AddressSpace); |
2421 | 72 | |
2422 | 72 | int InsertExtractCost = 0; |
2423 | 72 | if (Opcode == Instruction::Load) |
2424 | 229 | for (unsigned i = 0; 43 i < VF229 ; ++i186 ) |
2425 | 43 | // Add the cost of inserting each scalar load into the vector |
2426 | 186 | InsertExtractCost += |
2427 | 186 | getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); |
2428 | 72 | else |
2429 | 135 | for (unsigned i = 0; 29 i < VF135 ; ++i106 ) |
2430 | 29 | // Add the cost of extracting each element out of the data vector |
2431 | 106 | InsertExtractCost += |
2432 | 106 | getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); |
2433 | 72 | |
2434 | 72 | return MemoryOpCost + MaskUnpackCost + InsertExtractCost; |
2435 | 72 | } |
2436 | | |
2437 | | /// Calculate the cost of Gather / Scatter operation |
2438 | | int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, |
2439 | | Value *Ptr, bool VariableMask, |
2440 | 140 | unsigned Alignment) { |
2441 | 140 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); |
2442 | 140 | unsigned VF = SrcVTy->getVectorNumElements(); |
2443 | 140 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); |
2444 | 140 | if (!PtrTy && 140 Ptr->getType()->isVectorTy()37 ) |
2445 | 37 | PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); |
2446 | 140 | assert(PtrTy && "Unexpected type for Ptr argument"); |
2447 | 140 | unsigned AddressSpace = PtrTy->getAddressSpace(); |
2448 | 140 | |
2449 | 140 | bool Scalarize = false; |
2450 | 140 | if ((Opcode == Instruction::Load && 140 !isLegalMaskedGather(SrcVTy)81 ) || |
2451 | 131 | (Opcode == Instruction::Store && 131 !isLegalMaskedScatter(SrcVTy)59 )) |
2452 | 12 | Scalarize = true; |
2453 | 140 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX |
2454 | 140 | // Vector-4 of gather/scatter instruction does not exist on KNL. |
2455 | 140 | // We can extend it to 8 elements, but zeroing upper bits of |
2456 | 140 | // the mask vector will add more instructions. Right now we give the scalar |
2457 | 140 | // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction |
2458 | 140 | // is better in the VariableMask case. |
2459 | 140 | if (VF == 2 || 140 (VF == 4 && 110 !ST->hasVLX()43 )) |
2460 | 66 | Scalarize = true; |
2461 | 140 | |
2462 | 140 | if (Scalarize) |
2463 | 72 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, |
2464 | 72 | AddressSpace); |
2465 | 68 | |
2466 | 68 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); |
2467 | 68 | } |
2468 | | |
2469 | | bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, |
2470 | 170k | TargetTransformInfo::LSRCost &C2) { |
2471 | 170k | // X86 specific here are "instruction number 1st priority". |
2472 | 170k | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, |
2473 | 170k | C1.NumIVMuls, C1.NumBaseAdds, |
2474 | 170k | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < |
2475 | 170k | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, |
2476 | 170k | C2.NumIVMuls, C2.NumBaseAdds, |
2477 | 170k | C2.ScaleCost, C2.ImmCost, C2.SetupCost); |
2478 | 170k | } |
2479 | | |
2480 | 526 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { |
2481 | 526 | Type *ScalarTy = DataTy->getScalarType(); |
2482 | 526 | int DataWidth = isa<PointerType>(ScalarTy) ? |
2483 | 526 | DL.getPointerSizeInBits()34 : ScalarTy->getPrimitiveSizeInBits()492 ; |
2484 | 526 | |
2485 | 526 | return ((DataWidth == 32 || DataWidth == 64257 ) && ST->hasAVX()482 ) || |
2486 | 61 | ((DataWidth == 8 || 61 DataWidth == 1639 ) && ST->hasBWI()44 ); |
2487 | 526 | } |
2488 | | |
2489 | 194 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { |
2490 | 194 | return isLegalMaskedLoad(DataType); |
2491 | 194 | } |
2492 | | |
2493 | 849 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { |
2494 | 849 | // This function is called now in two cases: from the Loop Vectorizer |
2495 | 849 | // and from the Scalarizer. |
2496 | 849 | // When the Loop Vectorizer asks about legality of the feature, |
2497 | 849 | // the vectorization factor is not calculated yet. The Loop Vectorizer |
2498 | 849 | // sends a scalar type and the decision is based on the width of the |
2499 | 849 | // scalar element. |
2500 | 849 | // Later on, the cost model will estimate usage this intrinsic based on |
2501 | 849 | // the vector type. |
2502 | 849 | // The Scalarizer asks again about legality. It sends a vector type. |
2503 | 849 | // In this case we can reject non-power-of-2 vectors. |
2504 | 849 | if (isa<VectorType>(DataTy) && 849 !isPowerOf2_32(DataTy->getVectorNumElements())451 ) |
2505 | 6 | return false; |
2506 | 843 | Type *ScalarTy = DataTy->getScalarType(); |
2507 | 843 | int DataWidth = isa<PointerType>(ScalarTy) ? |
2508 | 843 | DL.getPointerSizeInBits()64 : ScalarTy->getPrimitiveSizeInBits()779 ; |
2509 | 843 | |
2510 | 843 | // AVX-512 allows gather and scatter |
2511 | 843 | return (DataWidth == 32 || DataWidth == 64263 ) && ST->hasAVX512()814 ; |
2512 | 849 | } |
2513 | | |
2514 | 380 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { |
2515 | 380 | return isLegalMaskedGather(DataType); |
2516 | 380 | } |
2517 | | |
2518 | 62 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { |
2519 | 62 | EVT VT = TLI->getValueType(DL, DataType); |
2520 | 62 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM14 : ISD::UDIVREM48 , VT); |
2521 | 62 | } |
2522 | | |
2523 | | bool X86TTIImpl::areInlineCompatible(const Function *Caller, |
2524 | 38.2k | const Function *Callee) const { |
2525 | 38.2k | const TargetMachine &TM = getTLI()->getTargetMachine(); |
2526 | 38.2k | |
2527 | 38.2k | // Work this as a subsetting of subtarget features. |
2528 | 38.2k | const FeatureBitset &CallerBits = |
2529 | 38.2k | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
2530 | 38.2k | const FeatureBitset &CalleeBits = |
2531 | 38.2k | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
2532 | 38.2k | |
2533 | 38.2k | // FIXME: This is likely too limiting as it will include subtarget features |
2534 | 38.2k | // that we might not care about for inlining, but it is conservatively |
2535 | 38.2k | // correct. |
2536 | 38.2k | return (CallerBits & CalleeBits) == CalleeBits; |
2537 | 38.2k | } |
2538 | | |
2539 | 400 | bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { |
2540 | 400 | // TODO: We can increase these based on available vector ops. |
2541 | 400 | MaxLoadSize = ST->is64Bit() ? 8201 : 4199 ; |
2542 | 400 | return true; |
2543 | 400 | } |
2544 | | |
2545 | 475 | bool X86TTIImpl::enableInterleavedAccessVectorization() { |
2546 | 475 | // TODO: We expect this to be beneficial regardless of arch, |
2547 | 475 | // but there are currently some unexplained performance artifacts on Atom. |
2548 | 475 | // As a temporary solution, disable on Atom. |
2549 | 475 | return !(ST->isAtom()); |
2550 | 475 | } |
2551 | | |
2552 | | // Get estimation for interleaved load/store operations for AVX2. |
2553 | | // \p Factor is the interleaved-access factor (stride) - number of |
2554 | | // (interleaved) elements in the group. |
2555 | | // \p Indices contains the indices for a strided load: when the |
2556 | | // interleaved load has gaps they indicate which elements are used. |
2557 | | // If Indices is empty (or if the number of indices is equal to the size |
2558 | | // of the interleaved-access as given in \p Factor) the access has no gaps. |
2559 | | // |
2560 | | // As opposed to AVX-512, AVX2 does not have generic shuffles that allow |
2561 | | // computing the cost using a generic formula as a function of generic |
2562 | | // shuffles. We therefore use a lookup table instead, filled according to |
2563 | | // the instruction sequences that codegen currently generates. |
2564 | | int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, |
2565 | | unsigned Factor, |
2566 | | ArrayRef<unsigned> Indices, |
2567 | | unsigned Alignment, |
2568 | 2 | unsigned AddressSpace) { |
2569 | 2 | |
2570 | 2 | // We currently Support only fully-interleaved groups, with no gaps. |
2571 | 2 | // TODO: Support also strided loads (interleaved-groups with gaps). |
2572 | 2 | if (Indices.size() && 2 Indices.size() != Factor0 ) |
2573 | 0 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2574 | 0 | Alignment, AddressSpace); |
2575 | 2 | |
2576 | 2 | // VecTy for interleave memop is <VF*Factor x Elt>. |
2577 | 2 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
2578 | 2 | // VecTy = <12 x i32>. |
2579 | 2 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; |
2580 | 2 | |
2581 | 2 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case |
2582 | 2 | // the VF=2, while v2i128 is an unsupported MVT vector type |
2583 | 2 | // (see MachineValueType.h::getVectorVT()). |
2584 | 2 | if (!LegalVT.isVector()) |
2585 | 2 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2586 | 2 | Alignment, AddressSpace); |
2587 | 0 |
|
2588 | 0 | unsigned VF = VecTy->getVectorNumElements() / Factor; |
2589 | 0 | Type *ScalarTy = VecTy->getVectorElementType(); |
2590 | 0 |
|
2591 | 0 | // Calculate the number of memory operations (NumOfMemOps), required |
2592 | 0 | // for load/store the VecTy. |
2593 | 0 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); |
2594 | 0 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
2595 | 0 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
2596 | 0 |
|
2597 | 0 | // Get the cost of one memory operation. |
2598 | 0 | Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), |
2599 | 0 | LegalVT.getVectorNumElements()); |
2600 | 0 | unsigned MemOpCost = |
2601 | 0 | getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); |
2602 | 0 |
|
2603 | 0 | VectorType *VT = VectorType::get(ScalarTy, VF); |
2604 | 0 | EVT ETy = TLI->getValueType(DL, VT); |
2605 | 0 | if (!ETy.isSimple()) |
2606 | 0 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2607 | 0 | Alignment, AddressSpace); |
2608 | 0 |
|
2609 | 0 | // TODO: Complete for other data-types and strides. |
2610 | 0 | // Each combination of Stride, ElementTy and VF results in a different |
2611 | 0 | // sequence; The cost tables are therefore accessed with: |
2612 | 0 | // Factor (stride) and VectorType=VFxElemType. |
2613 | 0 | // The Cost accounts only for the shuffle sequence; |
2614 | 0 | // The cost of the loads/stores is accounted for separately. |
2615 | 0 | // |
2616 | 0 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { |
2617 | 0 | { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 |
2618 | 0 | { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 |
2619 | 0 | { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 |
2620 | 0 | { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8 |
2621 | 0 | { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8 |
2622 | 0 |
|
2623 | 0 | { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 |
2624 | 0 | { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 |
2625 | 0 | { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 |
2626 | 0 | { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 |
2627 | 0 | { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8 |
2628 | 0 | }; |
2629 | 0 |
|
2630 | 0 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { |
2631 | 0 | { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) |
2632 | 0 | { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) |
2633 | 0 | { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) |
2634 | 0 | { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store) |
2635 | 0 | { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store) |
2636 | 0 |
|
2637 | 0 | { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) |
2638 | 0 | { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) |
2639 | 0 | { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store) |
2640 | 0 | { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store) |
2641 | 0 | { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store) |
2642 | 0 | }; |
2643 | 0 |
|
2644 | 0 | if (Opcode == Instruction::Load0 ) { |
2645 | 0 | if (const auto *Entry = |
2646 | 0 | CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) |
2647 | 0 | return NumOfMemOps * MemOpCost + Entry->Cost; |
2648 | 0 | } else { |
2649 | 0 | assert(Opcode == Instruction::Store && |
2650 | 0 | "Expected Store Instruction at this point"); |
2651 | 0 | if (const auto *Entry = |
2652 | 0 | CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) |
2653 | 0 | return NumOfMemOps * MemOpCost + Entry->Cost; |
2654 | 0 | } |
2655 | 0 |
|
2656 | 0 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2657 | 0 | Alignment, AddressSpace); |
2658 | 0 | } |
2659 | | |
2660 | | // Get estimation for interleaved load/store operations and strided load. |
2661 | | // \p Indices contains indices for strided load. |
2662 | | // \p Factor - the factor of interleaving. |
2663 | | // AVX-512 provides 3-src shuffles that significantly reduces the cost. |
2664 | | int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, |
2665 | | unsigned Factor, |
2666 | | ArrayRef<unsigned> Indices, |
2667 | | unsigned Alignment, |
2668 | 3 | unsigned AddressSpace) { |
2669 | 3 | |
2670 | 3 | // VecTy for interleave memop is <VF*Factor x Elt>. |
2671 | 3 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
2672 | 3 | // VecTy = <12 x i32>. |
2673 | 3 | |
2674 | 3 | // Calculate the number of memory operations (NumOfMemOps), required |
2675 | 3 | // for load/store the VecTy. |
2676 | 3 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; |
2677 | 3 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); |
2678 | 3 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
2679 | 3 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
2680 | 3 | |
2681 | 3 | // Get the cost of one memory operation. |
2682 | 3 | Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), |
2683 | 3 | LegalVT.getVectorNumElements()); |
2684 | 3 | unsigned MemOpCost = |
2685 | 3 | getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); |
2686 | 3 | |
2687 | 3 | if (Opcode == Instruction::Load3 ) { |
2688 | 3 | // Kind of shuffle depends on number of loaded values. |
2689 | 3 | // If we load the entire data in one register, we can use a 1-src shuffle. |
2690 | 3 | // Otherwise, we'll merge 2 sources in each operation. |
2691 | 3 | TTI::ShuffleKind ShuffleKind = |
2692 | 3 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc3 : TTI::SK_PermuteSingleSrc0 ; |
2693 | 3 | |
2694 | 3 | unsigned ShuffleCost = |
2695 | 3 | getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); |
2696 | 3 | |
2697 | 3 | unsigned NumOfLoadsInInterleaveGrp = |
2698 | 3 | Indices.size() ? Indices.size()3 : Factor0 ; |
2699 | 3 | Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), |
2700 | 3 | VecTy->getVectorNumElements() / Factor); |
2701 | 3 | unsigned NumOfResults = |
2702 | 3 | getTLI()->getTypeLegalizationCost(DL, ResultTy).first * |
2703 | 3 | NumOfLoadsInInterleaveGrp; |
2704 | 3 | |
2705 | 3 | // About a half of the loads may be folded in shuffles when we have only |
2706 | 3 | // one result. If we have more than one result, we do not fold loads at all. |
2707 | 3 | unsigned NumOfUnfoldedLoads = |
2708 | 3 | NumOfResults > 1 ? NumOfMemOps0 : NumOfMemOps / 23 ; |
2709 | 3 | |
2710 | 3 | // Get a number of shuffle operations per result. |
2711 | 3 | unsigned NumOfShufflesPerResult = |
2712 | 3 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); |
2713 | 3 | |
2714 | 3 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
2715 | 3 | // When we have more than one destination, we need additional instructions |
2716 | 3 | // to keep sources. |
2717 | 3 | unsigned NumOfMoves = 0; |
2718 | 3 | if (NumOfResults > 1 && 3 ShuffleKind == TTI::SK_PermuteTwoSrc0 ) |
2719 | 0 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; |
2720 | 3 | |
2721 | 3 | int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + |
2722 | 3 | NumOfUnfoldedLoads * MemOpCost + NumOfMoves; |
2723 | 3 | |
2724 | 3 | return Cost; |
2725 | 3 | } |
2726 | 0 |
|
2727 | 0 | // Store. |
2728 | 3 | assert(Opcode == Instruction::Store && |
2729 | 0 | "Expected Store Instruction at this point"); |
2730 | 0 |
|
2731 | 0 | // There is no strided stores meanwhile. And store can't be folded in |
2732 | 0 | // shuffle. |
2733 | 0 | unsigned NumOfSources = Factor; // The number of values to be merged. |
2734 | 0 | unsigned ShuffleCost = |
2735 | 0 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); |
2736 | 0 | unsigned NumOfShufflesPerStore = NumOfSources - 1; |
2737 | 0 |
|
2738 | 0 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
2739 | 0 | // We need additional instructions to keep sources. |
2740 | 0 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; |
2741 | 0 | int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + |
2742 | 0 | NumOfMoves; |
2743 | 0 | return Cost; |
2744 | 0 | } |
2745 | | |
2746 | | int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, |
2747 | | unsigned Factor, |
2748 | | ArrayRef<unsigned> Indices, |
2749 | | unsigned Alignment, |
2750 | 9 | unsigned AddressSpace) { |
2751 | 9 | auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) { |
2752 | 9 | RequiresBW = false; |
2753 | 9 | Type *EltTy = VecTy->getVectorElementType(); |
2754 | 9 | if (EltTy->isFloatTy() || 9 EltTy->isDoubleTy()9 || EltTy->isIntegerTy(64)9 || |
2755 | 9 | EltTy->isIntegerTy(32)6 || EltTy->isPointerTy()2 ) |
2756 | 7 | return true; |
2757 | 2 | if (2 EltTy->isIntegerTy(16) || 2 EltTy->isIntegerTy(8)2 ) { |
2758 | 0 | RequiresBW = true; |
2759 | 0 | return true; |
2760 | 0 | } |
2761 | 2 | return false; |
2762 | 2 | }; |
2763 | 9 | bool RequiresBW; |
2764 | 9 | bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); |
2765 | 9 | if (ST->hasAVX512() && 9 HasAVX512Solution5 && (!RequiresBW || 3 ST->hasBWI()0 )) |
2766 | 3 | return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, |
2767 | 3 | Alignment, AddressSpace); |
2768 | 6 | if (6 ST->hasAVX2()6 ) |
2769 | 2 | return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, |
2770 | 2 | Alignment, AddressSpace); |
2771 | 4 | |
2772 | 4 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2773 | 4 | Alignment, AddressSpace); |
2774 | 4 | } |